def getTagTermAffinityScores(questions, includeCounts=True, frequentWords=None): # print 'Computing TagAffinity model on %d questions' % len(questions) if not frequentWords: frequentWords = set(wordvectors.getFrequentWords(questions)[0]) ttas = {} tagCounts = {} infile_body = codecs.open(posts_body_file, 'r', 'utf-8') for (qid, question) in questions.items(): for tagID in question.tags: tagCounts[tagID] = tagCounts.get(tagID, 0) + 1 infile_body.seek(question.bodyByte) postWords = wordvectors.getWordsFromPost(infile_body.readline()) for word in set(postWords): if word not in frequentWords: continue inner_dict = ttas.get(word, {}) for tagID in question.tags: inner_dict[tagID] = inner_dict.get(tagID, 0) + 1 ttas[word] = inner_dict infile_body.close() for (term, inner_dict) in ttas.items(): for (tagID, freq) in inner_dict.items(): inner_dict[tagID] = float(freq) / tagCounts[tagID] # print 'Finished TagAffinity model' if includeCounts: finalTagCounts = {} for (tagID, count) in tagCounts.items(): if count >= 50: finalTagCounts[tagID] = count return (ttas, finalTagCounts) else: return ttas
def getTagTermBasedRankingScores(questionBody, ttas, tagCounts): postWords = wordvectors.getWordsFromPost(questionBody) result = {} for tagID in tagCounts.keys(): result[tagID] = 1.0 for word in postWords: if word in ttas: inner_dict = ttas[word] for (tagID, score) in result.items(): result[tagID] *= (1 - inner_dict.get(tagID, 0)) maxScore = 0.0 minScore = 0.0 for (tagID, score) in result.items(): result[tagID] = 1 - score maxScore = max(maxScore, 1 - score) minScore = min(minScore, 1 - score) scoreRange = maxScore - minScore if scoreRange == 0.0: scoreRange = 1 for (tagID, score) in result.items(): result[tagID] = (score - minScore) / scoreRange return result