def calc_similarity( quest, parag ): quest_words = set(TextNormalizer.tokenize_words(quest)) parag_words = set(TextNormalizer.tokenize_words(parag)) parag_lemmas = set(TextNormalizer.tokenize_lemmas(parag)) parag_stems = set(TextNormalizer.tokenize_stems(parag)) parag_crops = set(TextNormalizer.tokenize_crops(parag)) matched_parag_words = set() sim = 0.0 for qword in quest_words: if qword in parag_words: matched_parag_words.add(qword) sim += 1.0 else: qlemma = TextNormalizer.lemmatize_word(qword) if qlemma in parag_lemmas: #matched_parag_lemmas.add(qlemma) sim += 1.0 else: qstem = TextNormalizer.stem_word(qword) if qstem in parag_stems: sim += 0.95 else: qcrop = TextNormalizer.crop_word(qword) if qcrop in parag_crops: sim += 0.80 else: found_syn = False for pstem in parag_stems: if (qstem,pstem) in syn_stems: sim += 0.70 found_syn = True break if not found_syn: if qword in w2v: qvec = w2v[qword] max_cos = -1e38 for pword in parag_words: if pword in w2v: pvec = w2v[pword] c = v_cosine( qvec, pvec ) max_cos = max( max_cos, c ) sim += max_cos*0.5 return sim / len(quest_words)
def filter_NEs(tokens): return [ TextNormalizer.crop_word(word) for word in filter( is_NE, tokens ) ]