for word_pos, word in enumerate(parag_words): if word not in word2id: word2id[word] = len(word2id) X_parag[index, word_pos] = word2id[word] quest_stems = TextNormalizer.tokenize_stems( question ) quest_set = set(quest_stems) denom = float(len( quest_set )) max_intersect3 = 0.0 best_sent = u'' for parag_sent in segmenter.split(paragraph): parag_stems = TextNormalizer.tokenize_stems( Abbrev.normalize_abbrev(parag_sent) ) intersect3 = len( quest_set & set(parag_stems) ) / denom if intersect3>max_intersect3: max_intersect3 = intersect3 best_sent = parag_sent parag_words = TextNormalizer.tokenize_words(best_sent) for word_pos, word in enumerate(parag_words[0:max_len]): if word not in word2id: word2id[word] = len(word2id) X_parag1[index, word_pos] = word2id[word]
question1 = TextNormalizer.preprocess_question_str(question) quest_stems = set(TextNormalizer.tokenize_stems(question1)) quest_crops = set(TextNormalizer.tokenize_crops(question1)) denom_stems = float(len(quest_stems)) denom_crops = float(len(quest_crops)) max_crop_match = 0.0 best_crop_match_sent = u'' max_stem_match = 0.0 best_stem_match_sent = u'' wrt.write('\n\nid={}\n'.format(index)) for i, parag_sent in enumerate(segmenter.split(paragraph)): wrt.write(u'P[{}]\t{}\n'.format(i, parag_sent)) parag_stems = set(TextNormalizer.tokenize_stems(parag_sent)) parag_crops = set(TextNormalizer.tokenize_crops(parag_sent)) match_stems = len(parag_stems & quest_stems) / denom_stems match_crops = len(parag_crops & quest_crops) / denom_crops if match_stems > max_stem_match: max_stem_match = match_stems best_stem_match_sent = parag_sent if match_crops > max_crop_match: max_crop_match = match_crops best_crop_match_sent = parag_sent