segmenter = Segmenter() print('Segmentation of paragraph texts...') for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Calculating question similarities for " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_stems = TextNormalizer.tokenize_stems(u' '.join( TextNormalizer.preprocess_question(question))) quest_set = set(quest_stems) denom = float(len(quest_set)) max_intersect3 = 0.0 for parag_sent in segmenter.split(paragraph): parag_stems = TextNormalizer.tokenize_stems( Abbrev.normalize_abbrev(parag_sent)) intersect3 = len(quest_set & set(parag_stems)) / denom max_intersect3 = max(max_intersect3, intersect3) df.loc[index, 'max_intersect3'] = max_intersect3 # ----------------------------------------------------------------------------
wrt.write('\n\nid={}\n'.format(index)) for i, parag_sent in enumerate(segmenter.split(paragraph)): wrt.write(u'P[{}]\t{}\n'.format(i, parag_sent)) parag_stems = set(TextNormalizer.tokenize_stems(parag_sent)) parag_crops = set(TextNormalizer.tokenize_crops(parag_sent)) match_stems = len(parag_stems & quest_stems) / denom_stems match_crops = len(parag_crops & quest_crops) / denom_crops if match_stems > max_stem_match: max_stem_match = match_stems best_stem_match_sent = parag_sent if match_crops > max_crop_match: max_crop_match = match_crops best_crop_match_sent = parag_sent wrt.write(u'Q\t{}\n'.format(question)) q2 = u' '.join(TextNormalizer.preprocess_question(question)) wrt.write(u'Z\t{}\n'.format(q2)) wrt.write(u'Stem match ==> {} with {}\n'.format(max_stem_match, best_stem_match_sent)) wrt.write(u'Crop match ==> {} with {}\n'.format(max_crop_match, best_crop_match_sent)) wrt0.close() wrt1.close()