Example #1
0
segmenter = Segmenter()

print('Segmentation of paragraph texts...')

for name, df in [('train', dftrain), ('test', dftest)]:

    for index, row in tqdm.tqdm(df.iterrows(),
                                total=df.shape[0],
                                desc="Calculating question similarities for " +
                                name):

        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_stems = TextNormalizer.tokenize_stems(u' '.join(
            TextNormalizer.preprocess_question(question)))
        quest_set = set(quest_stems)
        denom = float(len(quest_set))

        max_intersect3 = 0.0

        for parag_sent in segmenter.split(paragraph):
            parag_stems = TextNormalizer.tokenize_stems(
                Abbrev.normalize_abbrev(parag_sent))
            intersect3 = len(quest_set & set(parag_stems)) / denom
            max_intersect3 = max(max_intersect3, intersect3)

        df.loc[index, 'max_intersect3'] = max_intersect3

# ----------------------------------------------------------------------------
Example #2
0
    wrt.write('\n\nid={}\n'.format(index))
    for i, parag_sent in enumerate(segmenter.split(paragraph)):
        wrt.write(u'P[{}]\t{}\n'.format(i, parag_sent))

        parag_stems = set(TextNormalizer.tokenize_stems(parag_sent))
        parag_crops = set(TextNormalizer.tokenize_crops(parag_sent))

        match_stems = len(parag_stems & quest_stems) / denom_stems
        match_crops = len(parag_crops & quest_crops) / denom_crops

        if match_stems > max_stem_match:
            max_stem_match = match_stems
            best_stem_match_sent = parag_sent

        if match_crops > max_crop_match:
            max_crop_match = match_crops
            best_crop_match_sent = parag_sent

    wrt.write(u'Q\t{}\n'.format(question))

    q2 = u' '.join(TextNormalizer.preprocess_question(question))
    wrt.write(u'Z\t{}\n'.format(q2))

    wrt.write(u'Stem match ==> {} with {}\n'.format(max_stem_match,
                                                    best_stem_match_sent))
    wrt.write(u'Crop match ==> {} with {}\n'.format(max_crop_match,
                                                    best_crop_match_sent))

wrt0.close()
wrt1.close()