Exemple #1
0
        for word_pos, word in enumerate(parag_words):
            if word not in word2id:
                word2id[word] = len(word2id)

            X_parag[index, word_pos] = word2id[word]


        quest_stems = TextNormalizer.tokenize_stems( question )
        quest_set = set(quest_stems)
        denom = float(len( quest_set ))

        max_intersect3 = 0.0
        best_sent = u''

        for parag_sent in segmenter.split(paragraph):
            parag_stems = TextNormalizer.tokenize_stems( Abbrev.normalize_abbrev(parag_sent) )
            intersect3 = len( quest_set & set(parag_stems) ) / denom
            if intersect3>max_intersect3:
                max_intersect3 = intersect3
                best_sent = parag_sent


        parag_words = TextNormalizer.tokenize_words(best_sent)
        for word_pos, word in enumerate(parag_words[0:max_len]):
            if word not in word2id:
                word2id[word] = len(word2id)

            X_parag1[index, word_pos] = word2id[word]

Exemple #2
0
    question1 = TextNormalizer.preprocess_question_str(question)
    quest_stems = set(TextNormalizer.tokenize_stems(question1))
    quest_crops = set(TextNormalizer.tokenize_crops(question1))

    denom_stems = float(len(quest_stems))
    denom_crops = float(len(quest_crops))

    max_crop_match = 0.0
    best_crop_match_sent = u''

    max_stem_match = 0.0
    best_stem_match_sent = u''

    wrt.write('\n\nid={}\n'.format(index))
    for i, parag_sent in enumerate(segmenter.split(paragraph)):
        wrt.write(u'P[{}]\t{}\n'.format(i, parag_sent))

        parag_stems = set(TextNormalizer.tokenize_stems(parag_sent))
        parag_crops = set(TextNormalizer.tokenize_crops(parag_sent))

        match_stems = len(parag_stems & quest_stems) / denom_stems
        match_crops = len(parag_crops & quest_crops) / denom_crops

        if match_stems > max_stem_match:
            max_stem_match = match_stems
            best_stem_match_sent = parag_sent

        if match_crops > max_crop_match:
            max_crop_match = match_crops
            best_crop_match_sent = parag_sent