Exemple #1
0
def url_sets_similarity(entry):
    entry['url_similarity_feature'] = jaccard_index(
        set([
            word.lemma for word in entry['question1_document'] if word.like_url
        ]),
        set([
            word.lemma for word in entry['question2_document'] if word.like_url
        ]))
    return entry
Exemple #2
0
def unigram_idf_cutoff_similarity(entry):
    entry['unigram_idf_cutoff_similarity_1_feature'] = jaccard_index(
        filter_words_with_minimum_idf(entry['question1_document'], 1),
        filter_words_with_minimum_idf(entry['question2_document'], 1))
    entry['unigram_idf_cutoff_similarity_5_feature'] = jaccard_index(
        filter_words_with_minimum_idf(entry['question1_document'], 5),
        filter_words_with_minimum_idf(entry['question2_document'], 5))
    entry['unigram_idf_cutoff_similarity_7.5_feature'] = jaccard_index(
        filter_words_with_minimum_idf(entry['question1_document'], 7.5),
        filter_words_with_minimum_idf(entry['question2_document'], 7.5))
    entry['unigram_idf_cutoff_similarity_10_feature'] = jaccard_index(
        filter_words_with_minimum_idf(entry['question1_document'], 10),
        filter_words_with_minimum_idf(entry['question2_document'], 10))
    entry['unigram_idf_cutoff_similarity_12.5_feature'] = jaccard_index(
        filter_words_with_minimum_idf(entry['question1_document'], 12.5),
        filter_words_with_minimum_idf(entry['question2_document'], 12.5))
    entry['unigram_idf_cutoff_similarity_15_feature'] = jaccard_index(
        filter_words_with_minimum_idf(entry['question1_document'], 15),
        filter_words_with_minimum_idf(entry['question2_document'], 15))
    return entry
Exemple #3
0
def object_sets_similarity(entry):
    entry['objects_similarity_feature'] = jaccard_index(
        get_objects(entry['question1_document']),
        get_objects(entry['question2_document']))
    return entry
Exemple #4
0
def parse_heads_sets_similarity(entry):
    entry['heads_similarity_feature'] = jaccard_index(
        get_heads(entry['question1_document']),
        get_heads(entry['question2_document']))
    return entry
Exemple #5
0
def entity_sets_similarity(entry):
    entry['entities_similarity_feature'] = jaccard_index(
        set([entity.text for entity in entry['question1_document'].ents]),
        set([entity.text for entity in entry['question2_document'].ents]))
    return entry
Exemple #6
0
def non_alphanumeric_sets_similarity(entry):
    entry['non_alphanumeric_similarity_feature'] = jaccard_index(
        set(get_non_alphanumeric_characters(entry['question1'])),
        set(get_non_alphanumeric_characters(entry['question2'])))
    return entry