def url_sets_similarity(entry): entry['url_similarity_feature'] = jaccard_index( set([ word.lemma for word in entry['question1_document'] if word.like_url ]), set([ word.lemma for word in entry['question2_document'] if word.like_url ])) return entry
def unigram_idf_cutoff_similarity(entry): entry['unigram_idf_cutoff_similarity_1_feature'] = jaccard_index( filter_words_with_minimum_idf(entry['question1_document'], 1), filter_words_with_minimum_idf(entry['question2_document'], 1)) entry['unigram_idf_cutoff_similarity_5_feature'] = jaccard_index( filter_words_with_minimum_idf(entry['question1_document'], 5), filter_words_with_minimum_idf(entry['question2_document'], 5)) entry['unigram_idf_cutoff_similarity_7.5_feature'] = jaccard_index( filter_words_with_minimum_idf(entry['question1_document'], 7.5), filter_words_with_minimum_idf(entry['question2_document'], 7.5)) entry['unigram_idf_cutoff_similarity_10_feature'] = jaccard_index( filter_words_with_minimum_idf(entry['question1_document'], 10), filter_words_with_minimum_idf(entry['question2_document'], 10)) entry['unigram_idf_cutoff_similarity_12.5_feature'] = jaccard_index( filter_words_with_minimum_idf(entry['question1_document'], 12.5), filter_words_with_minimum_idf(entry['question2_document'], 12.5)) entry['unigram_idf_cutoff_similarity_15_feature'] = jaccard_index( filter_words_with_minimum_idf(entry['question1_document'], 15), filter_words_with_minimum_idf(entry['question2_document'], 15)) return entry
def object_sets_similarity(entry): entry['objects_similarity_feature'] = jaccard_index( get_objects(entry['question1_document']), get_objects(entry['question2_document'])) return entry
def parse_heads_sets_similarity(entry): entry['heads_similarity_feature'] = jaccard_index( get_heads(entry['question1_document']), get_heads(entry['question2_document'])) return entry
def entity_sets_similarity(entry): entry['entities_similarity_feature'] = jaccard_index( set([entity.text for entity in entry['question1_document'].ents]), set([entity.text for entity in entry['question2_document'].ents])) return entry
def non_alphanumeric_sets_similarity(entry): entry['non_alphanumeric_similarity_feature'] = jaccard_index( set(get_non_alphanumeric_characters(entry['question1'])), set(get_non_alphanumeric_characters(entry['question2']))) return entry