Exemple #1
0
def main():
    stops = stopwords()

    train_path = "../data/train.csv"
    test_path = "../data/test.csv"

    Fields = FieldClass()

    train_df = pd.read_csv("../data/train.csv", sep='\t')
    test_df = pd.read_csv("../data/test.csv", sep='\t')

    train_df['question1'] = train_df['question1'].map(
        lambda x: str(x).lower().split())
    train_df['question2'] = train_df['question2'].map(
        lambda x: str(x).lower().split())

    test_df['question1'] = test_df['question1'].map(
        lambda x: str(x).lower().split())
    test_df['question2'] = test_df['question2'].map(
        lambda x: str(x).lower().split())

    train_qs = pd.Series(train_df['question1'].tolist() +
                         train_df['question2'].tolist())

    words = [x for y in train_qs for x in y]
    counts = Counter(words)
    weights = {word: get_weight(count) for word, count in counts.items()}

    f = functools.partial(word_match_share, stops=stopwords())

    train_df['word_match'] = train_df.apply(f, axis=1, raw=True)
    test_df['word_match'] = test_df.apply(f, axis=1, raw=True)

    train_df['jaccard'] = train_df.apply(jaccard, axis=1, raw=True)
    test_df['jaccard'] = test_df.apply(jaccard, axis=1, raw=True)

    train_df['wc_diff'] = train_df.apply(wc_diff, axis=1, raw=True)
    test_df['wc_diff'] = test_df.apply(wc_diff, axis=1, raw=True)

    train_df['wc_diff_unique'] = train_df.apply(wc_diff_unique,
                                                axis=1,
                                                raw=True)
    test_df['wc_diff_unique'] = test_df.apply(wc_diff_unique, axis=1, raw=True)

    train_df['wc_ratio_unique'] = train_df.apply(wc_ratio_unique,
                                                 axis=1,
                                                 raw=True)
    test_df['wc_ratio_unique'] = test_df.apply(wc_ratio_unique,
                                               axis=1,
                                               raw=True)

    f = functools.partial(wc_diff_unique_stop, stops=stops)
    train_df['wc_diff_unq_stop'] = train_df.apply(f, axis=1, raw=True)
    test_df['wc_diff_unq_stop'] = test_df.apply(f, axis=1, raw=True)

    f = functools.partial(wc_ratio_unique_stop, stops=stops)
    train_df["wc_ratio_unique_stop"] = train_df.apply(f, axis=1, raw=True)
    test_df["wc_ratio_unique_stop"] = test_df.apply(f, axis=1, raw=True)

    train_df["same_start"] = train_df.apply(same_start_word, axis=1, raw=True)
    test_df["same_start"] = test_df.apply(same_start_word, axis=1, raw=True)

    train_df["char_diff"] = train_df.apply(char_diff, axis=1, raw=True)
    test_df["char_diff"] = test_df.apply(char_diff, axis=1, raw=True)

    f = functools.partial(char_diff_unique_stop, stops=stops)
    train_df["char_diff_unq_stop"] = train_df.apply(f, axis=1, raw=True)
    test_df["char_diff_unq_stop"] = test_df.apply(f, axis=1, raw=True)

    train_df["total_unique_words"] = train_df.apply(total_unique_words,
                                                    axis=1,
                                                    raw=True)
    test_df["total_unique_words"] = test_df.apply(total_unique_words,
                                                  axis=1,
                                                  raw=True)

    f = functools.partial(total_unq_words_stop, stops=stops)
    train_df["total_unq_words_stop"] = train_df.apply(f, axis=1, raw=True)
    test_df["total_unq_words_stop"] = test_df.apply(f, axis=1, raw=True)

    train_df["char_ratio"] = train_df.apply(char_ratio, axis=1, raw=True)
    test_df["char_ratio"] = test_df.apply(char_ratio, axis=1, raw=True)

    f = functools.partial(tfidf_word_match_share, weights=weights)
    train_df["tfidf_wm"] = train_df.apply(f, axis=1, raw=True)
    test_df["tfidf_wm"] = test_df.apply(f, axis=1, raw=True)

    f = functools.partial(tfidf_word_match_share_stops,
                          stops=stops,
                          weights=weights)
    train_df["tfidf_wm_stops"] = train_df.apply(f, axis=1, raw=True)
    test_df["tfidf_wm_stops"] = test_df.apply(f, axis=1, raw=True)

    # counter
    compute_counters(train_df, test_df)

    # distance

    train_df["levenstein1"] = train_df.apply(
        lambda r: levenshtein1(r[Fields.question1], r[Fields.question2]),
        axis=1)
    test_df[Fields.levenstein1] = test_df.apply(
        lambda r: levenshtein1(r[Fields.question1], r[Fields.question2]),
        axis=1)
    train_df[Fields.levenstein2] = train_df.apply(
        lambda r: levenshtein2(r[Fields.question1], r[Fields.question2]),
        axis=1)
    test_df[Fields.levenstein2] = test_df.apply(
        lambda r: levenshtein2(r[Fields.question1], r[Fields.question2]),
        axis=1)
    train_df[Fields.sorensen] = train_df.apply(
        lambda r: sorencen(r[Fields.question1], r[Fields.question2]), axis=1)
    test_df[Fields.sorensen] = test_df.apply(
        lambda r: sorencen(r[Fields.question1], r[Fields.question2]), axis=1)

    # fuzzy
    train_df[Fields.qratio] = train_df.apply(lambda row: fuzz.QRatio(
        str(row[Fields.question1]), str(row[Fields.question2])),
                                             axis=1)
    test_df[Fields.qratio] = test_df.apply(lambda row: fuzz.QRatio(
        str(row[Fields.question1]), str(row[Fields.question2])),
                                           axis=1)
    quality_qratio = compute_quality(train_df, Fields.qratio)

    train_df[Fields.wratio] = train_df.apply(lambda row: fuzz.WRatio(
        str(row[Fields.question1]), str(row[Fields.question2])),
                                             axis=1)
    test_df[Fields.wratio] = test_df.apply(lambda row: fuzz.WRatio(
        str(row[Fields.question1]), str(row[Fields.question2])),
                                           axis=1)
    quality_wratio = compute_quality(train_df, Fields.wratio)

    train_df[Fields.partial_ratio] = train_df.apply(
        lambda row: fuzz.partial_ratio(str(row[Fields.question1]),
                                       str(row[Fields.question2])),
        axis=1)
    test_df[Fields.partial_ratio] = test_df.apply(
        lambda row: fuzz.partial_ratio(str(row[Fields.question1]),
                                       str(row[Fields.question2])),
        axis=1)
    quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)

    train_df[Fields.partial_token_set_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[Fields.question1]),
                                                 str(row[Fields.question2])),
        axis=1)
    test_df[Fields.partial_token_set_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_set_ratio(str(row[Fields.question1]),
                                                 str(row[Fields.question2])),
        axis=1)
    quality_partial_token_set_ratio = compute_quality(
        train_df, Fields.partial_token_set_ratio)

    train_df[Fields.partial_token_sort_ratio] = train_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[Fields.question1]),
                                                  str(row[Fields.question2])),
        axis=1)
    test_df[Fields.partial_token_sort_ratio] = test_df.apply(
        lambda row: fuzz.partial_token_sort_ratio(str(row[Fields.question1]),
                                                  str(row[Fields.question2])),
        axis=1)
    quality_partial_token_sort_ratio = compute_quality(
        train_df, Fields.partial_token_sort_ratio)

    train_df[Fields.token_set_ratio] = train_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[Fields.question1]),
                                         str(row[Fields.question2])),
        axis=1)
    test_df[Fields.token_set_ratio] = test_df.apply(
        lambda row: fuzz.token_set_ratio(str(row[Fields.question1]),
                                         str(row[Fields.question2])),
        axis=1)
    quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)

    train_df[Fields.token_sort_ratio] = train_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[Fields.question1]),
                                          str(row[Fields.question2])),
        axis=1)
    test_df[Fields.token_sort_ratio] = test_df.apply(
        lambda row: fuzz.token_sort_ratio(str(row[Fields.question1]),
                                          str(row[Fields.question2])),
        axis=1)
    quality_token_sort_ratio = compute_quality(train_df,
                                               Fields.token_sort_ratio)

    train_df.to_csv(train_path, sep='\t', index=False)
    test_df.to_csv(test_path, sep='\t', index=False)
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.question1.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(
    set(str(x['question1']).lower().split()).intersection(
        set(str(x['question2']).lower().split()))),
                                  axis=1)
data['fuzz_qratio'] = data.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(
    lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(
    lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(
    lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                           str(x['question2'])),
    axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(
    lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                            str(x['question2'])),
    axis=1)
data['fuzz_token_set_ratio'] = data.apply(
    lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
Exemple #3
0
 def testQuickRatioEqual(self):
     self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100)
Exemple #4
0
def build_features(data, stops):
    X = pd.DataFrame()

    log.info('Calculate tfidf')
    qs = pd.Series(data['question1'].tolist() + data['question2'].tolist())
    st = time.time()
    weights = calculate_tfidf(qs)
    log.info('...time for cal tfidf: %.2f m' % ((time.time() - st) / 60))
    del qs

    log.info('Building features')
    X['len_q1'] = data.question1.apply(word_len)  # 1:Length of Q1 str
    X['len_q2'] = data.question2.apply(word_len)  # 2:Length of Q2 str
    X['len_diff'] = abs(X.len_q1 -
                        X.len_q2)  # 3:Length difference between Q1 and Q2

    log.info('Building char features')
    X['len_char_q1'] = data.q1_split.apply(
        word_len_char)  # 4:Char length of Q1
    X['len_char_q2'] = data.q2_split.apply(
        word_len_char)  # 5:Char length of Q2
    X['len_char_diff'] = data.apply(
        len_char_diff, axis=1,
        raw=True)  # 6:Char length difference between Q1 and Q2
    X['char_diff_unq_stop'] = data.apply(char_diff_unique_stop,
                                         stops=stops,
                                         axis=1,
                                         raw=True)  # 7: set(6)
    X['char_ratio'] = data.apply(char_ratio, axis=1,
                                 raw=True)  # 8:Char length Q1 / char length Q2

    log.info('Building word count features')
    X['word_count_q1'] = data.q1_split.apply(word_count)  # 9:Word count of Q1
    X['word_count_q2'] = data.q2_split.apply(word_count)  # 10:Word count of Q2
    X['word_count_diff'] = data.apply(
        wc_diff, axis=1,
        raw=True)  # 11:Word count difference between  Q1 and Q2
    X['word_count_ratio'] = data.apply(
        wc_ratio, axis=1, raw=True)  # 12:Word count Q1 / word count Q2

    X['total_unique_words'] = data.apply(
        total_unique_words, axis=1, raw=True)  # 13:Word count set(Q1 + Q2)
    X['wc_diff_unique'] = data.apply(
        wc_diff_unique, axis=1,
        raw=True)  # 14:Word count set(Q1) - word count set(Q2)
    X['wc_ratio_unique'] = data.apply(
        wc_ratio_unique, axis=1,
        raw=True)  # 15:Word count set(Q1) / word count set(Q2)

    X['total_unq_words_stop'] = data.apply(total_unq_words_stop,
                                           stops=stops,
                                           axis=1,
                                           raw=True)  # 16: 13 - stop words
    X['wc_diff_unique_stop'] = data.apply(wc_diff_unique_stop,
                                          stops=stops,
                                          axis=1,
                                          raw=True)  # 17: 14 - stop words
    X['wc_ratio_unique_stop'] = data.apply(wc_ratio_unique_stop,
                                           stops=stops,
                                           axis=1,
                                           raw=True)  # 18: 15 - stop words

    log.info('Building mark features')
    X['same_start'] = data.apply(same_start_word, axis=1,
                                 raw=True)  # 19 same start = 1 else = 0
    X['same_end'] = data.apply(same_end_word, axis=1,
                               raw=True)  # 20 same end = 1 else = 0

    X['num_capital_q1'] = data.question1.apply(num_capital)  # 21
    X['num_capital_q2'] = data.question2.apply(num_capital)  # 22
    X['num_capital_diff'] = abs(X.num_capital_q1 - X.num_capital_q2)  # 23

    X['num_ques_mark_q1'] = data.question1.apply(num_ques_mark)  # 24
    X['num_ques_mark_q2'] = data.question2.apply(num_ques_mark)  # 25
    X['num_ques_mark_diff'] = abs(X.num_ques_mark_q1 -
                                  X.num_ques_mark_q2)  # 26

    log.info('Building another features')
    # 27 ~ 27+28(14*2)-1=54: First word in sentence(one hot)
    for start in common_start:
        X['start_%s_%s' % (start, 'q1')] = data.q1_split.apply(start_with,
                                                               args=(start, ))
    for start in common_start:  # 為了讓csv看起來更漂亮(更像one hot)
        X['start_%s_%s' % (start, 'q2')] = data.q2_split.apply(start_with,
                                                               args=(start, ))

    X['common_words'] = data.apply(common_words, axis=1,
                                   raw=True)  # 55:兩句相同的字數
    X['common_words_unique'] = data.apply(common_words_unit, axis=1,
                                          raw=True)  # 56:兩句相同的字母數

    X['word_match'] = data.apply(word_match_share, axis=1,
                                 raw=True)  # 57:字的重複比例 between Q1 and Q2
    X['word_match_stops'] = data.apply(
        word_match_share_stops, stops=stops, axis=1,
        raw=True)  # 58:字的重複比例 without stop word between Q1 and Q2
    X['tfidf_wm'] = data.apply(
        tfidf_word_match_share, weights=weights, axis=1,
        raw=True)  # 59:字的重複比例 between Q1 and Q2 (TF-IDF值)
    X['tfidf_wm_stops'] = data.apply(
        tfidf_word_match_share_stops,
        stops=stops,
        weights=weights,
        axis=1,
        raw=True)  # 60:字的重複比例 without stop word between Q1 and Q2 (TF-IDF值)

    log.info('Building fuzzy features')
    # 61~67:Build fuzzy features
    X['fuzz_qratio'] = data.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    X['fuzz_WRatio'] = data.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    X['fuzz_partial_ratio'] = data.apply(
        lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
        axis=1)
    X['fuzz_partial_token_set_ratio'] = data.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                               str(x['question2'])),
        axis=1)
    X['fuzz_partial_token_sort_ratio'] = data.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                                str(x['question2'])),
        axis=1)
    X['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(
        str(x['question1']), str(x['question2'])),
                                           axis=1)
    X['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(
        str(x['question1']), str(x['question2'])),
                                            axis=1)

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True)  # 68:jaccard distance

    log.info('Build word2vec/glove distance features')
    # Build word2vec/glove distance features
    X['wmd'] = data.apply(lambda x: wmd(x['q1_split'], x['q2_split']),
                          axis=1)  # 69
    X['norm_wmd'] = data.apply(
        lambda x: norm_wmd(x['q1_split'], x['q2_split']), axis=1)  # 70

    question1_vectors = np.zeros((data.shape[0], 300))

    log.info('Sent2Vec')
    # Sent2Vec
    for i, q in tqdm(enumerate(data.question1.values)):
        question1_vectors[i, :] = sent2vec(q)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.question2.values)):
        question2_vectors[i, :] = sent2vec(q)

    log.info('Building distance features')
    # Build distance features
    X['cosine_distance'] = [
        cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                       np.nan_to_num(question2_vectors))
    ]
    X['cityblock_distance'] = [
        cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                          np.nan_to_num(question2_vectors))
    ]
    X['canberra_distance'] = [
        canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                         np.nan_to_num(question2_vectors))
    ]
    X['minkowski_distance'] = [
        minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                             np.nan_to_num(question2_vectors))
    ]
    X['braycurtis_distance'] = [
        braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                           np.nan_to_num(question2_vectors))
    ]

    X['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    X['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    X['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    X['kur_q2vec'] = [kurtosis(x)
                      for x in np.nan_to_num(question2_vectors)]  # 79

    # LDA features
    topics_q1 = data.question1.apply(
        lambda x: dict(lda_model[dictionary.doc2bow(clean_doc(x))]))
    for idx in range(num_topics):
        X['lda_topic_%s_%s' %
          (idx, 'q1')] = topics_q1.apply(lambda x: x.get(idx, 0))
    del topics_q1
    topics_q2 = data.question2.apply(
        lambda x: dict(lda_model[dictionary.doc2bow(clean_doc(x))]))
    for idx in range(num_topics):
        X['lda_topic_%s_%s' %
          (idx, 'q2')] = topics_q2.apply(lambda x: x.get(idx, 0))
    del topics_q2

    # LSI features
    topics_q1 = data.question1.apply(
        lambda x: dict(lsi_model[dictionary.doc2bow(clean_doc(x))]))
    for idx in range(num_topics):
        X['lsi_topic_%s_%s' %
          (idx, 'q1')] = topics_q1.apply(lambda x: x.get(idx, 0))
    del topics_q1
    topics_q2 = data.question2.apply(
        lambda x: dict(lsi_model[dictionary.doc2bow(clean_doc(x))]))
    for idx in range(num_topics):
        X['lsi_topic_%s_%s' %
          (idx, 'q2')] = topics_q2.apply(lambda x: x.get(idx, 0))
    del topics_q2

    return X
Exemple #5
0
data['text2_lower'] = data['question2'].apply(lambda x: x.lower()) 
data['common_noun_cnt'] = [count_common_nouns(nltk.word_tokenize(lem.lemmatize(x[0],"v")),nltk.word_tokenize(lem.lemmatize(x[1], "v")), nouns) for x in data[['question1','question2']].values]


# Initial Features


data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)


data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0
def predict():
    #question1 = 'What practical applications might evolve from the discovery of the Higgs Boson ?'
    #question2 = 'What are some practical benefits of discovery of the Higgs Boson ?'
    question1 = request.form["question1"]
    question2 = request.form["question2"]

    diff_len = len(str(question1)) - len(str(question2))
    common_words = len(
        set(str(question1).lower().split()).intersection(
            set(str(question2).lower().split())))
    fuzz_qratio = fuzz.QRatio(str(question1), str(question2))
    fuzz_WRatio = fuzz.WRatio(str(question1), str(question2))
    fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2))
    fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio(
        str(question1), str(question2))
    fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio(
        str(question1), str(question2))
    fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2))
    fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1),
                                                  str(question2))
    wmd = wordmoverdistance(question1, question2)

    question1_vectors = sent2vec(question1)
    question2_vectors = sent2vec(question2)

    cosine_distance = cosine(question1_vectors, question2_vectors)
    cityblock_distance = cityblock(question1_vectors, question2_vectors)
    canberra_distance = canberra(question1_vectors, question2_vectors)
    euclidean_distance = euclidean(question1_vectors, question2_vectors)
    minkowski_distance = minkowski(question1_vectors, question2_vectors, 3)
    braycurtis_distance = braycurtis(question1_vectors, question2_vectors)

    X = np.array([
        diff_len, common_words, fuzz_qratio, fuzz_WRatio, fuzz_partial_ratio,
        fuzz_partial_token_set_ratio, fuzz_partial_token_sort_ratio,
        fuzz_token_set_ratio, fuzz_token_sort_ratio, wmd, cosine_distance,
        cityblock_distance, canberra_distance, euclidean_distance,
        minkowski_distance, braycurtis_distance
    ])

    X_to_render = X

    X = X_Scaler.transform(X.reshape(1, -1))

    classifier = pickle.load(open('models/ANN.model', 'rb'))

    y_ann_pred = classifier.predict(X)
    y_random_forest_pred = randomForest.predict(X)
    y_logistic_pred = logistic.predict(X)
    y_knn_pred = knn.predict(X)

    print(y_random_forest_pred)
    print(y_logistic_pred)
    print(y_knn_pred)
    # return render_template("result.html", X_to_render = X_to_render, y_random_forest_pred = y_random_forest_pred, y_logistic_pred = y_logistic_pred, y_knn_pred = y_knn_pred)
    return render_template("result.html",
                           X_to_render=X_to_render,
                           y_random_forest_pred=y_random_forest_pred,
                           y_logistic_pred=y_logistic_pred,
                           y_knn_pred=y_knn_pred,
                           y_ann_pred=y_ann_pred[0])
Exemple #7
0
    def preprocess(df):
        df_features = pd.DataFrame(index=df.index)
        df_intermediate = pd.DataFrame(index=df.index)

        print("--> Compute tokens...")
        df_intermediate["clean_a"] = df.text_a_text.apply(lambda x: clean(x))
        df_intermediate["clean_b"] = df.text_b_text.apply(lambda x: clean(x))

        df_intermediate["words_a"] = df_intermediate.apply(
            lambda row: row.clean_a.split(" "), axis=1)
        df_intermediate["words_b"] = df_intermediate.apply(
            lambda row: row.clean_b.split(" "), axis=1)

        df_intermediate["words_clean_a"] = df_intermediate.apply(
            lambda row: set([w for w in row.words_a if w not in en_stop]),
            axis=1)
        df_intermediate["words_clean_b"] = df_intermediate.apply(
            lambda row: set([w for w in row.words_b if w not in en_stop]),
            axis=1)

        df_intermediate["stop_a"] = df_intermediate.apply(
            lambda row: set([w for w in row.words_a if w in en_stop]), axis=1)
        df_intermediate["stop_b"] = df_intermediate.apply(
            lambda row: set([w for w in row.words_b if w in en_stop]), axis=1)

        print("--> Compute common words features...")
        df_intermediate["common_stop_words"] = df_intermediate.apply(
            lambda row: row.stop_a.intersection(row.stop_b), axis=1)
        df_intermediate["common_words"] = df_intermediate.apply(
            lambda row: set(row.words_a).intersection(set(row.words_b)),
            axis=1)
        df_intermediate["common_clean_words"] = df_intermediate.apply(
            lambda row: row.words_clean_a.intersection(row.words_clean_b),
            axis=1)

        df_intermediate[
            "common_stop_words_cnt"] = df_intermediate.common_stop_words.apply(
                lambda x: len(x))
        df_intermediate[
            "common_words_cnt"] = df_intermediate.common_words.apply(
                lambda x: len(x))
        df_intermediate[
            "common_clean_words_cnt"] = df_intermediate.common_clean_words.apply(
                lambda x: len(x))

        df_features["common_stop_words_ratio_min"] = df_intermediate.apply(
            lambda x: x.common_stop_words_cnt /
            (min(len(x["stop_a"]), len(x["stop_b"])) + 0.0001),
            axis=1)
        df_features["common_words_ratio_min"] = df_intermediate.apply(
            lambda x: x.common_words_cnt /
            (min(len(x["words_a"]), len(x["words_b"])) + 0.0001),
            axis=1)
        df_features["common_clean_words_ratio_min"] = df_intermediate.apply(
            lambda x: x.common_clean_words_cnt /
            (min(len(x["words_clean_a"]), len(x["words_clean_b"])) + 0.0001),
            axis=1)

        df_features["common_stop_words_ratio_max"] = df_intermediate.apply(
            lambda x: x.common_stop_words_cnt /
            (max(len(x["stop_a"]), len(x["stop_b"])) + 0.0001),
            axis=1)
        df_features["common_words_ratio_max"] = df_intermediate.apply(
            lambda x: x.common_words_cnt /
            (max(len(x["words_a"]), len(x["words_b"])) + 0.0001),
            axis=1)
        df_features["common_clean_words_ratio_max"] = df_intermediate.apply(
            lambda x: x.common_clean_words_cnt /
            (max(len(x["words_clean_a"]), len(x["words_clean_b"])) + 0.0001),
            axis=1)

        print("--> Compute general NLP features...")
        df_features["same_last_token"] = df_intermediate.apply(
            lambda x: int(x.words_a[-1] == x.words_b[-1]), axis=1)
        df_features["same_first_token"] = df_intermediate.apply(
            lambda x: int(x.words_a[0] == x.words_b[0]), axis=1)

        df_features["length_diff"] = df_intermediate.apply(
            lambda x: abs(len(x.words_a) - len(x.words_b)), axis=1)
        df_features["avg_length"] = df_intermediate.apply(
            lambda x: (len(x.words_a) + len(x.words_b)) / 2, axis=1)

        # Number of capital letters feature
        df_intermediate["a_n_capital"] = n_capital_letters(df["text_a_text"])
        df_intermediate["b_n_capital"] = n_capital_letters(df["text_b_text"])
        df_features["max_n_capital"] = df_intermediate[[
            "a_n_capital", "b_n_capital"
        ]].max(axis=1)
        df_features["min_n_capital"] = df_intermediate[[
            "a_n_capital", "b_n_capital"
        ]].min(axis=1)
        df_features["n_capital_diff"] = np.abs(df_intermediate["a_n_capital"] -
                                               df_intermediate["b_n_capital"])

        # Number related features
        df_intermediate["a_has_number"] = df.text_a_text.apply(
            lambda x: has_number(x))
        df_intermediate["b_has_number"] = df.text_b_text.apply(
            lambda x: has_number(x))
        df_features["max_has_number"] = df_intermediate[[
            "a_has_number", "b_has_number"
        ]].max(axis=1)
        df_features["min_has_number"] = df_intermediate[[
            "a_has_number", "b_has_number"
        ]].min(axis=1)

        # Adopted from https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question
        print("--> Compute fuzzy features...")
        df_features['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(
            str(x["text_a_text"]), str(x["text_b_text"])),
                                              axis=1)
        df_features['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(
            str(x["text_a_text"]), str(x["text_b_text"])),
                                              axis=1)
        df_features['fuzz_partial_ratio'] = df.apply(
            lambda x: fuzz.partial_ratio(str(x["text_a_text"]),
                                         str(x["text_b_text"])),
            axis=1)
        df_features['fuzz_partial_token_set_ratio'] = df.apply(
            lambda x: fuzz.partial_token_set_ratio(str(x["text_a_text"]),
                                                   str(x["text_b_text"])),
            axis=1)
        df_features['fuzz_partial_token_sort_ratio'] = df.apply(
            lambda x: fuzz.partial_token_sort_ratio(str(x["text_a_text"]),
                                                    str(x["text_b_text"])),
            axis=1)
        df_features['fuzz_token_set_ratio'] = df.apply(
            lambda x: fuzz.token_set_ratio(str(x["text_a_text"]),
                                           str(x["text_b_text"])),
            axis=1)
        df_features['fuzz_token_sort_ratio'] = df.apply(
            lambda x: fuzz.token_sort_ratio(str(x["text_a_text"]),
                                            str(x["text_b_text"])),
            axis=1)

        print("--> Compute longest substring...")
        df_features["longest_substring_ratio"] = df.apply(
            lambda x: get_longest_substring_ratio(x["text_a_text"], x[
                "text_b_text"]),
            axis=1)

        return df_features
Exemple #8
0
def predict():

        q1 = request.form['q1']
        q2 = request.form['q2']

        inference_point['freq_qid1']=train_org[train_org['question1']==q1].shape[0]
        inference_point['freq_qid2']=train_org[train_org['question2']==q2].shape[0]

        inference_point['q1len']=len(q1)
        inference_point['q2len']=len(q2)
        inference_point['q1_n_words']=len(q1.split(" "))
        inference_point['q2_n_words']=len(q2.split(" "))


        w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), q1.split(" ")))

        inference_point['word_Common'] =1.0 * len(w1 & w2)
        inference_point['word_Total']= 1.0 * (len(w1) + len(w2))
        inference_point['word_share']= 1.0 * len(w1 & w2)/(len(w1) + len(w2))
        inference_point['freq_q1+q2'] = inference_point['freq_qid1']+inference_point['freq_qid2']
        inference_point['freq_q1-q2'] = inference_point['freq_qid1']-inference_point['freq_qid2']



        q1=preprocess(q1)
        q2=preprocess(q2)

        token_features=get_token_features(q1,q2)

        inference_point["cwc_min"]      =  token_features[0]
        inference_point["cwc_max"]      =  token_features[1]
        inference_point["csc_min"]      =  token_features[2]
        inference_point["csc_max"]       = token_features[3]
        inference_point["ctc_min"]       = token_features[4]
        inference_point["ctc_max"]       = token_features[5]
        inference_point["last_word_eq"]  = token_features[6]
        inference_point["first_word_eq"] = token_features[7]
        inference_point["abs_len_diff"]  = token_features[8]
        inference_point["mean_len"]     =  token_features[9]

        inference_point['longest_substr_ratio']=len(list(lcs(q1,q2))[0])

        inference_point['token_set_ratio'] =fuzz.token_set_ratio(q1,q2)
        inference_point['token_sort_ratio'] =fuzz.token_sort_ratio(q1,q2)
        inference_point['fuzz_ratio'] =fuzz.QRatio(q1,q2)
        inference_point['fuzz_partial_ratio'] =fuzz.partial_ratio(q1,q2)




        q1_vec=tfidf_w2v(q1)
        q2_vec=tfidf_w2v(q2)

        for i in range(len(q1_vec)):
            inference_point[str(i)+'_x']=q1_vec[i]
            inference_point[str(i)+'_y']=q2_vec[i]

        inference_point['fuzz_ratio']
        len(inference_point)

# cols
        X=pd.DataFrame(inference_point,index=[0])
        X=X[cols]

        x = xgb.DMatrix(X)
        pred=bst.predict(x)
        if(pred>0.5):
            return render_template("results.html",sim='Similar',score=pred)
        else:
            return render_template("results.html",sim='Disimilar',score=pred)
Exemple #9
0
train = train[['label', 'words_x', 'words_y']]
train.columns = ['label', 'words1', 'words2']
len_train = train.shape[0]

test = pd.merge(test, question, left_on=['q1'], right_on=['qid'], how='left')
test = pd.merge(test, question, left_on=['q2'], right_on=['qid'], how='left')
test = test[['words_x', 'words_y']]
test.columns = ['words1', 'words2']

df_feat = pd.DataFrame()
df_data = pd.concat([train, test])

# 输出相似度的结果
# https://blog.csdn.net/sunyao_123/article/details/76942809
df_feat['fuzz_words_qratio'] = df_data.apply(
    lambda row: fuzz.QRatio(str(row['words1']), str(row['words2'])), axis=1)
df_feat['fuzz_words_WRatio'] = df_data.apply(
    lambda row: fuzz.WRatio(str(row['words1']), str(row['words2'])), axis=1)
df_feat['fuzz_words_partial_ratio'] = df_data.apply(
    lambda row: fuzz.partial_ratio(str(row['words1']), str(row['words2'])),
    axis=1)
df_feat['fuzz_words_partial_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_set_ratio(str(row['words1']),
                                             str(row['words2'])),
    axis=1)
df_feat['fuzz_words_partial_token_sort_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_sort_ratio(str(row['words1']),
                                              str(row['words2'])),
    axis=1)
df_feat['fuzz_words_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.token_set_ratio(str(row['words1']), str(row['words2'])),
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy import string_processing
a = 'my india'
b = ['Cirque du Soleil-Zarkana', 'this is my india', 'I love my india']
print(fuzz.QRatio(a, b))
print(process.extractOne(a, b, scorer=fuzz.ratio))
Exemple #11
0
for projectIndex in range(len(projectList["scratchpads"])):
    url = projectList["scratchpads"][projectIndex]["url"]
    id = url.split("/")[-1]

    print("Checking against project " + id)

    # Get the projects data
    project = requests.get("https://www.khanacademy.org/api/labs/scratchpads/" + id)
    projectCode = project.json()["revision"]["code"]

    # Compare the code
    a = fuzz.ratio(originalCode, projectCode)
    b = fuzz.partial_ratio(originalCode, projectCode)
    c = fuzz.token_sort_ratio(originalCode, projectCode)
    d = fuzz.partial_token_sort_ratio(originalCode, projectCode)
    e = fuzz.QRatio(originalCode, projectCode)

    data = [a, b, c, d, e]

    # Process the data and output it
    outputHTML = outputHTML + "<tr><td><a href=\"" + url + "\" target=\"_blank\"><p>" + id + "</p></a></td>"
    for value in data:
        color = ""
        if value < 58:
            color = "#7ffe00"
        elif value >= 58 and value < 75:
            color = "#ffff00"
        elif value >= 75 and value < 87:
            color = "#fe7f00"
        elif value >= 87:
            color = "#fe007f"
def read_data():

    data = pd.read_csv("data/test.csv")
    #var = str( data.loc[data["id"]==53,"question1"])
    #print var#cleanser(var)

    print "lock and load"

    #augmenting data with basic features
    data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
    data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
    data['diff_len'] = data.len_q1 - data.len_q2
    data['len_char_q1'] = data.question1.apply(
        lambda x: len("".join(str(x).split())))
    data['len_char_q2'] = data.question2.apply(
        lambda x: len("".join(str(x).split())))
    data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
    data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
    data['common_words'] = data.apply(lambda x: len(
        set(str(x['question1']).lower().split()).intersection(
            set(str(x['question2']).lower().split()))),
                                      axis=1)
    col_basic = [
        'len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
        'len_word_q1', 'len_word_q2', 'common_words'
    ]

    #Levenshtein Distance features

    data['q_ratio'] = data.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    data['w_ratio'] = data.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)
    data['partial_ratio'] = data.apply(
        lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
        axis=1)
    data['partial_token_set_ratio'] = data.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                               str(x['question2'])),
        axis=1)
    data['partial_token_sort_ratio'] = data.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                                str(x['question2'])),
        axis=1)
    data['token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(
        str(x['question1']), str(x['question2'])),
                                         axis=1)
    data['token_sort_ratio'] = data.apply(
        lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
        axis=1)

    col_levenshtein = [
        'q_ratio', 'w_ratio', 'partial_ratio', 'partial_token_set_ratio',
        'partial_token_sort_ratio', 'token_set_ratio', 'token_sort_ratio'
    ]

    question_vec_cols_q1, question_vec_cols_q2 = [], []

    print "levenshtein and basic features done!"

    vector_features, col_distance = question_to_vector(data['question1'],
                                                       data['question2'])
    col_distance = ['WMD_basic', 'WMD_normalized'] + col_distance

    print "assigning vector values"
    for i in range(0, 300):
        question_vec_cols_q1.append("vec_val_" + str(i) + "_q1")
        data["vec_val_" + str(i) + "_q1"] = vector_features[:, i]
        question_vec_cols_q2.append("vec_val_" + str(i) + "_q2")
        data["vec_val_" + str(i) + "_q2"] = vector_features[:, 300 + i]

    for i, key in enumerate(col_distance):
        data[key] = vector_features[:, 600 + i]

    data["zero_vec_check_q1"] = vector_features[:, 613]
    data["zero_vec_check_q2"] = vector_features[:, 614]

    header = ['id', 'is_duplicate']
    header.extend(col_basic)
    header.extend(col_levenshtein)
    header.extend(col_distance)
    header.extend(question_vec_cols_q1)
    header.extend(question_vec_cols_q2)
    header.append('zero_vec_check_q1')
    header.append('zero_vec_check_q2')
    print "writing csv"
    data.to_csv(TRANSFORMED_DATA_FILE_PATH, columns=header, sep='\t')
    print "done!"
def generate_h1(train_file, test_file, train_feature_file, test_feature_file,
                feature_map_file):

    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    print("Original data: X_train: {}, X_test: {}".format(
        df_train.shape, df_test.shape))

    print("Features processing, be patient...")

    # If a word appears only once, we ignore it completely (likely a typo)
    # Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
    def get_weight(count, eps=10000, min_count=2):
        return 0 if count < min_count else 1 / (count + eps)

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)
    words = (" ".join(train_qs)).lower().split()
    counts = Counter(words)
    weights = {word: get_weight(count) for word, count in counts.items()}

    stops = set(stopwords.words("english"))

    def word_shares(row):
        q1 = set(str(row['question1']).lower().split())
        q1words = q1.difference(stops)
        if len(q1words) == 0:
            return '0:0:0:0:0'

        q2 = set(str(row['question2']).lower().split())
        q2words = q2.difference(stops)
        if len(q2words) == 0:
            return '0:0:0:0:0'

        q1stops = q1.intersection(stops)
        q2stops = q2.intersection(stops)

        shared_words = q1words.intersection(q2words)
        shared_weights = [weights.get(w, 0) for w in shared_words]
        total_weights = [weights.get(w, 0) for w in q1words
                         ] + [weights.get(w, 0) for w in q2words]

        R1 = np.sum(shared_weights) / np.sum(total_weights)  #tfidf share
        R2 = len(shared_words) / (len(q1words) + len(q2words))  #count share
        R31 = len(q1stops) / len(q1words)  #stops in q1
        R32 = len(q2stops) / len(q2words)  #stops in q2
        return '{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32)

    df = pd.concat([df_train, df_test])
    df['word_shares'] = df.apply(word_shares, axis=1, raw=True)

    x = pd.DataFrame()
    x['word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
    x['tfidf_word_match'] = df['word_shares'].apply(
        lambda x: float(x.split(':')[1]))
    x['shared_count'] = df['word_shares'].apply(
        lambda x: float(x.split(':')[2]))

    x['stops1_ratio'] = df['word_shares'].apply(
        lambda x: float(x.split(':')[3]))
    x['stops2_ratio'] = df['word_shares'].apply(
        lambda x: float(x.split(':')[4]))
    x['diff_stops_r'] = x['stops1_ratio'] - x['stops2_ratio']

    x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
    x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
    x['diff_len'] = x['len_q1'] - x['len_q2']

    x['len_char_q1'] = df['question1'].apply(
        lambda x: len(str(x).replace(' ', '')))
    x['len_char_q2'] = df['question2'].apply(
        lambda x: len(str(x).replace(' ', '')))
    x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']

    x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
    x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
    x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']

    x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1']
    x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2']
    x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2']

    x['fuzz_qratio'] = df.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    x['fuzz_WRatio'] = df.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    x['fuzz_partial_ratio'] = df.apply(
        lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
        axis=1)

    x['fuzz_partial_token_set_ratio'] = df.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                               str(x['question2'])),
        axis=1)

    x['fuzz_partial_token_sort_ratio'] = df.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                                str(x['question2'])),
        axis=1)

    x['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(
        str(x['question1']), str(x['question2'])),
                                         axis=1)

    x['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(
        str(x['question1']), str(x['question2'])),
                                          axis=1)

    x['word2vec_similarity'] = df.apply(
        lambda x: PhraseVector(str(x['question1'])).CosineSimilarity(
            PhraseVector(str(x['question2'])).vector),
        axis=1)

    feature_names = list(x.columns.values)
    print("Features: {}".format(feature_names))

    x.fillna(0, inplace=True)

    x_train = x[:df_train.shape[0]]
    x_test = x[df_train.shape[0]:]
    y_train = df_train[TARGET].values

    if 1:  # Now we oversample the negative class - on your own risk of overfitting!
        pos_train = x_train[y_train == 1]
        neg_train = x_train[y_train == 0]

        print("Oversampling started for proportion: {}".format(
            len(pos_train) / (len(pos_train) + len(neg_train))))
        p = 0.165
        scale = ((float(len(pos_train)) /
                  (len(pos_train) + len(neg_train))) / p) - 1
        while scale > 1:
            neg_train = pd.concat([neg_train, neg_train])
            scale -= 1
        neg_train = pd.concat(
            [neg_train, neg_train[:int(scale * len(neg_train))]])
        print("Oversampling done, new proportion: {}".format(
            len(pos_train) / (len(pos_train) + len(neg_train))))

        x_train = pd.concat([pos_train, neg_train])
        y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(
            len(neg_train)).tolist()
        del pos_train, neg_train

    logging.info('saving features')
    save_data(x_train, y_train, train_feature_file)
    save_data(x_test, None, test_feature_file)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(x.columns):
            f.write('{}\t{}\tq\n'.format(i, col))
Exemple #14
0
question2 = input("enter question 2:")
#calculation of features from the input question
len_1 = len(question1)
len_2 = len(question2)
diff_len = len_1 - len_2
len_char_q1 = len(''.join(set(str(question1).replace(' ', ''))))
len_char_q2 = len(''.join(set(str(question2).replace(' ', ''))))
len_word_q1 = len(str(question1).split())
len_word_q2 = len(str(question2).split())
common_words = len(
    set(str(question1).lower().split()).intersection(
        set(str(question2).lower().split())))

#fuzzy
from fuzzywuzzy import fuzz
fuzz_qratio = fuzz.QRatio(str(question1), str(question2))
fuzz_WRatio = fuzz.WRatio(str(question1), str(question2))
fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2))
fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio(
    str(question1), str(question2))
fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio(
    str(question1), str(question2))
fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2))
fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2))

#wmd
import gensim
#model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
#sen2vec
import scipy
question1_vectors = scipy.sparse.lil_matrix((dataset.shape[0], 300))
def extract_features(model, sentence_1, sentence_2):

    features = []
    #preprocessing each pair of sentences and tokenize them
    sentence_1 = review_to_words(sentence_1)
    sentence_2 = review_to_words(sentence_2)
    tokens_1 = sentence_1.split()
    tokens_2 = sentence_2.split()

    #compute average of Glove word vectors for each sentence
    vec1 = np.zeros([300], dtype=float)
    vec2 = np.zeros([300], dtype=float)
    counter = 1
    for i in range(len(tokens_1)):
        if tokens_1[i] in model:
            counter += 1
            vec1 += model[tokens_1[i]]
    vec1 = vec1 / counter

    counter = 1
    for i in range(len(tokens_2)):
        if tokens_2[i] in model:
            counter += 1
            vec2 += model[tokens_2[i]]
    vec2 = vec2 / counter

    #add diffferent features
    features.append(cosine_sim(vec1, vec2))
    features.append(euclidean_distance(vec1, vec2))
    features.append(jaccard_similarity(sentence_1, sentence_2))
    features.append(distance_title_len(sentence_1, sentence_2))
    features.append(get_longest_substr_ratio(sentence_1, sentence_2))
    features.append(distance_bigrams_same(sentence_1, sentence_2))

    SAFE_DIV = 0.0001
    token_features = [0.0] * 9

    q1_tokens = sentence_1.split()
    q2_tokens = sentence_2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return features

    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    token_features[0] = common_word_count / (
        min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (
        max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (
        min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (
        max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (
        min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (
        max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    token_features[8] = (len(q1_tokens) + len(q2_tokens)) / 2

    for i in token_features:
        features.append(i)

    #use fuzzy wuzzy library
    features.append(fuzz.token_set_ratio(sentence_1, sentence_2))
    features.append(fuzz.token_sort_ratio(sentence_1, sentence_2))
    features.append(fuzz.QRatio(sentence_1, sentence_2))
    features.append(fuzz.partial_ratio(sentence_1, sentence_2))

    return features
def generate_h1(train_file, test_file, train_feature_file, test_feature_file,
                feature_map_file):

    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    print("Original data: X_train: {}, X_test: {}".format(
        df_train.shape, df_test.shape))

    print("Features processing, be patient...")

    df = pd.concat([df_train, df_test])

    x = pd.DataFrame()

    x['fuzz_qratio'] = df.apply(
        lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    x['fuzz_WRatio'] = df.apply(
        lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
        axis=1)

    x['fuzz_partial_ratio'] = df.apply(
        lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
        axis=1)

    x['fuzz_partial_token_set_ratio'] = df.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                               str(x['question2'])),
        axis=1)

    x['fuzz_partial_token_sort_ratio'] = df.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                                str(x['question2'])),
        axis=1)

    x['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(
        str(x['question1']), str(x['question2'])),
                                         axis=1)

    x['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(
        str(x['question1']), str(x['question2'])),
                                          axis=1)

    feature_names = list(x.columns.values)
    print("Features: {}".format(feature_names))

    x.fillna(0, inplace=True)

    x_train = x[:df_train.shape[0]]
    x_test = x[df_train.shape[0]:]
    y_train = df_train[TARGET].values

    logging.info('saving features')
    save_data(x_train, y_train, train_feature_file)
    save_data(x_test, None, test_feature_file)

    with open(feature_map_file, 'w') as f:
        for i, col in enumerate(x.columns):
            f.write('{}\t{}\tq\n'.format(i, col))
Exemple #17
0
    def get_feature_engineered_data(self, flag):

        #Creating an object of Data Exploration
        obj = FeatureEngineeringFunctions()

        if flag == 1:
            train = pd.read_csv('../LematizedFiles/trainlem.csv',
                                engine='python')
        else:
            train = pd.read_csv('../LematizedFiles/testlem.csv',
                                engine='python')

        obj.convert_to_string(train)

        #Data Parameter used for some feature engineering attributes
        EPSILON = 0.0000001

        #Map works element wise on a series
        #Apply works row/column wise on a dataframe
        train['q1_word_num'] = train['question1'].map(obj.words_count)
        train['q2_word_num'] = train['question2'].map(obj.words_count)

        train['q1_length'] = train['question1'].map(obj.length)
        train['q2_length'] = train['question2'].map(obj.length)

        train['word_num_difference'] = abs(train.q1_word_num -
                                           train.q2_word_num)
        train['length_difference'] = abs(train.q1_length - train.q2_length)

        train['q1_has_fullstop'] = train.question1.apply(
            lambda x: int('.' in x))
        train['q2_has_fullstop'] = train.question2.apply(
            lambda x: int('.' in x))

        train['q1_digit_count'] = train.question1.apply(
            lambda question: sum([word.isdigit() for word in question]))
        train['q2_digit_count'] = train.question2.apply(
            lambda question: sum([word.isdigit() for word in question]))
        train['digit_count_difference'] = abs(train.q1_digit_count -
                                              train.q2_digit_count)

        train['q1_capital_char_count'] = train.question1.apply(
            lambda question: sum([word.isupper() for word in question]))
        train['q2_capital_char_count'] = train.question2.apply(
            lambda question: sum([word.isupper() for word in question]))
        train['capital_char_count_difference'] = abs(
            train.q1_capital_char_count - train.q2_capital_char_count)

        train['q1_has_math_expression'] = train.question1.apply(
            lambda x: int('[math]' in x))
        train['q2_has_math_expression'] = train.question2.apply(
            lambda x: int('[math]' in x))

        train['common_words'] = train[['question1',
                                       'question2']].apply(obj.count_common,
                                                           axis=1)
        train['lem_common_words'] = train[['lem_question1', 'lem_question2'
                                           ]].apply(obj.count_common, axis=1)

        train['log_word_share'] = np.log(
            train[['question1', 'question2']].apply(obj.count_common, axis=1) +
            EPSILON)
        train['lem_log_word_share'] = np.log(
            train[['lem_question1', 'lem_question2']].apply(obj.count_common,
                                                            axis=1) + EPSILON)

        train['word_share_squared'] = (train[['question1', 'question2'
                                              ]].apply(obj.count_common,
                                                       axis=1)**2)
        train['lem_word_share_squared'] = (train[[
            'lem_question1', 'lem_question2'
        ]].apply(obj.count_common, axis=1)**2)

        train['word_share_sqrt'] = np.sqrt(train[['question1', 'question2'
                                                  ]].apply(obj.count_common,
                                                           axis=1))
        train['lem_word_share_sqrt'] = np.sqrt(
            train[['lem_question1', 'lem_question2']].apply(obj.count_common,
                                                            axis=1))

        train['log_length_difference'] = np.log(train.length_difference +
                                                EPSILON)
        train['length_difference_squared'] = train.length_difference**2
        train['length_difference_sqrt'] = np.sqrt(train.length_difference)

        train['log_lem_tfidf'] = np.log(train.lem_tfidf_word_match + EPSILON)
        train['lem_tfidf_squared'] = train.lem_tfidf_word_match**2
        train['lem_tfidf_sqrt'] = np.sqrt(train.lem_tfidf_word_match)

        train['total_unique_words'] = train[['question1', 'question2'
                                             ]].apply(obj.total_unique_words,
                                                      axis=1)
        train['word_count_ratio'] = train[['question1', 'question2'
                                           ]].apply(obj.word_count_ratio,
                                                    axis=1)

        train['fuzz_qratio'] = train.apply(
            lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
            axis=1)
        train['fuzz_WRatio'] = train.apply(
            lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
            axis=1)
        train['fuzz_partial_ratio'] = train.apply(lambda x: fuzz.partial_ratio(
            str(x['question1']), str(x['question2'])),
                                                  axis=1)
        train['fuzz_partial_token_set_ratio'] = train.apply(
            lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                                   str(x['question2'])),
            axis=1)
        train['fuzz_partial_token_sort_ratio'] = train.apply(
            lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                                    str(x['question2'])),
            axis=1)
        train['fuzz_token_set_ratio'] = train.apply(
            lambda x: fuzz.token_set_ratio(str(x['question1']),
                                           str(x['question2'])),
            axis=1)
        train['fuzz_token_sort_ratio'] = train.apply(
            lambda x: fuzz.token_sort_ratio(str(x['question1']),
                                            str(x['question2'])),
            axis=1)

        train['cosine_score'] = train.apply(get_cosine, axis=1, raw=True)

        features = [
            'q1_word_num', 'q2_word_num', 'word_num_difference', 'q1_length',
            'q2_length', 'length_difference', 'q1_has_fullstop',
            'q2_has_fullstop', 'q1_digit_count', 'q2_digit_count',
            'digit_count_difference', 'q1_capital_char_count',
            'q2_capital_char_count', 'capital_char_count_difference',
            'q1_has_math_expression', 'q2_has_math_expression',
            'log_length_difference', 'log_word_share', 'word_share_squared',
            'word_share_sqrt', 'length_difference_squared',
            'length_difference_sqrt', 'common_words', 'lem_common_words',
            'lem_log_word_share', 'lem_word_share_squared',
            'lem_word_share_sqrt', 'tfidf_word_match', 'lem_tfidf_word_match',
            'intersection_count', 'log_lem_tfidf', 'lem_tfidf_squared',
            'lem_tfidf_sqrt', 'total_unique_words', 'word_count_ratio',
            'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
            'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
            'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'cosine_score',
            'q1_freq', 'q2_freq'
        ]

        if flag == 1:
            target = 'is_duplicate'
            X = train[features]
            Y = train[target]
            return X, Y
        else:
            X = train[features]
            return X
Exemple #18
0
        data_line = line.rstrip().split(',')
        ele1 = data_line[0].split('||')
        ele2 = data_line[1].split('||')
        kod1 = ele1[0]
        firm1 = ele1[1]
        kod2 = ele2[0]
        firm2 = ele2[1]
        # score = fuzz.token_set_ratio(firm1, firm2)

        score_r = fuzz.ratio(firm1, firm2)
        score_pr = fuzz.partial_ratio(firm1, firm2)
        score_tsor = fuzz.token_sort_ratio(firm1, firm2)
        score_tser = fuzz.token_set_ratio(firm1, firm2)
        score_ptsor = fuzz.partial_token_sort_ratio(firm1, firm2)
        score_ptser = fuzz.partial_token_set_ratio(firm1, firm2)
        score_qr = fuzz.QRatio(firm1, firm2)
        score_uqr = fuzz.UQRatio(firm1, firm2)
        score_wr = fuzz.WRatio(firm1, firm2)
        score_uwr = fuzz.UWRatio(firm1, firm2)

        # print('kod1:' + kod1)
        # print('firm1:' + firm1)
        # print('kod2:' + kod2)
        # print('firm2:' + firm2)
        # print('score:' + str(score))

        # if score_r > 90 or score_pr > 90 or score_tsor > 90 or score_tser > 90 or score_ptsor > 90 or score_ptser > 90 \
        #         or score_qr > 90 or score_uqr > 90 or score_wr > 90 or score_uwr > 90:

        if score_tser > 90:
            temp3 = (
                       header=None,
                       sep='\t')
df_train.columns = ['line', 'q1', 'q2', 'label']

df_train_add = pd.read_csv('../data/input/atec_nlp_sim_train_add.csv',
                           encoding='utf-8-sig',
                           header=None,
                           sep='\t')
df_train_add.columns = ['line', 'q1', 'q2', 'label']
df_train = pd.concat([df_train, df_train_add], axis=0, sort=False)

df_feat = pd.DataFrame()
df_feat['fuzz_ratio'] = df_train.apply(
    lambda row: fuzz.ratio(str(row.q1), str(row.q2)), axis=1)
df_feat['fuzz_qratio'] = df_train.apply(
    lambda row: fuzz.QRatio(str(row.q1), str(row.q2)), axis=1)
df_feat['fuzz_wratio'] = df_train.apply(
    lambda row: fuzz.WRatio(str(row.q1), str(row.q2)), axis=1)
df_feat['fuzz_partial_ratio'] = df_train.apply(
    lambda row: fuzz.partial_ratio(str(row.q1), str(row.q2)), axis=1)
df_feat['fuzz_partial_token_set_ratio'] = df_train.apply(
    lambda row: fuzz.partial_token_set_ratio(str(row.q1), str(row.q2)), axis=1)
df_feat['fuzz_partial_token_sort_ratio'] = df_train.apply(
    lambda row: fuzz.partial_token_sort_ratio(str(row.q1), str(row.q2)),
    axis=1)
df_feat['fuzz_token_set_ratio'] = df_train.apply(
    lambda row: fuzz.token_set_ratio(str(row.q1), str(row.q2)), axis=1)
df_feat['fuzz_token_sort_ratio'] = df_train.apply(
    lambda row: fuzz.token_sort_ratio(str(row.q1), str(row.q2)), axis=1)

df_feat.to_csv('subfeas/train_feature_fuzz.csv', index=False)
def create_all_simple_features(list_pairs_train,list_pairs_test,texts,\
                               ids2ind,word_vectors,embedding_matrix,glove_embedding,sequences\
                               ,data_full_word,my_p=50,n_lsa=40):
    '''
    Create the set of basic features from the list_pairs
    param: list_pairs_train: list of pairs of question to compare for training
            list_pars_test: list of pairs of question to compare for test phases
            ids2ind: index of each question
            word_vectors : w2vec based word_vectors (gensim object)
            embedding_matrix: w2vec based word matrix
            sequences: sequences index (keras object)
            n_lsa: number of axis used for pca used in lsa embedding
    return: matrix train (nb_sample,nb_features) 
            matrix test (nb_sample,nb_features)
    '''
    vec = TfidfVectorizer()
    A = vec.fit_transform(texts.values())
    LSA_features= TruncatedSVD(n_components=n_lsa).fit_transform(A)
    vec_count = CountVectorizer()
    B = vec_count.fit_transform(texts.values())
    LSA_bis_features= TruncatedSVD(n_components=n_lsa).fit_transform(B)
    
    stemmer = nltk.stem.SnowballStemmer('english')

    ###Stem doc
    documents = [[stemmer.stem(word) for word in sentence.split(" ")] for sentence in texts]
    documents = [' '.join(doc) for doc in documents]

    vec_stem = TfidfVectorizer()
    C = vec_stem.fit_transform(documents)
    LSA_features_stem= TruncatedSVD(n_components=n_lsa).fit_transform(C)

    vec_count_stem = CountVectorizer()
    D = vec_count_stem.fit_transform(documents)
    LSA_bis_features_stem= TruncatedSVD(n_components=n_lsa).fit_transform(D)
    
    #########Init###########
    N_train = len(list_pairs_train)
    N_test= len(list_pairs_test)
    X_train = np.zeros((N_train,135))
    X_test = np.zeros((N_test,135))
    
    cleaned_docs= data_full_word
    d2v_training_data = []
    
    for idx,doc in enumerate(cleaned_docs):
        d2v_training_data.append(LabeledSentence(words=doc,tags=[idx]))
        if idx % round(len(cleaned_docs)/10) == 0:
            print(idx)
    
    d2v_dm = Doc2Vec(d2v_training_data, 
                 size=200,
                 iter=6,
                 window=5, 
                 min_count=3, 
                 workers=4)
    
    d2v_dm.delete_temporary_training_data(keep_doctags_vectors=True, 
                                      keep_inference=True)

    d2v_dbow = Doc2Vec(d2v_training_data, 
                       size=my_p, 
                       window=4,
                       iter=6,
                       min_count=3, 
                       dm=0, 
                       workers=4)
    
    d2v_dbow.delete_temporary_training_data(keep_doctags_vectors=True, 
                                            keep_inference=True)
    #####Create features for training###########
    for i in range(N_train):
        q1 = list_pairs_train[i][0]
        q2 = list_pairs_train[i][1]
        
        X_train[i,0] = 1- cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
        X_train[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))
        X_train[i,3] = min(word_vectors.wv.wmdistance((texts[q1].lower()).split(),(texts[q2].lower()).split()),100000) #WM distance
        X_train[i,4] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split()),7)        
        X_train[i,5]= 1- cosine_similarity(LSA_features[ids2ind[q1],:].reshape(1, -1), LSA_features[ids2ind[q2],:].reshape(1, -1))
        if (len(sequences[ids2ind[q1]])>0)and(len(sequences[ids2ind[q2]])>0) :
            mean_pos_1 = (embedding_matrix[sequences[ids2ind[q1]],:]).sum(axis=0)
            mean_pos_2 = (embedding_matrix[sequences[ids2ind[q2]],:]).sum(axis=0)
            mean_pos_1= mean_pos_1 / np.sqrt((mean_pos_1 ** 2).sum())
            mean_pos_2= mean_pos_2 / np.sqrt((mean_pos_2 ** 2).sum())
            X_train[i,1] = 1- cosine_similarity(mean_pos_1.reshape(1, -1),mean_pos_2.reshape(1, -1))
            mean_pos_1_gv = (glove_embedding[sequences[ids2ind[q1]],:]).sum(axis=0)
            mean_pos_2_gv = (glove_embedding[sequences[ids2ind[q2]],:]).sum(axis=0)
            if np.sum(mean_pos_1_gv)>0:
                mean_pos_1_gv= mean_pos_1_gv / np.sqrt((mean_pos_1_gv ** 2).sum())
            if np.sum(mean_pos_2_gv)>0:
                mean_pos_2_gv= mean_pos_2_gv / np.sqrt((mean_pos_2_gv ** 2).sum())
            X_train[i,15] = 1- cosine_similarity(mean_pos_1_gv.reshape(1, -1),mean_pos_2_gv.reshape(1, -1))
            X_train[i,17] = np.linalg.norm(mean_pos_1.reshape(1, -1)-mean_pos_2.reshape(1, -1))
            X_train[i,18] = np.linalg.norm(mean_pos_1_gv.reshape(1, -1)-mean_pos_2_gv.reshape(1, -1))
            X_train[i,35]= distance.cityblock(mean_pos_1,mean_pos_2)
            X_train[i,36]= distance.jaccard(mean_pos_1,mean_pos_2)
            X_train[i,37]= distance.canberra(mean_pos_1,mean_pos_2)
            X_train[i,38]= distance.minkowski(mean_pos_1,mean_pos_2,3)
            X_train[i,39]= distance.braycurtis(mean_pos_1,mean_pos_2)
            X_train[i,40]= distance.cityblock(mean_pos_1_gv,mean_pos_2_gv)
            X_train[i,41]= distance.jaccard(mean_pos_1_gv,mean_pos_2_gv)
            X_train[i,42]= distance.canberra(mean_pos_1_gv,mean_pos_2_gv)
            X_train[i,43]= distance.minkowski(mean_pos_1_gv,mean_pos_2_gv,3)
            X_train[i,44]= distance.braycurtis(mean_pos_1_gv,mean_pos_2_gv)
        else:
            X_train[i,1] = -1
            X_train[i,15]= -1
            X_train[i,17] = -1
            X_train[i,18] = -1
            X_train[i,35:44]=-1
            
        X_train[i,6] = fuzz.partial_ratio(texts[q1],texts[q2])/100
        X_train[i,7] = fuzz.QRatio(texts[q1],texts[q2])/100
        X_train[i,8] = 1 - cosine_similarity(LSA_bis_features[ids2ind[q1],:].reshape(1, -1), LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
    
        d2v1 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q1]])  
        d2v2 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q2]])  
        
        d2vbow1 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q1]])  
        d2vbow2 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q2]])
        
        X_train[i,9] = 1 - cosine_similarity(d2vbow1.reshape(1, -1), d2vbow2.reshape(1, -1))
        X_train[i,10] = 1 - cosine_similarity(d2v1.reshape(1, -1), d2v2.reshape(1, -1))
        X_train[i,11] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_train[i,12] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_train[i,13] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_train[i,14] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_train[i,16]= np.linalg.norm(LSA_features[ids2ind[q1],:].reshape(1, -1) - LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,19] = np.linalg.norm(LSA_bis_features[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,20] = np.linalg.norm(d2vbow1.reshape(1, -1) -  d2vbow2.reshape(1, -1))
        X_train[i,21] = np.linalg.norm(d2v1.reshape(1, -1) -  d2v2.reshape(1, -1))
        X_train[i,22] = np.linalg.norm(A[ids2ind[q1],:].todense() - A[ids2ind[q2],:].todense())
        X_train[i,23]=  max(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1))
        X_train[i,24]= max(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1))
        X_train[i,25]= max(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1))
        X_train[i,26]= max(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1))
        X_train[i,27]= max(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1))
        X_train[i,28]= min(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1))
        X_train[i,29]= min(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1))
        X_train[i,30]= min(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1))
        X_train[i,31]= min(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1))
        X_train[i,32]=  min(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1))
        X_train[i,33]= fuzz.token_set_ratio(texts[q1],texts[q2])/100
        X_train[i,34]= fuzz.token_sort_ratio(texts[q1],texts[q2])/100
        X_train[i,45] = abs(len(texts[q1].lower())-len(texts[q2].lower()))
        X_train[i,46] = abs(len([j for j in texts[q1] if j=='?'])-len([j for j in texts[q2] if j=='?']))
        X_train[i,47] = len(texts[q1].split()) + len(texts[q2].split())
        X_train[i,48] = distance.cityblock(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_train[i,49] = distance.jaccard(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_train[i,50] = distance.canberra(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_train[i,51] = distance.minkowski(d2v1.reshape(1, -1),d2v2.reshape(1, -1),3)
        X_train[i,52] = distance.braycurtis(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_train[i,53] = distance.cityblock(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,54] = distance.jaccard(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,55] = distance.canberra(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,56] = distance.minkowski(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1),3)
        X_train[i,57] = distance.braycurtis(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,58] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_train[i,59] = is_first_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_train[i,60] = is_last_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split())
        
    
        #####Using QID
        X_train[i,61] = abs(int(q1) - int(q2))
        X_train[i,62] = abs((int(q1) + int(q2))/2)
        X_train[i,63] = abs(min(int(q1),int(q2)))

        ####Using N-grams
        X_train[i,64] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2)
        X_train[i,65] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3)
        X_train[i,66] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4)
        X_train[i,67] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5)
        X_train[i,68] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6)
        X_train[i,69] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2)
        X_train[i,70] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3)
        X_train[i,71] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4)
        X_train[i,72] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5)
        X_train[i,73] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6)

        X_train[i,74] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_train[i,75] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_train[i,76] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_train[i,77] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_train[i,78] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False),7)
        X_train[i,79] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_train[i,80] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False)
        X_train[i,81] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False)
        X_train[i,82] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False)
        X_train[i,83] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False)
        X_train[i,84] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False)
        X_train[i,85] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False)
        X_train[i,86] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False)
        X_train[i,87] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False)
        X_train[i,88] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False)
        X_train[i,89] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False)

    
        X_train[i,90] = distance.cityblock(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,91] = distance.jaccard(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,92] = distance.canberra(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_train[i,93] = distance.minkowski(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1),3)
        X_train[i,94] = distance.braycurtis(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
                
        
        X_train[i,95] = distance.cityblock(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,96] = distance.jaccard(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,97] = distance.canberra(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,98] = distance.minkowski(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_train[i,99] = distance.braycurtis(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        
        
        X_train[i,100] = 1- cosine_similarity(B[ids2ind[q1],:], B[ids2ind[q2],:])
        X_train[i,101] = np.linalg.norm(B[ids2ind[q1],:].todense() - B[ids2ind[q2],:].todense())
        X_train[i,102] = distance.cityblock(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,103] = distance.jaccard(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,104] = distance.canberra(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,105] = distance.minkowski(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_train[i,106] = distance.braycurtis(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        
        X_train[i,107] = 1- cosine_similarity(C[ids2ind[q1],:], C[ids2ind[q2],:])
        X_train[i,108] = np.linalg.norm(C[ids2ind[q1],:].todense() - C[ids2ind[q2],:].todense())
        X_train[i,109] = distance.cityblock(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,110] = distance.jaccard(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,111] = distance.canberra(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,112] = distance.minkowski(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_train[i,113] = distance.braycurtis(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        
        X_train[i,114] = 1- cosine_similarity(D[ids2ind[q1],:], D[ids2ind[q2],:])
        X_train[i,115] = np.linalg.norm(D[ids2ind[q1],:].todense() - D[ids2ind[q2],:].todense())
        X_train[i,116] = distance.cityblock(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,117] = distance.jaccard(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,118] = distance.canberra(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        X_train[i,119] = distance.minkowski(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_train[i,120] = distance.braycurtis(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        
        X_train[i,121] = 1 - cosine_similarity(LSA_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,122] = np.linalg.norm(LSA_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,123] = distance.cityblock(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,124] = distance.jaccard(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,125] = distance.canberra(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,126] = distance.minkowski(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1),3)
        X_train[i,127] = distance.braycurtis(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))

        X_train[i,128] = 1 - cosine_similarity(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,129] = np.linalg.norm(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,130] = distance.cityblock(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,131] = distance.jaccard(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,132] = distance.canberra(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_train[i,133] = distance.minkowski(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1),3)
        X_train[i,134] = distance.braycurtis(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))



        
    #####Create features for test###########
    for i in range(N_test):
        q1 = list_pairs_test[i][0]
        q2 = list_pairs_test[i][1]
        X_test[i,0] = 1- cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
        X_test[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split()))
        X_test[i,3] = min(word_vectors.wv.wmdistance((texts[q1].lower()).split(),(texts[q2].lower()).split()),100000) #WM distance
        X_test[i,4] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split()),7)
        X_test[i,5]= 1- cosine_similarity(LSA_features[ids2ind[q1],:].reshape(1, -1), LSA_features[ids2ind[q2],:].reshape(1, -1))
    
        if (len(sequences[ids2ind[q1]])>0)and(len(sequences[ids2ind[q2]])>0) :
            mean_pos_1 = (embedding_matrix[sequences[ids2ind[q1]],:]).sum(axis=0)
            mean_pos_2 = (embedding_matrix[sequences[ids2ind[q2]],:]).sum(axis=0)
            mean_pos_1= mean_pos_1 / np.sqrt((mean_pos_1 ** 2).sum())
            mean_pos_2= mean_pos_2 / np.sqrt((mean_pos_2 ** 2).sum())
            X_test[i,1] = 1- cosine_similarity(mean_pos_1.reshape(1, -1),mean_pos_2.reshape(1, -1))
            mean_pos_1_gv = (glove_embedding[sequences[ids2ind[q1]],:]).sum(axis=0)
            mean_pos_2_gv = (glove_embedding[sequences[ids2ind[q2]],:]).sum(axis=0)
            if np.sum(mean_pos_1_gv)!=0:
                mean_pos_1_gv= mean_pos_1_gv / np.sqrt((mean_pos_1_gv ** 2).sum())
            if np.sum(mean_pos_2_gv)!=0:
                mean_pos_2_gv= mean_pos_2_gv / np.sqrt((mean_pos_2_gv ** 2).sum())
            X_test[i,15] = 1- cosine_similarity(mean_pos_1_gv.reshape(1, -1),mean_pos_2_gv.reshape(1, -1))
            X_test[i,17] = np.linalg.norm(mean_pos_1.reshape(1, -1)-mean_pos_2.reshape(1, -1))
            X_test[i,18] = np.linalg.norm(mean_pos_1_gv.reshape(1, -1)-mean_pos_2_gv.reshape(1, -1))
            X_test[i,35]= distance.cityblock(mean_pos_1,mean_pos_2)
            X_test[i,36]= distance.jaccard(mean_pos_1,mean_pos_2)
            X_test[i,37]= distance.canberra(mean_pos_1,mean_pos_2)
            X_test[i,38]= distance.minkowski(mean_pos_1,mean_pos_2,3)
            X_test[i,39]= distance.braycurtis(mean_pos_1,mean_pos_2)
            X_test[i,40]= distance.cityblock(mean_pos_1_gv,mean_pos_2_gv)
            X_test[i,41]= distance.jaccard(mean_pos_1_gv,mean_pos_2_gv)
            X_test[i,42]= distance.canberra(mean_pos_1_gv,mean_pos_2_gv)
            X_test[i,43]= distance.minkowski(mean_pos_1_gv,mean_pos_2_gv,3)
            X_test[i,44]= distance.braycurtis(mean_pos_1_gv,mean_pos_2_gv)
        else:
            X_test[i,1] = -1
            X_test[i,15]= -1
            X_test[i,17] = -1
            X_test[i,18] = -1 
            X_test[i,35:44]=-1
        X_test[i,6] = fuzz.partial_ratio(texts[q1],texts[q2])/100
        X_test[i,7] = fuzz.QRatio(texts[q1],texts[q2])/100
        X_test[i,8] = 1 - cosine_similarity(LSA_bis_features[ids2ind[q1],:].reshape(1, -1), LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        
        d2v1 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q1]])  
        d2v2 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q2]])  
        
        d2vbow1 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q1]])  
        d2vbow2 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q2]])
        
        X_test[i,9] = 1 - cosine_similarity(d2vbow1.reshape(1, -1), d2vbow2.reshape(1, -1))
        X_test[i,10] = 1 - cosine_similarity(d2v1.reshape(1, -1), d2v2.reshape(1, -1))
        X_test[i,11] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,12] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,13] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,14] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,16]= np.linalg.norm(LSA_features[ids2ind[q1],:].reshape(1, -1) - LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,19] = np.linalg.norm(LSA_bis_features[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,20] = np.linalg.norm(d2vbow1.reshape(1, -1) -  d2vbow2.reshape(1, -1))
        X_test[i,21] = np.linalg.norm(d2v1.reshape(1, -1) -  d2v2.reshape(1, -1))
        X_test[i,22] = np.linalg.norm(A[ids2ind[q1],:].todense() - A[ids2ind[q2],:].todense())
        X_test[i,23]=  max(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1))
        X_test[i,24]= max(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1))
        X_test[i,25]= max(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1))
        X_test[i,26]= max(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1))
        X_test[i,27]= max(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1))
        X_test[i,28]= min(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1))
        X_test[i,29]= min(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1))
        X_test[i,30]= min(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1))
        X_test[i,31]= min(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1))
        X_test[i,32]=  min(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1))
        X_test[i,33]= fuzz.token_set_ratio(texts[q1],texts[q2])/100
        X_test[i,34]= fuzz.token_sort_ratio(texts[q1],texts[q2])/100
        X_test[i,45] = abs(len(texts[q1].lower())-len(texts[q2].lower()))
        X_test[i,46] = abs(len([j for j in texts[q1] if j=='?'])-len([j for j in texts[q2] if j=='?']))
        X_test[i,47] = len(texts[q1].split()) + len(texts[q2].split())
        X_test[i,48] = distance.cityblock(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_test[i,49] = distance.jaccard(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_test[i,50] = distance.canberra(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_test[i,51] = distance.minkowski(d2v1.reshape(1, -1),d2v2.reshape(1, -1),3)
        X_test[i,52] = distance.braycurtis(d2v1.reshape(1, -1),d2v2.reshape(1, -1))
        X_test[i,53] = distance.cityblock(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,54] = distance.jaccard(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,55] = distance.canberra(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,56] = distance.minkowski(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1),3)
        X_test[i,57] = distance.braycurtis(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,58] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,59] = is_first_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,60] = is_last_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split())
        X_test[i,61] = abs(int(q1) - int(q2))
        X_test[i,62] = abs((int(q1) + int(q2))/2)
        X_test[i,63] = abs(min(int(q1),int(q2)))
        X_test[i,64] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2)
        X_test[i,65] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3)
        X_test[i,66] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4)
        X_test[i,67] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5)
        X_test[i,68] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6)
        X_test[i,69] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2)
        X_test[i,70] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3)
        X_test[i,71] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4)
        X_test[i,72] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5)
        X_test[i,73] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6)

        X_test[i,74] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_test[i,75] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_test[i,76] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_test[i,77] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_test[i,78] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False),7)
        X_test[i,79] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False)
        X_test[i,80] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False)
        X_test[i,81] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False)
        X_test[i,82] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False)
        X_test[i,83] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False)
        X_test[i,84] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False)
        X_test[i,85] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False)
        X_test[i,86] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False)
        X_test[i,87] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False)
        X_test[i,88] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False)
        X_test[i,89] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False)


        X_test[i,90] = distance.cityblock(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,91] = distance.jaccard(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,92] = distance.canberra(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
        X_test[i,93] = distance.minkowski(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1),3)
        X_test[i,94] = distance.braycurtis(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1))
                
        
        X_test[i,95] = distance.cityblock(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,96] = distance.jaccard(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,97] = distance.canberra(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,98] = distance.minkowski(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_test[i,99] = distance.braycurtis(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1))
        
        
        X_test[i,100] = 1- cosine_similarity(B[ids2ind[q1],:], B[ids2ind[q2],:])
        X_test[i,101] = np.linalg.norm(B[ids2ind[q1],:].todense() - B[ids2ind[q2],:].todense())
        X_test[i,102] = distance.cityblock(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,103] = distance.jaccard(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,104] = distance.canberra(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,105] = distance.minkowski(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_test[i,106] = distance.braycurtis(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1))
        
        X_test[i,107] = 1- cosine_similarity(C[ids2ind[q1],:], C[ids2ind[q2],:])
        X_test[i,108] = np.linalg.norm(C[ids2ind[q1],:].todense() - C[ids2ind[q2],:].todense())
        X_test[i,109] = distance.cityblock(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,110] = distance.jaccard(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,111] = distance.canberra(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,112] = distance.minkowski(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_test[i,113] = distance.braycurtis(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1))
        
        X_test[i,114] = 1- cosine_similarity(D[ids2ind[q1],:], D[ids2ind[q2],:])
        X_test[i,115] = np.linalg.norm(D[ids2ind[q1],:].todense() - D[ids2ind[q2],:].todense())
        X_test[i,116] = distance.cityblock(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,117] = distance.jaccard(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,118] = distance.canberra(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        X_test[i,119] = distance.minkowski(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1),3)
        X_test[i,120] = distance.braycurtis(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1))
        
        X_test[i,121] = 1 - cosine_similarity(LSA_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,122] = np.linalg.norm(LSA_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,123] = distance.cityblock(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,124] = distance.jaccard(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,125] = distance.canberra(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,126] = distance.minkowski(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1),3)
        X_test[i,127] = distance.braycurtis(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1))

        X_test[i,128] = 1 - cosine_similarity(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,129] = np.linalg.norm(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,130] = distance.cityblock(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,131] = distance.jaccard(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,132] = distance.canberra(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))
        X_test[i,133] = distance.minkowski(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1),3)
        X_test[i,134] = distance.braycurtis(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1))



    return X_train,X_test
Exemple #21
0
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
len_train = df_train.shape[0]

df_feat = pd.DataFrame()
df_data = pd.concat([
    df_train[['question1', 'question2']], df_test[['question1', 'question2']]
],
                    axis=0)

df_feat['fuzz_qratio'] = df_data.apply(
    lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])),
    axis=1)
df_feat['fuzz_WRatio'] = df_data.apply(
    lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])),
    axis=1)
df_feat['fuzz_partial_ratio'] = df_data.apply(lambda row: fuzz.partial_ratio(
    str(row['question1']), str(row['question2'])),
                                              axis=1)
df_feat['fuzz_partial_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_set_ratio(str(row['question1']),
                                             str(row['question2'])),
    axis=1)
df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_sort_ratio(str(row['question1']),
                                              str(row['question2'])),
    axis=1)
Exemple #22
0
x_train['shared_words_length'] = temp_df['allR'].apply(lambda x: float(x.split(':')[4]))

# Set 2
x_train['tfidf'] = df_train.apply(tfidf_word_match_share, axis = 1, raw = True)

# Set 3
x_train['q1_word_count'] = df_train['question1'].apply(lambda x: len(str(x).lower().split()))
x_train['q2_word_count'] = df_train['question2'].apply(lambda x: len(str(x).lower().split()))
x_train['diff_word_count'] = x_train['q1_word_count'] - x_train['q2_word_count']

x_train['q1_char_count_withspace'] = df_train['question1'].apply(lambda x: len(str(x)))
x_train['q2_char_count_withspace'] = df_train['question2'].apply(lambda x: len(str(x)))
x_train['diff_char_count_withspace'] = x_train['q1_char_count_withspace'] - x_train['q2_char_count_withspace']

# Set 4
x_train['fuzz_qratio'] = df_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_WRatio'] = df_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_partial_ratio'] = df_train.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_partial_token_set_ratio'] = df_train.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_partial_token_sort_ratio'] = df_train.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_token_set_ratio'] = df_train.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_token_sort_ratio'] = df_train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

# Set 5
x_train['jaccard_dist'] = df_train.apply(jaccard_dist, axis = 1)
x_train['cosine_dist'] = df_train.apply(cosine_dist, axis = 1)

del temp_df

################################################################################
################################################################################
Exemple #23
0
if __name__ == '__main__':
    data_path = './data/train.pickle'
    output_path = './data/train_fuzzy.pickle'

    data_path = './data/test.pickle'
    output_path = './data/test_fuzzy.pickle'

    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    result = []
    for i in range(len(data)):
        sentence_q1 = ' '.join(data[i]['question1'])
        sentence_q2 = ' '.join(data[i]['question2'])

        qratio = fuzzy.QRatio(sentence_q1, sentence_q2)
        wratio = fuzzy.WRatio(sentence_q1, sentence_q2)
        ratio = fuzzy.ratio(sentence_q1, sentence_q2)
        partial_ratio = fuzzy.partial_ratio(sentence_q1, sentence_q2)
        partial_token_set_ratio = fuzzy.partial_token_set_ratio(
            sentence_q1, sentence_q2)
        partial_token_sort_ratio = fuzzy.partial_token_sort_ratio(
            sentence_q1, sentence_q2)
        token_set_ratio = fuzzy.token_set_ratio(sentence_q1, sentence_q2)
        token_sort_ratio = fuzzy.token_sort_ratio(sentence_q1, sentence_q2)

        fuzzyee = [
            qratio, wratio, ratio, partial_ratio, partial_token_sort_ratio,
            token_set_ratio, token_sort_ratio
        ]
        fuzzyee.append(partial_token_set_ratio)
Exemple #24
0
def engineer(data):
    import pickle
    import pandas as pd
    import numpy as np
    import gensim
    from fuzzywuzzy import fuzz
    import nltk
    from nltk import pos_tag
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import stopwords
    from tqdm import tqdm
    from scipy.spatial.distance import cosine, euclidean
    from nltk import word_tokenize
    import re

    stop_words = stopwords.words('english')

    def clean_text(text):
        """ Pre process and convert texts to a list of words """
        text = str(text)
        text = text.lower()

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " is ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"[a-z]+\-[a-z]+", "", text)
        text = re.sub(r"[a-z]+\-", "", text)
        text = re.sub(r"\-[a-z]+", "", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)

        return text

    model = gensim.models.KeyedVectors.load_word2vec_format('D:\papers\Project Material\data\GoogleNews-vectors'
                                                            '-negative300.bin.gz',
                                                            binary=True)

    def wmd(s1, s2):
        s1 = str(s1).lower().split()
        s2 = str(s2).lower().split()
        stop_words = stopwords.words('english')
        s1 = [w for w in s1 if w not in stop_words]
        s2 = [w for w in s2 if w not in stop_words]
        return round(model.wmdistance(s1, s2), 3)

    def sent2vec(s):
        words = str(s).lower()
        words = word_tokenize(words)
        words = [w for w in words if not w in stop_words]
        words = [w for w in words if w.isalpha()]
        M = []
        for w in words:
            try:
                M.append(model[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())

    wh = ['where', 'why', 'what', 'who', 'whom', 'how', 'when', 'is', 'am', 'are', 'has', 'have', 'had', 'do', 'does',
          'did']
    for x in wh:
        if x in stop_words:
            stop_words.remove(x)

    for s in data.head()['question1']:
        print(s, '\n')

    data['question1'] = data.question1.apply(lambda x: clean_text(x))
    data['question2'] = data.question2.apply(lambda x: clean_text(x))

    for s in data.head()['question1']:
        print(s, '\n')

    # Added Features.
    data['word_overlap'] = [set(x[0].split()) & set(x[1].split()) for x in data[['question1', 'question2']].values]
    data['common_word_cnt'] = data['word_overlap'].str.len()

    data['text1_nostop'] = data['question1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
    data['text2_nostop'] = data['question2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

    data['word_overlap'] = [set(x[0].split()) & set(x[1].split()) for x in
                            data[['text1_nostop', 'text2_nostop']].values]
    data['common_nonstop_word_cnt'] = data['word_overlap'].str.len()

    data['char_cnt_1'] = data['question1'].str.len()
    data['char_cnt_2'] = data['question2'].str.len()
    data['char_cnt_diff'] = (data['char_cnt_1'] - data['char_cnt_2']) ** 2
    data['word_cnt_1'] = data['question1'].apply(lambda x: len(str(x).split()))
    data['word_cnt_2'] = data['question2'].apply(lambda x: len(str(x).split()))
    data['word_cnt_diff'] = (data['word_cnt_1'] - data['word_cnt_2']) ** 2
    data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
    data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
    data['diff_len'] = (data.len_q1 - data.len_q2) ** 2

    text1 = list(data['question1'])
    text2 = list(data['question2'])
    corpus1 = ' '.join(text1)
    corpus2 = ' '.join(text2)
    corpus = corpus1.lower() + corpus2.lower()

    lem = WordNetLemmatizer()

    corpus = lem.lemmatize(corpus, "v")
    # corpus = stem.stem(corpus)

    tags = pos_tag(corpus.split())
    nouns = [i[0] for i in tags if i[1] in ("NN", "NNS", "NNP", "NNPS")]

    def count_common_nouns(var1, var2, var3):
        count = 0
        for i in var1:
            if (i in var2) & (i in var3):
                count += 1
        return count

    data['text1_lower'] = data['question1'].apply(lambda x: x.lower())
    data['text2_lower'] = data['question2'].apply(lambda x: x.lower())
    data['common_noun_cnt'] = [
        count_common_nouns(nltk.word_tokenize(lem.lemmatize(x[0], "v")), nltk.word_tokenize(lem.lemmatize(x[1], "v")),
                           nouns) for x in data[['question1', 'question2']].values]

    # FUZZ WUZZ Features
    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
                                            axis=1)
    data['fuzz_partial_token_set_ratio'] = data.apply(
        lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_partial_token_sort_ratio'] = data.apply(
        lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
    data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
                                              axis=1)
    data['fuzz_token_sort_ratio'] = data.apply(
        lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])),
        axis=1)

    data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

    question1_vectors = np.zeros((data.shape[0], 300))
    error_count = 0

    for i, q in tqdm(enumerate(data.question1.values)):
        question1_vectors[i, :] = sent2vec(q)

    question2_vectors = np.zeros((data.shape[0], 300))
    for i, q in tqdm(enumerate(data.question2.values)):
        question2_vectors[i, :] = sent2vec(q)

    data['cosine_distance'] = [round(cosine(x, y), 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                                        np.nan_to_num(question2_vectors))]

    data['euclidean_distance'] = [round(euclidean(x, y), 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                                              np.nan_to_num(question2_vectors))]

    to_remove = ['word_overlap', 'text1_lower', 'text2_lower', 'question1', 'question2', 'test_id', 'text1_nostop',
                 'text2_nostop']
    data = data.drop(to_remove, axis=1) 
    data.to_csv(r'D:\papers\Project Material\new_try\CQA\forum\\revised.csv', index=False)
    return data
def fuzz_QRatio(sentences):
    sen = sentences.split("\001")
    return fuzz.QRatio(sen[0], sen[1])
Exemple #26
0
    def testFuzzy(self):
        print(
            'ratio',
            fuzz.ratio('MISSION HOSPITAL',
                       'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'ratio',
            fuzz.ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                       'MISSION HOSPITAL'))

        print(
            'partial_ratio',
            fuzz.partial_ratio('MISSION HOSPITAL',
                               'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_ratio',
            fuzz.partial_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                               'MISSION HOSPITAL'))

        print(
            'token_sort_ratio',
            fuzz.token_sort_ratio('MISSION HOSPITAL',
                                  'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'token_sort_ratio',
            fuzz.token_sort_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                                  'MISSION HOSPITAL'))

        print(
            'partial_token_sort_ratio',
            fuzz.partial_token_sort_ratio(
                'MISSION HOSPITAL',
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_token_sort_ratio',
            fuzz.partial_token_sort_ratio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'token_set_ratio',
            fuzz.token_set_ratio('MISSION HOSPITAL',
                                 'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'token_set_ratio',
            fuzz.token_set_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                                 'MISSION HOSPITAL'))

        print(
            'partial_token_set_ratio',
            fuzz.partial_token_set_ratio(
                'MISSION HOSPITAL',
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_token_set_ratio',
            fuzz.partial_token_set_ratio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'QRatio',
            fuzz.QRatio('MISSION HOSPITAL',
                        'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'QRatio',
            fuzz.QRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                        'MISSION HOSPITAL'))

        print(
            'UQRatio',
            fuzz.UQRatio('MISSION HOSPITAL',
                         'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'UQRatio',
            fuzz.UQRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                         'MISSION HOSPITAL'))

        print(
            'WRatio',
            fuzz.WRatio('MISSION HOSPITAL',
                        'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'WRatio',
            fuzz.WRatio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'UWRatio',
            fuzz.UWRatio('MISSION HOSPITAL',
                         'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'UWRatio',
            fuzz.UWRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                         'MISSION HOSPITAL'))

        pass
Exemple #27
0
 def testQuickRatioNotEqual(self):
     self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100)
# ratio of difference in these lengths to total length
df['diff_len_word_ratio'] = abs(df.len_word_q1 - df.len_word_q2) / (df.len_word_q1 + df.len_word_q2)

# Number of common words in question1 and question2
df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

# ratio of number of common words to average length of the questions
df['common_words_ratio'] = 2* df.common_words / (df.len_word_q1 + df.len_word_q2)

df.to_csv('fs1.csv', index=False)

#Fuzzy features

# Q-ratio
df['fuzz_Qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)

# W-ratio
df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

# Partial ratio
df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)

# Partial token set ratio
df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)

# Partial token sort ratio
df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

# Token set ratio
df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
Exemple #29
0
 def testQuickRatioCaseInsensitive(self):
     self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100)
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(corpus2).todense()

for feature in features:
    print("coseno ", cosine_distances(features[0], feature))
for feature in features:
    print("euclidiana ", euclidean_distances(features[0], feature))

print()
for sentence in corpus2:
    print("levenshtein ", distance.levenshtein(corpus2[0], sentence))
for sentence in corpus2:
    print("jaccard ", distance.jaccard(corpus2[0], sentence))
for sentence in corpus2:
    print("fuzzy ", fuzz.QRatio(corpus2[0], sentence))



# from owlready2 import *

# repository = "/Users/jairoandresarizacastaneda/Desktop/IoT segundo semestre/codigo gateway/ontology/repository"
# filename = "hub_iot_qos.owl"
# onto_path.append(repository)
# onto_hub = get_ontology("file:///"+repository+"/"+filename).load()

# for individual in onto_hub.individuals():
#     # print(individual)
#     # if individual.aboutProperty:
#     #     print("prop", individual.aboutProperty)
#     if isinstance(individual, onto_hub.Service):