def main(): stops = stopwords() train_path = "../data/train.csv" test_path = "../data/test.csv" Fields = FieldClass() train_df = pd.read_csv("../data/train.csv", sep='\t') test_df = pd.read_csv("../data/test.csv", sep='\t') train_df['question1'] = train_df['question1'].map( lambda x: str(x).lower().split()) train_df['question2'] = train_df['question2'].map( lambda x: str(x).lower().split()) test_df['question1'] = test_df['question1'].map( lambda x: str(x).lower().split()) test_df['question2'] = test_df['question2'].map( lambda x: str(x).lower().split()) train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()) words = [x for y in train_qs for x in y] counts = Counter(words) weights = {word: get_weight(count) for word, count in counts.items()} f = functools.partial(word_match_share, stops=stopwords()) train_df['word_match'] = train_df.apply(f, axis=1, raw=True) test_df['word_match'] = test_df.apply(f, axis=1, raw=True) train_df['jaccard'] = train_df.apply(jaccard, axis=1, raw=True) test_df['jaccard'] = test_df.apply(jaccard, axis=1, raw=True) train_df['wc_diff'] = train_df.apply(wc_diff, axis=1, raw=True) test_df['wc_diff'] = test_df.apply(wc_diff, axis=1, raw=True) train_df['wc_diff_unique'] = train_df.apply(wc_diff_unique, axis=1, raw=True) test_df['wc_diff_unique'] = test_df.apply(wc_diff_unique, axis=1, raw=True) train_df['wc_ratio_unique'] = train_df.apply(wc_ratio_unique, axis=1, raw=True) test_df['wc_ratio_unique'] = test_df.apply(wc_ratio_unique, axis=1, raw=True) f = functools.partial(wc_diff_unique_stop, stops=stops) train_df['wc_diff_unq_stop'] = train_df.apply(f, axis=1, raw=True) test_df['wc_diff_unq_stop'] = test_df.apply(f, axis=1, raw=True) f = functools.partial(wc_ratio_unique_stop, stops=stops) train_df["wc_ratio_unique_stop"] = train_df.apply(f, axis=1, raw=True) test_df["wc_ratio_unique_stop"] = test_df.apply(f, axis=1, raw=True) train_df["same_start"] = train_df.apply(same_start_word, axis=1, raw=True) test_df["same_start"] = test_df.apply(same_start_word, axis=1, raw=True) train_df["char_diff"] = train_df.apply(char_diff, axis=1, raw=True) test_df["char_diff"] = test_df.apply(char_diff, axis=1, raw=True) f = functools.partial(char_diff_unique_stop, stops=stops) train_df["char_diff_unq_stop"] = train_df.apply(f, axis=1, raw=True) test_df["char_diff_unq_stop"] = test_df.apply(f, axis=1, raw=True) train_df["total_unique_words"] = train_df.apply(total_unique_words, axis=1, raw=True) test_df["total_unique_words"] = test_df.apply(total_unique_words, axis=1, raw=True) f = functools.partial(total_unq_words_stop, stops=stops) train_df["total_unq_words_stop"] = train_df.apply(f, axis=1, raw=True) test_df["total_unq_words_stop"] = test_df.apply(f, axis=1, raw=True) train_df["char_ratio"] = train_df.apply(char_ratio, axis=1, raw=True) test_df["char_ratio"] = test_df.apply(char_ratio, axis=1, raw=True) f = functools.partial(tfidf_word_match_share, weights=weights) train_df["tfidf_wm"] = train_df.apply(f, axis=1, raw=True) test_df["tfidf_wm"] = test_df.apply(f, axis=1, raw=True) f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights) train_df["tfidf_wm_stops"] = train_df.apply(f, axis=1, raw=True) test_df["tfidf_wm_stops"] = test_df.apply(f, axis=1, raw=True) # counter compute_counters(train_df, test_df) # distance train_df["levenstein1"] = train_df.apply( lambda r: levenshtein1(r[Fields.question1], r[Fields.question2]), axis=1) test_df[Fields.levenstein1] = test_df.apply( lambda r: levenshtein1(r[Fields.question1], r[Fields.question2]), axis=1) train_df[Fields.levenstein2] = train_df.apply( lambda r: levenshtein2(r[Fields.question1], r[Fields.question2]), axis=1) test_df[Fields.levenstein2] = test_df.apply( lambda r: levenshtein2(r[Fields.question1], r[Fields.question2]), axis=1) train_df[Fields.sorensen] = train_df.apply( lambda r: sorencen(r[Fields.question1], r[Fields.question2]), axis=1) test_df[Fields.sorensen] = test_df.apply( lambda r: sorencen(r[Fields.question1], r[Fields.question2]), axis=1) # fuzzy train_df[Fields.qratio] = train_df.apply(lambda row: fuzz.QRatio( str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.qratio] = test_df.apply(lambda row: fuzz.QRatio( str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_qratio = compute_quality(train_df, Fields.qratio) train_df[Fields.wratio] = train_df.apply(lambda row: fuzz.WRatio( str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.wratio] = test_df.apply(lambda row: fuzz.WRatio( str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_wratio = compute_quality(train_df, Fields.wratio) train_df[Fields.partial_ratio] = train_df.apply( lambda row: fuzz.partial_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.partial_ratio] = test_df.apply( lambda row: fuzz.partial_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio) train_df[Fields.partial_token_set_ratio] = train_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.partial_token_set_ratio] = test_df.apply( lambda row: fuzz.partial_token_set_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_partial_token_set_ratio = compute_quality( train_df, Fields.partial_token_set_ratio) train_df[Fields.partial_token_sort_ratio] = train_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.partial_token_sort_ratio] = test_df.apply( lambda row: fuzz.partial_token_sort_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_partial_token_sort_ratio = compute_quality( train_df, Fields.partial_token_sort_ratio) train_df[Fields.token_set_ratio] = train_df.apply( lambda row: fuzz.token_set_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.token_set_ratio] = test_df.apply( lambda row: fuzz.token_set_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio) train_df[Fields.token_sort_ratio] = train_df.apply( lambda row: fuzz.token_sort_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) test_df[Fields.token_sort_ratio] = test_df.apply( lambda row: fuzz.token_sort_ratio(str(row[Fields.question1]), str(row[Fields.question2])), axis=1) quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio) train_df.to_csv(train_path, sep='\t', index=False) test_df.to_csv(test_path, sep='\t', index=False)
data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = data.len_q1 - data.len_q2 data['len_char_q1'] = data.question1.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len( set(str(x['question1']).lower().split()).intersection( set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
def testQuickRatioEqual(self): self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100)
def build_features(data, stops): X = pd.DataFrame() log.info('Calculate tfidf') qs = pd.Series(data['question1'].tolist() + data['question2'].tolist()) st = time.time() weights = calculate_tfidf(qs) log.info('...time for cal tfidf: %.2f m' % ((time.time() - st) / 60)) del qs log.info('Building features') X['len_q1'] = data.question1.apply(word_len) # 1:Length of Q1 str X['len_q2'] = data.question2.apply(word_len) # 2:Length of Q2 str X['len_diff'] = abs(X.len_q1 - X.len_q2) # 3:Length difference between Q1 and Q2 log.info('Building char features') X['len_char_q1'] = data.q1_split.apply( word_len_char) # 4:Char length of Q1 X['len_char_q2'] = data.q2_split.apply( word_len_char) # 5:Char length of Q2 X['len_char_diff'] = data.apply( len_char_diff, axis=1, raw=True) # 6:Char length difference between Q1 and Q2 X['char_diff_unq_stop'] = data.apply(char_diff_unique_stop, stops=stops, axis=1, raw=True) # 7: set(6) X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) # 8:Char length Q1 / char length Q2 log.info('Building word count features') X['word_count_q1'] = data.q1_split.apply(word_count) # 9:Word count of Q1 X['word_count_q2'] = data.q2_split.apply(word_count) # 10:Word count of Q2 X['word_count_diff'] = data.apply( wc_diff, axis=1, raw=True) # 11:Word count difference between Q1 and Q2 X['word_count_ratio'] = data.apply( wc_ratio, axis=1, raw=True) # 12:Word count Q1 / word count Q2 X['total_unique_words'] = data.apply( total_unique_words, axis=1, raw=True) # 13:Word count set(Q1 + Q2) X['wc_diff_unique'] = data.apply( wc_diff_unique, axis=1, raw=True) # 14:Word count set(Q1) - word count set(Q2) X['wc_ratio_unique'] = data.apply( wc_ratio_unique, axis=1, raw=True) # 15:Word count set(Q1) / word count set(Q2) X['total_unq_words_stop'] = data.apply(total_unq_words_stop, stops=stops, axis=1, raw=True) # 16: 13 - stop words X['wc_diff_unique_stop'] = data.apply(wc_diff_unique_stop, stops=stops, axis=1, raw=True) # 17: 14 - stop words X['wc_ratio_unique_stop'] = data.apply(wc_ratio_unique_stop, stops=stops, axis=1, raw=True) # 18: 15 - stop words log.info('Building mark features') X['same_start'] = data.apply(same_start_word, axis=1, raw=True) # 19 same start = 1 else = 0 X['same_end'] = data.apply(same_end_word, axis=1, raw=True) # 20 same end = 1 else = 0 X['num_capital_q1'] = data.question1.apply(num_capital) # 21 X['num_capital_q2'] = data.question2.apply(num_capital) # 22 X['num_capital_diff'] = abs(X.num_capital_q1 - X.num_capital_q2) # 23 X['num_ques_mark_q1'] = data.question1.apply(num_ques_mark) # 24 X['num_ques_mark_q2'] = data.question2.apply(num_ques_mark) # 25 X['num_ques_mark_diff'] = abs(X.num_ques_mark_q1 - X.num_ques_mark_q2) # 26 log.info('Building another features') # 27 ~ 27+28(14*2)-1=54: First word in sentence(one hot) for start in common_start: X['start_%s_%s' % (start, 'q1')] = data.q1_split.apply(start_with, args=(start, )) for start in common_start: # 為了讓csv看起來更漂亮(更像one hot) X['start_%s_%s' % (start, 'q2')] = data.q2_split.apply(start_with, args=(start, )) X['common_words'] = data.apply(common_words, axis=1, raw=True) # 55:兩句相同的字數 X['common_words_unique'] = data.apply(common_words_unit, axis=1, raw=True) # 56:兩句相同的字母數 X['word_match'] = data.apply(word_match_share, axis=1, raw=True) # 57:字的重複比例 between Q1 and Q2 X['word_match_stops'] = data.apply( word_match_share_stops, stops=stops, axis=1, raw=True) # 58:字的重複比例 without stop word between Q1 and Q2 X['tfidf_wm'] = data.apply( tfidf_word_match_share, weights=weights, axis=1, raw=True) # 59:字的重複比例 between Q1 and Q2 (TF-IDF值) X['tfidf_wm_stops'] = data.apply( tfidf_word_match_share_stops, stops=stops, weights=weights, axis=1, raw=True) # 60:字的重複比例 without stop word between Q1 and Q2 (TF-IDF值) log.info('Building fuzzy features') # 61~67:Build fuzzy features X['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) X['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio( str(x['question1']), str(x['question2'])), axis=1) X['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio( str(x['question1']), str(x['question2'])), axis=1) X['jaccard'] = data.apply(jaccard, axis=1, raw=True) # 68:jaccard distance log.info('Build word2vec/glove distance features') # Build word2vec/glove distance features X['wmd'] = data.apply(lambda x: wmd(x['q1_split'], x['q2_split']), axis=1) # 69 X['norm_wmd'] = data.apply( lambda x: norm_wmd(x['q1_split'], x['q2_split']), axis=1) # 70 question1_vectors = np.zeros((data.shape[0], 300)) log.info('Sent2Vec') # Sent2Vec for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) log.info('Building distance features') # Build distance features X['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] X['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] X['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] X['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] X['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] # 79 # LDA features topics_q1 = data.question1.apply( lambda x: dict(lda_model[dictionary.doc2bow(clean_doc(x))])) for idx in range(num_topics): X['lda_topic_%s_%s' % (idx, 'q1')] = topics_q1.apply(lambda x: x.get(idx, 0)) del topics_q1 topics_q2 = data.question2.apply( lambda x: dict(lda_model[dictionary.doc2bow(clean_doc(x))])) for idx in range(num_topics): X['lda_topic_%s_%s' % (idx, 'q2')] = topics_q2.apply(lambda x: x.get(idx, 0)) del topics_q2 # LSI features topics_q1 = data.question1.apply( lambda x: dict(lsi_model[dictionary.doc2bow(clean_doc(x))])) for idx in range(num_topics): X['lsi_topic_%s_%s' % (idx, 'q1')] = topics_q1.apply(lambda x: x.get(idx, 0)) del topics_q1 topics_q2 = data.question2.apply( lambda x: dict(lsi_model[dictionary.doc2bow(clean_doc(x))])) for idx in range(num_topics): X['lsi_topic_%s_%s' % (idx, 'q2')] = topics_q2.apply(lambda x: x.get(idx, 0)) del topics_q2 return X
data['text2_lower'] = data['question2'].apply(lambda x: x.lower()) data['common_noun_cnt'] = [count_common_nouns(nltk.word_tokenize(lem.lemmatize(x[0],"v")),nltk.word_tokenize(lem.lemmatize(x[1], "v")), nouns) for x in data[['question1','question2']].values] # Initial Features data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = data.len_q1 - data.len_q2 data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0
def predict(): #question1 = 'What practical applications might evolve from the discovery of the Higgs Boson ?' #question2 = 'What are some practical benefits of discovery of the Higgs Boson ?' question1 = request.form["question1"] question2 = request.form["question2"] diff_len = len(str(question1)) - len(str(question2)) common_words = len( set(str(question1).lower().split()).intersection( set(str(question2).lower().split()))) fuzz_qratio = fuzz.QRatio(str(question1), str(question2)) fuzz_WRatio = fuzz.WRatio(str(question1), str(question2)) fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2)) fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio( str(question1), str(question2)) fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio( str(question1), str(question2)) fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2)) fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2)) wmd = wordmoverdistance(question1, question2) question1_vectors = sent2vec(question1) question2_vectors = sent2vec(question2) cosine_distance = cosine(question1_vectors, question2_vectors) cityblock_distance = cityblock(question1_vectors, question2_vectors) canberra_distance = canberra(question1_vectors, question2_vectors) euclidean_distance = euclidean(question1_vectors, question2_vectors) minkowski_distance = minkowski(question1_vectors, question2_vectors, 3) braycurtis_distance = braycurtis(question1_vectors, question2_vectors) X = np.array([ diff_len, common_words, fuzz_qratio, fuzz_WRatio, fuzz_partial_ratio, fuzz_partial_token_set_ratio, fuzz_partial_token_sort_ratio, fuzz_token_set_ratio, fuzz_token_sort_ratio, wmd, cosine_distance, cityblock_distance, canberra_distance, euclidean_distance, minkowski_distance, braycurtis_distance ]) X_to_render = X X = X_Scaler.transform(X.reshape(1, -1)) classifier = pickle.load(open('models/ANN.model', 'rb')) y_ann_pred = classifier.predict(X) y_random_forest_pred = randomForest.predict(X) y_logistic_pred = logistic.predict(X) y_knn_pred = knn.predict(X) print(y_random_forest_pred) print(y_logistic_pred) print(y_knn_pred) # return render_template("result.html", X_to_render = X_to_render, y_random_forest_pred = y_random_forest_pred, y_logistic_pred = y_logistic_pred, y_knn_pred = y_knn_pred) return render_template("result.html", X_to_render=X_to_render, y_random_forest_pred=y_random_forest_pred, y_logistic_pred=y_logistic_pred, y_knn_pred=y_knn_pred, y_ann_pred=y_ann_pred[0])
def preprocess(df): df_features = pd.DataFrame(index=df.index) df_intermediate = pd.DataFrame(index=df.index) print("--> Compute tokens...") df_intermediate["clean_a"] = df.text_a_text.apply(lambda x: clean(x)) df_intermediate["clean_b"] = df.text_b_text.apply(lambda x: clean(x)) df_intermediate["words_a"] = df_intermediate.apply( lambda row: row.clean_a.split(" "), axis=1) df_intermediate["words_b"] = df_intermediate.apply( lambda row: row.clean_b.split(" "), axis=1) df_intermediate["words_clean_a"] = df_intermediate.apply( lambda row: set([w for w in row.words_a if w not in en_stop]), axis=1) df_intermediate["words_clean_b"] = df_intermediate.apply( lambda row: set([w for w in row.words_b if w not in en_stop]), axis=1) df_intermediate["stop_a"] = df_intermediate.apply( lambda row: set([w for w in row.words_a if w in en_stop]), axis=1) df_intermediate["stop_b"] = df_intermediate.apply( lambda row: set([w for w in row.words_b if w in en_stop]), axis=1) print("--> Compute common words features...") df_intermediate["common_stop_words"] = df_intermediate.apply( lambda row: row.stop_a.intersection(row.stop_b), axis=1) df_intermediate["common_words"] = df_intermediate.apply( lambda row: set(row.words_a).intersection(set(row.words_b)), axis=1) df_intermediate["common_clean_words"] = df_intermediate.apply( lambda row: row.words_clean_a.intersection(row.words_clean_b), axis=1) df_intermediate[ "common_stop_words_cnt"] = df_intermediate.common_stop_words.apply( lambda x: len(x)) df_intermediate[ "common_words_cnt"] = df_intermediate.common_words.apply( lambda x: len(x)) df_intermediate[ "common_clean_words_cnt"] = df_intermediate.common_clean_words.apply( lambda x: len(x)) df_features["common_stop_words_ratio_min"] = df_intermediate.apply( lambda x: x.common_stop_words_cnt / (min(len(x["stop_a"]), len(x["stop_b"])) + 0.0001), axis=1) df_features["common_words_ratio_min"] = df_intermediate.apply( lambda x: x.common_words_cnt / (min(len(x["words_a"]), len(x["words_b"])) + 0.0001), axis=1) df_features["common_clean_words_ratio_min"] = df_intermediate.apply( lambda x: x.common_clean_words_cnt / (min(len(x["words_clean_a"]), len(x["words_clean_b"])) + 0.0001), axis=1) df_features["common_stop_words_ratio_max"] = df_intermediate.apply( lambda x: x.common_stop_words_cnt / (max(len(x["stop_a"]), len(x["stop_b"])) + 0.0001), axis=1) df_features["common_words_ratio_max"] = df_intermediate.apply( lambda x: x.common_words_cnt / (max(len(x["words_a"]), len(x["words_b"])) + 0.0001), axis=1) df_features["common_clean_words_ratio_max"] = df_intermediate.apply( lambda x: x.common_clean_words_cnt / (max(len(x["words_clean_a"]), len(x["words_clean_b"])) + 0.0001), axis=1) print("--> Compute general NLP features...") df_features["same_last_token"] = df_intermediate.apply( lambda x: int(x.words_a[-1] == x.words_b[-1]), axis=1) df_features["same_first_token"] = df_intermediate.apply( lambda x: int(x.words_a[0] == x.words_b[0]), axis=1) df_features["length_diff"] = df_intermediate.apply( lambda x: abs(len(x.words_a) - len(x.words_b)), axis=1) df_features["avg_length"] = df_intermediate.apply( lambda x: (len(x.words_a) + len(x.words_b)) / 2, axis=1) # Number of capital letters feature df_intermediate["a_n_capital"] = n_capital_letters(df["text_a_text"]) df_intermediate["b_n_capital"] = n_capital_letters(df["text_b_text"]) df_features["max_n_capital"] = df_intermediate[[ "a_n_capital", "b_n_capital" ]].max(axis=1) df_features["min_n_capital"] = df_intermediate[[ "a_n_capital", "b_n_capital" ]].min(axis=1) df_features["n_capital_diff"] = np.abs(df_intermediate["a_n_capital"] - df_intermediate["b_n_capital"]) # Number related features df_intermediate["a_has_number"] = df.text_a_text.apply( lambda x: has_number(x)) df_intermediate["b_has_number"] = df.text_b_text.apply( lambda x: has_number(x)) df_features["max_has_number"] = df_intermediate[[ "a_has_number", "b_has_number" ]].max(axis=1) df_features["min_has_number"] = df_intermediate[[ "a_has_number", "b_has_number" ]].min(axis=1) # Adopted from https://github.com/abhishekkrthakur/is_that_a_duplicate_quora_question print("--> Compute fuzzy features...") df_features['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio( str(x["text_a_text"]), str(x["text_b_text"])), axis=1) df_features['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio( str(x["text_a_text"]), str(x["text_b_text"])), axis=1) df_features['fuzz_partial_ratio'] = df.apply( lambda x: fuzz.partial_ratio(str(x["text_a_text"]), str(x["text_b_text"])), axis=1) df_features['fuzz_partial_token_set_ratio'] = df.apply( lambda x: fuzz.partial_token_set_ratio(str(x["text_a_text"]), str(x["text_b_text"])), axis=1) df_features['fuzz_partial_token_sort_ratio'] = df.apply( lambda x: fuzz.partial_token_sort_ratio(str(x["text_a_text"]), str(x["text_b_text"])), axis=1) df_features['fuzz_token_set_ratio'] = df.apply( lambda x: fuzz.token_set_ratio(str(x["text_a_text"]), str(x["text_b_text"])), axis=1) df_features['fuzz_token_sort_ratio'] = df.apply( lambda x: fuzz.token_sort_ratio(str(x["text_a_text"]), str(x["text_b_text"])), axis=1) print("--> Compute longest substring...") df_features["longest_substring_ratio"] = df.apply( lambda x: get_longest_substring_ratio(x["text_a_text"], x[ "text_b_text"]), axis=1) return df_features
def predict(): q1 = request.form['q1'] q2 = request.form['q2'] inference_point['freq_qid1']=train_org[train_org['question1']==q1].shape[0] inference_point['freq_qid2']=train_org[train_org['question2']==q2].shape[0] inference_point['q1len']=len(q1) inference_point['q2len']=len(q2) inference_point['q1_n_words']=len(q1.split(" ")) inference_point['q2_n_words']=len(q2.split(" ")) w1 = set(map(lambda word: word.lower().strip(), q1.split(" "))) w2 = set(map(lambda word: word.lower().strip(), q1.split(" "))) inference_point['word_Common'] =1.0 * len(w1 & w2) inference_point['word_Total']= 1.0 * (len(w1) + len(w2)) inference_point['word_share']= 1.0 * len(w1 & w2)/(len(w1) + len(w2)) inference_point['freq_q1+q2'] = inference_point['freq_qid1']+inference_point['freq_qid2'] inference_point['freq_q1-q2'] = inference_point['freq_qid1']-inference_point['freq_qid2'] q1=preprocess(q1) q2=preprocess(q2) token_features=get_token_features(q1,q2) inference_point["cwc_min"] = token_features[0] inference_point["cwc_max"] = token_features[1] inference_point["csc_min"] = token_features[2] inference_point["csc_max"] = token_features[3] inference_point["ctc_min"] = token_features[4] inference_point["ctc_max"] = token_features[5] inference_point["last_word_eq"] = token_features[6] inference_point["first_word_eq"] = token_features[7] inference_point["abs_len_diff"] = token_features[8] inference_point["mean_len"] = token_features[9] inference_point['longest_substr_ratio']=len(list(lcs(q1,q2))[0]) inference_point['token_set_ratio'] =fuzz.token_set_ratio(q1,q2) inference_point['token_sort_ratio'] =fuzz.token_sort_ratio(q1,q2) inference_point['fuzz_ratio'] =fuzz.QRatio(q1,q2) inference_point['fuzz_partial_ratio'] =fuzz.partial_ratio(q1,q2) q1_vec=tfidf_w2v(q1) q2_vec=tfidf_w2v(q2) for i in range(len(q1_vec)): inference_point[str(i)+'_x']=q1_vec[i] inference_point[str(i)+'_y']=q2_vec[i] inference_point['fuzz_ratio'] len(inference_point) # cols X=pd.DataFrame(inference_point,index=[0]) X=X[cols] x = xgb.DMatrix(X) pred=bst.predict(x) if(pred>0.5): return render_template("results.html",sim='Similar',score=pred) else: return render_template("results.html",sim='Disimilar',score=pred)
train = train[['label', 'words_x', 'words_y']] train.columns = ['label', 'words1', 'words2'] len_train = train.shape[0] test = pd.merge(test, question, left_on=['q1'], right_on=['qid'], how='left') test = pd.merge(test, question, left_on=['q2'], right_on=['qid'], how='left') test = test[['words_x', 'words_y']] test.columns = ['words1', 'words2'] df_feat = pd.DataFrame() df_data = pd.concat([train, test]) # 输出相似度的结果 # https://blog.csdn.net/sunyao_123/article/details/76942809 df_feat['fuzz_words_qratio'] = df_data.apply( lambda row: fuzz.QRatio(str(row['words1']), str(row['words2'])), axis=1) df_feat['fuzz_words_WRatio'] = df_data.apply( lambda row: fuzz.WRatio(str(row['words1']), str(row['words2'])), axis=1) df_feat['fuzz_words_partial_ratio'] = df_data.apply( lambda row: fuzz.partial_ratio(str(row['words1']), str(row['words2'])), axis=1) df_feat['fuzz_words_partial_token_set_ratio'] = df_data.apply( lambda row: fuzz.partial_token_set_ratio(str(row['words1']), str(row['words2'])), axis=1) df_feat['fuzz_words_partial_token_sort_ratio'] = df_data.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['words1']), str(row['words2'])), axis=1) df_feat['fuzz_words_token_set_ratio'] = df_data.apply( lambda row: fuzz.token_set_ratio(str(row['words1']), str(row['words2'])),
from fuzzywuzzy import fuzz from fuzzywuzzy import process from fuzzywuzzy import string_processing a = 'my india' b = ['Cirque du Soleil-Zarkana', 'this is my india', 'I love my india'] print(fuzz.QRatio(a, b)) print(process.extractOne(a, b, scorer=fuzz.ratio))
for projectIndex in range(len(projectList["scratchpads"])): url = projectList["scratchpads"][projectIndex]["url"] id = url.split("/")[-1] print("Checking against project " + id) # Get the projects data project = requests.get("https://www.khanacademy.org/api/labs/scratchpads/" + id) projectCode = project.json()["revision"]["code"] # Compare the code a = fuzz.ratio(originalCode, projectCode) b = fuzz.partial_ratio(originalCode, projectCode) c = fuzz.token_sort_ratio(originalCode, projectCode) d = fuzz.partial_token_sort_ratio(originalCode, projectCode) e = fuzz.QRatio(originalCode, projectCode) data = [a, b, c, d, e] # Process the data and output it outputHTML = outputHTML + "<tr><td><a href=\"" + url + "\" target=\"_blank\"><p>" + id + "</p></a></td>" for value in data: color = "" if value < 58: color = "#7ffe00" elif value >= 58 and value < 75: color = "#ffff00" elif value >= 75 and value < 87: color = "#fe7f00" elif value >= 87: color = "#fe007f"
def read_data(): data = pd.read_csv("data/test.csv") #var = str( data.loc[data["id"]==53,"question1"]) #print var#cleanser(var) print "lock and load" #augmenting data with basic features data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = data.len_q1 - data.len_q2 data['len_char_q1'] = data.question1.apply( lambda x: len("".join(str(x).split()))) data['len_char_q2'] = data.question2.apply( lambda x: len("".join(str(x).split()))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len( set(str(x['question1']).lower().split()).intersection( set(str(x['question2']).lower().split()))), axis=1) col_basic = [ 'len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2', 'len_word_q1', 'len_word_q2', 'common_words' ] #Levenshtein Distance features data['q_ratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['w_ratio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio( str(x['question1']), str(x['question2'])), axis=1) data['token_sort_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) col_levenshtein = [ 'q_ratio', 'w_ratio', 'partial_ratio', 'partial_token_set_ratio', 'partial_token_sort_ratio', 'token_set_ratio', 'token_sort_ratio' ] question_vec_cols_q1, question_vec_cols_q2 = [], [] print "levenshtein and basic features done!" vector_features, col_distance = question_to_vector(data['question1'], data['question2']) col_distance = ['WMD_basic', 'WMD_normalized'] + col_distance print "assigning vector values" for i in range(0, 300): question_vec_cols_q1.append("vec_val_" + str(i) + "_q1") data["vec_val_" + str(i) + "_q1"] = vector_features[:, i] question_vec_cols_q2.append("vec_val_" + str(i) + "_q2") data["vec_val_" + str(i) + "_q2"] = vector_features[:, 300 + i] for i, key in enumerate(col_distance): data[key] = vector_features[:, 600 + i] data["zero_vec_check_q1"] = vector_features[:, 613] data["zero_vec_check_q2"] = vector_features[:, 614] header = ['id', 'is_duplicate'] header.extend(col_basic) header.extend(col_levenshtein) header.extend(col_distance) header.extend(question_vec_cols_q1) header.extend(question_vec_cols_q2) header.append('zero_vec_check_q1') header.append('zero_vec_check_q2') print "writing csv" data.to_csv(TRANSFORMED_DATA_FILE_PATH, columns=header, sep='\t') print "done!"
def generate_h1(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): df_train = pd.read_csv(train_file) df_test = pd.read_csv(test_file) print("Original data: X_train: {}, X_test: {}".format( df_train.shape, df_test.shape)) print("Features processing, be patient...") # If a word appears only once, we ignore it completely (likely a typo) # Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller def get_weight(count, eps=10000, min_count=2): return 0 if count < min_count else 1 / (count + eps) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) words = (" ".join(train_qs)).lower().split() counts = Counter(words) weights = {word: get_weight(count) for word, count in counts.items()} stops = set(stopwords.words("english")) def word_shares(row): q1 = set(str(row['question1']).lower().split()) q1words = q1.difference(stops) if len(q1words) == 0: return '0:0:0:0:0' q2 = set(str(row['question2']).lower().split()) q2words = q2.difference(stops) if len(q2words) == 0: return '0:0:0:0:0' q1stops = q1.intersection(stops) q2stops = q2.intersection(stops) shared_words = q1words.intersection(q2words) shared_weights = [weights.get(w, 0) for w in shared_words] total_weights = [weights.get(w, 0) for w in q1words ] + [weights.get(w, 0) for w in q2words] R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share R2 = len(shared_words) / (len(q1words) + len(q2words)) #count share R31 = len(q1stops) / len(q1words) #stops in q1 R32 = len(q2stops) / len(q2words) #stops in q2 return '{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32) df = pd.concat([df_train, df_test]) df['word_shares'] = df.apply(word_shares, axis=1, raw=True) x = pd.DataFrame() x['word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[0])) x['tfidf_word_match'] = df['word_shares'].apply( lambda x: float(x.split(':')[1])) x['shared_count'] = df['word_shares'].apply( lambda x: float(x.split(':')[2])) x['stops1_ratio'] = df['word_shares'].apply( lambda x: float(x.split(':')[3])) x['stops2_ratio'] = df['word_shares'].apply( lambda x: float(x.split(':')[4])) x['diff_stops_r'] = x['stops1_ratio'] - x['stops2_ratio'] x['len_q1'] = df['question1'].apply(lambda x: len(str(x))) x['len_q2'] = df['question2'].apply(lambda x: len(str(x))) x['diff_len'] = x['len_q1'] - x['len_q2'] x['len_char_q1'] = df['question1'].apply( lambda x: len(str(x).replace(' ', ''))) x['len_char_q2'] = df['question2'].apply( lambda x: len(str(x).replace(' ', ''))) x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2'] x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split())) x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split())) x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2'] x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1'] x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2'] x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2'] x['fuzz_qratio'] = df.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_WRatio'] = df.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_partial_ratio'] = df.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_partial_token_set_ratio'] = df.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_partial_token_sort_ratio'] = df.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio( str(x['question1']), str(x['question2'])), axis=1) x['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio( str(x['question1']), str(x['question2'])), axis=1) x['word2vec_similarity'] = df.apply( lambda x: PhraseVector(str(x['question1'])).CosineSimilarity( PhraseVector(str(x['question2'])).vector), axis=1) feature_names = list(x.columns.values) print("Features: {}".format(feature_names)) x.fillna(0, inplace=True) x_train = x[:df_train.shape[0]] x_test = x[df_train.shape[0]:] y_train = df_train[TARGET].values if 1: # Now we oversample the negative class - on your own risk of overfitting! pos_train = x_train[y_train == 1] neg_train = x_train[y_train == 0] print("Oversampling started for proportion: {}".format( len(pos_train) / (len(pos_train) + len(neg_train)))) p = 0.165 scale = ((float(len(pos_train)) / (len(pos_train) + len(neg_train))) / p) - 1 while scale > 1: neg_train = pd.concat([neg_train, neg_train]) scale -= 1 neg_train = pd.concat( [neg_train, neg_train[:int(scale * len(neg_train))]]) print("Oversampling done, new proportion: {}".format( len(pos_train) / (len(pos_train) + len(neg_train)))) x_train = pd.concat([pos_train, neg_train]) y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros( len(neg_train)).tolist() del pos_train, neg_train logging.info('saving features') save_data(x_train, y_train, train_feature_file) save_data(x_test, None, test_feature_file) with open(feature_map_file, 'w') as f: for i, col in enumerate(x.columns): f.write('{}\t{}\tq\n'.format(i, col))
question2 = input("enter question 2:") #calculation of features from the input question len_1 = len(question1) len_2 = len(question2) diff_len = len_1 - len_2 len_char_q1 = len(''.join(set(str(question1).replace(' ', '')))) len_char_q2 = len(''.join(set(str(question2).replace(' ', '')))) len_word_q1 = len(str(question1).split()) len_word_q2 = len(str(question2).split()) common_words = len( set(str(question1).lower().split()).intersection( set(str(question2).lower().split()))) #fuzzy from fuzzywuzzy import fuzz fuzz_qratio = fuzz.QRatio(str(question1), str(question2)) fuzz_WRatio = fuzz.WRatio(str(question1), str(question2)) fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2)) fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio( str(question1), str(question2)) fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio( str(question1), str(question2)) fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2)) fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2)) #wmd import gensim #model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) #sen2vec import scipy question1_vectors = scipy.sparse.lil_matrix((dataset.shape[0], 300))
def extract_features(model, sentence_1, sentence_2): features = [] #preprocessing each pair of sentences and tokenize them sentence_1 = review_to_words(sentence_1) sentence_2 = review_to_words(sentence_2) tokens_1 = sentence_1.split() tokens_2 = sentence_2.split() #compute average of Glove word vectors for each sentence vec1 = np.zeros([300], dtype=float) vec2 = np.zeros([300], dtype=float) counter = 1 for i in range(len(tokens_1)): if tokens_1[i] in model: counter += 1 vec1 += model[tokens_1[i]] vec1 = vec1 / counter counter = 1 for i in range(len(tokens_2)): if tokens_2[i] in model: counter += 1 vec2 += model[tokens_2[i]] vec2 = vec2 / counter #add diffferent features features.append(cosine_sim(vec1, vec2)) features.append(euclidean_distance(vec1, vec2)) features.append(jaccard_similarity(sentence_1, sentence_2)) features.append(distance_title_len(sentence_1, sentence_2)) features.append(get_longest_substr_ratio(sentence_1, sentence_2)) features.append(distance_bigrams_same(sentence_1, sentence_2)) SAFE_DIV = 0.0001 token_features = [0.0] * 9 q1_tokens = sentence_1.split() q2_tokens = sentence_2.split() if len(q1_tokens) == 0 or len(q2_tokens) == 0: return features q1_words = set([word for word in q1_tokens if word not in STOP_WORDS]) q2_words = set([word for word in q2_tokens if word not in STOP_WORDS]) q1_stops = set([word for word in q1_tokens if word in STOP_WORDS]) q2_stops = set([word for word in q2_tokens if word in STOP_WORDS]) common_word_count = len(q1_words.intersection(q2_words)) common_stop_count = len(q1_stops.intersection(q2_stops)) common_token_count = len(set(q1_tokens).intersection(set(q2_tokens))) token_features[0] = common_word_count / ( min(len(q1_words), len(q2_words)) + SAFE_DIV) token_features[1] = common_word_count / ( max(len(q1_words), len(q2_words)) + SAFE_DIV) token_features[2] = common_stop_count / ( min(len(q1_stops), len(q2_stops)) + SAFE_DIV) token_features[3] = common_stop_count / ( max(len(q1_stops), len(q2_stops)) + SAFE_DIV) token_features[4] = common_token_count / ( min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) token_features[5] = common_token_count / ( max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) token_features[6] = int(q1_tokens[-1] == q2_tokens[-1]) token_features[7] = int(q1_tokens[0] == q2_tokens[0]) token_features[8] = (len(q1_tokens) + len(q2_tokens)) / 2 for i in token_features: features.append(i) #use fuzzy wuzzy library features.append(fuzz.token_set_ratio(sentence_1, sentence_2)) features.append(fuzz.token_sort_ratio(sentence_1, sentence_2)) features.append(fuzz.QRatio(sentence_1, sentence_2)) features.append(fuzz.partial_ratio(sentence_1, sentence_2)) return features
def generate_h1(train_file, test_file, train_feature_file, test_feature_file, feature_map_file): df_train = pd.read_csv(train_file) df_test = pd.read_csv(test_file) print("Original data: X_train: {}, X_test: {}".format( df_train.shape, df_test.shape)) print("Features processing, be patient...") df = pd.concat([df_train, df_test]) x = pd.DataFrame() x['fuzz_qratio'] = df.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_WRatio'] = df.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_partial_ratio'] = df.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_partial_token_set_ratio'] = df.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_partial_token_sort_ratio'] = df.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) x['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio( str(x['question1']), str(x['question2'])), axis=1) x['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio( str(x['question1']), str(x['question2'])), axis=1) feature_names = list(x.columns.values) print("Features: {}".format(feature_names)) x.fillna(0, inplace=True) x_train = x[:df_train.shape[0]] x_test = x[df_train.shape[0]:] y_train = df_train[TARGET].values logging.info('saving features') save_data(x_train, y_train, train_feature_file) save_data(x_test, None, test_feature_file) with open(feature_map_file, 'w') as f: for i, col in enumerate(x.columns): f.write('{}\t{}\tq\n'.format(i, col))
def get_feature_engineered_data(self, flag): #Creating an object of Data Exploration obj = FeatureEngineeringFunctions() if flag == 1: train = pd.read_csv('../LematizedFiles/trainlem.csv', engine='python') else: train = pd.read_csv('../LematizedFiles/testlem.csv', engine='python') obj.convert_to_string(train) #Data Parameter used for some feature engineering attributes EPSILON = 0.0000001 #Map works element wise on a series #Apply works row/column wise on a dataframe train['q1_word_num'] = train['question1'].map(obj.words_count) train['q2_word_num'] = train['question2'].map(obj.words_count) train['q1_length'] = train['question1'].map(obj.length) train['q2_length'] = train['question2'].map(obj.length) train['word_num_difference'] = abs(train.q1_word_num - train.q2_word_num) train['length_difference'] = abs(train.q1_length - train.q2_length) train['q1_has_fullstop'] = train.question1.apply( lambda x: int('.' in x)) train['q2_has_fullstop'] = train.question2.apply( lambda x: int('.' in x)) train['q1_digit_count'] = train.question1.apply( lambda question: sum([word.isdigit() for word in question])) train['q2_digit_count'] = train.question2.apply( lambda question: sum([word.isdigit() for word in question])) train['digit_count_difference'] = abs(train.q1_digit_count - train.q2_digit_count) train['q1_capital_char_count'] = train.question1.apply( lambda question: sum([word.isupper() for word in question])) train['q2_capital_char_count'] = train.question2.apply( lambda question: sum([word.isupper() for word in question])) train['capital_char_count_difference'] = abs( train.q1_capital_char_count - train.q2_capital_char_count) train['q1_has_math_expression'] = train.question1.apply( lambda x: int('[math]' in x)) train['q2_has_math_expression'] = train.question2.apply( lambda x: int('[math]' in x)) train['common_words'] = train[['question1', 'question2']].apply(obj.count_common, axis=1) train['lem_common_words'] = train[['lem_question1', 'lem_question2' ]].apply(obj.count_common, axis=1) train['log_word_share'] = np.log( train[['question1', 'question2']].apply(obj.count_common, axis=1) + EPSILON) train['lem_log_word_share'] = np.log( train[['lem_question1', 'lem_question2']].apply(obj.count_common, axis=1) + EPSILON) train['word_share_squared'] = (train[['question1', 'question2' ]].apply(obj.count_common, axis=1)**2) train['lem_word_share_squared'] = (train[[ 'lem_question1', 'lem_question2' ]].apply(obj.count_common, axis=1)**2) train['word_share_sqrt'] = np.sqrt(train[['question1', 'question2' ]].apply(obj.count_common, axis=1)) train['lem_word_share_sqrt'] = np.sqrt( train[['lem_question1', 'lem_question2']].apply(obj.count_common, axis=1)) train['log_length_difference'] = np.log(train.length_difference + EPSILON) train['length_difference_squared'] = train.length_difference**2 train['length_difference_sqrt'] = np.sqrt(train.length_difference) train['log_lem_tfidf'] = np.log(train.lem_tfidf_word_match + EPSILON) train['lem_tfidf_squared'] = train.lem_tfidf_word_match**2 train['lem_tfidf_sqrt'] = np.sqrt(train.lem_tfidf_word_match) train['total_unique_words'] = train[['question1', 'question2' ]].apply(obj.total_unique_words, axis=1) train['word_count_ratio'] = train[['question1', 'question2' ]].apply(obj.word_count_ratio, axis=1) train['fuzz_qratio'] = train.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) train['fuzz_WRatio'] = train.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) train['fuzz_partial_ratio'] = train.apply(lambda x: fuzz.partial_ratio( str(x['question1']), str(x['question2'])), axis=1) train['fuzz_partial_token_set_ratio'] = train.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) train['fuzz_partial_token_sort_ratio'] = train.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) train['fuzz_token_set_ratio'] = train.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) train['fuzz_token_sort_ratio'] = train.apply( lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) train['cosine_score'] = train.apply(get_cosine, axis=1, raw=True) features = [ 'q1_word_num', 'q2_word_num', 'word_num_difference', 'q1_length', 'q2_length', 'length_difference', 'q1_has_fullstop', 'q2_has_fullstop', 'q1_digit_count', 'q2_digit_count', 'digit_count_difference', 'q1_capital_char_count', 'q2_capital_char_count', 'capital_char_count_difference', 'q1_has_math_expression', 'q2_has_math_expression', 'log_length_difference', 'log_word_share', 'word_share_squared', 'word_share_sqrt', 'length_difference_squared', 'length_difference_sqrt', 'common_words', 'lem_common_words', 'lem_log_word_share', 'lem_word_share_squared', 'lem_word_share_sqrt', 'tfidf_word_match', 'lem_tfidf_word_match', 'intersection_count', 'log_lem_tfidf', 'lem_tfidf_squared', 'lem_tfidf_sqrt', 'total_unique_words', 'word_count_ratio', 'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'cosine_score', 'q1_freq', 'q2_freq' ] if flag == 1: target = 'is_duplicate' X = train[features] Y = train[target] return X, Y else: X = train[features] return X
data_line = line.rstrip().split(',') ele1 = data_line[0].split('||') ele2 = data_line[1].split('||') kod1 = ele1[0] firm1 = ele1[1] kod2 = ele2[0] firm2 = ele2[1] # score = fuzz.token_set_ratio(firm1, firm2) score_r = fuzz.ratio(firm1, firm2) score_pr = fuzz.partial_ratio(firm1, firm2) score_tsor = fuzz.token_sort_ratio(firm1, firm2) score_tser = fuzz.token_set_ratio(firm1, firm2) score_ptsor = fuzz.partial_token_sort_ratio(firm1, firm2) score_ptser = fuzz.partial_token_set_ratio(firm1, firm2) score_qr = fuzz.QRatio(firm1, firm2) score_uqr = fuzz.UQRatio(firm1, firm2) score_wr = fuzz.WRatio(firm1, firm2) score_uwr = fuzz.UWRatio(firm1, firm2) # print('kod1:' + kod1) # print('firm1:' + firm1) # print('kod2:' + kod2) # print('firm2:' + firm2) # print('score:' + str(score)) # if score_r > 90 or score_pr > 90 or score_tsor > 90 or score_tser > 90 or score_ptsor > 90 or score_ptser > 90 \ # or score_qr > 90 or score_uqr > 90 or score_wr > 90 or score_uwr > 90: if score_tser > 90: temp3 = (
header=None, sep='\t') df_train.columns = ['line', 'q1', 'q2', 'label'] df_train_add = pd.read_csv('../data/input/atec_nlp_sim_train_add.csv', encoding='utf-8-sig', header=None, sep='\t') df_train_add.columns = ['line', 'q1', 'q2', 'label'] df_train = pd.concat([df_train, df_train_add], axis=0, sort=False) df_feat = pd.DataFrame() df_feat['fuzz_ratio'] = df_train.apply( lambda row: fuzz.ratio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_qratio'] = df_train.apply( lambda row: fuzz.QRatio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_wratio'] = df_train.apply( lambda row: fuzz.WRatio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_partial_ratio'] = df_train.apply( lambda row: fuzz.partial_ratio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_partial_token_set_ratio'] = df_train.apply( lambda row: fuzz.partial_token_set_ratio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_partial_token_sort_ratio'] = df_train.apply( lambda row: fuzz.partial_token_sort_ratio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_token_set_ratio'] = df_train.apply( lambda row: fuzz.token_set_ratio(str(row.q1), str(row.q2)), axis=1) df_feat['fuzz_token_sort_ratio'] = df_train.apply( lambda row: fuzz.token_sort_ratio(str(row.q1), str(row.q2)), axis=1) df_feat.to_csv('subfeas/train_feature_fuzz.csv', index=False)
def create_all_simple_features(list_pairs_train,list_pairs_test,texts,\ ids2ind,word_vectors,embedding_matrix,glove_embedding,sequences\ ,data_full_word,my_p=50,n_lsa=40): ''' Create the set of basic features from the list_pairs param: list_pairs_train: list of pairs of question to compare for training list_pars_test: list of pairs of question to compare for test phases ids2ind: index of each question word_vectors : w2vec based word_vectors (gensim object) embedding_matrix: w2vec based word matrix sequences: sequences index (keras object) n_lsa: number of axis used for pca used in lsa embedding return: matrix train (nb_sample,nb_features) matrix test (nb_sample,nb_features) ''' vec = TfidfVectorizer() A = vec.fit_transform(texts.values()) LSA_features= TruncatedSVD(n_components=n_lsa).fit_transform(A) vec_count = CountVectorizer() B = vec_count.fit_transform(texts.values()) LSA_bis_features= TruncatedSVD(n_components=n_lsa).fit_transform(B) stemmer = nltk.stem.SnowballStemmer('english') ###Stem doc documents = [[stemmer.stem(word) for word in sentence.split(" ")] for sentence in texts] documents = [' '.join(doc) for doc in documents] vec_stem = TfidfVectorizer() C = vec_stem.fit_transform(documents) LSA_features_stem= TruncatedSVD(n_components=n_lsa).fit_transform(C) vec_count_stem = CountVectorizer() D = vec_count_stem.fit_transform(documents) LSA_bis_features_stem= TruncatedSVD(n_components=n_lsa).fit_transform(D) #########Init########### N_train = len(list_pairs_train) N_test= len(list_pairs_test) X_train = np.zeros((N_train,135)) X_test = np.zeros((N_test,135)) cleaned_docs= data_full_word d2v_training_data = [] for idx,doc in enumerate(cleaned_docs): d2v_training_data.append(LabeledSentence(words=doc,tags=[idx])) if idx % round(len(cleaned_docs)/10) == 0: print(idx) d2v_dm = Doc2Vec(d2v_training_data, size=200, iter=6, window=5, min_count=3, workers=4) d2v_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) d2v_dbow = Doc2Vec(d2v_training_data, size=my_p, window=4, iter=6, min_count=3, dm=0, workers=4) d2v_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) #####Create features for training########### for i in range(N_train): q1 = list_pairs_train[i][0] q2 = list_pairs_train[i][1] X_train[i,0] = 1- cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:]) X_train[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split())) X_train[i,3] = min(word_vectors.wv.wmdistance((texts[q1].lower()).split(),(texts[q2].lower()).split()),100000) #WM distance X_train[i,4] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split()),7) X_train[i,5]= 1- cosine_similarity(LSA_features[ids2ind[q1],:].reshape(1, -1), LSA_features[ids2ind[q2],:].reshape(1, -1)) if (len(sequences[ids2ind[q1]])>0)and(len(sequences[ids2ind[q2]])>0) : mean_pos_1 = (embedding_matrix[sequences[ids2ind[q1]],:]).sum(axis=0) mean_pos_2 = (embedding_matrix[sequences[ids2ind[q2]],:]).sum(axis=0) mean_pos_1= mean_pos_1 / np.sqrt((mean_pos_1 ** 2).sum()) mean_pos_2= mean_pos_2 / np.sqrt((mean_pos_2 ** 2).sum()) X_train[i,1] = 1- cosine_similarity(mean_pos_1.reshape(1, -1),mean_pos_2.reshape(1, -1)) mean_pos_1_gv = (glove_embedding[sequences[ids2ind[q1]],:]).sum(axis=0) mean_pos_2_gv = (glove_embedding[sequences[ids2ind[q2]],:]).sum(axis=0) if np.sum(mean_pos_1_gv)>0: mean_pos_1_gv= mean_pos_1_gv / np.sqrt((mean_pos_1_gv ** 2).sum()) if np.sum(mean_pos_2_gv)>0: mean_pos_2_gv= mean_pos_2_gv / np.sqrt((mean_pos_2_gv ** 2).sum()) X_train[i,15] = 1- cosine_similarity(mean_pos_1_gv.reshape(1, -1),mean_pos_2_gv.reshape(1, -1)) X_train[i,17] = np.linalg.norm(mean_pos_1.reshape(1, -1)-mean_pos_2.reshape(1, -1)) X_train[i,18] = np.linalg.norm(mean_pos_1_gv.reshape(1, -1)-mean_pos_2_gv.reshape(1, -1)) X_train[i,35]= distance.cityblock(mean_pos_1,mean_pos_2) X_train[i,36]= distance.jaccard(mean_pos_1,mean_pos_2) X_train[i,37]= distance.canberra(mean_pos_1,mean_pos_2) X_train[i,38]= distance.minkowski(mean_pos_1,mean_pos_2,3) X_train[i,39]= distance.braycurtis(mean_pos_1,mean_pos_2) X_train[i,40]= distance.cityblock(mean_pos_1_gv,mean_pos_2_gv) X_train[i,41]= distance.jaccard(mean_pos_1_gv,mean_pos_2_gv) X_train[i,42]= distance.canberra(mean_pos_1_gv,mean_pos_2_gv) X_train[i,43]= distance.minkowski(mean_pos_1_gv,mean_pos_2_gv,3) X_train[i,44]= distance.braycurtis(mean_pos_1_gv,mean_pos_2_gv) else: X_train[i,1] = -1 X_train[i,15]= -1 X_train[i,17] = -1 X_train[i,18] = -1 X_train[i,35:44]=-1 X_train[i,6] = fuzz.partial_ratio(texts[q1],texts[q2])/100 X_train[i,7] = fuzz.QRatio(texts[q1],texts[q2])/100 X_train[i,8] = 1 - cosine_similarity(LSA_bis_features[ids2ind[q1],:].reshape(1, -1), LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) d2v1 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q1]]) d2v2 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q2]]) d2vbow1 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q1]]) d2vbow2 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q2]]) X_train[i,9] = 1 - cosine_similarity(d2vbow1.reshape(1, -1), d2vbow2.reshape(1, -1)) X_train[i,10] = 1 - cosine_similarity(d2v1.reshape(1, -1), d2v2.reshape(1, -1)) X_train[i,11] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_train[i,12] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_train[i,13] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_train[i,14] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_train[i,16]= np.linalg.norm(LSA_features[ids2ind[q1],:].reshape(1, -1) - LSA_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,19] = np.linalg.norm(LSA_bis_features[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,20] = np.linalg.norm(d2vbow1.reshape(1, -1) - d2vbow2.reshape(1, -1)) X_train[i,21] = np.linalg.norm(d2v1.reshape(1, -1) - d2v2.reshape(1, -1)) X_train[i,22] = np.linalg.norm(A[ids2ind[q1],:].todense() - A[ids2ind[q2],:].todense()) X_train[i,23]= max(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1)) X_train[i,24]= max(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1)) X_train[i,25]= max(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1)) X_train[i,26]= max(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1)) X_train[i,27]= max(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1)) X_train[i,28]= min(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1)) X_train[i,29]= min(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1)) X_train[i,30]= min(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1)) X_train[i,31]= min(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1)) X_train[i,32]= min(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1)) X_train[i,33]= fuzz.token_set_ratio(texts[q1],texts[q2])/100 X_train[i,34]= fuzz.token_sort_ratio(texts[q1],texts[q2])/100 X_train[i,45] = abs(len(texts[q1].lower())-len(texts[q2].lower())) X_train[i,46] = abs(len([j for j in texts[q1] if j=='?'])-len([j for j in texts[q2] if j=='?'])) X_train[i,47] = len(texts[q1].split()) + len(texts[q2].split()) X_train[i,48] = distance.cityblock(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_train[i,49] = distance.jaccard(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_train[i,50] = distance.canberra(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_train[i,51] = distance.minkowski(d2v1.reshape(1, -1),d2v2.reshape(1, -1),3) X_train[i,52] = distance.braycurtis(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_train[i,53] = distance.cityblock(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,54] = distance.jaccard(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,55] = distance.canberra(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,56] = distance.minkowski(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1),3) X_train[i,57] = distance.braycurtis(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,58] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_train[i,59] = is_first_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_train[i,60] = is_last_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split()) #####Using QID X_train[i,61] = abs(int(q1) - int(q2)) X_train[i,62] = abs((int(q1) + int(q2))/2) X_train[i,63] = abs(min(int(q1),int(q2))) ####Using N-grams X_train[i,64] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2) X_train[i,65] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3) X_train[i,66] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4) X_train[i,67] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5) X_train[i,68] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6) X_train[i,69] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2) X_train[i,70] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3) X_train[i,71] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4) X_train[i,72] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5) X_train[i,73] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6) X_train[i,74] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_train[i,75] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_train[i,76] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_train[i,77] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_train[i,78] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False),7) X_train[i,79] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_train[i,80] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False) X_train[i,81] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False) X_train[i,82] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False) X_train[i,83] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False) X_train[i,84] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False) X_train[i,85] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False) X_train[i,86] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False) X_train[i,87] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False) X_train[i,88] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False) X_train[i,89] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False) X_train[i,90] = distance.cityblock(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,91] = distance.jaccard(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,92] = distance.canberra(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,93] = distance.minkowski(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1),3) X_train[i,94] = distance.braycurtis(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_train[i,95] = distance.cityblock(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,96] = distance.jaccard(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,97] = distance.canberra(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,98] = distance.minkowski(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1),3) X_train[i,99] = distance.braycurtis(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,100] = 1- cosine_similarity(B[ids2ind[q1],:], B[ids2ind[q2],:]) X_train[i,101] = np.linalg.norm(B[ids2ind[q1],:].todense() - B[ids2ind[q2],:].todense()) X_train[i,102] = distance.cityblock(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,103] = distance.jaccard(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,104] = distance.canberra(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,105] = distance.minkowski(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1),3) X_train[i,106] = distance.braycurtis(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,107] = 1- cosine_similarity(C[ids2ind[q1],:], C[ids2ind[q2],:]) X_train[i,108] = np.linalg.norm(C[ids2ind[q1],:].todense() - C[ids2ind[q2],:].todense()) X_train[i,109] = distance.cityblock(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,110] = distance.jaccard(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,111] = distance.canberra(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,112] = distance.minkowski(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1),3) X_train[i,113] = distance.braycurtis(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,114] = 1- cosine_similarity(D[ids2ind[q1],:], D[ids2ind[q2],:]) X_train[i,115] = np.linalg.norm(D[ids2ind[q1],:].todense() - D[ids2ind[q2],:].todense()) X_train[i,116] = distance.cityblock(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,117] = distance.jaccard(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,118] = distance.canberra(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,119] = distance.minkowski(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1),3) X_train[i,120] = distance.braycurtis(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_train[i,121] = 1 - cosine_similarity(LSA_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,122] = np.linalg.norm(LSA_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,123] = distance.cityblock(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,124] = distance.jaccard(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,125] = distance.canberra(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,126] = distance.minkowski(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1),3) X_train[i,127] = distance.braycurtis(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,128] = 1 - cosine_similarity(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,129] = np.linalg.norm(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,130] = distance.cityblock(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,131] = distance.jaccard(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,132] = distance.canberra(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_train[i,133] = distance.minkowski(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1),3) X_train[i,134] = distance.braycurtis(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) #####Create features for test########### for i in range(N_test): q1 = list_pairs_test[i][0] q2 = list_pairs_test[i][1] X_test[i,0] = 1- cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:]) X_test[i,2] = abs(len(texts[q1].split()) - len(texts[q2].split())) X_test[i,3] = min(word_vectors.wv.wmdistance((texts[q1].lower()).split(),(texts[q2].lower()).split()),100000) #WM distance X_test[i,4] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split()),7) X_test[i,5]= 1- cosine_similarity(LSA_features[ids2ind[q1],:].reshape(1, -1), LSA_features[ids2ind[q2],:].reshape(1, -1)) if (len(sequences[ids2ind[q1]])>0)and(len(sequences[ids2ind[q2]])>0) : mean_pos_1 = (embedding_matrix[sequences[ids2ind[q1]],:]).sum(axis=0) mean_pos_2 = (embedding_matrix[sequences[ids2ind[q2]],:]).sum(axis=0) mean_pos_1= mean_pos_1 / np.sqrt((mean_pos_1 ** 2).sum()) mean_pos_2= mean_pos_2 / np.sqrt((mean_pos_2 ** 2).sum()) X_test[i,1] = 1- cosine_similarity(mean_pos_1.reshape(1, -1),mean_pos_2.reshape(1, -1)) mean_pos_1_gv = (glove_embedding[sequences[ids2ind[q1]],:]).sum(axis=0) mean_pos_2_gv = (glove_embedding[sequences[ids2ind[q2]],:]).sum(axis=0) if np.sum(mean_pos_1_gv)!=0: mean_pos_1_gv= mean_pos_1_gv / np.sqrt((mean_pos_1_gv ** 2).sum()) if np.sum(mean_pos_2_gv)!=0: mean_pos_2_gv= mean_pos_2_gv / np.sqrt((mean_pos_2_gv ** 2).sum()) X_test[i,15] = 1- cosine_similarity(mean_pos_1_gv.reshape(1, -1),mean_pos_2_gv.reshape(1, -1)) X_test[i,17] = np.linalg.norm(mean_pos_1.reshape(1, -1)-mean_pos_2.reshape(1, -1)) X_test[i,18] = np.linalg.norm(mean_pos_1_gv.reshape(1, -1)-mean_pos_2_gv.reshape(1, -1)) X_test[i,35]= distance.cityblock(mean_pos_1,mean_pos_2) X_test[i,36]= distance.jaccard(mean_pos_1,mean_pos_2) X_test[i,37]= distance.canberra(mean_pos_1,mean_pos_2) X_test[i,38]= distance.minkowski(mean_pos_1,mean_pos_2,3) X_test[i,39]= distance.braycurtis(mean_pos_1,mean_pos_2) X_test[i,40]= distance.cityblock(mean_pos_1_gv,mean_pos_2_gv) X_test[i,41]= distance.jaccard(mean_pos_1_gv,mean_pos_2_gv) X_test[i,42]= distance.canberra(mean_pos_1_gv,mean_pos_2_gv) X_test[i,43]= distance.minkowski(mean_pos_1_gv,mean_pos_2_gv,3) X_test[i,44]= distance.braycurtis(mean_pos_1_gv,mean_pos_2_gv) else: X_test[i,1] = -1 X_test[i,15]= -1 X_test[i,17] = -1 X_test[i,18] = -1 X_test[i,35:44]=-1 X_test[i,6] = fuzz.partial_ratio(texts[q1],texts[q2])/100 X_test[i,7] = fuzz.QRatio(texts[q1],texts[q2])/100 X_test[i,8] = 1 - cosine_similarity(LSA_bis_features[ids2ind[q1],:].reshape(1, -1), LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) d2v1 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q1]]) d2v2 = d2v_dm.infer_vector(cleaned_docs[ids2ind[q2]]) d2vbow1 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q1]]) d2vbow2 = d2v_dbow.infer_vector(cleaned_docs[ids2ind[q2]]) X_test[i,9] = 1 - cosine_similarity(d2vbow1.reshape(1, -1), d2vbow2.reshape(1, -1)) X_test[i,10] = 1 - cosine_similarity(d2v1.reshape(1, -1), d2v2.reshape(1, -1)) X_test[i,11] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,12] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,13] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,14] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,16]= np.linalg.norm(LSA_features[ids2ind[q1],:].reshape(1, -1) - LSA_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,19] = np.linalg.norm(LSA_bis_features[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,20] = np.linalg.norm(d2vbow1.reshape(1, -1) - d2vbow2.reshape(1, -1)) X_test[i,21] = np.linalg.norm(d2v1.reshape(1, -1) - d2v2.reshape(1, -1)) X_test[i,22] = np.linalg.norm(A[ids2ind[q1],:].todense() - A[ids2ind[q2],:].todense()) X_test[i,23]= max(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1)) X_test[i,24]= max(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1)) X_test[i,25]= max(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1)) X_test[i,26]= max(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1)) X_test[i,27]= max(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1)) X_test[i,28]= min(min(presence_of_why(texts[q1].lower()),1),min(presence_of_why(texts[q2].lower()),1)) X_test[i,29]= min(min(presence_of_what(texts[q1].lower()),1),min(presence_of_what(texts[q2].lower()),1)) X_test[i,30]= min(min(presence_of_when(texts[q1].lower()),1),min(presence_of_when(texts[q2].lower()),1)) X_test[i,31]= min(min(presence_of_where(texts[q1].lower()),1),min(presence_of_where(texts[q2].lower()),1)) X_test[i,32]= min(min(presence_of_how(texts[q1].lower()),1),min(presence_of_how(texts[q2].lower()),1)) X_test[i,33]= fuzz.token_set_ratio(texts[q1],texts[q2])/100 X_test[i,34]= fuzz.token_sort_ratio(texts[q1],texts[q2])/100 X_test[i,45] = abs(len(texts[q1].lower())-len(texts[q2].lower())) X_test[i,46] = abs(len([j for j in texts[q1] if j=='?'])-len([j for j in texts[q2] if j=='?'])) X_test[i,47] = len(texts[q1].split()) + len(texts[q2].split()) X_test[i,48] = distance.cityblock(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_test[i,49] = distance.jaccard(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_test[i,50] = distance.canberra(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_test[i,51] = distance.minkowski(d2v1.reshape(1, -1),d2v2.reshape(1, -1),3) X_test[i,52] = distance.braycurtis(d2v1.reshape(1, -1),d2v2.reshape(1, -1)) X_test[i,53] = distance.cityblock(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,54] = distance.jaccard(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,55] = distance.canberra(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,56] = distance.minkowski(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1),3) X_test[i,57] = distance.braycurtis(LSA_features[ids2ind[q1],:].reshape(1, -1),LSA_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,58] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,59] = is_first_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,60] = is_last_word_same((texts[q1].lower()).split(),(texts[q2].lower()).split()) X_test[i,61] = abs(int(q1) - int(q2)) X_test[i,62] = abs((int(q1) + int(q2))/2) X_test[i,63] = abs(min(int(q1),int(q2))) X_test[i,64] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2) X_test[i,65] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3) X_test[i,66] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4) X_test[i,67] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5) X_test[i,68] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6) X_test[i,69] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2) X_test[i,70] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3) X_test[i,71] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4) X_test[i,72] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5) X_test[i,73] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6) X_test[i,74] = dice((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_test[i,75] = jaccard((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_test[i,76] = overlap((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_test[i,77] = cosine_wd((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_test[i,78] = min(matching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False),7) X_test[i,79] = unmatching((texts[q1].lower()).split(),(texts[q2].lower()).split(),stemming=False) X_test[i,80] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False) X_test[i,81] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False) X_test[i,82] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False) X_test[i,83] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False) X_test[i,84] = common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False) X_test[i,85] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),2,stemming=False) X_test[i,86] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),3,stemming=False) X_test[i,87] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),4,stemming=False) X_test[i,88] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),5,stemming=False) X_test[i,89] = not_common_n_grams((texts[q1].lower()).split(),(texts[q2].lower()).split(),6,stemming=False) X_test[i,90] = distance.cityblock(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,91] = distance.jaccard(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,92] = distance.canberra(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,93] = distance.minkowski(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1),3) X_test[i,94] = distance.braycurtis(LSA_bis_features[ids2ind[q1],:].reshape(1, -1),LSA_bis_features[ids2ind[q2],:].reshape(1, -1)) X_test[i,95] = distance.cityblock(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,96] = distance.jaccard(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,97] = distance.canberra(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,98] = distance.minkowski(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1),3) X_test[i,99] = distance.braycurtis(A[ids2ind[q1],:].todense().reshape(1, -1),A[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,100] = 1- cosine_similarity(B[ids2ind[q1],:], B[ids2ind[q2],:]) X_test[i,101] = np.linalg.norm(B[ids2ind[q1],:].todense() - B[ids2ind[q2],:].todense()) X_test[i,102] = distance.cityblock(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,103] = distance.jaccard(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,104] = distance.canberra(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,105] = distance.minkowski(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1),3) X_test[i,106] = distance.braycurtis(B[ids2ind[q1],:].todense().reshape(1, -1),B[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,107] = 1- cosine_similarity(C[ids2ind[q1],:], C[ids2ind[q2],:]) X_test[i,108] = np.linalg.norm(C[ids2ind[q1],:].todense() - C[ids2ind[q2],:].todense()) X_test[i,109] = distance.cityblock(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,110] = distance.jaccard(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,111] = distance.canberra(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,112] = distance.minkowski(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1),3) X_test[i,113] = distance.braycurtis(C[ids2ind[q1],:].todense().reshape(1, -1),C[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,114] = 1- cosine_similarity(D[ids2ind[q1],:], D[ids2ind[q2],:]) X_test[i,115] = np.linalg.norm(D[ids2ind[q1],:].todense() - D[ids2ind[q2],:].todense()) X_test[i,116] = distance.cityblock(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,117] = distance.jaccard(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,118] = distance.canberra(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,119] = distance.minkowski(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1),3) X_test[i,120] = distance.braycurtis(D[ids2ind[q1],:].todense().reshape(1, -1),D[ids2ind[q2],:].todense().reshape(1, -1)) X_test[i,121] = 1 - cosine_similarity(LSA_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,122] = np.linalg.norm(LSA_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,123] = distance.cityblock(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,124] = distance.jaccard(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,125] = distance.canberra(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,126] = distance.minkowski(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1),3) X_test[i,127] = distance.braycurtis(LSA_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,128] = 1 - cosine_similarity(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1), LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,129] = np.linalg.norm(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1)- LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,130] = distance.cityblock(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,131] = distance.jaccard(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,132] = distance.canberra(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) X_test[i,133] = distance.minkowski(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1),3) X_test[i,134] = distance.braycurtis(LSA_bis_features_stem[ids2ind[q1],:].reshape(1, -1),LSA_bis_features_stem[ids2ind[q2],:].reshape(1, -1)) return X_train,X_test
import pandas as pd import numpy as np from fuzzywuzzy import fuzz df_train = pd.read_csv('train.csv') df_test = pd.read_csv('test.csv') len_train = df_train.shape[0] df_feat = pd.DataFrame() df_data = pd.concat([ df_train[['question1', 'question2']], df_test[['question1', 'question2']] ], axis=0) df_feat['fuzz_qratio'] = df_data.apply( lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_WRatio'] = df_data.apply( lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_ratio'] = df_data.apply(lambda row: fuzz.partial_ratio( str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_token_set_ratio'] = df_data.apply( lambda row: fuzz.partial_token_set_ratio(str(row['question1']), str(row['question2'])), axis=1) df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply( lambda row: fuzz.partial_token_sort_ratio(str(row['question1']), str(row['question2'])), axis=1)
x_train['shared_words_length'] = temp_df['allR'].apply(lambda x: float(x.split(':')[4])) # Set 2 x_train['tfidf'] = df_train.apply(tfidf_word_match_share, axis = 1, raw = True) # Set 3 x_train['q1_word_count'] = df_train['question1'].apply(lambda x: len(str(x).lower().split())) x_train['q2_word_count'] = df_train['question2'].apply(lambda x: len(str(x).lower().split())) x_train['diff_word_count'] = x_train['q1_word_count'] - x_train['q2_word_count'] x_train['q1_char_count_withspace'] = df_train['question1'].apply(lambda x: len(str(x))) x_train['q2_char_count_withspace'] = df_train['question2'].apply(lambda x: len(str(x))) x_train['diff_char_count_withspace'] = x_train['q1_char_count_withspace'] - x_train['q2_char_count_withspace'] # Set 4 x_train['fuzz_qratio'] = df_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) x_train['fuzz_WRatio'] = df_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) x_train['fuzz_partial_ratio'] = df_train.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) x_train['fuzz_partial_token_set_ratio'] = df_train.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) x_train['fuzz_partial_token_sort_ratio'] = df_train.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) x_train['fuzz_token_set_ratio'] = df_train.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) x_train['fuzz_token_sort_ratio'] = df_train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) # Set 5 x_train['jaccard_dist'] = df_train.apply(jaccard_dist, axis = 1) x_train['cosine_dist'] = df_train.apply(cosine_dist, axis = 1) del temp_df ################################################################################ ################################################################################
if __name__ == '__main__': data_path = './data/train.pickle' output_path = './data/train_fuzzy.pickle' data_path = './data/test.pickle' output_path = './data/test_fuzzy.pickle' with open(data_path, 'rb') as f: data = pickle.load(f) result = [] for i in range(len(data)): sentence_q1 = ' '.join(data[i]['question1']) sentence_q2 = ' '.join(data[i]['question2']) qratio = fuzzy.QRatio(sentence_q1, sentence_q2) wratio = fuzzy.WRatio(sentence_q1, sentence_q2) ratio = fuzzy.ratio(sentence_q1, sentence_q2) partial_ratio = fuzzy.partial_ratio(sentence_q1, sentence_q2) partial_token_set_ratio = fuzzy.partial_token_set_ratio( sentence_q1, sentence_q2) partial_token_sort_ratio = fuzzy.partial_token_sort_ratio( sentence_q1, sentence_q2) token_set_ratio = fuzzy.token_set_ratio(sentence_q1, sentence_q2) token_sort_ratio = fuzzy.token_sort_ratio(sentence_q1, sentence_q2) fuzzyee = [ qratio, wratio, ratio, partial_ratio, partial_token_sort_ratio, token_set_ratio, token_sort_ratio ] fuzzyee.append(partial_token_set_ratio)
def engineer(data): import pickle import pandas as pd import numpy as np import gensim from fuzzywuzzy import fuzz import nltk from nltk import pos_tag from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from tqdm import tqdm from scipy.spatial.distance import cosine, euclidean from nltk import word_tokenize import re stop_words = stopwords.words('english') def clean_text(text): """ Pre process and convert texts to a list of words """ text = str(text) text = text.lower() # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " is ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"[a-z]+\-[a-z]+", "", text) text = re.sub(r"[a-z]+\-", "", text) text = re.sub(r"\-[a-z]+", "", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) return text model = gensim.models.KeyedVectors.load_word2vec_format('D:\papers\Project Material\data\GoogleNews-vectors' '-negative300.bin.gz', binary=True) def wmd(s1, s2): s1 = str(s1).lower().split() s2 = str(s2).lower().split() stop_words = stopwords.words('english') s1 = [w for w in s1 if w not in stop_words] s2 = [w for w in s2 if w not in stop_words] return round(model.wmdistance(s1, s2), 3) def sent2vec(s): words = str(s).lower() words = word_tokenize(words) words = [w for w in words if not w in stop_words] words = [w for w in words if w.isalpha()] M = [] for w in words: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) return v / np.sqrt((v ** 2).sum()) wh = ['where', 'why', 'what', 'who', 'whom', 'how', 'when', 'is', 'am', 'are', 'has', 'have', 'had', 'do', 'does', 'did'] for x in wh: if x in stop_words: stop_words.remove(x) for s in data.head()['question1']: print(s, '\n') data['question1'] = data.question1.apply(lambda x: clean_text(x)) data['question2'] = data.question2.apply(lambda x: clean_text(x)) for s in data.head()['question1']: print(s, '\n') # Added Features. data['word_overlap'] = [set(x[0].split()) & set(x[1].split()) for x in data[['question1', 'question2']].values] data['common_word_cnt'] = data['word_overlap'].str.len() data['text1_nostop'] = data['question1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words)) data['text2_nostop'] = data['question2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words)) data['word_overlap'] = [set(x[0].split()) & set(x[1].split()) for x in data[['text1_nostop', 'text2_nostop']].values] data['common_nonstop_word_cnt'] = data['word_overlap'].str.len() data['char_cnt_1'] = data['question1'].str.len() data['char_cnt_2'] = data['question2'].str.len() data['char_cnt_diff'] = (data['char_cnt_1'] - data['char_cnt_2']) ** 2 data['word_cnt_1'] = data['question1'].apply(lambda x: len(str(x).split())) data['word_cnt_2'] = data['question2'].apply(lambda x: len(str(x).split())) data['word_cnt_diff'] = (data['word_cnt_1'] - data['word_cnt_2']) ** 2 data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = (data.len_q1 - data.len_q2) ** 2 text1 = list(data['question1']) text2 = list(data['question2']) corpus1 = ' '.join(text1) corpus2 = ' '.join(text2) corpus = corpus1.lower() + corpus2.lower() lem = WordNetLemmatizer() corpus = lem.lemmatize(corpus, "v") # corpus = stem.stem(corpus) tags = pos_tag(corpus.split()) nouns = [i[0] for i in tags if i[1] in ("NN", "NNS", "NNP", "NNPS")] def count_common_nouns(var1, var2, var3): count = 0 for i in var1: if (i in var2) & (i in var3): count += 1 return count data['text1_lower'] = data['question1'].apply(lambda x: x.lower()) data['text2_lower'] = data['question2'].apply(lambda x: x.lower()) data['common_noun_cnt'] = [ count_common_nouns(nltk.word_tokenize(lem.lemmatize(x[0], "v")), nltk.word_tokenize(lem.lemmatize(x[1], "v")), nouns) for x in data[['question1', 'question2']].values] # FUZZ WUZZ Features data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply( lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0 for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) data['cosine_distance'] = [round(cosine(x, y), 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [round(euclidean(x, y), 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] to_remove = ['word_overlap', 'text1_lower', 'text2_lower', 'question1', 'question2', 'test_id', 'text1_nostop', 'text2_nostop'] data = data.drop(to_remove, axis=1) data.to_csv(r'D:\papers\Project Material\new_try\CQA\forum\\revised.csv', index=False) return data
def fuzz_QRatio(sentences): sen = sentences.split("\001") return fuzz.QRatio(sen[0], sen[1])
def testFuzzy(self): print( 'ratio', fuzz.ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'ratio', fuzz.ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_ratio', fuzz.partial_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_ratio', fuzz.partial_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'token_sort_ratio', fuzz.token_sort_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'token_sort_ratio', fuzz.token_sort_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_token_sort_ratio', fuzz.partial_token_sort_ratio( 'MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_token_sort_ratio', fuzz.partial_token_sort_ratio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'token_set_ratio', fuzz.token_set_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'token_set_ratio', fuzz.token_set_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_token_set_ratio', fuzz.partial_token_set_ratio( 'MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_token_set_ratio', fuzz.partial_token_set_ratio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'QRatio', fuzz.QRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'QRatio', fuzz.QRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'UQRatio', fuzz.UQRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'UQRatio', fuzz.UQRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'WRatio', fuzz.WRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'WRatio', fuzz.WRatio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'UWRatio', fuzz.UWRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'UWRatio', fuzz.UWRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) pass
def testQuickRatioNotEqual(self): self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100)
# ratio of difference in these lengths to total length df['diff_len_word_ratio'] = abs(df.len_word_q1 - df.len_word_q2) / (df.len_word_q1 + df.len_word_q2) # Number of common words in question1 and question2 df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) # ratio of number of common words to average length of the questions df['common_words_ratio'] = 2* df.common_words / (df.len_word_q1 + df.len_word_q2) df.to_csv('fs1.csv', index=False) #Fuzzy features # Q-ratio df['fuzz_Qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) # W-ratio df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) # Partial ratio df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) # Partial token set ratio df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) # Partial token sort ratio df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) # Token set ratio df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
def testQuickRatioCaseInsensitive(self): self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100)
vectorizer = CountVectorizer() features = vectorizer.fit_transform(corpus2).todense() for feature in features: print("coseno ", cosine_distances(features[0], feature)) for feature in features: print("euclidiana ", euclidean_distances(features[0], feature)) print() for sentence in corpus2: print("levenshtein ", distance.levenshtein(corpus2[0], sentence)) for sentence in corpus2: print("jaccard ", distance.jaccard(corpus2[0], sentence)) for sentence in corpus2: print("fuzzy ", fuzz.QRatio(corpus2[0], sentence)) # from owlready2 import * # repository = "/Users/jairoandresarizacastaneda/Desktop/IoT segundo semestre/codigo gateway/ontology/repository" # filename = "hub_iot_qos.owl" # onto_path.append(repository) # onto_hub = get_ontology("file:///"+repository+"/"+filename).load() # for individual in onto_hub.individuals(): # # print(individual) # # if individual.aboutProperty: # # print("prop", individual.aboutProperty) # if isinstance(individual, onto_hub.Service):