Ejemplo n.º 1
0
import pandas as pd
from sklearn.model_selection import train_test_split
from dependency_parse import get_pos_dep
from semantic_similarity import align
from features import percentage_semantic_similarity_both

data = pd.read_csv("input/train.csv")
X = data[['question1', 'question2']].values
y = data['is_duplicate'].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

X_sample = X_train[:10]
for x in X_sample:
    print x[0]
    print x[1]
    S = get_pos_dep(x[0])
    T = get_pos_dep(x[1])
    A, info_A = align(S, T)
    p = percentage_semantic_similarity_both(S, T, A)
    print 'Percentage Semantic Similarity: ', p
    print '\n'
from semantic_similarity import align
from features import *
import gzip

len_test = pd.read_csv('../../input/test.csv').shape[0]

with gzip.open('stanford_corenlp_test.nlp', 'rb') as handle:
    rows = []
    for i in range(len_test):
        try:
            d = pickle.load(handle)
            id = d['id']
            #             print(id)
            #print(d['q1']['raw'])
            #print(d['q2']['raw'])
            S = get_pos_dep(d['q1']['toks'], d['q1']['deps'])
            T = get_pos_dep(d['q2']['toks'], d['q2']['deps'])
            #print(S)
            #print(T)
            A = align(S, T)
            #print(A)
            #print('Len(S):',len(S))
            #print('Len(T):', len(T))
            #print('Len(A):', len(A))
            # Semantic Similarity Features
            S_sem_sim = percentage_semantic_similarity_one(S, A)
            T_sem_sim = percentage_semantic_similarity_one(T, A)
            sem_sim = percentage_semantic_similarity_both(S, T, A)
            # Noun Features
            S_unmatch_n, T_unmatch_n = number_unmatched(S,
                                                        T,
Ejemplo n.º 3
0
def train_features():
    with gzip.open('data_dump/stanfordData_train_ner.nlp', 'rb') as handle:
        rows = []
        while True:
            try:
                d = pickle.load(handle)
                id = d['id']
                print(id)
                is_duplicate = d['is_duplicate']
                #print(d['q1']['raw'])
                #print(d['q2']['raw'])
                S = get_pos_dep(d['q1']['toks'], d['q1']['deps'])
                T = get_pos_dep(d['q2']['toks'], d['q2']['deps'])
                #print(S)
                #print(T)
                A = align(S, T)
                #print(A)
                #print('Len(S):',len(S))
                #print('Len(T):', len(T))
                #print('Len(A):', len(A))
                # Semantic Similarity Features
                S_sem_sim = percentage_semantic_similarity_one(S, A)
                T_sem_sim = percentage_semantic_similarity_one(T, A)
                sem_sim = percentage_semantic_similarity_both(S, T, A)
                # Noun Features
                S_unmatch_n, T_unmatch_n = number_unmatched(S,
                                                            T,
                                                            A,
                                                            'n',
                                                            inferred_pos=True)
                S_unmatch_n_p, T_unmatch_n_p = percent_unmatched(
                    S, T, A, 'n', inferred_pos=True)
                # Adjective Features
                S_unmatch_a, T_unmatch_a = number_unmatched(S,
                                                            T,
                                                            A,
                                                            'a',
                                                            inferred_pos=True)
                S_unmatch_a_p, T_unmatch_a_p = percent_unmatched(
                    S, T, A, 'a', inferred_pos=True)
                # Verb Features
                S_unmatch_v, T_unmatch_v = number_unmatched(S,
                                                            T,
                                                            A,
                                                            'v',
                                                            inferred_pos=True)
                S_unmatch_v_p, T_unmatch_v_p = percent_unmatched(
                    S, T, A, 'v', inferred_pos=True)
                # Personal Pronoun Feature
                S_unmatch_pp, T_unmatch_pp = number_unmatched(
                    S, T, A, 'PRP', inferred_pos=False)
                S_unmatch_pp_p, T_unmatch_pp_p = percent_unmatched(
                    S, T, A, 'PRP', inferred_pos=False)
                # WH-Pronoun Feature
                S_unmatch_wp, T_unmatch_wp = number_unmatched(
                    S, T, A, 'WP', inferred_pos=False)
                S_unmatch_wp_p, T_unmatch_wp_p = percent_unmatched(
                    S, T, A, 'WP', inferred_pos=False)
                # Numbers Feature
                S_unmatch_num, T_unmatch_num = number_unmatched(
                    S, T, A, 'CD', inferred_pos=False)
                S_unmatch_num_p, T_unmatch_num_p = percent_unmatched(
                    S, T, A, 'CD', inferred_pos=False)
                # NER Feature
                S_unmatch_ner, T_unmatch_ner = ner_unmatched(S, T)
                # Length Difference Feature
                len_dif = len_difference(S, T)
                len_dif_p = len_difference_p(S, T)
                # Put all features in a row
                features_row = [
                    id, S_sem_sim, T_sem_sim, sem_sim, S_unmatch_n,
                    T_unmatch_n, S_unmatch_n_p, T_unmatch_n_p, S_unmatch_a,
                    T_unmatch_a, S_unmatch_a_p, T_unmatch_a_p, S_unmatch_v,
                    T_unmatch_v, S_unmatch_v_p, T_unmatch_v_p, S_unmatch_pp,
                    T_unmatch_pp, S_unmatch_pp_p, T_unmatch_pp_p, S_unmatch_wp,
                    T_unmatch_wp, S_unmatch_wp_p, T_unmatch_wp_p,
                    S_unmatch_num, T_unmatch_num, S_unmatch_num_p,
                    T_unmatch_num_p, S_unmatch_ner, T_unmatch_ner, len_dif,
                    len_dif_p, is_duplicate
                ]
                rows.append(features_row)
            except EOFError:
                break

        columns = [
            'id', 'S_sem_sim', 'T_sem_sim', 'sem_sim', 'S_unmatch_n',
            'T_unmatch_n', 'S_unmatch_n_p', 'T_unmatch_n_p', 'S_unmatch_a',
            'T_unmatch_a', 'S_unmatch_a_p', 'T_unmatch_a_p', 'S_unmatch_v',
            'T_unmatch_v', 'S_unmatch_v_p', 'T_unmatch_v_p', 'S_unmatch_pp',
            'T_unmatch_pp', 'S_unmatch_pp_p', 'T_unmatch_pp_p', 'S_unmatch_wp',
            'T_unmatch_wp', 'S_unmatch_wp_p', 'T_unmatch_wp_p',
            'S_unmatch_num', 'T_unmatch_num', 'S_unmatch_num_p',
            'T_unmatch_num_p', 'S_unmatch_ner', 'T_unmatch_ner', 'len_dif',
            'len_dif_p', 'is_duplicate'
        ]
        df = pd.DataFrame(np.array(rows), columns=columns)
        df.to_csv('Features/dean_train_features.csv', index=False)