def create_feature(data_file, vectorizer, pipeline):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    X1 = pipeline.transform(
        vectorizer.transform(df.question1.values.astype(str)))
    X2 = pipeline.transform(
        vectorizer.transform(df.question2.values.astype(str)))
    X = np.hstack((X1, X2))

    column_names = []
    for i in tqdm(range(X.shape[1])):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    df[column_names] = X
    df[column_names].to_csv(feature_output_file(data_file),
                            index=False,
                            float_format='%.5f')
Exemple #2
0
    def create_features(self, input_file):
        output_file = feature_output_file(input_file)

        start_time = time.time()
        print("Start to create features {} {}".format(sys.argv[0], input_file),
              file=sys.stderr)
        if os.path.exists(output_file):
            print('File exists {}.'.format(feature_output_file(output_file)))
            return

        data = self.read_data(input_file)
        values = self.calculate_features(data)
        print("Finished to create features {} {}: {:.2f} [s]".format(
            sys.argv[0], input_file,
            time.time() - start_time),
              file=sys.stderr)

        start_time = time.time()
        print("Start to write features {} {}".format(sys.argv[0], input_file),
              file=sys.stderr)
        self.write_feature(column_name=self.get_column_name(input_file),
                           output_file=output_file,
                           values=values)
        print("Finished to write features {} {}: {:.2f} [s]".format(
            sys.argv[0], input_file,
            time.time() - start_time),
              file=sys.stderr)
Exemple #3
0
def create_feature(data_file, neighbor_sets: defaultdict, neighbor_weights: defaultdict):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = pd.read_csv(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)

    column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    column_names = []
    features = [
        ('identity', calculate_identity),
        ('q1_weight_avg', calculate_q1_weight_avg),
        ('q2_weight_avg', calculate_q2_weight_avg),
        ('q1_q2_intersection_weight_sum', calculate_q1_q2_intersection_weight_sum),
        ('q1_q2_intersection_weight_avg', calculate_q1_q2_intersection_weight_avg),
        ('q1_q2_intersection_weight_max', calculate_q1_q2_intersection_weight_max),
        ('q1_q2_intersection_weight_min', calculate_q1_q2_intersection_weight_min),
        ('q1_q2_intersection_weight_diff_max', calculate_q1_q2_intersection_weight_diff_max),
        ('q1_q2_intersection_weight_diff_min', calculate_q1_q2_intersection_weight_diff_min)
    ]

    for (column_name_suffix, feature_calculator) in features:
        column_name = column_name_prefix + '.' + column_name_suffix
        df[column_name] = feature_calculator(df, neighbor_sets, neighbor_weights)
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #4
0
def create_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_pos_tag(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])

    q1_past = []
    q2_past = []
    for i, row in df.iterrows():
        q1_has = 0
        for w, t in ast.literal_eval(row['question1']):
            if t in ['VBD']:
                q1_has = 1
                break
        q1_past.append(q1_has)

        q2_has = 0
        for w, t in ast.literal_eval(row['question2']):
            if t in ['VBD']:
                q2_has = 1
                break
        q2_past.append(q2_has)
    column_names = [column_name_prefix + '.1', column_name_prefix + '.2']
    df[column_names[0]] = q1_past
    df[column_names[1]] = q2_past
    df[column_names].to_csv(feature_output_file(data_file),
                            index=False,
                            float_format='%.5f')
Exemple #5
0
def create_features_files(train_path, test_path):
    print(train_path, test_path)
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(
            feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) +
              ", " + feature_output_file(test_path))
        return

    print('Preprocessing')
    train = nltk_tokenize(train_path)
    for q1, q2, dup in tqdm(
            zip(train.question1.astype(str), train.question2.astype(str),
                train.is_duplicate)):
        if dup:
            diff_pairs = list_diff_pairs(q1, q2)
            dup_counter.update(diff_pairs)

    print('freatures >= MIN_FREQ: {}'.format(
        sum(1 for t, freq in dup_counter.most_common() if freq >= MIN_FREQ)))

    print('Creating feature for train')
    create_features(train_path)

    print('Creating feature for test')
    create_features(test_path)
Exemple #6
0
def create_feature(data_file, vectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df.question1.astype(str), df.question1.astype(str)))
    ]
    X = vectorizer.transform(diff_pairs)

    column_names = []
    for i in range(X.shape[1]):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    df[column_names] = X
    df[column_names].to_csv(feature_output_file(data_file),
                            index=False,
                            float_format='%.5f')
def create_word_match_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = nltk_stemming(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = df.apply(word_match_share, axis=1, raw=True)
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #8
0
def create_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    print(sys.argv[0], file=sys.stderr)
    df = nltk_pos_tag(data_file)
    print(df.head(), file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = df.apply(pnoun_jaccard, axis=1, raw=True).values
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #9
0
def create_word_match_feature(data_file, model: gensim.models.KeyedVectors):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = nltk_tokenize(data_file)

    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = df.apply(wmd, axis=1, raw=True, model=model)
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #10
0
def create_features_files(train_path, test_path):
    print(train_path, test_path)
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(
            feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) +
              ", " + feature_output_file(test_path))
        return

    print('Creating feature for train')
    create_features(train_path)

    print('Creating feature for test')
    create_features(test_path)
def create_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    question1_vectors, question2_vectors = sentence2vec(data_file)

    print(sys.argv[0], data_file, file=sys.stderr)
    df = pd.DataFrame()
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = np.nan_to_num([kurtosis(x) for x in np.nan_to_num(question2_vectors)])

    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, df):
    if os.path.exists(feature_output_file(data_file)) and False:
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    column_name_prefix = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    q1_see_later = column_name_prefix + '.q1'
    q2_see_later = column_name_prefix + '.q2'
    column_names = [q1_see_later, q2_see_later]
    out_df = pd.DataFrame()
    out_df[q1_see_later] = df['see_later1'].tolist()
    out_df[q2_see_later] = df['see_later2'].tolist()
    out_df[column_names].to_csv(feature_output_file(data_file), index=False)
def create_feature(data_file, vectorizer: TfidfVectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    df = convert(nltk_pos_tag(data_file))

    cosine_values = []
    q1vec = vectorizer.transform(df['question1'].apply(lambda x: x if type(x) == str else '').values)
    q2vec = vectorizer.transform(df['question2'].apply(lambda x: x if type(x) == str else '').values)
    for i in range(df.shape[0]):
        cosine_values.append(round(float(np.dot(q1vec[i], q2vec[i].T)[0, 0]), 5))
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = cosine_values
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #14
0
def create_features_files(train_path, test_path):
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(
            feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) +
              ", " + feature_output_file(test_path))
        return

    global limiter
    limiter = DepthLimiter()

    print('Creating feature for train')
    create_features(train_path)

    print('Creating feature for test')
    create_features(test_path)
Exemple #15
0
def create_feature(data_file, vectorizer):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_stemming(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    X1 = vectorizer.transform(df.question1.values.astype(str))
    X2 = vectorizer.transform(df.question2.values.astype(str))
    values = []
    for i in tqdm(range(X1.shape[0])):
        values.append(np.dot(X1[i], X2[i].T))
    df[column_name] = values
    df[column_name].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #16
0
def create_features(data_path):
    data = pd.read_csv(data_path)

    df = pd.DataFrame()
    df['dep_depth_limit'] = Parallel(n_jobs=-1, verbose=5)(
        delayed(create_feature)(q1, q2) for q1, q2 in zip(
            data.question1.astype(str), data.question2.astype(str)))
    df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
Exemple #17
0
def create_feature(data_file, model):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])

    values = np.zeros((df.shape[0]))
    for i in tqdm(range(df.shape[0])):
        q1 = df.question1.values[i]
        q2 = df.question2.values[i]
        values[i] = calculate_distance(q1, q2, model)

    df[column_name] = values
    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #18
0
def create_feature(data_file, questions_dict, q1_vc, q2_vc):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = pd.read_csv(data_file)
    column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    q1_hash = column_name_prefix + '.q1_hash.cat'
    q2_hash = column_name_prefix + '.q2_hash.cat'
    q1_freq = column_name_prefix + '.q1_freq'
    q2_freq = column_name_prefix + '.q2_freq'

    column_names = [q1_freq, q2_freq]
    df[q1_hash] = df['question1'].map(questions_dict)
    df[q2_hash] = df['question2'].map(questions_dict)
    df[q1_freq] = df[q1_hash].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc))
    df[q2_freq] = df[q2_hash].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc))
    df[column_names].to_csv(feature_output_file(data_file), index=False)
def create_features(data_path):
    data = nltk_tokenize(data_path)

    feature_dicts = Parallel(n_jobs=-1, verbose=3)(
        delayed(create_feature)(q1, q2) for q1, q2 in zip(
            data.question1.astype(str), data.question2.astype(str)))

    df = pd.DataFrame(feature_dicts)
    df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
Exemple #20
0
def create_features(data_path):
    print('data_path file: {}'.format(data_path))
    data = pd.read_csv(data_path)  #[:1000]

    features = Parallel(n_jobs=-1, verbose=5)(
        delayed(create_feature)(q1, q2) for q1, q2 in zip(
            data.question1.astype(str), data.question2.astype(str)))
    df = pd.DataFrame(features)
    df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
Exemple #21
0
def create_feature(data_file, model: gensim.models.KeyedVectors):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    question1_vectors, question2_vectors = sentence2vec(data_file)

    print(sys.argv[0], data_file, file=sys.stderr)
    df = pd.DataFrame()
    column_name = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    values = []
    df[column_name] = np.nan_to_num([
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
    ])

    df[[column_name]].to_csv(feature_output_file(data_file),
                             index=False,
                             float_format='%.5f')
def create_word_match_feature(data_file, model: gensim.models.Doc2Vec):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    df = nltk_tokenize(data_file)
    print(sys.argv[0], data_file, file=sys.stderr)
    column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])

    X1 = calc_document_vector(df.question1.values.astype(str).tolist(), model)
    X2 = calc_document_vector(df.question2.values.astype(str).tolist(), model)
    X = np.hstack((X1, X2))

    column_names = []
    for i in tqdm(range(X.shape[1])):
        column_name = column_name_prefix + '.' + str(i)
        df[column_name] = X[:, i]
        column_names.append(column_name)
    column_names = pd.Index(column_names)
    print('Start to write dataset')
    df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Exemple #23
0
def create_features_files(train_path, test_path):
    print(train_path, test_path)
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path))
        return

    print('Preprocessing')
    train = nltk_tokenize(train_path)
    for q1, q2, dup in tqdm(zip(train.question1.astype(str), train.question2.astype(str), train.is_duplicate)):
        words1 = q1.split()
        words2 = q2.split()
        all_counter.update(words1)
        all_counter.update(words2)
        if dup:
            dup_counter.update(words1)
            dup_counter.update(words2)

    print('Creating feature for train')
    create_features(train_path)

    print('Creating feature for test')
    create_features(test_path)
def create_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    global creator
    if creator is None:
        print('creator')
        creator = DependencyNgramsCreator()

    print('read: {}'.format(data_file))
    df = pd.read_csv(data_file)

    df_features = create_df(df,
                            'dep_2grams_common_ratio_stop',
                            n=2,
                            skip_stopwords=True)

    print('write: {}'.format(feature_output_file(data_file)))
    df_features.to_csv(feature_output_file(data_file),
                       index=False,
                       float_format='%.5f')
def create_feature(train_path, test_path, n, skip_stopwords):
    if os.path.exists(feature_output_file(train_path)) and os.path.exists(
            feature_output_file(test_path)):
        print('File exists {}.'.format(feature_output_file(train_path)) +
              ", " + feature_output_file(test_path))
        return

    global creator
    global vectorizer
    print('start preprocessing')
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    creator = DependencyNgramsCreator(n=n, skip_stopwords=skip_stopwords)

    train_q1_ngram_lists = create_ngrams_lists(train.question1.astype(str))
    train_q2_ngram_lists = create_ngrams_lists(train.question2.astype(str))
    test_q1_ngram_lists = create_ngrams_lists(test.question1.astype(str))
    test_q2_ngram_lists = create_ngrams_lists(test.question2.astype(str))

    vectorizer = TfidfVectorizer(tokenizer=lambda a: a,
                                 lowercase=False,
                                 min_df=10,
                                 max_df=0.5)
    vectorizer.fit(train_q1_ngram_lists + train_q2_ngram_lists +
                   test_q1_ngram_lists + test_q2_ngram_lists)

    print('finish preprocessing')

    print('train')
    train_q1_tfidf = vectorizer.transform(train_q1_ngram_lists)
    train_q2_tfidf = vectorizer.transform(train_q2_ngram_lists)
    train_feature = pd.DataFrame()
    train_feature['dep_2grams_sum_tfidf_q1'] = np.array(
        train_q1_tfidf.sum(axis=1)).flatten()
    train_feature['dep_2grams_sum_tfidf_q2'] = np.array(
        train_q2_tfidf.sum(axis=1)).flatten()
    train_feature['dep_2grams_tfidf_cosine'] = np.array(
        train_q1_tfidf.multiply(train_q2_tfidf).sum(axis=1)).flatten()
    train_feature.to_csv(feature_output_file(train_path),
                         index=False,
                         float_format='%.5f')

    print('test')
    test_q1_tfidf = vectorizer.transform(test_q1_ngram_lists)
    test_q2_tfidf = vectorizer.transform(test_q2_ngram_lists)
    test_feature = pd.DataFrame()
    test_feature['dep_2grams_sum_tfidf_q1'] = np.array(
        test_q1_tfidf.sum(axis=1)).flatten()
    test_feature['dep_2grams_sum_tfidf_q2'] = np.array(
        test_q2_tfidf.sum(axis=1)).flatten()
    test_feature['dep_2grams_tfidf_cosine'] = np.array(
        test_q1_tfidf.multiply(test_q2_tfidf).sum(axis=1)).flatten()
    test_feature.to_csv(feature_output_file(test_path),
                        index=False,
                        float_format='%.5f')
Exemple #26
0
 def get_column_name(self, input_file):
     return 'f{0}'.format(
         os.path.basename(feature_output_file(input_file)).split('_')[0])
Exemple #27
0
def check_feature_existence(feature_creator_file, data_prefix):
    for k, file_name in generate_filename_from_prefix(data_prefix):
        if not os.path.exists(
                feature_output_file(file_name, feature_creator_file)):
            return False
    return True
 def get_column_name(self, input_file):
     prefix = 'f{0}'.format(
         os.path.basename(feature_output_file(input_file)).split('_')[0])
     return "{},{},{}".format(prefix + ".mc", prefix + ".jc", prefix + "ds")
Exemple #29
0
def create_feature(data_file, features: pd.DataFrame):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return
    features.to_csv(feature_output_file(data_file), index=False)