def create_feature(data_file, vectorizer, pipeline): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = pipeline.transform( vectorizer.transform(df.question1.values.astype(str))) X2 = pipeline.transform( vectorizer.transform(df.question2.values.astype(str))) X = np.hstack((X1, X2)) column_names = [] for i in tqdm(range(X.shape[1])): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) df[column_names] = X df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features(self, input_file): output_file = feature_output_file(input_file) start_time = time.time() print("Start to create features {} {}".format(sys.argv[0], input_file), file=sys.stderr) if os.path.exists(output_file): print('File exists {}.'.format(feature_output_file(output_file))) return data = self.read_data(input_file) values = self.calculate_features(data) print("Finished to create features {} {}: {:.2f} [s]".format( sys.argv[0], input_file, time.time() - start_time), file=sys.stderr) start_time = time.time() print("Start to write features {} {}".format(sys.argv[0], input_file), file=sys.stderr) self.write_feature(column_name=self.get_column_name(input_file), output_file=output_file, values=values) print("Finished to write features {} {}: {:.2f} [s]".format( sys.argv[0], input_file, time.time() - start_time), file=sys.stderr)
def create_feature(data_file, neighbor_sets: defaultdict, neighbor_weights: defaultdict): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = pd.read_csv(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) column_names = [] features = [ ('identity', calculate_identity), ('q1_weight_avg', calculate_q1_weight_avg), ('q2_weight_avg', calculate_q2_weight_avg), ('q1_q2_intersection_weight_sum', calculate_q1_q2_intersection_weight_sum), ('q1_q2_intersection_weight_avg', calculate_q1_q2_intersection_weight_avg), ('q1_q2_intersection_weight_max', calculate_q1_q2_intersection_weight_max), ('q1_q2_intersection_weight_min', calculate_q1_q2_intersection_weight_min), ('q1_q2_intersection_weight_diff_max', calculate_q1_q2_intersection_weight_diff_max), ('q1_q2_intersection_weight_diff_min', calculate_q1_q2_intersection_weight_diff_min) ] for (column_name_suffix, feature_calculator) in features: column_name = column_name_prefix + '.' + column_name_suffix df[column_name] = feature_calculator(df, neighbor_sets, neighbor_weights) column_names.append(column_name) column_names = pd.Index(column_names) df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_pos_tag(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) q1_past = [] q2_past = [] for i, row in df.iterrows(): q1_has = 0 for w, t in ast.literal_eval(row['question1']): if t in ['VBD']: q1_has = 1 break q1_past.append(q1_has) q2_has = 0 for w, t in ast.literal_eval(row['question2']): if t in ['VBD']: q2_has = 1 break q2_past.append(q2_has) column_names = [column_name_prefix + '.1', column_name_prefix + '.2'] df[column_names[0]] = q1_past df[column_names[1]] = q2_past df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features_files(train_path, test_path): print(train_path, test_path) if os.path.exists(feature_output_file(train_path)) and os.path.exists( feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return print('Preprocessing') train = nltk_tokenize(train_path) for q1, q2, dup in tqdm( zip(train.question1.astype(str), train.question2.astype(str), train.is_duplicate)): if dup: diff_pairs = list_diff_pairs(q1, q2) dup_counter.update(diff_pairs) print('freatures >= MIN_FREQ: {}'.format( sum(1 for t, freq in dup_counter.most_common() if freq >= MIN_FREQ))) print('Creating feature for train') create_features(train_path) print('Creating feature for test') create_features(test_path)
def create_feature(data_file, vectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) diff_pairs = [ list_diff_pairs(q1, q2) for q1, q2 in tqdm( zip(df.question1.astype(str), df.question1.astype(str))) ] X = vectorizer.transform(diff_pairs) column_names = [] for i in range(X.shape[1]): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) df[column_names] = X df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_word_match_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = df.apply(word_match_share, axis=1, raw=True) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return print(sys.argv[0], file=sys.stderr) df = nltk_pos_tag(data_file) print(df.head(), file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = df.apply(pnoun_jaccard, axis=1, raw=True).values df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_word_match_feature(data_file, model: gensim.models.KeyedVectors): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = df.apply(wmd, axis=1, raw=True, model=model) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features_files(train_path, test_path): print(train_path, test_path) if os.path.exists(feature_output_file(train_path)) and os.path.exists( feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return print('Creating feature for train') create_features(train_path) print('Creating feature for test') create_features(test_path)
def create_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return question1_vectors, question2_vectors = sentence2vec(data_file) print(sys.argv[0], data_file, file=sys.stderr) df = pd.DataFrame() column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = np.nan_to_num([kurtosis(x) for x in np.nan_to_num(question2_vectors)]) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, df): if os.path.exists(feature_output_file(data_file)) and False: print('File exists {}.'.format(feature_output_file(data_file))) return column_name_prefix = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) q1_see_later = column_name_prefix + '.q1' q2_see_later = column_name_prefix + '.q2' column_names = [q1_see_later, q2_see_later] out_df = pd.DataFrame() out_df[q1_see_later] = df['see_later1'].tolist() out_df[q2_see_later] = df['see_later2'].tolist() out_df[column_names].to_csv(feature_output_file(data_file), index=False)
def create_feature(data_file, vectorizer: TfidfVectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = convert(nltk_pos_tag(data_file)) cosine_values = [] q1vec = vectorizer.transform(df['question1'].apply(lambda x: x if type(x) == str else '').values) q2vec = vectorizer.transform(df['question2'].apply(lambda x: x if type(x) == str else '').values) for i in range(df.shape[0]): cosine_values.append(round(float(np.dot(q1vec[i], q2vec[i].T)[0, 0]), 5)) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) df[column_name] = cosine_values df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features_files(train_path, test_path): if os.path.exists(feature_output_file(train_path)) and os.path.exists( feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return global limiter limiter = DepthLimiter() print('Creating feature for train') create_features(train_path) print('Creating feature for test') create_features(test_path)
def create_feature(data_file, vectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = vectorizer.transform(df.question1.values.astype(str)) X2 = vectorizer.transform(df.question2.values.astype(str)) values = [] for i in tqdm(range(X1.shape[0])): values.append(np.dot(X1[i], X2[i].T)) df[column_name] = values df[column_name].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features(data_path): data = pd.read_csv(data_path) df = pd.DataFrame() df['dep_depth_limit'] = Parallel(n_jobs=-1, verbose=5)( delayed(create_feature)(q1, q2) for q1, q2 in zip( data.question1.astype(str), data.question2.astype(str))) df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
def create_feature(data_file, model): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) values = np.zeros((df.shape[0])) for i in tqdm(range(df.shape[0])): q1 = df.question1.values[i] q2 = df.question2.values[i] values[i] = calculate_distance(q1, q2, model) df[column_name] = values df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(data_file, questions_dict, q1_vc, q2_vc): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = pd.read_csv(data_file) column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) q1_hash = column_name_prefix + '.q1_hash.cat' q2_hash = column_name_prefix + '.q2_hash.cat' q1_freq = column_name_prefix + '.q1_freq' q2_freq = column_name_prefix + '.q2_freq' column_names = [q1_freq, q2_freq] df[q1_hash] = df['question1'].map(questions_dict) df[q2_hash] = df['question2'].map(questions_dict) df[q1_freq] = df[q1_hash].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc)) df[q2_freq] = df[q2_hash].map(lambda x: try_apply_dict(x, q1_vc) + try_apply_dict(x, q2_vc)) df[column_names].to_csv(feature_output_file(data_file), index=False)
def create_features(data_path): data = nltk_tokenize(data_path) feature_dicts = Parallel(n_jobs=-1, verbose=3)( delayed(create_feature)(q1, q2) for q1, q2 in zip( data.question1.astype(str), data.question2.astype(str))) df = pd.DataFrame(feature_dicts) df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
def create_features(data_path): print('data_path file: {}'.format(data_path)) data = pd.read_csv(data_path) #[:1000] features = Parallel(n_jobs=-1, verbose=5)( delayed(create_feature)(q1, q2) for q1, q2 in zip( data.question1.astype(str), data.question2.astype(str))) df = pd.DataFrame(features) df.to_csv(feature_output_file(data_path), index=False, float_format='%.5f')
def create_feature(data_file, model: gensim.models.KeyedVectors): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return question1_vectors, question2_vectors = sentence2vec(data_file) print(sys.argv[0], data_file, file=sys.stderr) df = pd.DataFrame() column_name = 'f{0}'.format( os.path.basename(feature_output_file(data_file)).split('_')[0]) values = [] df[column_name] = np.nan_to_num([ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ]) df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_word_match_feature(data_file, model: gensim.models.Doc2Vec): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_tokenize(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = calc_document_vector(df.question1.values.astype(str).tolist(), model) X2 = calc_document_vector(df.question2.values.astype(str).tolist(), model) X = np.hstack((X1, X2)) column_names = [] for i in tqdm(range(X.shape[1])): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) print('Start to write dataset') df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_features_files(train_path, test_path): print(train_path, test_path) if os.path.exists(feature_output_file(train_path)) and os.path.exists(feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return print('Preprocessing') train = nltk_tokenize(train_path) for q1, q2, dup in tqdm(zip(train.question1.astype(str), train.question2.astype(str), train.is_duplicate)): words1 = q1.split() words2 = q2.split() all_counter.update(words1) all_counter.update(words2) if dup: dup_counter.update(words1) dup_counter.update(words2) print('Creating feature for train') create_features(train_path) print('Creating feature for test') create_features(test_path)
def create_feature(data_file): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return global creator if creator is None: print('creator') creator = DependencyNgramsCreator() print('read: {}'.format(data_file)) df = pd.read_csv(data_file) df_features = create_df(df, 'dep_2grams_common_ratio_stop', n=2, skip_stopwords=True) print('write: {}'.format(feature_output_file(data_file))) df_features.to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
def create_feature(train_path, test_path, n, skip_stopwords): if os.path.exists(feature_output_file(train_path)) and os.path.exists( feature_output_file(test_path)): print('File exists {}.'.format(feature_output_file(train_path)) + ", " + feature_output_file(test_path)) return global creator global vectorizer print('start preprocessing') train = pd.read_csv(train_path) test = pd.read_csv(test_path) creator = DependencyNgramsCreator(n=n, skip_stopwords=skip_stopwords) train_q1_ngram_lists = create_ngrams_lists(train.question1.astype(str)) train_q2_ngram_lists = create_ngrams_lists(train.question2.astype(str)) test_q1_ngram_lists = create_ngrams_lists(test.question1.astype(str)) test_q2_ngram_lists = create_ngrams_lists(test.question2.astype(str)) vectorizer = TfidfVectorizer(tokenizer=lambda a: a, lowercase=False, min_df=10, max_df=0.5) vectorizer.fit(train_q1_ngram_lists + train_q2_ngram_lists + test_q1_ngram_lists + test_q2_ngram_lists) print('finish preprocessing') print('train') train_q1_tfidf = vectorizer.transform(train_q1_ngram_lists) train_q2_tfidf = vectorizer.transform(train_q2_ngram_lists) train_feature = pd.DataFrame() train_feature['dep_2grams_sum_tfidf_q1'] = np.array( train_q1_tfidf.sum(axis=1)).flatten() train_feature['dep_2grams_sum_tfidf_q2'] = np.array( train_q2_tfidf.sum(axis=1)).flatten() train_feature['dep_2grams_tfidf_cosine'] = np.array( train_q1_tfidf.multiply(train_q2_tfidf).sum(axis=1)).flatten() train_feature.to_csv(feature_output_file(train_path), index=False, float_format='%.5f') print('test') test_q1_tfidf = vectorizer.transform(test_q1_ngram_lists) test_q2_tfidf = vectorizer.transform(test_q2_ngram_lists) test_feature = pd.DataFrame() test_feature['dep_2grams_sum_tfidf_q1'] = np.array( test_q1_tfidf.sum(axis=1)).flatten() test_feature['dep_2grams_sum_tfidf_q2'] = np.array( test_q2_tfidf.sum(axis=1)).flatten() test_feature['dep_2grams_tfidf_cosine'] = np.array( test_q1_tfidf.multiply(test_q2_tfidf).sum(axis=1)).flatten() test_feature.to_csv(feature_output_file(test_path), index=False, float_format='%.5f')
def get_column_name(self, input_file): return 'f{0}'.format( os.path.basename(feature_output_file(input_file)).split('_')[0])
def check_feature_existence(feature_creator_file, data_prefix): for k, file_name in generate_filename_from_prefix(data_prefix): if not os.path.exists( feature_output_file(file_name, feature_creator_file)): return False return True
def get_column_name(self, input_file): prefix = 'f{0}'.format( os.path.basename(feature_output_file(input_file)).split('_')[0]) return "{},{},{}".format(prefix + ".mc", prefix + ".jc", prefix + "ds")
def create_feature(data_file, features: pd.DataFrame): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return features.to_csv(feature_output_file(data_file), index=False)