def prepare(self): df_train = nltk_stemming_without_stopwords(self.input_files['train']) df_test = nltk_stemming_without_stopwords(self.input_files['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) self.vectorizer.fit(train_qs.values)
def read_data(self, data_file): X1 = self.vectorizer.transform( nltk_stemming_without_stopwords(data_file)['question1'].fillna( "").tolist()) X2 = self.vectorizer.transform( nltk_stemming_without_stopwords(data_file)['question2'].fillna( "").tolist()).T return [(X1.getrow(i), X2.getcol(i)) for i in range(X1.shape[0])]
def read_data(self, data_file): X1 = np.array( self.vectorizer.transform(nltk_stemming_without_stopwords(data_file)['question1'].fillna("").tolist()).sum( axis=1)).reshape(-1, 1) X2 = np.array( self.vectorizer.transform(nltk_stemming_without_stopwords(data_file)['question2'].fillna("").tolist()).sum( axis=1)).reshape(-1, 1) return np.hstack((X1, X2))
def read_data(self, data_file): X1 = self.vectorizer.transform( nltk_stemming_without_stopwords(data_file)['question1'].fillna( "").tolist()) X2 = self.vectorizer.transform( nltk_stemming_without_stopwords(data_file)['question2'].fillna( "").tolist()) return X1 - X2
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming_without_stopwords( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = get_test_df( dict(generate_filename_from_prefix(options.data_prefix))['test']) vectorizer = TfKLdVectorizer(alpha=1, divergence='js', ngram_range=(1, 2), max_df=0.4, min_df=5) train_q1s = pd.Series(df_train['question1'].tolist() + df_test['question1'].tolist()).astype(str) train_q2s = pd.Series(df_train['question2'].tolist() + df_test['question2'].tolist()).astype(str) train_ys = pd.Series(df_train['is_duplicate'].tolist() + df_test['is_duplicate'].tolist()).astype(int) vectorizer.fit(train_q1s, train_q2s, train_ys) train_qs = pd.Series(train_q1s.tolist() + train_q2s.tolist()).astype(str) value_qs = vectorizer.transform(train_qs) print(value_qs) pipeline = make_pipeline(TruncatedSVD(n_components=10)) pipeline.fit(value_qs) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer, pipeline=pipeline)
def get_test_df(filename): df_test = nltk_stemming_without_stopwords(filename) test_submission = pd.read_csv(os.path.dirname(filename) + '/../output/gbm_34.json.gbm_34.json.submission.csv') df_test = df_test.merge(test_submission, how='inner', on='test_id') df_test = df_test[((df_test.is_duplicate < 0.05) & (df_test.is_duplicate > 0.01)) | (df_test.is_duplicate > 0.8)] df_test['is_duplicate'] = (df_test.is_duplicate > 0.5).astype(int) return df_test
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming_without_stopwords(dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(3, 3)) vectorizer.fit_transform(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming_without_stopwords( dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) pipeline = make_pipeline(CountVectorizer(max_df=0.5, min_df=2), TruncatedSVD(n_components=10)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def create_feature(data_file, vectorizer): if os.path.exists(feature_output_file(data_file)): print('File exists {}.'.format(feature_output_file(data_file))) return df = nltk_stemming_without_stopwords(data_file) print(sys.argv[0], data_file, file=sys.stderr) column_name_prefix = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0]) X1 = vectorizer.transform(df.question1.values.astype(str)) X2 = vectorizer.transform(df.question2.values.astype(str)) X = np.hstack((X1, X2)) column_names = [] for i in range(X.shape[1]): column_name = column_name_prefix + '.' + str(i) df[column_name] = X[:, i] column_names.append(column_name) column_names = pd.Index(column_names) df[column_names] = X df[column_names].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')