def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['test']) dup_counter = Counter() for q1, q2, dup in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str), df_train.is_duplicate)): if dup: dup_counter.update(list_diff_pairs(q1, q2)) train_diff_pairs_ = [list_diff_pairs(q1, q2) for q1, q2 in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str)))] train_diff_pairs = [pair for pair in list(chain.from_iterable(train_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ] test_diff_pairs_ = [list_diff_pairs(q1, q2) for q1, q2 in tqdm(zip(df_test.question1.astype(str), df_test.question2.astype(str)))] test_diff_pairs = [pair for pair in list(chain.from_iterable(test_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ] pipeline = make_pipeline( CountVectorizer(max_df=0.5, min_df=MIN_FREQ, dtype=np.int32, tokenizer=lambda a: a, lowercase=False), NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True) ) pipeline.fit(train_diff_pairs + test_diff_pairs) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['test']) train_diff_pairs = [ list_diff_pairs(q1, q2) for q1, q2 in tqdm( zip(df_train.question1.astype(str), df_train.question2.astype( str))) ] test_diff_pairs = [ list_diff_pairs(q1, q2) for q1, q2 in tqdm( zip(df_test.question1.astype(str), df_test.question2.astype(str))) ] pipeline = make_pipeline( CountVectorizer(max_df=0.5, min_df=100, dtype=np.int32, tokenizer=lambda a: a, lowercase=False), NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True)) pipeline.fit(train_diff_pairs + test_diff_pairs) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() train_file = dict(generate_filename_from_prefix( options.data_prefix))['train'] test_file = dict(generate_filename_from_prefix( options.data_prefix))['test'] train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) test_df['is_duplicate'] = -1 test_df.rename(columns={'test_id': 'id'}, inplace=True) df = train_df.append(test_df) df.reset_index(inplace=True) from collections import Counter see_later1 = [] see_later2 = [] sentence_counter1 = Counter() sentence_counter2 = Counter() for i in tqdm(range(df.shape[0])): row = df.iloc[-i - 1] q1 = str(row['question1']) q2 = str(row['question2']) see_later1.append(sentence_counter1[q1]) see_later2.append(sentence_counter2[q2]) sentence_counter1[q1] += 1 sentence_counter2[q2] += 1 df['see_later1'] = list(reversed(see_later1)) df['see_later2'] = list(reversed(see_later2)) create_feature(data_file=train_file, df=df[df.is_duplicate >= 0]) create_feature(data_file=test_file, df=df[df.is_duplicate < 0])
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming_without_stopwords( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = get_test_df( dict(generate_filename_from_prefix(options.data_prefix))['test']) vectorizer = TfKLdVectorizer(alpha=1, divergence='js', ngram_range=(1, 2), max_df=0.4, min_df=5) train_q1s = pd.Series(df_train['question1'].tolist() + df_test['question1'].tolist()).astype(str) train_q2s = pd.Series(df_train['question2'].tolist() + df_test['question2'].tolist()).astype(str) train_ys = pd.Series(df_train['is_duplicate'].tolist() + df_test['is_duplicate'].tolist()).astype(int) vectorizer.fit(train_q1s, train_q2s, train_ys) train_qs = pd.Series(train_q1s.tolist() + train_q2s.tolist()).astype(str) value_qs = vectorizer.transform(train_qs) print(value_qs) pipeline = make_pipeline(TruncatedSVD(n_components=10)) pipeline.fit(value_qs) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer, pipeline=pipeline)
def main(): options = common_feature_parser().parse_args() # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook train_orig = pd.read_csv(dict(generate_filename_from_prefix(options.data_prefix))['train']) test_orig = pd.read_csv(dict(generate_filename_from_prefix(options.data_prefix))['test']) df1 = train_orig[['question1']].copy() df2 = train_orig[['question2']].copy() df1_test = test_orig[['question1']].copy() df2_test = test_orig[['question2']].copy() df2.rename(columns={'question2': 'question1'}, inplace=True) df2_test.rename(columns={'question2': 'question1'}, inplace=True) train_questions = df1.append(df2) train_questions = train_questions.append(df1_test) train_questions = train_questions.append(df2_test) train_questions.drop_duplicates(subset=['question1'], inplace=True) train_questions.reset_index(inplace=True, drop=True) questions_dict = pd.Series(train_questions.index.values, index=train_questions.question1.values).to_dict() train_cp = train_orig.copy() test_cp = test_orig.copy() train_cp.drop(['qid1', 'qid2'], axis=1, inplace=True) test_cp['is_duplicate'] = -1 test_cp.rename(columns={'test_id': 'id'}, inplace=True) comb = pd.concat([train_cp, test_cp]) comb['q1_hash'] = comb['question1'].map(questions_dict) comb['q2_hash'] = comb['question2'].map(questions_dict) q1_vc = comb.q1_hash.value_counts().to_dict() q2_vc = comb.q2_hash.value_counts().to_dict() for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, questions_dict=questions_dict, q1_vc=q1_vc, q2_vc=q2_vc)
def main(): file_prefix = '0.1925_lstm_leak_203_101_0.26_0.35_36' options = common_feature_parser().parse_args() input_files = dict(generate_filename_from_prefix(options.data_prefix)) train_file = os.path.join(os.path.dirname(input_files['train']), '../lstm/', file_prefix + '.stacking.csv') test_file = os.path.join(os.path.dirname(input_files['train']), '../lstm/', file_prefix + '.submission.csv') neighbor_sets, neighbor_weights = prepare_graph_with_filenames(options, train_file, test_file) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, neighbor_sets=neighbor_sets, neighbor_weights=neighbor_weights)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2') vectorizer.fit_transform(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer)
def main(): options = common_feature_parser().parse_args() print(sys.argv[0], file=sys.stderr) df_train = convert(nltk_pos_tag(dict(generate_filename_from_prefix(options.data_prefix))['train'])) print(df_train.head(), file=sys.stderr) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2') vectorizer.fit_transform(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) pipeline = make_pipeline( TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2)), NMF(n_components=10, random_state=1, l1_ratio=.15) ) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming_without_stopwords( dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) pipeline = make_pipeline(CountVectorizer(max_df=0.5, min_df=2), TruncatedSVD(n_components=10)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) pipeline = make_pipeline( TfidfVectorizer(max_df=0.5, min_df=2, norm='l2'), LatentDirichletAllocation(n_topics=10, random_state=1)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming_without_stopwords( dict(generate_filename_from_prefix(options.data_prefix))['train']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str) vectorizer = CountVectorizer(max_df=0.5, min_df=2, ngram_range=(2, 2), binary=True) vectorizer.fit_transform(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=vectorizer)
def main(): options = common_feature_parser().parse_args() # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook train_df = pd.read_csv( dict(generate_filename_from_prefix(options.data_prefix))['train']) test_df = pd.read_csv( dict(generate_filename_from_prefix(options.data_prefix))['test']) counter = Counter() features = { 'train': calculate_features(train_df, counter), 'test': calculate_features(test_df, counter) } for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, features=features[k])
def dump_graph(options): node_file = get_node_filename(options) edge_file = get_edge_filename(options) if os.path.exists(node_file) and os.path.exists(edge_file): print("File exists {} and {}".format(node_file, edge_file)) return train_df = pd.read_csv( dict(generate_filename_from_prefix(options.data_prefix))['train']) test_df = pd.read_csv( dict(generate_filename_from_prefix(options.data_prefix))['test']) edges, question2id = build_graph(test_df, train_df) dump_load_file(node_file, question2id) with open(edge_file, 'w') as f: for e in edges: print(e, file=f)
def prepare_graph_with_filenames(options, train_file, test_file): input_files = dict(generate_filename_from_prefix(options.data_prefix)) print('Stacking ingredients: {} and {}'.format(train_file, test_file), file=sys.stderr) neighbor_sets = defaultdict(set) neighbor_weights = defaultdict(dict) dfs = [] df_train = pd.read_csv(input_files['train']) df_train['prob'] = pd.read_csv(train_file)['prediction'] dfs.append(df_train) df_test = pd.read_csv(input_files['test']) df_test['prob'] = pd.read_csv(test_file)['is_duplicate'] dfs.append(df_test) for df in dfs: for i, (q1, q2, value) in tqdm( enumerate( zip(df.question1.astype(str), df.question2.astype(str), df.prob))): neighbor_sets[q1].add(q2) neighbor_weights[q1][q2] = value neighbor_sets[q2].add(q1) neighbor_weights[q2][q1] = value return neighbor_sets, neighbor_weights
def prepare_graph(options, file_prefix): input_files = dict(generate_filename_from_prefix(options.data_prefix)) train_file = os.path.join(os.path.dirname(input_files['train']), '../output/', file_prefix + '.model.train.pred') test_file = os.path.join(os.path.dirname(input_files['train']), '../output/', file_prefix + '.submission.csv') return prepare_graph_with_filenames(options, train_file, test_file)
def main(): parser = common_feature_parser() parser.add_argument('--google_word2vec', default='data/input/GoogleNews-vectors-negative300.bin', type=str) options = parser.parse_args() model = gensim.models.KeyedVectors.load_word2vec_format(options.google_word2vec, binary=True) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, model=model)
def main(): options = common_feature_parser().parse_args() df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) pipeline = make_pipeline( TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2), dtype=np.float32), TruncatedSVD(n_components=50, n_iter=10) ) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): file_prefix = 'xgb_cross_0.json.xgb_cross_0.json' options = common_feature_parser().parse_args() neighbor_sets, neighbor_weights = prepare_graph(options, file_prefix) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, neighbor_sets=neighbor_sets, neighbor_weights=neighbor_weights)
def main(): options = common_feature_parser().parse_args() df_train = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['train']) df_test = nltk_tokenize( dict(generate_filename_from_prefix(options.data_prefix))['test']) train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()).astype(str) pipeline = make_pipeline( CountVectorizer(max_df=0.5, min_df=2, max_features=200), TfidfTransformer(norm='l2'), TruncatedSVD(n_components=10)) pipeline.fit(train_qs.values) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, vectorizer=pipeline)
def main(): options = common_feature_parser().parse_args() # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook train_df = pd.read_csv( dict(generate_filename_from_prefix(options.data_prefix))['train']) test_df = pd.read_csv( dict(generate_filename_from_prefix(options.data_prefix))['test']) uf = UnionFind() for i, row in tqdm(train_df.iterrows()): uf.unite(str(row['question1']), str(row['question2'])) for i, row in tqdm(test_df.iterrows()): uf.unite(str(row['question1']), str(row['question2'])) features = { 'train': calculate_features(train_df, uf), 'test': calculate_features(test_df, uf) } joblib.dump(uf, 'tmp-union-find.pkl') for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name, features=features[k])
def generate_data_files(config, options): for feature_id in config['features']: feature_creator_file = get_feature_creator_file(feature_id) if check_feature_existence(feature_creator_file, config['data_prefix']): print('Feature file for {} exists.'.format(feature_creator_file), file=sys.stderr) continue commands = [ "python3", feature_creator_file, "--data_prefix", config['data_prefix'] ] if options.train_only: commands.append("--train_only") subprocess.call(commands) data_files = dict(generate_filename_from_prefix(config['data_prefix'])) return data_files
def main(): m = gensim.models.Doc2Vec.load('../data/input/enwiki_dbow/doc2vec.bin') options = common_feature_parser().parse_args() for k, file_name in generate_filename_from_prefix(options.data_prefix): create_word_match_feature(data_file=file_name, model=m)
def main(): options = common_feature_parser().parse_args() for k, file_name in generate_filename_from_prefix(options.data_prefix): create_feature(data_file=file_name)
def __init__(self, options): self.input_files = dict( generate_filename_from_prefix(options.data_prefix)) self.train_only = options.train_only self.n_threads = options.n_threads
def main(): options = common_feature_parser().parse_args() model = gensim.models.KeyedVectors.load_word2vec_format( 'data/input/GoogleNews-vectors-negative300.bin', binary=True) for k, file_name in generate_filename_from_prefix(options.data_prefix): create_word_match_feature(data_file=file_name, model=model)
def check_feature_existence(feature_creator_file, data_prefix): for k, file_name in generate_filename_from_prefix(data_prefix): if not os.path.exists( feature_output_file(file_name, feature_creator_file)): return False return True