Exemple #1
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(dict(generate_filename_from_prefix(options.data_prefix))['test'])

    dup_counter = Counter()
    for q1, q2, dup in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str),
                                df_train.is_duplicate)):
        if dup:
            dup_counter.update(list_diff_pairs(q1, q2))

    train_diff_pairs_ = [list_diff_pairs(q1, q2)
                        for q1, q2 in tqdm(zip(df_train.question1.astype(str), df_train.question2.astype(str)))]
    train_diff_pairs = [pair for pair in list(chain.from_iterable(train_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ]

    test_diff_pairs_ = [list_diff_pairs(q1, q2)
                       for q1, q2 in tqdm(zip(df_test.question1.astype(str), df_test.question2.astype(str)))]
    test_diff_pairs = [pair for pair in list(chain.from_iterable(test_diff_pairs_)) if dup_counter[pair] >= MIN_FREQ]

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5, min_df=MIN_FREQ, dtype=np.int32, tokenizer=lambda a: a, lowercase=False),
        NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True)
    )
    pipeline.fit(train_diff_pairs + test_diff_pairs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemple #2
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])

    train_diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df_train.question1.astype(str), df_train.question2.astype(
                str)))
    ]
    test_diff_pairs = [
        list_diff_pairs(q1, q2) for q1, q2 in tqdm(
            zip(df_test.question1.astype(str), df_test.question2.astype(str)))
    ]

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5,
                        min_df=100,
                        dtype=np.int32,
                        tokenizer=lambda a: a,
                        lowercase=False),
        NMF(n_components=10, random_state=1, l1_ratio=.15, verbose=True))
    pipeline.fit(train_diff_pairs + test_diff_pairs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
def main():
    options = common_feature_parser().parse_args()
    train_file = dict(generate_filename_from_prefix(
        options.data_prefix))['train']
    test_file = dict(generate_filename_from_prefix(
        options.data_prefix))['test']

    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    test_df['is_duplicate'] = -1
    test_df.rename(columns={'test_id': 'id'}, inplace=True)
    df = train_df.append(test_df)
    df.reset_index(inplace=True)
    from collections import Counter
    see_later1 = []
    see_later2 = []
    sentence_counter1 = Counter()
    sentence_counter2 = Counter()

    for i in tqdm(range(df.shape[0])):
        row = df.iloc[-i - 1]
        q1 = str(row['question1'])
        q2 = str(row['question2'])
        see_later1.append(sentence_counter1[q1])
        see_later2.append(sentence_counter2[q2])
        sentence_counter1[q1] += 1
        sentence_counter2[q2] += 1
    df['see_later1'] = list(reversed(see_later1))
    df['see_later2'] = list(reversed(see_later2))
    create_feature(data_file=train_file, df=df[df.is_duplicate >= 0])
    create_feature(data_file=test_file, df=df[df.is_duplicate < 0])
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming_without_stopwords(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = get_test_df(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])

    vectorizer = TfKLdVectorizer(alpha=1,
                                 divergence='js',
                                 ngram_range=(1, 2),
                                 max_df=0.4,
                                 min_df=5)
    train_q1s = pd.Series(df_train['question1'].tolist() +
                          df_test['question1'].tolist()).astype(str)
    train_q2s = pd.Series(df_train['question2'].tolist() +
                          df_test['question2'].tolist()).astype(str)
    train_ys = pd.Series(df_train['is_duplicate'].tolist() +
                         df_test['is_duplicate'].tolist()).astype(int)
    vectorizer.fit(train_q1s, train_q2s, train_ys)

    train_qs = pd.Series(train_q1s.tolist() + train_q2s.tolist()).astype(str)
    value_qs = vectorizer.transform(train_qs)
    print(value_qs)

    pipeline = make_pipeline(TruncatedSVD(n_components=10))
    pipeline.fit(value_qs)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name,
                       vectorizer=vectorizer,
                       pipeline=pipeline)
Exemple #5
0
def main():
    options = common_feature_parser().parse_args()
    # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook
    train_orig = pd.read_csv(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_orig = pd.read_csv(dict(generate_filename_from_prefix(options.data_prefix))['test'])

    df1 = train_orig[['question1']].copy()
    df2 = train_orig[['question2']].copy()
    df1_test = test_orig[['question1']].copy()
    df2_test = test_orig[['question2']].copy()

    df2.rename(columns={'question2': 'question1'}, inplace=True)
    df2_test.rename(columns={'question2': 'question1'}, inplace=True)

    train_questions = df1.append(df2)
    train_questions = train_questions.append(df1_test)
    train_questions = train_questions.append(df2_test)
    train_questions.drop_duplicates(subset=['question1'], inplace=True)

    train_questions.reset_index(inplace=True, drop=True)
    questions_dict = pd.Series(train_questions.index.values, index=train_questions.question1.values).to_dict()
    train_cp = train_orig.copy()
    test_cp = test_orig.copy()
    train_cp.drop(['qid1', 'qid2'], axis=1, inplace=True)

    test_cp['is_duplicate'] = -1
    test_cp.rename(columns={'test_id': 'id'}, inplace=True)
    comb = pd.concat([train_cp, test_cp])

    comb['q1_hash'] = comb['question1'].map(questions_dict)
    comb['q2_hash'] = comb['question2'].map(questions_dict)
    q1_vc = comb.q1_hash.value_counts().to_dict()
    q2_vc = comb.q2_hash.value_counts().to_dict()
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, questions_dict=questions_dict, q1_vc=q1_vc, q2_vc=q2_vc)
def main():
    file_prefix = 'xgb_cross_0.json.xgb_cross_0.json'
    options = common_feature_parser().parse_args()
    neighbor_sets, neighbor_weights = prepare_graph(options, file_prefix)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name,
                       neighbor_sets=neighbor_sets,
                       neighbor_weights=neighbor_weights)
Exemple #7
0
def main():
    parser = common_feature_parser()
    parser.add_argument('--google_word2vec', default='data/input/GoogleNews-vectors-negative300.bin', type=str)
    options = parser.parse_args()

    model = gensim.models.KeyedVectors.load_word2vec_format(options.google_word2vec, binary=True)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, model=model)
def main():
    file_prefix = '0.1925_lstm_leak_203_101_0.26_0.35_36'
    options = common_feature_parser().parse_args()
    input_files = dict(generate_filename_from_prefix(options.data_prefix))
    train_file = os.path.join(os.path.dirname(input_files['train']), '../lstm/', file_prefix + '.stacking.csv')
    test_file = os.path.join(os.path.dirname(input_files['train']), '../lstm/', file_prefix + '.submission.csv')
    neighbor_sets, neighbor_weights = prepare_graph_with_filenames(options, train_file, test_file)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, neighbor_sets=neighbor_sets, neighbor_weights=neighbor_weights)
def main():
    options = common_feature_parser().parse_args()

    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2')
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
def main():
    options = common_feature_parser().parse_args()
    print(sys.argv[0], file=sys.stderr)
    df_train = convert(nltk_pos_tag(dict(generate_filename_from_prefix(options.data_prefix))['train']))
    print(df_train.head(), file=sys.stderr)

    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=1, norm='l2')
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
Exemple #11
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming_without_stopwords(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(CountVectorizer(max_df=0.5, min_df=2),
                             TruncatedSVD(n_components=10))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemple #12
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2)),
        NMF(n_components=10, random_state=1, l1_ratio=.15)
    )
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemple #13
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming_without_stopwords(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)
    vectorizer = CountVectorizer(max_df=0.5,
                                 min_df=2,
                                 ngram_range=(2, 2),
                                 binary=True)
    vectorizer.fit_transform(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=vectorizer)
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2'),
        LatentDirichletAllocation(n_topics=10, random_state=1))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemple #15
0
def main():
    options = common_feature_parser().parse_args()
    # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook
    train_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    counter = Counter()
    features = {
        'train': calculate_features(train_df, counter),
        'test': calculate_features(test_df, counter)
    }

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, features=features[k])
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_stemming(dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        TfidfVectorizer(max_df=0.5, min_df=2, norm='l2', ngram_range=(1, 2), dtype=np.float32),
        TruncatedSVD(n_components=50, n_iter=10)
    )
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemple #17
0
def main():
    options = common_feature_parser().parse_args()
    df_train = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    df_test = nltk_tokenize(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist() +
                         df_test['question1'].tolist() +
                         df_test['question2'].tolist()).astype(str)

    pipeline = make_pipeline(
        CountVectorizer(max_df=0.5, min_df=2, max_features=200),
        TfidfTransformer(norm='l2'), TruncatedSVD(n_components=10))
    pipeline.fit(train_qs.values)

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, vectorizer=pipeline)
Exemple #18
0
def main():
    options = common_feature_parser().parse_args()
    # from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/notebook
    train_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['train'])
    test_df = pd.read_csv(
        dict(generate_filename_from_prefix(options.data_prefix))['test'])
    uf = UnionFind()
    for i, row in tqdm(train_df.iterrows()):
        uf.unite(str(row['question1']), str(row['question2']))
    for i, row in tqdm(test_df.iterrows()):
        uf.unite(str(row['question1']), str(row['question2']))

    features = {
        'train': calculate_features(train_df, uf),
        'test': calculate_features(test_df, uf)
    }
    joblib.dump(uf, 'tmp-union-find.pkl')

    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name, features=features[k])
def main():
    parser = common_feature_parser()
    options = parser.parse_args()
    feature_creator = WordMatchCount(options)
    feature_creator.create()
Exemple #20
0
def main():
    options = common_feature_parser().parse_args()
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_feature(data_file=file_name)
Exemple #21
0
def main():
    options = common_feature_parser().parse_args()
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/input/GoogleNews-vectors-negative300.bin', binary=True)
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_word_match_feature(data_file=file_name, model=model)
Exemple #22
0
def main():
    parser = common_feature_parser()
    options = parser.parse_args()
    feature_creator = NGramTfidfRelativeDifference(options, ngram_range=(3, 3))
    feature_creator.create()
def main():
    options = common_feature_parser().parse_args()
    train_path = os.path.join(options.data_prefix, 'train.csv')
    test_path = os.path.join(options.data_prefix, 'test.csv')

    create_features_files(train_path, test_path)
Exemple #24
0
def main():
    parser = common_feature_parser()
    options = parser.parse_args()
    feature_creator = CharNGramSimilarityFeatureCreator(options, ngram_range=(5, 5))
    feature_creator.create()
def main():
    options = common_feature_parser().parse_args()
    train_path = os.path.join(options.data_prefix, 'train.csv')
    test_path = os.path.join(options.data_prefix, 'test.csv')
    create_feature(train_path, test_path, n=2, skip_stopwords=False)
def main():
    options = common_feature_parser().parse_args()
    create_feature(os.path.join(options.data_prefix, 'train.csv'))
    create_feature(os.path.join(options.data_prefix, 'test.csv'))
Exemple #27
0
def main():
    parser = common_feature_parser()
    options = parser.parse_args()
    dump_graph(options)
Exemple #28
0
def main():
    parser = common_feature_parser()
    options = parser.parse_args()
    feature_creator = FeatureCreator(options)
    feature_creator.create()
def main():
    m = gensim.models.Doc2Vec.load('../data/input/enwiki_dbow/doc2vec.bin')
    options = common_feature_parser().parse_args()
    for k, file_name in generate_filename_from_prefix(options.data_prefix):
        create_word_match_feature(data_file=file_name, model=m)