Example #1
0
def run_tfidf(obj_field, target_field, generator, n_gram):
    obj_corpus = df[obj_field].values
    tgt_corpus = df[target_field].values
    ext = generator(obj_corpus, tgt_corpus, ngram=n_gram)
    x = ext.transform()
    print(x.shape)
    save_path = "features/feature_%d_gram_%s_%s_%s.pkl" % (
        n_gram, ext.__name__(), obj_field, target_field)
    to_pkl(x, save_path)
Example #2
0
def run_char_dist_sim(obj_field, target_field, generator):
    obj_corpus = df[obj_field].values
    tgt_corpus = df[target_field].values
    ext = generator(obj_corpus, tgt_corpus)
    x = ext.transform()
    print(x.shape)
    save_path = "features/feature_%s_%s_%s.pkl" % (ext.__name__(), obj_field,
                                                   target_field)
    to_pkl(x, save_path)
Example #3
0
def run_lsa_ngram(df, field):
    obj_corpus = df[field].values
    n_grams = [1, 2, 3]
    for n_gram in n_grams:
        ext = LSA_Word_Ngram(obj_corpus, None, n_gram, config.SVD_DIM,
                             config.SVD_N_ITER)
        x = ext.transform()
        save_path = "features/feature_lsa_word_%d_gram_%s.pkl" % (n_gram,
                                                                  field)
        to_pkl(x, save_path)
Example #4
0
def run_tfidf_char_ngram_cosinesim(obj_field, target_field):
    n_grams = [1, 2, 3]
    obj_corpus = df[obj_field].values
    tgt_corpus = df[target_field].values
    for n_gram in n_grams:
        ext = TFIDF_Char_Ngram_CosineSim(obj_corpus, tgt_corpus, n_gram)
        x = ext.transform()
        print(x.shape)
        save_path = "features/feature_tfidf_cosinesim_char_%d_gram_%s_%s.pkl" % (
            n_gram, obj_field, target_field)
        to_pkl(x, save_path)
Example #5
0
def feature_combine(feature_dir):
    features = []
    file_names = os.listdir(feature_dir)
    for file_name in file_names:
        if not file_name.startswith("feature"):
            continue
        feature = load_pkl(os.path.join(feature_dir, file_name))
        if len(feature.shape) == 1:
            feature = feature[np.newaxis, :].transpose()
        features.append(feature)
    print("features", len(features))
    X = np.concatenate(features, axis=1)
    print("X shape is:", X.shape)
    to_pkl(X, "features/train/X_10.pkl")
Example #6
0
def run_lsa_ngram_cooc(obj_field, target_field, generator):
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obj_corpus = df[obj_field].values
    tgt_corpus = df[target_field].values
    for obs_ngram in obs_ngrams:
        for target_ngram in target_ngrams:
            ext = generator(obj_corpus,
                            tgt_corpus,
                            obs_ngram=obs_ngram,
                            target_ngram=target_ngram)
            x = ext.transform()
            print(x.shape)
            save_path = "features/feature_%s_%s_%s.pkl" % (
                ext.__name__(), obj_field, target_field)
            to_pkl(x, save_path)
def train(train_data, y, num_each_group):
    print("Start Training...")
    CASE_NUM = train_data.shape[0]
    GROUPS_NUM = int(CASE_NUM / num_each_group)
    assert CASE_NUM % GROUPS_NUM == 0

    X_groups = np.arange(0, GROUPS_NUM).repeat(num_each_group)

    X = np.concatenate([X_groups[:, None], train_data], axis=1)

    ranker = XGBRanker(n_estimators=150,
                       learning_rate=0.1,
                       subsample=1.0,
                       max_depth=6)

    ranker.fit(X, y, eval_metric=['ndcg', 'map@5-'])

    to_pkl(ranker, config.model_save_path)
    return ranker
def predict(test_data, ranker, num_each_group, predict_save):
    print("Start predicting...")
    fw = codecs.open(predict_save, 'w')
    CASE_NUM = test_data.shape[0]
    GROUPS_NUM = int(CASE_NUM / num_each_group)
    assert CASE_NUM % GROUPS_NUM == 0

    X_groups = np.arange(0, GROUPS_NUM).repeat(num_each_group)
    X = np.concatenate([X_groups[:, None], test_data], axis=1)
    y_pred = ranker.predict(X)
    y_pred = y_pred.reshape(-1, 3)
    res = y_pred.argmax(axis=1).tolist()
    to_pkl(res, config.predict_index_save)
    with open(config.test_sample_file, 'r') as fr:
        for ix, line in zip(res, fr):
            data = json.loads(line.strip("\n"))
            preds = data["preds"]
            pred = preds[ix]
            fw.write(pred + "\n")
    fw.close()
Example #9
0
def dumps_y(df):
    y = df["score"].values
    save_path = "features/train/y_10.pkl"
    to_pkl(y, save_path)
Example #10
0
def dump_df_feature(df, fields):
    for field in fields:
        data = df[field].values
        save_path = "features/feature_%s.pkl" % (field)
        to_pkl(data, save_path)