Example #1
0
def main():
    predict_result_file = from_project_root(
        "lstm_model/result/result_predict-1700.csv")
    distribution_file = from_project_root(
        "lstm_model/result/erro_distribution-1700.csv")
    get_the_error_label_distribution(predict_result_file, distribution_file)
    pass
def main(result_dir):

    all_predict_files = read_all_filenames(from_project_root(result_dir))

    all_predict_results = []
    for predict_file in all_predict_files:
        all_predict_results.append(pk.load(open(predict_file,'rb')))

    predict_all = []
    predict_all_pro = []
    for i in range(len(all_predict_results[0])):

        predict_one_merge = np.array([0.0] * 19)

        for j in range(len(all_predict_results)):

            predict_one_merge = predict_one_merge + np.array(all_predict_results[j][i])

        #
        predict_one_merge = predict_one_merge / len(all_predict_results)
        max_index = np.where(predict_one_merge == np.max(predict_one_merge))[0][0]
        predict_all.append(max_index + 1)
        predict_all_pro.append(predict_one_merge)

    predict_context, predict_labels = Data_helper.get_predict_data(from_project_root(
        "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_dev.csv"))

    macro_f1 = f1_score(predict_all, predict_labels, average='macro')
    accuracy_score1 = accuracy_score(predict_all, predict_labels, normalize=True)

    print("macro_f1:{}".format(macro_f1))
    print("accuracy:{}".format(accuracy_score1))

    # save
    pk.dump(predict_all_pro,open(result_dir+"/predict_merge_dev.pk",'wb'))
Example #3
0
def main():

    # exit()
    #计算df
    # train_file = from_project_root("lstm_model/processed_data/phrase_level_data.csv")
    # df_pickle = from_project_root("lstm_model/processed_data/one_gram/phrase_level_df.pk")
    # cal_df(train_file,df_pickle)
    # exit()

    # 将词袋模型的tf_bdc权重进行降维
    # tfbdc_word_bag_pickle = from_project_root("lstm_model/processed_data/vector/tfbdc_1gram_300000_Xy.pk")
    # pca_tfbdc_pickle = from_project_root("lstm_model/processed_data/vector/pca_tfbdc_1gram_300000_Xy.csv")
    # pca(tfbdc_word_bag_pickle,pca_tfbdc_pickle)
    # exit()

    # 根据train_file建立字典
    # train_file = from_project_root("lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200.csv")
    # vocab_pickle = from_project_root("lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_vocab.pk")
    # create_vocab_dict(train_file,vocab_pickle)
    # exit()

    # 将pk转化为csv文件
    pickle_file = from_project_root(
        "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_vocab.pk"
    )
    save_csv_file = from_project_root(
        "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200_vocab.csv"
    )
    transfer_pk_to_csv(pickle_file, save_csv_file)
    exit()
def main():

    # 将one-gram转变成n-gram
    # n_gram = 2
    # phrase_train_file = from_project_root("lstm_model/processed_data/phrase_level_data.csv")
    # n_gram_phrase_train_file = from_project_root("lstm_model/processed_data/two_gram/{}-gram_phrase_level_data.csv".format(n_gram))
    # create_n_gram_sentence(n_gram,phrase_train_file,n_gram_phrase_train_file)
    # exit()

    # 对于每个句子进行过滤
    bdc_pickle = from_project_root(
        "lstm_model/processed_data/one_gram/phrase_level_1gram_bdc.json")
    tf_pickle = from_project_root(
        "lstm_model/processed_data/one_gram/phrase_level_1gram_tf.json")
    dc_pickle = from_project_root(
        "lstm_model/processed_data/one_gram/phrase_level_1gram_dc.json")

    train_file = from_project_root(
        "lstm_model/processed_data/phrase_level_data.csv")
    processed_data_file = from_project_root(
        "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200.csv"
    )
    pre_processed_sen(bdc_pickle,
                      tf_pickle,
                      dc_pickle,
                      train_file,
                      processed_data_file,
                      limit_word=200)
    pass
def main():
    model_url = from_project_root(
        "data/model/exhaustive_model_epoch16_0.723039.pt")
    test_url = from_project_root("data/genia.test.iob2")
    model = torch.load(model_url)
    evaluate(model, test_url)
    pass
Example #6
0
def main(name):
    # load data from pickle
    pk_url = from_project_root("processed_data/vector/stacked_dc_idf_" + name +
                               "_36.pk")
    print("loading data from", pk_url)
    X, y, X_test = joblib.load(pk_url)

    train_url = from_project_root("data/multilabel_" + name + ".csv")
    test_url = from_project_root("data/test_processed.csv")

    print(X.shape, y.shape, X_test.shape)
    clf = XGBClassifier(n_jobs=-1)  # xgb's default n_jobs=1

    result = get_result_from_stacking(clf, X, y, X_test)
    test_public = pd.read_csv(test_url)['id']
    output_str = 'content_id,subject,sentiment_value,sentiment_word\n'
    for jjj in range(len(result)):
        output_str += "%s,0,%s,\n" % (test_public[jjj], result[jjj])
    outfile = open('result_36' + name + '.csv', 'w')
    outfile.write(output_str)
    outfile.close()

    save_url = from_project_root(
        "processed_data/vector/{}_dc_idf_xgb.pk".format(X.shape[1] //
                                                        N_CLASSES))
    joblib.dump(
        gen_data_for_stacking(clf,
                              X,
                              y,
                              X_test,
                              n_splits=5,
                              random_state=19950717,
                              name=name), save_url)

    pass
Example #7
0
def gen_feature_stacking_result(gen_type='val'):
    """ generate feature stacking result data

    Args:
        gen_type: val or test

    Returns:
        X, y, X_test

    """
    params = load_params()
    print("len(params) =", len(params))
    save_url = from_project_root("data/vector/stacked_%s_XyX_%s_%d_%sc.pk"
                                 % (('one' if ONLY_SINGLE else 'all'), gen_type, len(load_params()), LABEL_COL))
    print("stacking data will be saved at", save_url)
    if gen_type == 'val':
        train_url = from_project_root("data/preliminary/train_ex.csv")
        test_url = from_project_root("data/preliminary/test_gold_ex.csv")
        # train_url = from_project_root("data/preliminary/train_exs.csv")
        # test_url = from_project_root("data/preliminary/best_subject_exs.csv")
    elif gen_type == 'test':
        train_url = from_project_root("data/train_2_ex.csv")
        test_url = from_project_root("data/test_public_2v3_ex.csv")
    else:
        print("error, gen_type should be 'val' or 'test'")
        return

    joblib.dump(feature_stacking(train_url, test_url, use_proba=True, random_state=RANDOM_STATE,
                                 drop_words=DROP_WORDS, only_single=ONLY_SINGLE), save_url)
Example #8
0
def main():
    proba_dict = {
        from_project_root('processed_data/result/result1.csv'): 0.1,
        from_project_root('processed_data/result/result2.csv'): 0.9,
    }
    save_url = from_project_root('processed_data/result.csv')
    merge_probas(proba_dict, save_url)
    pass
def main():
    data_urls = [
        from_project_root("data/genia.train.iob2"),
        from_project_root("data/genia.dev.iob2"),
        from_project_root("data/genia.test.iob2")
    ]
    prepare_vocab(data_urls, update=True, min_count=1)
    pass
def calc_bdc(data_url=DATA_URL, update=False, ngram=1):
    """ calc the bdc value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: maxn for ngram

    Returns:
        dict: bdc dict {word: bdc_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    bdc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_bdc.json".format(
            level, ngram))
    dc_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_dc.json".format(
            level, ngram))
    if not update and exists(bdc_url):
        return ju.load(bdc_url)

    labels, sentences = load_raw_data(data_url, ngram=ngram)
    word_label_dict = collections.defaultdict(dict)  # store f(t, c_i)
    label_words_num = collections.defaultdict(int)  # to store all f(c_i)
    for label, sentence in tqdm(zip(labels, sentences), total=len(labels)):
        label_words_num[label] += len(sentence)
        for word in sentence:
            try:
                word_label_dict[word][label] += 1
            except KeyError:
                word_label_dict[word][label] = 1

    bdc_dict = collections.defaultdict(float)
    dc_dict = collections.defaultdict(float)
    for word in tqdm(word_label_dict):

        # for calc dc
        arr = np.array(list(
            word_label_dict[word].values()))  # f(t, c_i) for all labels
        arr = arr / arr.sum()  # f(t, c_i) / f(t)
        arr = np.log(arr) * arr
        dc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num))  # norm

        # for calc bdc
        for label in word_label_dict[word]:
            word_label_dict[word][label] /= label_words_num[
                label]  # p(t, c_i) = f(t, c_i) / f(c_i)
        arr = np.array(list(
            word_label_dict[word].values()))  # p(t, c_i) for all labels
        arr = arr / arr.sum()  # p(t, c_i) / sum(p(t, c_i))
        arr = np.log(arr) * arr
        bdc_dict[word] = 1 + arr.sum() / np.log(len(label_words_num))  # norm

    # to sort save calculated result
    ju.dump(ju.sort_dict_by_value(bdc_dict), bdc_url)
    ju.dump(ju.sort_dict_by_value(dc_dict), dc_url)
    return bdc_dict
Example #11
0
def main():
    # load_data(data_url, update=False)
    data_urls = [
        from_project_root("data/Germ/germ.train.iob2"),
        from_project_root("data/Germ/germ.dev.iob2"),
        from_project_root("data/Germ/germ.test.iob2")
    ]
    prepare_vocab(data_urls, update=True, min_count=1)
    pass
Example #12
0
def main():
    model_url = from_project_root("data/model/best_model.pt")
    print("loading model from", model_url)
    model = torch.load(model_url)
    # model = torch.load(model_url, map_location='cpu')
    test_url = from_project_root("data/genia/genia.test.iob2")
    evaluate(model, test_url)
    # predict_on_iob2(model, test_url)
    pass
Example #13
0
def split_data(param_data_df):
    """
    划分数据集
    :param param_data_df:
    :return:
    """
    train_df, validation_df = train_test_split(param_data_df, test_size=0.2)
    train_filename = from_project_root("data/small_train.csv")
    test_filename = from_project_root(("data/small_test.csv"))
    write_data_df(train_filename, train_df)
    write_data_df(test_filename, validation_df)
def rcnn_rcnn_attention():

    rcnn_model = from_project_root(
        "lstm_model/result/prob_rcnnon_cv0.789744.csv")
    rnn_cnn_attention = from_project_root(
        "lstm_model/result/result_rcnn_0.775.pk")

    rcnn_pro = []
    with open(rcnn_model, 'r', encoding='utf-8') as f:
        for line in f.readlines()[1:]:
            pro_list = np.array(line.strip().split(',')[:-1]).astype(
                np.float32)
            rcnn_pro.append(pro_list)

    rcnn_attention_pro = pk.load(open(rnn_cnn_attention, 'rb'))

    predict_merge = []
    predict_pro_merge = []
    for i in range(len(rcnn_attention_pro)):  # 预测样本的条数

        one_predict_merge = (np.array(rcnn_pro[i]) / 5 +
                             np.array(rcnn_attention_pro[i])) / 2

        predict_pro_merge.append(one_predict_merge)

        max_index = np.where(
            one_predict_merge == np.max(one_predict_merge))[0][0]
        print(max_index)
        predict_merge.append(max_index + 1)  # 这就是最终的预测结果pro_merge

    predict_context, ids = Data_helper.get_predict_data(
        from_project_root(
            "lstm_model/processed_data/phrase_level_test_data.csv"))

    # 保存
    pk.dump(
        predict_pro_merge,
        open(
            from_project_root(
                "lstm_model/result/pro_rcnn_rnn_cnn_attention_0.794.pk"),
            'wb'))

    # 保存结果
    with open(from_project_root(
            "lstm_model/result/result_rcnn_rnn_cnn_attention.csv"),
              'w',
              encoding='utf-8') as f:
        f.write("id,class\n")
        for i in range(len(ids)):
            f.write("{},{}\n".format(ids[i], predict_merge[i]))

    pass
Example #15
0
def read_data_df(filename, data_type):
    """
    分块读取文件
    :param filename:
    :param data_type:
    :return:
    """
    filename = from_project_root(filename)
    if data_type == "train":
        data_df = pd.read_csv(filename,
                              chunksize=10000,
                              dtype={
                                  "id": str,
                                  "article": str,
                                  "word_seg": str,
                                  "class": np.int
                              },
                              engine="c")
    elif data_type == "test":
        data_df = pd.read_csv(filename,
                              chunksize=10000,
                              dtype={
                                  "id": str,
                                  "article": str,
                                  "word_seg": str
                              },
                              engine="c")
    tr_list = []
    for tr in data_df:
        tr_list.append(tr)
    data_df = pd.concat(tr_list)
    return data_df
def main():

    json_url = from_project_root("processed_data/entity2contents.json")
    json_data = json_util.load(json_url)

    print(json_data["红楼梦"])
    exit()
Example #17
0
def load_raw_data(data_url, ngram=1):
    """ load data to get labels list and sentences list, set ngram=None if you
        want every sentence to be a space separated string instead of ngram list

    Args:
        data_url: url to data file
        ngram: generate ngram in sentence

    Returns:
        (list, list): labels and sentences

    """
    if not exists(data_url):
        generate_level_data(from_project_root("data/train_set.csv"))

    with open(data_url, "r", encoding="utf-8") as data_file:
        labels = list()
        sentences = list()
        print("loading data from \n ", data_url)
        s_time = time()
        for line in data_file:
            line = line.split(',')
            labels.append(int(line[0]))
            if ngram is not None:
                sentences.append(sentence_to_ngram(line[1], ngram))
            else:
                sentences.append(line[1])
        e_time = time()
        print("finished loading in %.3f seconds\n" % (e_time - s_time))
    return labels, sentences
Example #18
0
def gen_rematch_val():
    """ Use train data of rematch to generate gold result of test data in preliminary

    """
    train_df = pd.read_csv(from_project_root("data/train_2.csv"))
    test_df = pd.read_csv(
        from_project_root("data/preliminary/test_public.csv"))
    val_df = test_df.merge(train_df, on='content') \
        .drop(columns=['content_id_y']) \
        .rename(columns={'content_id_x': 'content_id'})
    val_df.to_csv(from_project_root('data/preliminary/test_gold.csv'),
                  index=False)

    test_df = pd.read_csv(from_project_root("data/test_public_2.csv"))
    test_df = test_df[~test_df['content_id'].isin(val_df['content_id'])]
    test_df.to_csv('data/test_2.csv', index=False)
Example #19
0
def transform(train_url=TRAIN_URL,
              test_url=None,
              column='word_seg',
              tw_type=TW_TYPE):
    """

    Args:
        column: column to use
        train_url: str, url to train data (with header)
        test_url: url to test data
        tw_type: str, term wighting type {idf, dc, bdc}

    Returns:
        X, y, X_test: vertorized data

    """
    data_url = from_project_root("processed_data\phrase_level_data.csv")
    if column == 'article':
        data_url = data_url.replace('phrase', 'word')

    if tw_type == 'idf':
        return tfidf_transform(train_url, test_url, column=column)
    elif tw_type == 'dc':
        dc_dict = cw.calc_dc(data_url, ngram=MAX_N)
        return dict_transform(dc_dict, train_url, test_url, column=column)
    elif tw_type == 'bdc':
        bdc_dict = cw.calc_bdc(data_url, ngram=MAX_N)
        return dict_transform(bdc_dict, train_url, test_url, column=column)
Example #20
0
def main():
    print("data generating...")
    xy_url = from_project_root(
        "processed_data/vector/{}_tf{}_{}gram_{}_XyN.pk".format(
            COLUMN, TW_TYPE, MAX_N, MAX_FEATURES))
    # test_url = None
    test_url = from_project_root('data/test_set.csv')
    if test_url:
        xy_url.replace('XyN', 'XyX_test')
    print("generated (X, y, X_test) will be saved at", xy_url)
    X, y, X_test = transform(TRAIN_URL,
                             test_url,
                             column=COLUMN,
                             tw_type=TW_TYPE)
    joblib.dump((X, y, X_test), xy_url)
    pass
Example #21
0
def ft_process(data_url=None):
    """ process data into what ft model need, and save it into './processed_data' dir

    Args:
        data_url: url to original .csv data

    Returns:
        str: url to saved processed data

    """
    save_filename = basename(data_url).replace('.csv', '_ft.csv')
    save_url = from_project_root("embedding_model/processed_data/" + save_filename)

    # file specified by data_url is already processed
    if exists(save_url):
        return save_url
    if data_url is not None:
        labels, sentences = load_raw_data(data_url)
    else:
        train_df = load_to_df(TRAIN_URL)
        labels = train_df['class'].values
        sentences = train_df['word_seg']

    with open(save_url, "w", encoding='utf-8', newline='\n') as ft_file:
        for i in range(len(labels)):
            label = FT_LABEL_PREFIX + str(labels[i])
            sentence = ' '.join(sentences[i])
            ft_file.write('{} {}\n'.format(label, sentence))
    return save_url
Example #22
0
def main():
    params = load_params()
    print("len(params) =", len(params))
    save_url = from_project_root(
        "processed_data/vector/stacked_dc_idf_lsvc_%d.pk" % len(load_params()))
    joblib.dump(feature_stacking(use_proba=True, random_state=RANDOM_STATE),
                save_url)
Example #23
0
def evaluate(pred_url, use_senti=True):
    """ evaluate result file of preliminary test data

    Args:
        pred_url: str, url of predicted result file
        use_senti: bool, use sentiment_value column or not

    """
    usecols = ['content_id', 'subject']
    if use_senti:
        usecols.append('sentiment_value')
    true_df = pd.read_csv(from_project_root('data/preliminary/test_gold.csv'),
                          usecols=usecols)
    pred_df = pd.read_csv(pred_url, usecols=usecols)

    # tp:判断正确的数量;
    # fp:判断错误或多判的数量;
    # fn;漏判的数量;
    tp = len(true_df.merge(pred_df, on=usecols))
    fp = len(pred_df) - tp
    fn = len(true_df) - tp
    print("metrics on test set of preliminary%s:" %
          ("" if use_senti else " without sentiment"))
    print(" tp = %d, fp = %d, fn = %d, n_samples = %d" % (tp, fp, fn, tp + fn))
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    micro_f1 = 2 * recall * precision / (recall + precision)
    print(" recall = %f, precision = %f, micro_f1 = %f\n" %
          (recall, precision, micro_f1))
Example #24
0
def predict(pro_file):

    data = pk.load(open(pro_file,'rb'))
    result = []
    for predict_one_merge in data:
        max_index = np.where(predict_one_merge == np.max(predict_one_merge))[0][0]
        result.append(max_index+1)

    predict_context, ids = Data_helper.get_predict_data(
        from_project_root("lstm_model/processed_data/phrase_level_test_data.csv"))

    # 保存结果
    with open(from_project_root("hierarchicalAttention_Model/result/result_rcnn_rcnn_atten_han_5cv.csv"), 'w', encoding='utf-8') as f:
        f.write("id,class\n")
        for i in range(len(ids)):
            f.write("{},{}\n".format(ids[i], result[i]))
Example #25
0
def one_hot(param_data, sentence_type):
    """
    计算每个词对应的one_hot值
    :param param_data:
    :param sentence_type:
    :return:
    """
    word_dictionary = []
    data = None
    if sentence_type == "phrase":
        data = param_data["word_seg"].values
    elif sentence_type == "word":
        data = param_data["article"].values
    for sentence in tqdm(data):
        word_list = sentence.split(" ")
        word_list_only = list(set(word_list))
        word_dictionary.extend(word_list_only)
    word_dictionary_count = Counter(word_dictionary)
    word_dictionary_only = list(word_dictionary_count.items())
    word_value = [1] * len(word_dictionary_only)
    word_df = pd.DataFrame(word_dictionary_only, columns=["word", "count"])
    word_df["weight"] = word_value
    filename = "processed_data/csv_weight/" + sentence_type + "_level_one_hot.csv"
    filename = from_project_root(filename)
    word_df.to_csv(filename, index=False)
def calc_tf(data_url=DATA_URL, update=False, ngram=1):
    """ calc the tf value of all tokens

    Args:
        data_url: url to data file
        update: update dict even it exists
        ngram: max_n for ngram

    Returns:
        dict: tf dict {word: tf_value}

    """
    level = 'phrase' if 'phrase' in data_url else 'word'
    tf_url = from_project_root(
        "processed_data/saved_weight/{}_level_{}gram_tf.json".format(
            level, ngram))
    if not update and exists(tf_url):
        return ju.load(tf_url)

    tf_dict = collections.defaultdict(int)
    _, sentences = load_raw_data(data_url, ngram=ngram)
    for sentence in tqdm(sentences):
        for word in sentence:
            tf_dict[word] += 1

    ju.dump(ju.sort_dict_by_value(tf_dict, reverse=True), tf_url)
Example #27
0
def validate(pkl_url=None, cv=5, evaluating=False):
    """ do validating

        Args:
            pkl_url: load data from pickle file, set to None to generate data instantly
            cv: do cross validation or not
            evaluating: whether to do evaluating on test_gold

    """
    clfs = init_clfs()
    val_url = from_project_root("data/preliminary/test_gold_ex.csv")
    if pkl_url is not None:
        # load from pickle
        print("loading data from", pkl_url)
        X, y, X_val = joblib.load(pkl_url)
    else:
        train_url = from_project_root("data/preliminary/train_ex.csv")
        # generate from original csv
        X, y, X_val = generate_vectors(train_url,
                                       val_url,
                                       column='article',
                                       max_n=3,
                                       min_df=3,
                                       max_df=0.8,
                                       max_features=20000,
                                       trans_type='dc',
                                       sublinear_tf=True,
                                       balanced=True,
                                       multilabel_out=False,
                                       label_col='subjects',
                                       only_single=True,
                                       shuffle=True)

    print("data shapes:\n", X.shape, y.shape, X_val.shape)
    for name, clf in clfs.items():
        if len(y.shape) > 1:
            clf = OneVsRestClassifier(clf)
        print("cross validation on %s is running" % name)
        validate_clf(clf, X, y, cv=5, scoring='f1_micro')
        if evaluating:
            print("metrics of %s classifier:" % name)
            clf.fit(X, y)
            y_true = pd.read_csv(val_url, usecols=list(map(
                str, range(10)))).values < 2
            y_pred = clf.predict(X_val)
            y_probas = predict_proba(clf, X_val)
            calc_metrics(y_true, y_pred, y_probas)
Example #28
0
def feature_stacking(n_splits=CV,
                     random_state=None,
                     use_proba=False,
                     verbose=False,
                     drop_words=DROP_WORDS):
    """

    Args:
        n_splits: n_splits for KFold
        random_state: random_state for KFlod
        use_proba: True to predict probabilities of labels instead of labels
        verbose: True to print more info
        drop_words: drop_words for run_parallel

    Returns:
        X, y, X_test

    """

    # clf = OneVsRestClassifier(SVC(kernel='linear', probability=True)) # multilabel
    clf = OneVsRestClassifier(LinearSVCP())  # LinearSVC for multilabel
    # train_url = from_project_root("data/multilabel.csv")
    # test_url = from_project_root("data/test_processed.csv")
    train_url = from_project_root("../data/multilabel.csv")
    test_url = from_project_root("../data/test_processed.csv")

    # test_url = None
    X, y, X_test = generate_vectors(train_url, test_url,
                                    sublinear_tf=False)  # for X.shape

    params_list = load_params()
    parallel = joblib.Parallel(n_jobs=N_JOBS, verbose=True)

    rets = parallel(
        joblib.delayed(run_parallel)
        (ind, train_url, test_url, params, clf, n_splits, random_state,
         use_proba, verbose, drop_words)
        for ind, params in enumerate(params_list))
    rets = sorted(rets, key=lambda x: x[0])

    X_stack_train = np.empty((X.shape[0], 0), float)
    X_stack_test = np.empty((X_test.shape[0], 0), float)
    for ind, y_pred, y_pred_test in rets:
        X_stack_train = np.append(X_stack_train, y_pred, axis=1)
        X_stack_test = np.append(X_stack_test, y_pred_test, axis=1)
    return X_stack_train, y, X_stack_test
Example #29
0
def main():
    train_data_file = from_project_root(
        "lstm_model/processed_data/one_gram/filter-1gram_phrase_level_data_200.csv"
    )
    n_parts = 5
    dev_nums = 5000
    split_data_to_parts(train_data_file, n_parts, dev_nums)
    pass
Example #30
0
def main():
    kwargs = {
        'size': 300,
        'min_count': 5,
        'window': 5,
        'iter': 5,
        'sg': 1,
        'hs': 1
    }
    model = train_w2v_model(data_url=None, kwargs=kwargs)
    print(len(model.wv.vocab))

    wv_url = from_project_root(
        "embedding_model/models/wv_word_seg_300_5_5_5_1_1.txt")
    save_url = from_project_root("processed_data/vector/avg_wvs_300.pk")
    gen_data_for_clf(wv_url, save_url=save_url)
    pass