コード例 #1
0
        vob = vob.union(set(data[i][0]))
    vecs = load_my_vecs("./output/word2vect.txt", vob)
    vecs = handle_unknow(list(vob), vecs)
    word_to_ix = {}
    tag_to_ix = {"K": 0, "o": 1}
    word_length = len(vecs.keys())
    for i in range(word_length):
        word_to_ix[list(vecs.keys())[i]] = i
    pretrained_weight = list(vecs.values())

    train(X_train, word_to_ix, pretrained_weight)

    result = []
    for i in range(EPOCHS):
        # 训练集的结果
        yp_train = lstm.test("./model/lstm_w2v" + str(i) + ".pkl", X_train,
                             word_to_ix)
        p_t, r_t, f1_t = lstm.calculate(y_train, yp_train)
        # 测试集的结果
        yp_test = lstm.test("./model/lstm_w2v" + str(i) + ".pkl", X_test,
                            word_to_ix)
        p, r, f1 = lstm.calculate(y_test, yp_test)
        now = datetime.strftime(datetime.now(), "%m-%d %H:%M:%S")
        print(now, "epoch:", i)
        result.append(([p_t, r_t, f1_t], [p, r, f1]))
    with open("./output/lstmw2vresult.txt", "w", encoding="utf-8") as f:
        for i, j in result:
            f.write(" ".join(map(str, i)))
            f.write("\t")
            f.write(" ".join(map(str, j)))
            f.write("\n")
コード例 #2
0
ファイル: lstm_test.py プロジェクト: ab123498/MLDS2017
import os
import argparse
from lstm import test
# parser = argparse.ArgumentParser()
# parser.add_argument('testing_list')
# parser.add_argument('feat_path')
# option = parser.parse_args()
testing_dir = './MLDS_hw2_data/testing_data/feat'
testing_list = './MLDS_hw2_data/testing_id.txt'

test(testing_dir, testing_list)
コード例 #3
0
val_x = dev_x
val_y = change_dev_y

model = model_input.LSTModel(
    class_num=class_num,
    lstm_dims=lstm_sizes,
    fc_size=fc_size,
    max_sent_len=max_sent_len,
)
model.build()
print("****model build finished***")

# # ==================train=================
# min_dev_loss = model_input.train(
#     model,
#     learning_rate,
#     train_x,
#     train_y,
#     val_x,
#     val_y,
#     max_epochs,
#     batch_size,
#     keep_prob,
#     l2reg,
#     show_step=show_step
# )
# logger.info(' ** The minimum dev_loss is {min_dev_loss}')

# ==================test =================
model_input.test(model, test_x, test_y, batch_size)
コード例 #4
0
def experiment_model(
    selection_problem,
    selection_test_fold,
    selection_source,
    selection_test_source,
    selection_count,
    selection_random_seed,
    selection_tag,
    selection_reject_minimum,
    selection_overwrite,
    al_threshold,
    embedding_type,
    embedding_shape,
    embedding_overwrite,
    model_type,
    model_arch_num,
    model_layer_sizes,
    model_maxlen,
    model_batch_size,
    model_learning_rate,
    model_epochs,
    model_num,
    experiment_tag,
    verbose=True,
    params=None
):
    # embed_df, sel_df, name = experiment_dataset(
    #     selection_problem,
    #     selection_source,
    #     selection_count,
    #     selection_random_seed,
    #     selection_reject_minimum,
    #     selection_overwrite,
    #     embedding_type,
    #     embedding_shape,
    #     embedding_overwrite,
    #     verbose=verbose
    # )


    embed_df, sel_df, name, test_selection_df, test_embedding_df, al_selection_df, al_embedding_df = experiment_dataset(
        selection_problem,
        selection_test_fold,
        selection_source,
        selection_test_source,
        selection_count,
        selection_random_seed,
        selection_tag,
        selection_reject_minimum,
        selection_overwrite,
        al_threshold,
        embedding_type,
        embedding_shape,
        embedding_overwrite,
    )

    X = embed_df
    X_test = test_embedding_df
    X_al_test = al_embedding_df
    target_col = ""
    if selection_problem == "reliability":
        target_col = "reliable"
        y = sel_df.reliable
        y_test = test_selection_df.reliable
        y_al_test = al_selection_df.reliable
    elif selection_problem == "biased" or selection_problem == "extreme_biased" or selection_problem == "bias_direction": # NOTE: unsure if this is where bias_direction should go?
        target_col = "biased"
        y = sel_df.biased
        y_test = test_selection_df.biased
        y_al_test = al_selection_df.biased

    # pad as needed
    data_width=0
    if embedding_shape == "sequence":
        X = lstm.pad_data(X, maxlen=model_maxlen)
        X_test = lstm.pad_data(X_test, maxlen=model_maxlen)
        X_al_test = lstm.pad_data(X_al_test, maxlen=model_maxlen)

        # TODO: 300 actually needs to be width (num cols) of dataset
        data_width = X.shape[-1]
        if model_type == "cnn":
            X = np.reshape(X, (X.shape[0], model_maxlen*data_width, 1))
            X_test = np.reshape(X_test, (X_test.shape[0], model_maxlen*data_width, 1))
            X_al_test = np.reshape(X_al_test, (X_al_test.shape[0], model_maxlen*data_width, 1))
    else:
        X = np.array(X)
        y = np.array(y)
        X_test = np.array(X_test)
        y_test = np.array(y_test)
        X_al_test = np.array(X_al_test)
        y_al_test = np.array(y_al_test)
        print(X)
        data_width = X.shape[-1]

    
    if selection_problem == "bias_direction" and model_type != "svm":
        y = keras.utils.to_categorical(y, num_classes=3)
        y_test = keras.utils.to_categorical(y_test, num_classes=3)
        y_al_test = keras.utils.to_categorical(y_al_test, num_classes=3)


    if "AL_TRAINING" in experiment_tag:
        model = svm.LinearSVC()
        print(X_al_test.shape, y_al_test.shape)
        cv_results = cross_validate(model, X_al_test, y_al_test, cv=10)
        print("_"*80)
        print(cv_results["test_score"])
        results_scores = []
        total = 0
        for num in cv_results["test_score"]:
            results_scores.append(num)
            total += num
        total /= len(cv_results["test_score"])
        print(total)

        save_data = {"average": float(total), "scores": results_scores}
        output_path = f"../data/output/{experiment_tag}"
        util.create_dir(output_path)
        with open(output_path + "/" + experiment_tag + ".json", 'w') as outfile:
            json.dump(save_data, outfile)
        exit()

    
    name = f'{experiment_tag}_{name}_{model_type}_{model_arch_num}_{model_num}_{model_maxlen}_{model_batch_size}_{model_learning_rate}'
        
    if model_type == "lstm":
        model, history, loss, acc, predictions = lstm.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem)

        loss_al, acc_al, predictions_al = lstm.test(X_al_test, y_al_test, model_batch_size, model)
    elif model_type == "cnn":
        model, history, loss, acc, predictions = cnn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name)
    elif model_type == "nn":
        model, history, loss, acc, predictions = nn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem)

        loss_al, acc_al, predictions_al = nn.test(X_al_test, y_al_test, model_batch_size, model)
    elif model_type == "svm":
        model = svm.LinearSVC(random_state=42)
        model.fit(X, y)
        history = None
        loss = 0
        acc = model.score(X_test, y_test)
        predictions = model.predict(X_test)
        loss_al = 0
        acc_al = model.score(X_al_test, y_al_test)
        predictions_al = model.predict(X_al_test)
    print("Training done")

    logging.info("%s", str(test_selection_df[target_col].value_counts()))
    print(test_selection_df[target_col].value_counts())

    # turn predictions into dataframe
    #pred = pd.DataFrame({"predicted": predictions})
    #pred.index = test_selection_df.index
    
    if selection_problem == "bias_direction" and model_type != "svm":
        test_selection_df["predicted"] = np.argmax(predictions, axis=1)
        test_selection_df["pred_class"] = np.argmax(predictions, axis=1)
        
        al_selection_df["predicted"] = np.argmax(predictions_al, axis=1)
        al_selection_df["pred_class"] = np.argmax(predictions_al, axis=1)
    else:
        test_selection_df["predicted"] = predictions
        test_selection_df["pred_class"] = round(test_selection_df.predicted).astype(int)
        
        al_selection_df["predicted"] = predictions_al
        al_selection_df["pred_class"] = round(al_selection_df.predicted).astype(int)

    #al_unique_selection_df = []
    
    # get list of sources for MBC that aren't in training set
    training_sources = list(set(sel_df.source))
    mbc_sources = list(set(al_selection_df.Source))
    unseen_mbc_sources = [x for x in mbc_sources if x not in training_sources and not (x in util.MBC_to_NELA and util.MBC_to_NELA[x] in training_sources)]
    al_unseen_selection_df = al_selection_df[al_selection_df.Source.isin(unseen_mbc_sources)]

    print("="*20, "TRAINING", "="*20)
    print(training_sources)
    print("="*20, "MBC", "="*20)
    print(mbc_sources)
    print("="*20, "UNSEEN", "="*20)
    print(unseen_mbc_sources)


    overall_counts = [] 
    overall_counts_al = [] 
    overall_counts_al_unseen = [] # only unique sources
    if selection_problem != "bias_direction":
        overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=True)
        overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=True)
        overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=True)
    else:
        overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=False)
        overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=False)
        overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=False)


        

    # make output directory (based on experiment tag)
    output_path = f"../data/output/{experiment_tag}"
    breakdown_output_path = output_path + "/persource"
    albreakdown_output_path = output_path + "/alpersource"
    util.create_dir(output_path)
    util.create_dir(breakdown_output_path)
    util.create_dir(albreakdown_output_path)

    logging.info("Overall confusion analysis")
    confusion_analysis(overall_counts, output_path, experiment_tag, name, history, loss, acc, params, False)
    logging.info("Overall analysis complete")

    groups = test_selection_df.groupby(test_selection_df.source)
    logging.info("There are %i groups", len(groups))


    for group_name, group in groups:
        logging.info("Next group %s", name)

        group_counts = []
        if selection_problem != "bias_direction":
            group_counts = calculate_cm_counts(group, target_col, binary=True)
        else:
            group_counts = calculate_cm_counts(group, target_col, binary=False)
            

        confusion_analysis(group_counts, breakdown_output_path, experiment_tag, name + "_persource", history, loss, acc, params, source=group_name)

    #with open("../data/output/" + name + "_predictions.pkl", 'wb') as outfile:
    with open(output_path + "/" + name + "_predictions.pkl", 'wb') as outfile:
        pickle.dump(test_selection_df, outfile)

    logging.info("*****-----------------------------------------*****")
    logging.info("Article-level analysis")
    confusion_analysis(overall_counts_al, output_path, experiment_tag, name + "_al", None, loss_al, acc_al, params, False)
    logging.info("--- (With only unseen sources)")
    confusion_analysis(overall_counts_al_unseen, output_path, experiment_tag, name + "_al_unseen", None, loss_al, acc_al, params, False)
    with open(output_path + "/" + name + "_al_unseensourcelist.json", 'w') as outfile:
        json.dump(unseen_mbc_sources, outfile)
    # TODO: move unseen source calc to bottom and redo groups?
    
    groups = al_selection_df.groupby(al_selection_df.Source)
    logging.info("There are %i al groups", len(groups))
    
    for group_name, group in groups:
        logging.info("Next group %s", name)

        group_counts = []
        if selection_problem != "bias_direction":
            group_counts = calculate_cm_counts(group, target_col, binary=True)
        else:
            group_counts = calculate_cm_counts(group, target_col, binary=False)
        confusion_analysis(group_counts, albreakdown_output_path, experiment_tag, name + "_peralsource", None, loss_al, acc_al, params, source=group_name)
    with open(output_path + "/" + name + "_predictionsal.pkl", 'wb') as outfile:
        pickle.dump(al_selection_df, outfile)
コード例 #5
0
ファイル: train.py プロジェクト: matejMartinc/CLIN29
def main():
    print("Predict Youtube cross genre")
    directory = 'data/csv/'
    '''df_data, y = preprocess_data(directory, 'train_news_twitter.csv')
    df_test, test_y = preprocess_data(directory, 'youtube_train.csv')
    train_and_test(df_data, y, df_test, test_y, 100, 'youtube')

    print("Predict News cross genre")
    directory = 'data/csv/'
    df_data, y = preprocess_data(directory, 'train_youtube_twitter.csv')
    df_test, test_y = preprocess_data(directory, 'news_train.csv')
    train_and_test(df_data, y, df_test, test_y, 100, 'news')'''

    print("Predict Twitter cross genre")
    #directory = 'data/csv/'
    #df_data, y = preprocess_data(directory, 'twitter_train.csv')
    #df_test, test_y = preprocess_data(directory, 'twitter_train.csv')
    #print("Shape of train and test: ", df_data.shape, df_test.shape)
    #train_and_test(df_data, y, df_test, test_y, 100, 'twitter')
    '''directory = 'data/csv/'
    df_data, y, df_test, test_y = preprocess_data(directory, 'surprise_test.csv', split=True)
    print("Shape of train and test: ", df_data.shape, df_test.shape)
    train_and_test(df_data, y, df_test, test_y, 100, 'surprise')'''

    #cross genre
    '''model = 'models/news_model_cg_0.557.pt'
    model = torch.load(model)
    corpus = pickle.load(open('models/news_corpus_cg_0.557.pk', 'rb'))
    corpus.batch_size = 16
    model.batch_size = 16
    df_test, test_y = preprocess_data(directory, 'twitter_test.csv', predict=True)
    test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_twitter_2', test=False)

    model = 'models/news_model_cg_0.557.pt'
    model = torch.load(model)
    corpus = pickle.load(open('models/news_corpus_cg_0.557.pk', 'rb'))
    corpus.batch_size = 10
    model.batch_size = 10
    df_test, test_y = preprocess_data(directory, 'news_test.csv', predict=True)
    test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_news_1', test=False)

    model = 'models/youtube_model_cg_0.558.pt'
    model = torch.load(model)
    corpus = pickle.load(open('models/youtube_corpus_cg_0.558.pk', 'rb'))
    corpus.batch_size = 2
    model.batch_size = 2
    df_test, test_y = preprocess_data(directory, 'youtube_test.csv', predict=True)
    test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_youtube_1', test=False)'''
    '''model = 'models/news_model_in.pt'
    model = torch.load(model)
    corpus = pickle.load(open('models/news_corpus_in.pk', 'rb'))
    corpus.batch_size = 1
    model.batch_size = 1
    df_test, test_y = preprocess_data(directory, 'surprise_test.csv', predict=True)
    test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_kb_1', test=False)'''

    #in_genre

    model = 'models/youtube_model_in.pt'
    model = torch.load(model)
    corpus = pickle.load(open('models/youtube_corpus_in.pk', 'rb'))
    corpus.batch_size = 2
    model.batch_size = 2
    df_test, test_y = preprocess_data(directory,
                                      'youtube_test.csv',
                                      predict=True)
    test(df_test, test_y, model, corpus, 'IJS-KD_IN_youtube_1', test=False)
    '''model = 'models/news_model_in.pt'