Ejemplo n.º 1
0
def eval_ae():
    from Models.logistic_regression import MultiClassLogisticRegression
    from Models.random_forest import RandomForest
    from Models.naive_bayes import NaiveBayes
    from Models.svm import SVM
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat(
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw)
    # Train an auto encoder of size 4096
    encoder = get_encoder(train_x, test_x, 4096)
    # use auto encoder to encode the train, validate and test sets
    encoded_train = encoder.predict(train_x)
    encoded_test = encoder.predict(test_x)
    encoded_val = encoder.predict(val_x)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print('neural net ae')
    model = _get_nn_model_bag_of_words_simple_scratch(
        encoded_train,
        train_y,
        encoded_val,
        val_y,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(model, encoded_test, test_y)
    evaluate_model_nn(model, encoded_test, test_y)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print('logistic regression ae')
    model = MultiClassLogisticRegression()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print('random forest ae')
    model = RandomForest()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print('naive bayes ae')
    model = NaiveBayes()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)
def _get_naive_bayes_model_bag_of_words_full_save_missing(train_x, train_y):
    naive_bayes = NaiveBayes(name='NaiveBayes_bag_of_words_full_save_missing')
    naive_bayes.train(train_x, train_y)

    return naive_bayes
def _get_naive_bayes_model_tfidf(train_x, train_y):
    naive_bayes = NaiveBayes(name='NaiveBayes_tfidf')
    naive_bayes.train(train_x, train_y)

    return naive_bayes
def _get_naive_bayes_model_doc2vec_simple_16384(train_x, train_y):
    naive_bayes = NaiveBayes(name='NaiveBayes_doc2vec_simple_16384')
    naive_bayes.train(train_x, train_y)

    return naive_bayes
def _get_naive_bayes_model_bag_of_words_simple(train_x, train_y):
    naive_bayes = NaiveBayes(name='NaiveBayes_bag_of_words_simple')
    naive_bayes.train(train_x, train_y)

    return naive_bayes
Ejemplo n.º 6
0
def eval_pub_med():
    from gensim.models.keyedvectors import KeyedVectors
    # Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
    # Load the pubmed model
    model = KeyedVectors.load_word2vec_format(
        'wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
    # Load data into train/validate/test sets
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    tokens_train, train_y_raw = tokenize(train_x_raw,
                                         train_y_raw,
                                         save_missing_feature_as_string=False,
                                         remove_empty=True)
    # for the each tokenized vector in the train set, run the model on each word and take the average.
    # If no words are vectorized by pubmed, append an 0 vector
    avg = []
    for item in tokens_train:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_train = np.array(avg)

    # run the same for the validation set
    tokens_val, val_y_raw = tokenize(val_x_raw,
                                     val_y_raw,
                                     save_missing_feature_as_string=False,
                                     remove_empty=True)
    avg = []
    for item in tokens_val:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_val = np.array(avg)

    # run the same for the test set
    tokens_test, test_y_raw = tokenize(test_x_raw,
                                       test_y_raw,
                                       save_missing_feature_as_string=False,
                                       remove_empty=True)
    avg = []
    for item in tokens_test:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_test = np.array(avg)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, nn")
    nn_model = _get_nn_model_bag_of_words_simple_scratch(
        pub_med_train,
        train_y_raw,
        pub_med_val,
        val_y_raw,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(nn_model, pub_med_test, test_y_raw)
    evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, logistic regression")
    from Models.logistic_regression import MultiClassLogisticRegression
    log_reg = MultiClassLogisticRegression()
    log_reg.train(pub_med_train, train_y_raw)
    eval_model(log_reg, pub_med_test, test_y_raw)
    evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False)
    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, random forest")
    from Models.random_forest import RandomForest
    rand_for = RandomForest()
    rand_for.train(pub_med_train, train_y_raw)
    eval_model(rand_for, pub_med_test, test_y_raw)
    evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False)
    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, naivebayes")
    from Models.naive_bayes import NaiveBayes
    nb = NaiveBayes()
    nb.train(pub_med_train, train_y_raw)
    eval_model(nb, pub_med_test, test_y_raw)
    evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)