Ejemplo n.º 1
0
if __name__ == "__main__":
    from tfn.preprocess import Dataset
    from tfn.helper import export_results
    from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
    from sklearn.model_selection import train_test_split
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("-x",
                        "--export-results",
                        dest="export",
                        action='store_true',
                        help="Exports results to results.csv")
    args = parser.parse_args()

    data = Dataset('twitter')
    X_train, X_test, y_train, y_test = train_test_split(data.X, data.y)

    grad_boost = GradientBoost()
    grad_boost.fit(X_train, y_train)
    y_pred = grad_boost.predict(X_test)

    print(y_pred)
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('TF-IDF + xgb accuracy:', round(acc, 4))
    print('TF-IDF + xgb AUC:', round(roc, 4))
    print('TF-IDF + xgb F1:', round(f1, 4))
Ejemplo n.º 2
0
    parser.add_argument("--emb-type",
                        "-t",
                        dest="type",
                        default="glove",
                        type=str,
                        help="Embedding type. Can be 'glove' or 'char'.")
    parser.add_argument("-x",
                        "--export-results",
                        dest="export",
                        action='store_true',
                        help="Exports results to results.csv")
    args = parser.parse_args()

    if args.type == "glove":
        emb_size = args.emb_size
        data = Dataset(args.type)
        emb = GloveEmbedding(data.X, emb_size=emb_size, type=args.type)
        X = emb.corpus_vectors
        y = np.array(data.y)
    elif args.type == "char":
        data = Dataset(args.type)
        emb = CharEmbedding(data.X)
        X = emb.X_enc
        y = np.array(data.y)
        emb_size = 100

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    lstm = LSTMModel(num_features=emb_size, seq_length=X.shape[1])
Ejemplo n.º 3
0
                        help="Proportion of data used for testing.")
    parser.add_argument("--val-prop",
                        "-v",
                        dest="val_prop",
                        default=0.2,
                        type=float,
                        help="Proportion of data used for validation.")
    parser.add_argument(
        "--no-export",
        "-n",
        dest="no_export",
        action="store_false",
        help="Results of running will not be stored in results.csv.")
    args = parser.parse_args()

    data_t = Dataset('twitter')

    # Load augmentation data
    if args.load_aug:
        with open(AUG_PATH / args.load_aug, 'rb') as aug_file:
            num_copies, aug_t = pickle.load(aug_file)
    else:
        # Run data augmentation (takes a long time)
        num_copies = args.aug_copies
        aug_t = AugmentWithEmbeddings(data_t.X,
                                      data_t.y,
                                      num_copies=num_copies,
                                      replace_pr=args.repl_prob)
        if args.save_aug:
            with open(AUG_PATH / args.save_aug, 'wb') as aug_file:
                pickle.dump((num_copies, aug_t), aug_file)
Ejemplo n.º 4
0
        for X_test in self.pred_test_loader:
            X_test = X_test[0]
            y_pred = self.model(X_test)
            if self.device == "cpu":
                predictions_list.append(y_pred.data.numpy())
            else:
                predictions_list.append(y_pred.data.cpu().numpy())
        predictions = np.vstack(predictions_list)
        return predictions


if __name__ == '__main__':
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

    data = Dataset("glove")
    embedding = GloveEmbedding(data.X, emb_size=EMBEDDING_DIM, type="glove")
    X = embedding.corpus_vectors
    y = np.array(data.y)

    cnn = CNNModel(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT,
                   BATCH_SIZE)

    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

    #optimiser = optim.SGD(self.model.parameters(), lr=lr, momentum=momentum)
    optimiser = optim.Adam(cnn.model.parameters())
    cnn.fit(X_train, y_train, optimiser, epochs=30)

    y_pred = cnn.predict(X_test)
Ejemplo n.º 5
0
    class Callback(CallbackAny2Vec):
        """Callback to print loss after each epoch."""
        def __init__(self):
            self.epoch = 0
            self.loss_previous_step = 0

        def on_epoch_end(self, model):
            loss = model.get_latest_training_loss()
            print('Loss after epoch {}: {}'.format(
                self.epoch, loss - self.loss_previous_step))
            self.epoch += 1
            self.loss_previous_step = loss


if __name__ == "__main__":
    from tfn.preprocess import Dataset

    # Test GloveEmbedding
    type = 'glove'
    ds = Dataset(type)
    emb = GloveEmbedding(ds.X, type=type)

    print(emb.corpus_vectors.shape)
    print(emb.corpus[0])
    print(emb.corpus_vectors[0])

    # Test OneHotCharEmbedding
    # ds = Dataset('char')
    # emb = CharEmbedding(ds.X, train=True, training_path="../data/training.1600000.processed.noemoticon.csv")
Ejemplo n.º 6
0
    while (i < len(vocab)):
        temp = vocab[i].split()
        for ii in temp:
            new_vocab.append(ii)
        i = i + 1
    #print(new_vocab)
    #remove duplicate word in list
    newlist = sorted(set(new_vocab), key=lambda x: new_vocab.index(x))
    #print(newlist)
    print('Get ' + str(gram) + '-gram ' + str(target) + ' target')

    return newlist


if __name__ == '__main__':
    trainX, y = Dataset('twitter')._get_training_data_from_csv()

    vocab1_diaster = get_word(gram=1, target=1, length=500)
    vocab1_nondiaster = get_word(gram=1, target=0, length=50)
    vocab2_diaster = get_word(gram=2, target=1, length=50)
    vocab2_nondiaster = get_word(gram=2, target=0, length=50)
    vocab3_diaster = get_word(gram=3, target=1, length=50)
    vocab3_nondiaster = get_word(gram=3, target=0, length=50)

    word2vecmod = word2vec_model(corpus=trainX, update=False)

    tsne_plot(word2vecmod, vocab1_diaster, "UniGram diaster word embedding")
    tsne_plot(word2vecmod, vocab1_nondiaster,
              "UniGram nondiaster word embedding")
    tsne_plot(word2vecmod, vocab2_diaster, "BiGram diaster word embedding")
    tsne_plot(word2vecmod, vocab2_nondiaster,