Example #1
0
def get_clean_reviews(articles):
    """Return a list of list of words."""

    clean_reviews = []
    for review_text in articles["review"]:
        clean_reviews.append(review_to_wordlist(review_text,
                                                remove_stopwords=True))
    return clean_reviews

    # Create clean_train_reviews and clean_test_reviews as we did before

    # Read data from files
    train_file = join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv')
    train = pd.read_csv(train_file, header=0, delimiter="\t", quoting=3)

    test_file = join(os.path.dirname(__file__), 'data', 'testData.tsv')
    test = pd.read_csv(test_file, header=0, delimiter="\t", quoting=3)


    print "Cleaning training reviews"
    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append(review_to_wordlist(review,
            remove_stopwords=True))

    print "Cleaning test reviews"
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append(review_to_wordlist(review,
            remove_stopwords=True))


    # ****** Create bags of centroids

    # Pre-allocate an array for the training set bags of centroids (for speed)
    train_centroids = np.zeros((train["review"].size, num_clusters),
        dtype="float32")

    # Transform the training set reviews into bags of centroids
def main(args):
    print "loadding reviews and labels from dataset"
    data = pd.read_csv('data/labeledTrainData.tsv.zip',
                       compression='zip',
                       delimiter='\t',
                       header=0,
                       quoting=3)
    reviews = data["review"]
    labels = list(data['sentiment'])
    sentences = []
    for review in reviews:
        if len(review) > 0:
            sentences.append(
                utils.review_to_wordlist(review.decode('utf8').strip(),
                                         remove_stopwords=True))
    print "loaded %d reviews from dataset" % len(sentences)

    word_dict = utils.build_vocab(sentences, max_words=10000)
    vec_reviews = utils.vectorize(sentences, word_dict, verbose=True)
    train_x = vec_reviews[0:20000]
    train_y = labels[0:20000]
    train_y = utils.one_hot(train_y, args.nb_classes)
    test_x = vec_reviews[20000:]
    test_y = labels[20000:]
    test_y = utils.one_hot(test_y, args.nb_classes)

    save_dir = args.save_dir
    log_dir = args.log_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    with tf.Graph().as_default():
        config_proto = utils.get_config_proto()
        sess = tf.Session(config=config_proto)
        if args.model_type == "cnn":
            model = TextCNN(args, "TextCNN")
            test_batch = utils.get_batches(test_x, test_y, args.max_size)
        elif args.model_type in ["rnn", "bi_rnn"]:
            model = TextRNN(args, "TextRNN")
            test_batch = utils.get_batches(test_x,
                                           test_y,
                                           args.max_size,
                                           type="rnn")

        sess.run(tf.global_variables_initializer())
        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

        for epoch in range(1, args.nb_epochs + 1):
            print "epoch %d start" % epoch
            print "- " * 50

            loss = 0.
            total_reviews = 0
            accuracy = 0.
            if args.model_type == "cnn":
                train_batch = utils.get_batches(train_x, train_y,
                                                args.batch_size)
            elif args.model_type in ["rnn", "bi_rnn"]:
                train_batch = utils.get_batches(train_x,
                                                train_y,
                                                args.batch_size,
                                                type="rnn")
            epoch_start_time = time.time()
            step_start_time = epoch_start_time
            for idx, batch in enumerate(train_batch):
                reviews, reviews_length, labels = batch
                _, loss_t, accuracy_t, global_step, batch_size, summaries = model.train(
                    sess, reviews, reviews_length, labels, args.keep_prob)

                loss += loss_t * batch_size
                total_reviews += batch_size
                accuracy += accuracy_t * batch_size
                summary_writer.add_summary(summaries, global_step)

                if global_step % 50 == 0:
                    print "epoch %d, step %d, loss %f, accuracy %.4f, time %.2fs" % \
                      (epoch, global_step, loss_t, accuracy_t, time.time() - step_start_time)
                    step_start_time = time.time()

            epoch_time = time.time() - epoch_start_time
            print "%.2f seconds in this epoch" % (epoch_time)
            print "train loss %f, train accuracy %.4f" % (
                loss / total_reviews, accuracy / total_reviews)

            total_reviews = 0
            accuracy = 0.
            for batch in test_batch:
                reviews, reviews_length, labels = batch
                accuracy_t, batch_size = model.test(sess, reviews,
                                                    reviews_length, labels,
                                                    1.0)
                total_reviews += batch_size
                accuracy += accuracy_t * batch_size
            print "accuracy %.4f in %d test reviews" % (
                accuracy / total_reviews, total_reviews)
Example #4
0
def main(args):
    print "loadding data and labels from dataset"
    train = pd.read_csv(args.train_dir)
    ch_train = pd.read_csv(args.chtrain_dir)
    x_train = train["comment_text"]
    x_chtrain = ch_train["comment_text"]
    target_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    x = []
    x_ch = []
    for line in x_train:
        if len(line) > 0:
            x.append(utils.review_to_wordlist(line.strip()))
    print "loaded %d comments from dataset" % len(x)
    for line in x_chtrain:
        if len(line) > 0:
            x_ch.append(utils.review_to_wordlist_char(line.strip()))
    print "loaded %d comments from dataset" % len(x)
    y = train[target_cols].values

    index2word, word2index = utils.load_vocab(args.vocab_dir)
    index2char, char2index = utils.load_char(args.char_dir)
    x_vector = utils.vectorize(x, word2index, verbose=False)
    x_vector = np.array(x_vector)
    char_vector = utils.vectorize_char(x_ch, char2index, verbose=False)
    char_vector = np.array(char_vector)
    print char_vector[0]

    save_dir = os.path.join(args.save_dir, args.model_type)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]:
        max_step = args.max_step_cnn
        max_size = args.max_size_cnn
        nb_epochs = args.nb_epochs_cnn
    elif args.model_type in [
            "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn"
    ]:
        max_step = args.max_step_rnn
        max_size = args.max_size_rnn
        nb_epochs = args.nb_epochs_rnn

    ex_features = add_features("../data/train.csv")
    nfolds = args.nfolds
    skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018)
    test_prob = []
    stack_logits = np.zeros((len(x_vector), len(target_cols)))
    for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)):
        x_train, x_eval = x_vector[train_index], x_vector[test_index]
        char_train, char_eval = char_vector[train_index], char_vector[
            test_index]
        y_train, y_eval = y[train_index], y[test_index]
        with tf.Graph().as_default():
            config_proto = utils.get_config_proto()
            sess = tf.Session(config=config_proto)
            if args.model_type == "cnn":
                model = TextCNN(args, "TextCNN")
            elif args.model_type == "cnnfe":
                model = TextCNNFE(args, "TextCNNFE")
            elif args.model_type == "rnn":
                model = TextRNN(args, "TextRNN")
            elif args.model_type == "rnnfe":
                model = TextRNNFE(args, "TextRNNFE")
            elif args.model_type == "rcnn":
                model = TextRCNN(args, "TextRCNN")
            elif args.model_type == "attention":
                model = RNNWithAttention(args, "Attention")
            elif args.model_type == "chrnn":
                model = TextRNNChar(args, "TextRNNChar")
            elif args.model_type == "chcnn":
                model = TextCNNChar(args, "TextCNNChar")
            elif args.model_type == "chcnn2":
                model = TextCNNChar(args, "TextCNNChar2")
            elif args.model_type == "rnnfe2":
                model = TextRNNFE2(args, "TextCNNCharFE2")
            elif args.model_type == "chrnnfe":
                model = TextRNNCharFE(args, "TextCNNCharFE")
            else:
                raise ValueError("Unknown model_type %s" % args.model_type)
            sess.run(tf.global_variables_initializer())

            if args.use_ft:
                pretrain_dir = args.ft_dir
                print "use FastText word vector"
                embedding = utils.load_fasttext(pretrain_dir, index2word)
            if not args.use_ft:
                pretrain_dir = args.glove_dir
                print "use Glove word vector"
                embedding = utils.load_glove(pretrain_dir, index2word)
            sess.run(model.embedding_init,
                     {model.embedding_placeholder: embedding})

            for line in model.tvars:
                print line

            print "training %s model for toxic comments classification" % (
                args.model_type)
            print "%d fold start training" % f
            for epoch in range(1, nb_epochs + 1):
                print "epoch %d start with lr %f" % (
                    epoch,
                    model.learning_rate.eval(session=sess)), "\n", "- " * 50
                loss, total_comments = 0.0, 0
                if args.model_type in ["cnn", "rnn", "rcnn"]:
                    train_batch = utils.get_batches(x_train, y_train,
                                                    args.batch_size,
                                                    args.max_len)
                    valid_batch = utils.get_batches(x_eval, y_eval, max_size,
                                                    args.max_len, False)

                elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
                    train_batch = utils.get_batches_with_char(
                        x_train, char_train, y_train, args.batch_size,
                        args.max_len)
                    valid_batch = utils.get_batches_with_char(
                        x_eval, char_eval, y_eval, max_size, args.max_len,
                        False)

                elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
                    train_batch = utils.get_batches_with_fe(
                        x_train, y_train, ex_features, args.batch_size,
                        args.max_len)
                    valid_batch = utils.get_batches_with_fe(
                        x_eval, y_eval, ex_features, max_size, args.max_len,
                        False)

                elif args.model_type in ["chrnnfe"]:
                    train_batch = utils.get_batches_with_charfe(
                        x_train, char_train, y_train, ex_features,
                        args.batch_size, args.max_len)
                    valid_batch = utils.get_batches_with_charfe(
                        x_eval, char_eval, y_eval, ex_features, max_size,
                        args.max_len, False)

                epoch_start_time = time.time()
                step_start_time = epoch_start_time
                for idx, batch in enumerate(train_batch):
                    if args.model_type in ["cnn", "rnn", "rcnn"]:
                        comments, comments_length, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, labels)

                    elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
                        comments, comments_length, chs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, chs, labels)

                    elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
                        comments, comments_length, exs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, labels, exs)

                    elif args.model_type in ["chrnnfe"]:
                        comments, comments_length, chs, exs, labels = batch
                        _, loss_t, global_step, batch_size = model.train(
                            sess, comments, comments_length, chs, labels, exs)

                    loss += loss_t * batch_size
                    total_comments += batch_size

                    if global_step % 200 == 0:
                        print "epoch %d step %d loss %f time %.2fs" % (
                            epoch, global_step, loss_t,
                            time.time() - step_start_time)

                    if global_step % 200 == 0:
                        _ = run_valid(valid_batch, model, sess,
                                      args.model_type)
                        # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step)
                        step_start_time = time.time()

                epoch_time = time.time() - epoch_start_time
                sess.run(model.learning_rate_decay_op)
                print "%.2f seconds in this epoch with train loss %f" % (
                    epoch_time, loss / total_comments)

            test_prob.append(run_test(args, model, sess))
            stack_logits[test_index] = run_valid(valid_batch, model, sess,
                                                 args.model_type)

    preds = np.zeros((test_prob[0].shape[0], len(target_cols)))
    for prob in test_prob:
        preds += prob
        print prob[0]
    preds /= len(test_prob)
    print len(test_prob)
    write_predict(stack_logits, args.model_type)
    write_results(preds, args.model_type)
Example #5
0
def run_test(args, model, sess):
    test = pd.read_csv(args.test_dir)
    ch_test = pd.read_csv(args.chtest_dir)
    x_test = test["comment_text"]
    x_chtest = ch_test["comment_text"]
    target_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    x = []
    x_ch = []
    for line in x_test:
        if len(line) > 0:
            x.append(utils.review_to_wordlist(line.strip()))
    print "loaded %d comments from test dataset" % len(x)
    for line in x_chtest:
        if len(line) > 0:
            x_ch.append(utils.review_to_wordlist_char(line.strip()))
    print "loaded %d comments from dataset" % len(x)

    index2word, word2index = utils.load_vocab(args.vocab_dir)
    index2char, char2index = utils.load_char(args.char_dir)
    x_vector = utils.vectorize(x, word2index, verbose=False)
    x_vector = np.array(x_vector)
    char_vector = utils.vectorize_char(x_ch, char2index, verbose=False)
    char_vector = np.array(char_vector)

    ex_features = add_features("../data/test.csv")
    if args.model_type in ["cnn"]:
        test_batch = utils.get_test_batches(x_vector, args.max_size_cnn,
                                            args.max_len)
    elif args.model_type in ["rnn", "rcnn"]:
        test_batch = utils.get_test_batches(x_vector, args.max_size_rnn,
                                            args.max_len)
    elif args.model_type in ["chrnn"]:
        test_batch = utils.get_test_batches_with_char(x_vector, char_vector,
                                                      args.max_size_rnn,
                                                      args.max_len)
    elif args.model_type in ["chcnn", "chcnn2"]:
        test_batch = utils.get_test_batches_with_char(x_vector, char_vector,
                                                      args.batch_size,
                                                      args.max_len)
    elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
        test_batch = utils.get_test_batches_with_fe(x_vector, ex_features,
                                                    args.max_size_rnn,
                                                    args.max_len)
    elif args.model_type in ["chrnnfe"]:
        test_batch = utils.get_test_batches_with_charfe(
            x_vector, char_vector, ex_features, args.max_size_rnn,
            args.max_len)

    total_logits = []
    for batch in test_batch:
        if args.model_type in ["cnn", "rnn", "rcnn"]:
            comments, comments_length = batch
            logits = model.get_logits(sess, comments, comments_length).tolist()
        elif args.model_type in ["chrnn", "chcnn", "chcnn2"]:
            comments, comments_length, chs = batch
            logits = model.get_logits(sess, comments, comments_length,
                                      chs).tolist()
        elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]:
            comments, comments_length, exs = batch
            logits = model.get_logits(sess, comments, comments_length,
                                      exs).tolist()
        elif args.model_type in ["chrnnfe"]:
            comments, comments_length, chs, exs = batch
            logits = model.get_logits(sess, comments, comments_length, chs,
                                      exs).tolist()
        total_logits += logits
    return np.array(total_logits)