def get_clean_reviews(articles): """Return a list of list of words.""" clean_reviews = [] for review_text in articles["review"]: clean_reviews.append(review_to_wordlist(review_text, remove_stopwords=True)) return clean_reviews
# Create clean_train_reviews and clean_test_reviews as we did before # Read data from files train_file = join(os.path.dirname(__file__), 'data', 'labeledTrainData.tsv') train = pd.read_csv(train_file, header=0, delimiter="\t", quoting=3) test_file = join(os.path.dirname(__file__), 'data', 'testData.tsv') test = pd.read_csv(test_file, header=0, delimiter="\t", quoting=3) print "Cleaning training reviews" clean_train_reviews = [] for review in train["review"]: clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True)) print "Cleaning test reviews" clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True)) # ****** Create bags of centroids # Pre-allocate an array for the training set bags of centroids (for speed) train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32") # Transform the training set reviews into bags of centroids
def main(args): print "loadding reviews and labels from dataset" data = pd.read_csv('data/labeledTrainData.tsv.zip', compression='zip', delimiter='\t', header=0, quoting=3) reviews = data["review"] labels = list(data['sentiment']) sentences = [] for review in reviews: if len(review) > 0: sentences.append( utils.review_to_wordlist(review.decode('utf8').strip(), remove_stopwords=True)) print "loaded %d reviews from dataset" % len(sentences) word_dict = utils.build_vocab(sentences, max_words=10000) vec_reviews = utils.vectorize(sentences, word_dict, verbose=True) train_x = vec_reviews[0:20000] train_y = labels[0:20000] train_y = utils.one_hot(train_y, args.nb_classes) test_x = vec_reviews[20000:] test_y = labels[20000:] test_y = utils.one_hot(test_y, args.nb_classes) save_dir = args.save_dir log_dir = args.log_dir if not os.path.exists(save_dir): os.makedirs(save_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") test_batch = utils.get_batches(test_x, test_y, args.max_size) elif args.model_type in ["rnn", "bi_rnn"]: model = TextRNN(args, "TextRNN") test_batch = utils.get_batches(test_x, test_y, args.max_size, type="rnn") sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) for epoch in range(1, args.nb_epochs + 1): print "epoch %d start" % epoch print "- " * 50 loss = 0. total_reviews = 0 accuracy = 0. if args.model_type == "cnn": train_batch = utils.get_batches(train_x, train_y, args.batch_size) elif args.model_type in ["rnn", "bi_rnn"]: train_batch = utils.get_batches(train_x, train_y, args.batch_size, type="rnn") epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): reviews, reviews_length, labels = batch _, loss_t, accuracy_t, global_step, batch_size, summaries = model.train( sess, reviews, reviews_length, labels, args.keep_prob) loss += loss_t * batch_size total_reviews += batch_size accuracy += accuracy_t * batch_size summary_writer.add_summary(summaries, global_step) if global_step % 50 == 0: print "epoch %d, step %d, loss %f, accuracy %.4f, time %.2fs" % \ (epoch, global_step, loss_t, accuracy_t, time.time() - step_start_time) step_start_time = time.time() epoch_time = time.time() - epoch_start_time print "%.2f seconds in this epoch" % (epoch_time) print "train loss %f, train accuracy %.4f" % ( loss / total_reviews, accuracy / total_reviews) total_reviews = 0 accuracy = 0. for batch in test_batch: reviews, reviews_length, labels = batch accuracy_t, batch_size = model.test(sess, reviews, reviews_length, labels, 1.0) total_reviews += batch_size accuracy += accuracy_t * batch_size print "accuracy %.4f in %d test reviews" % ( accuracy / total_reviews, total_reviews)
def main(args): print "loadding data and labels from dataset" train = pd.read_csv(args.train_dir) ch_train = pd.read_csv(args.chtrain_dir) x_train = train["comment_text"] x_chtrain = ch_train["comment_text"] target_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] x = [] x_ch = [] for line in x_train: if len(line) > 0: x.append(utils.review_to_wordlist(line.strip())) print "loaded %d comments from dataset" % len(x) for line in x_chtrain: if len(line) > 0: x_ch.append(utils.review_to_wordlist_char(line.strip())) print "loaded %d comments from dataset" % len(x) y = train[target_cols].values index2word, word2index = utils.load_vocab(args.vocab_dir) index2char, char2index = utils.load_char(args.char_dir) x_vector = utils.vectorize(x, word2index, verbose=False) x_vector = np.array(x_vector) char_vector = utils.vectorize_char(x_ch, char2index, verbose=False) char_vector = np.array(char_vector) print char_vector[0] save_dir = os.path.join(args.save_dir, args.model_type) if not os.path.exists(save_dir): os.makedirs(save_dir) if args.model_type in ["cnn", "cnnfe", "chcnn", "chcnn2"]: max_step = args.max_step_cnn max_size = args.max_size_cnn nb_epochs = args.nb_epochs_cnn elif args.model_type in [ "rnn", "rnnfe", "rnnfe2", "chrnn", "chrnnfe", "rcnn" ]: max_step = args.max_step_rnn max_size = args.max_size_rnn nb_epochs = args.nb_epochs_rnn ex_features = add_features("../data/train.csv") nfolds = args.nfolds skf = KFold(n_splits=nfolds, shuffle=True, random_state=2018) test_prob = [] stack_logits = np.zeros((len(x_vector), len(target_cols))) for (f, (train_index, test_index)) in enumerate(skf.split(x_vector)): x_train, x_eval = x_vector[train_index], x_vector[test_index] char_train, char_eval = char_vector[train_index], char_vector[ test_index] y_train, y_eval = y[train_index], y[test_index] with tf.Graph().as_default(): config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) if args.model_type == "cnn": model = TextCNN(args, "TextCNN") elif args.model_type == "cnnfe": model = TextCNNFE(args, "TextCNNFE") elif args.model_type == "rnn": model = TextRNN(args, "TextRNN") elif args.model_type == "rnnfe": model = TextRNNFE(args, "TextRNNFE") elif args.model_type == "rcnn": model = TextRCNN(args, "TextRCNN") elif args.model_type == "attention": model = RNNWithAttention(args, "Attention") elif args.model_type == "chrnn": model = TextRNNChar(args, "TextRNNChar") elif args.model_type == "chcnn": model = TextCNNChar(args, "TextCNNChar") elif args.model_type == "chcnn2": model = TextCNNChar(args, "TextCNNChar2") elif args.model_type == "rnnfe2": model = TextRNNFE2(args, "TextCNNCharFE2") elif args.model_type == "chrnnfe": model = TextRNNCharFE(args, "TextCNNCharFE") else: raise ValueError("Unknown model_type %s" % args.model_type) sess.run(tf.global_variables_initializer()) if args.use_ft: pretrain_dir = args.ft_dir print "use FastText word vector" embedding = utils.load_fasttext(pretrain_dir, index2word) if not args.use_ft: pretrain_dir = args.glove_dir print "use Glove word vector" embedding = utils.load_glove(pretrain_dir, index2word) sess.run(model.embedding_init, {model.embedding_placeholder: embedding}) for line in model.tvars: print line print "training %s model for toxic comments classification" % ( args.model_type) print "%d fold start training" % f for epoch in range(1, nb_epochs + 1): print "epoch %d start with lr %f" % ( epoch, model.learning_rate.eval(session=sess)), "\n", "- " * 50 loss, total_comments = 0.0, 0 if args.model_type in ["cnn", "rnn", "rcnn"]: train_batch = utils.get_batches(x_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches(x_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: train_batch = utils.get_batches_with_char( x_train, char_train, y_train, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_char( x_eval, char_eval, y_eval, max_size, args.max_len, False) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: train_batch = utils.get_batches_with_fe( x_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_fe( x_eval, y_eval, ex_features, max_size, args.max_len, False) elif args.model_type in ["chrnnfe"]: train_batch = utils.get_batches_with_charfe( x_train, char_train, y_train, ex_features, args.batch_size, args.max_len) valid_batch = utils.get_batches_with_charfe( x_eval, char_eval, y_eval, ex_features, max_size, args.max_len, False) epoch_start_time = time.time() step_start_time = epoch_start_time for idx, batch in enumerate(train_batch): if args.model_type in ["cnn", "rnn", "rcnn"]: comments, comments_length, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels) elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: comments, comments_length, chs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: comments, comments_length, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, labels, exs) elif args.model_type in ["chrnnfe"]: comments, comments_length, chs, exs, labels = batch _, loss_t, global_step, batch_size = model.train( sess, comments, comments_length, chs, labels, exs) loss += loss_t * batch_size total_comments += batch_size if global_step % 200 == 0: print "epoch %d step %d loss %f time %.2fs" % ( epoch, global_step, loss_t, time.time() - step_start_time) if global_step % 200 == 0: _ = run_valid(valid_batch, model, sess, args.model_type) # model.saver.save(sess, os.path.join(save_dir, "model.ckpt"), global_step=global_step) step_start_time = time.time() epoch_time = time.time() - epoch_start_time sess.run(model.learning_rate_decay_op) print "%.2f seconds in this epoch with train loss %f" % ( epoch_time, loss / total_comments) test_prob.append(run_test(args, model, sess)) stack_logits[test_index] = run_valid(valid_batch, model, sess, args.model_type) preds = np.zeros((test_prob[0].shape[0], len(target_cols))) for prob in test_prob: preds += prob print prob[0] preds /= len(test_prob) print len(test_prob) write_predict(stack_logits, args.model_type) write_results(preds, args.model_type)
def run_test(args, model, sess): test = pd.read_csv(args.test_dir) ch_test = pd.read_csv(args.chtest_dir) x_test = test["comment_text"] x_chtest = ch_test["comment_text"] target_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] x = [] x_ch = [] for line in x_test: if len(line) > 0: x.append(utils.review_to_wordlist(line.strip())) print "loaded %d comments from test dataset" % len(x) for line in x_chtest: if len(line) > 0: x_ch.append(utils.review_to_wordlist_char(line.strip())) print "loaded %d comments from dataset" % len(x) index2word, word2index = utils.load_vocab(args.vocab_dir) index2char, char2index = utils.load_char(args.char_dir) x_vector = utils.vectorize(x, word2index, verbose=False) x_vector = np.array(x_vector) char_vector = utils.vectorize_char(x_ch, char2index, verbose=False) char_vector = np.array(char_vector) ex_features = add_features("../data/test.csv") if args.model_type in ["cnn"]: test_batch = utils.get_test_batches(x_vector, args.max_size_cnn, args.max_len) elif args.model_type in ["rnn", "rcnn"]: test_batch = utils.get_test_batches(x_vector, args.max_size_rnn, args.max_len) elif args.model_type in ["chrnn"]: test_batch = utils.get_test_batches_with_char(x_vector, char_vector, args.max_size_rnn, args.max_len) elif args.model_type in ["chcnn", "chcnn2"]: test_batch = utils.get_test_batches_with_char(x_vector, char_vector, args.batch_size, args.max_len) elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: test_batch = utils.get_test_batches_with_fe(x_vector, ex_features, args.max_size_rnn, args.max_len) elif args.model_type in ["chrnnfe"]: test_batch = utils.get_test_batches_with_charfe( x_vector, char_vector, ex_features, args.max_size_rnn, args.max_len) total_logits = [] for batch in test_batch: if args.model_type in ["cnn", "rnn", "rcnn"]: comments, comments_length = batch logits = model.get_logits(sess, comments, comments_length).tolist() elif args.model_type in ["chrnn", "chcnn", "chcnn2"]: comments, comments_length, chs = batch logits = model.get_logits(sess, comments, comments_length, chs).tolist() elif args.model_type in ["rnnfe", "cnnfe", "rnnfe2"]: comments, comments_length, exs = batch logits = model.get_logits(sess, comments, comments_length, exs).tolist() elif args.model_type in ["chrnnfe"]: comments, comments_length, chs, exs = batch logits = model.get_logits(sess, comments, comments_length, chs, exs).tolist() total_logits += logits return np.array(total_logits)