def __init__( self, x_train, y_train, x_test, y_test, fnames_train, fnames_test, embedding, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM=300, OPTIMIZER='adam', logger=None, opts=None, by_acc=True, n_classes=4, ): """ Vars """ MODEL = opts.model #FOR RNN HIDDEN_UNITS = 4 NUM_LAYERS = 1 BATCH_SIZE = 64 # Any size is accepted labelencoder = LabelEncoder() # set y_train_ = np.array(y_train).astype(str) y_test_ = np.array(y_test).astype(str) labelencoder.fit(y_train_) y_train_ = labelencoder.transform(y_train_) y_test_ = labelencoder.transform(y_test_) n_values = len(np.unique(y_train_)) # To One hot y_train = to_categorical(y_train_, n_values) y_test = to_categorical(y_test_, n_values) logger.info("texts_rep_train: {}".format(x_train.shape)) logger.info("y_train: {}".format(x_test.shape)) """""" """""" """""" """""" """""" """"" # Tensorflow """ """""" """""" """""" """""" """""" "" batch_size = tf.placeholder(tf.int64, name="batch_size") X = tf.placeholder(tf.int64, shape=[None, MAX_SEQUENCE_LENGTH]) fnames_plc = tf.placeholder(tf.string, shape=[None], name="fnames_plc") y = tf.placeholder(tf.int64, shape=[None, n_classes]) is_training = tf.placeholder_with_default(False, shape=[], name='is_training') dropout_keep_prob = tf.placeholder_with_default(1.0, shape=()) lr = tf.placeholder(tf.float32, shape=[]) glove_weights_initializer = tf.constant_initializer(embedding) print(glove_weights_initializer) embeddings = tf.Variable( # tf.random_uniform([2000, EMBEDDING_DIM], -1.0, 1.0) embedding) print(embeddings) """ GET THE MODEL """ if MODEL == "CNN": logits = CNN.get_model(X, W=embeddings, is_training=is_training, filters=opts.filters, n_classes=n_classes, logger=logger) elif MODEL == "RNN": logits = RNN.get_model(X, W=embeddings, dropout_keep_prob=dropout_keep_prob, hidden_size=HIDDEN_UNITS, n_classes=n_classes, num_layers=NUM_LAYERS) print(logits) softmax = tf.nn.softmax(logits) num_params = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]) print("{} params to train".format(num_params)) train_op, loss = train_ops.train_op(logits, y=y, learning_rate=lr, optimizer=OPTIMIZER) """""" """Test de embeddings""" train_dataset = tf.data.Dataset.from_tensor_slices( (X, y, fnames_plc)).batch(batch_size).shuffle(buffer_size=12) test_dataset = tf.data.Dataset.from_tensor_slices( (X, y, fnames_plc)).batch(batch_size).shuffle(buffer_size=12) train_data = (x_train, y_train, fnames_train) test_data = (x_test, y_test, fnames_test) # create a iterator of the correct shape and type iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) iter_test = tf.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes) # create the initialisation operations train_init_op = iter.make_initializer(train_dataset) test_init_op = iter_test.make_initializer(test_dataset) epoch_start = 0 ## Train sess = tf.Session() init_g = tf.global_variables_initializer() init_l = tf.local_variables_initializer() sess.run(init_g) sess.run(init_l) best_loss = 99999 best_acc = 0 for epoch in range(epoch_start, opts.epochs + 1): sess.run(train_init_op, feed_dict={ X: train_data[0], y: train_data[1], fnames_plc: train_data[2], batch_size: BATCH_SIZE, }) current_batch_index = 0 next_element = iter.get_next() loss_count = 0 while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt, batch_fnames = data _, loss_result = sess.run( [train_op, loss], feed_dict={ X: batch_x, y: batch_tgt, lr: train_ops.lr_scheduler(epoch), batch_size: BATCH_SIZE, is_training: True, dropout_keep_prob: 0.3, }) # print("Loss: {}".format(loss_result)) loss_count += loss_result loss_count = loss_count / current_batch_index acc = 0 if not by_acc: if loss_count < best_loss: best_loss = loss_count logger.info("New best_loss : {}".format(best_loss)) sesions.save_best(sess, opts.work_dir) else: """ ---------------- dev """ sess.run(test_init_op, feed_dict={ X: test_data[0], y: test_data[1], fnames_plc: test_data[2], batch_size: BATCH_SIZE, }) current_batch_index = 0 next_element = iter_test.get_next() while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt, batch_fnames = data results = sess.run( [softmax], feed_dict={ X: batch_x, y: batch_tgt, fnames_plc: batch_fnames, batch_size: BATCH_SIZE, dropout_keep_prob: 1.0, lr: train_ops.lr_scheduler(1) }) acc_aux = metrics.accuracy(X=results[0], y=batch_tgt) acc += acc_aux acc = acc / current_batch_index logger.info("----------") logger.info("Loss on epoch {} : {} - LR: {}".format( epoch, loss_count, train_ops.lr_scheduler(epoch))) logger.info("Acc Val Test {}".format(acc)) if acc > best_acc: best_acc = acc logger.info("New acc : {}".format(best_acc)) sesions.save_best(sess, opts.work_dir) """ ----------------- TEST ----------------- """ logger.info("\n-- TEST --\n") logger.info("Restoring the best Checkpoint.") # restore_file = sesions.restore_from_best(sess, save_path) restore_file = sesions.restore_from_best(sess, opts.work_dir) if restore_file: logger.info("Best model restored") else: logger.info("Cant restore the best model.") exit() sess.run(test_init_op, feed_dict={ X: test_data[0], y: test_data[1], fnames_plc: test_data[2], batch_size: BATCH_SIZE, }) current_batch_index = 0 next_element = iter_test.get_next() classifieds = [] while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt, batch_fnames = data results = sess.run( [softmax], feed_dict={ X: batch_x, y: batch_tgt, fnames_plc: batch_fnames, batch_size: BATCH_SIZE, dropout_keep_prob: 1.0, lr: train_ops.lr_scheduler(1) }) results = results[0] acc_aux = metrics.accuracy(X=results, y=batch_tgt) acc += acc_aux for i in range(len(results)): hyp = [np.argmax(results[i], axis=-1)] hyp = labelencoder.inverse_transform(hyp)[ 0] # real label #set doc_name = batch_fnames[i].decode("utf-8").split("/")[-1] tgt = [np.argmax(batch_tgt[i], axis=-1)] tgt = labelencoder.inverse_transform(tgt)[ 0] # real label #set # to vote classifieds.append((hyp, doc_name, tgt)) # print(classifieds) # exit() acc = acc / current_batch_index logger.info("----------") logger.info("Acc Val Test {}".format(acc)) # # per_doc = classify_per_doc(classifieds, logger=logger) # logger.info("----------") # logger.info("Acc Val Test Per document votation {}".format(metrics.accuracy_per_doc(per_doc))) # logger.info("----------") # [print(x) for x in classifieds_to_write] self.results = classifieds
def __init__(self, layers=[512, 256, 128, 64, 32], filters=[64, 128, 256, 512], MODEL="FF", HIDDEN_UNITS=32, NUM_LAYERS=2, do_val=False, OPTIMIZER='adam', DEV_SPLIT=0.2, NUM_EPOCH=50, min_ngram=1, up=5, max_features=None, dataset="PAN2019", logger=None, opts=None, DEBUG=False, lang="es"): """ Vars """ BATCH_SIZE = opts.batch_size logger = logger or logging.getLogger(__name__) # MODEL = "RNN" # MODEL = "CNN" if not DEBUG: ## PAN path = opts.tr_data + '/' + lang path_test = opts.i + '/' + lang sent = 0 if lang == 'en': sent = 0 if do_val: txt_train = opts.file_i + "/{}/truth-train.txt".format(lang) txt_dev = opts.file_i + "/{}/truth-dev.txt".format(lang) dt_train = process.PAN2019(path=path, txt=txt_train, join_all=MODEL == "FF", sentiment_id=sent) dt_dev = process.PAN2019(path=path, txt=txt_dev, join_all=MODEL == "FF", sentiment_id=sent) fnames_dev = dt_dev.fnames y_dev = dt_dev.y x_dev = dt_dev.X # sent_dev = dt_dev.sentiment del dt_dev else: txt_train = opts.file_i + "/{}/truth.txt".format(lang) dt_train = process.PAN2019(path=path, txt=txt_train, join_all=MODEL == "FF", sentiment_id=sent) dt_test = process.PAN2019_Test(path=path_test, join_all=MODEL == "FF", sentiment_id=sent) n_classes = 2 # bot or not bot # sent_train = dt_train.sentiment # sent_test = dt_test.sentiment x_train = dt_train.X y_train = dt_train.y y2_train = dt_train.y2 print(len(x_train)) print(len(y_train)) print(len(y2_train)) x_test = dt_test.X # y_test = dt_test.y fnames_train = dt_train.fnames fnames_test = dt_test.fnames labelencoder = LabelEncoder() #set y_train_ = np.array(y_train).astype(str) # y_test_ = np.array(y_test).astype(str) labelencoder.fit(y_train_) y_train_ = labelencoder.transform(y_train_) # y_test_ = labelencoder.transform(y_test_) n_values = len(np.unique(y_train_)) # To One hot y_train = to_categorical(y_train_, n_values) # y_test = to_categorical(y_test_, n_values) if max_features: rep = TfidfVectorizer(ngram_range=(min_ngram, up), max_features=max_features) else: rep = TfidfVectorizer(ngram_range=(min_ngram, up)) del dt_train del dt_test logger.info("fit_transform tfidf") texts_rep_train = rep.fit_transform(x_train) logger.info("To array") texts_rep_train = texts_rep_train.toarray() logger.info("transform tfidf") text_test_rep = rep.transform(x_test) logger.info("To array") text_test_rep = text_test_rep.toarray() if do_val: text_dev_rep = rep.transform(x_dev) text_dev_rep = text_dev_rep.toarray() y_dev_ = np.array(y_dev).astype(str) y_dev_ = labelencoder.transform(y_dev_) y_dev = to_categorical(y_dev_, n_values) if MODEL == "CNN": num = opts.num_tweets texts_rep_train = texts_rep_train.reshape( int(texts_rep_train.shape[0] / num), num, texts_rep_train.shape[1]) text_test_rep = text_test_rep.reshape( int(text_test_rep.shape[0] / num), num, text_test_rep.shape[1]) else: logger.info(" --------------- DEBUG ON ------------------") n_classes = 2 n_vcab = 10000 train_data = 128 dev_data = 50 texts_rep_train = np.random.randn(train_data, 100, n_vcab) text_test_rep = np.random.randn(dev_data, 100, n_vcab) y_train = np.eye(n_classes)[np.random.choice( n_classes, train_data)] y_test = np.eye(n_classes)[np.random.choice(n_classes, dev_data)] alphabet = list( 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' ) np_alphabet = np.array(alphabet, dtype="|S1") fnames_train = np.random.choice(np_alphabet, [train_data]) fnames_test = np.random.choice(np_alphabet, [dev_data]) logger.info("Random data created") # print(len(sent_train)) # print(texts_rep_train.shape) # texts_rep_train = np.concatenate((texts_rep_train, np.expand_dims(sentiment.append([blob.sentiment.polarity, blob.sentiment.subjectivity]),axis=1)), axis=-1) # text_test_rep = np.concatenate((text_test_rep, np.expand_dims(sent_test,axis=1)), axis=-1) # text_dev_rep = np.concatenate((text_dev_rep, np.expand_dims(sent_dev,axis=1)), axis=-1) # texts_rep_train = np.concatenate((texts_rep_train, sent_train), axis=-1) logger.info("texts_rep_train: {}".format(texts_rep_train.shape)) logger.info("y_train: {}".format(y_train.shape)) # X_train, X_val, X_test, y_train, y_val, y_test, MAX_SEQUENCE_LENGTH = prepare_data( # dir_word_embeddings, fname_vocab, train_path, test_path, EMBEDDING_DIM, # VALIDATION_SPLIT=DEV_SPLIT, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH # ) """""" """""" """""" """""" """""" """"" # Tensorflow """ """""" """""" """""" """""" """""" "" batch_size = tf.placeholder(tf.int64, name="batch_size") if MODEL == "CNN": X = tf.placeholder(tf.float32, shape=[ None, texts_rep_train.shape[1], texts_rep_train.shape[2] ], name="X") else: X = tf.placeholder(tf.float32, shape=[None, len(texts_rep_train[0])], name="X") print(X) y = tf.placeholder(tf.int64, shape=[None, n_classes], name="y") fnames_plc = tf.placeholder(tf.string, shape=[None], name="fnames_plc") lr = tf.placeholder(tf.float32, shape=[], name="lr") is_training = tf.placeholder_with_default(False, shape=[], name='is_training') dropout_keep_prob = tf.placeholder_with_default(1.0, shape=(), name="dropout") """ GET THE MODEL """ if MODEL == "CNN": logits = CNN.get_model(X, is_training=is_training, filters=filters, n_classes=n_classes, tf_idf=True, logger=logger) elif MODEL == "RNN": logits = RNN.get_model(X, dropout_keep_prob, hidden_size=HIDDEN_UNITS, n_classes=n_classes, num_layers=NUM_LAYERS) elif MODEL == "FF": logits = FF.get_model(X, dropout_keep_prob, is_training=is_training, layers=layers, n_classes=n_classes) logger.info(logits) softmax = tf.nn.softmax(logits) num_params = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]) logger.info("{} params to train".format(num_params)) train_op, loss = train_ops.train_op(logits, y, learning_rate=lr, optimizer=OPTIMIZER) """""" """Test de embeddings""" train_dataset = tf.data.Dataset.from_tensor_slices( (X, y, fnames_plc)).batch(batch_size).shuffle(buffer_size=12) dev_dataset = tf.data.Dataset.from_tensor_slices( (X, y, fnames_plc)).batch(batch_size).shuffle(buffer_size=12) test_dataset = tf.data.Dataset.from_tensor_slices( (X, fnames_plc)).batch(batch_size).shuffle(buffer_size=12) train_data = (texts_rep_train, y_train, fnames_train) if do_val: dev_data = (text_dev_rep, y_dev, fnames_dev) test_data = (text_test_rep, fnames_test) print(text_test_rep.shape) print(len(fnames_test)) # create a iterator of the correct shape and type iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) iter_test = tf.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes) # create the initialisation operations train_init_op = iter.make_initializer(train_dataset) dev_init_op = iter.make_initializer(dev_dataset) test_init_op = iter_test.make_initializer(test_dataset) epoch_start = 0 ## Train sess = tf.Session() init_g = tf.global_variables_initializer() init_l = tf.local_variables_initializer() sess.run(init_g) sess.run(init_l) best_acc = 0 for epoch in range(epoch_start, NUM_EPOCH + 1): sess.run(train_init_op, feed_dict={ X: train_data[0], y: train_data[1], fnames_plc: train_data[2], batch_size: BATCH_SIZE, }) current_batch_index = 0 next_element = iter.get_next() loss_count = 0 while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt, batch_fnames = data _, loss_result = sess.run( [train_op, loss], feed_dict={ X: batch_x, y: batch_tgt, lr: train_ops.lr_scheduler(epoch), batch_size: BATCH_SIZE, is_training: True, dropout_keep_prob: 0.3, }) # print("Loss: {}".format(loss_result)) loss_count += loss_result loss_count = loss_count / current_batch_index logger.info("Loss on epoch {} : {} - LR: {}".format( epoch, loss_count, train_ops.lr_scheduler(epoch))) acc = 0 if do_val: print("Eval") ## Eval sess.run( dev_init_op, feed_dict={ # sess.run(dev_init_op, feed_dict={ X: dev_data[0], y: dev_data[1], fnames_plc: dev_data[2], batch_size: BATCH_SIZE, }) current_batch_index = 0 next_element = iter.get_next() while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt, batch_fnames = data results = sess.run( [softmax], feed_dict={ X: batch_x, y: batch_tgt, lr: train_ops.lr_scheduler(epoch), batch_size: BATCH_SIZE, is_training: False, dropout_keep_prob: 1.0 }) results = results[0] acc_aux = metrics.accuracy(X=results, y=batch_tgt) acc += acc_aux acc = acc / current_batch_index print("Acc Val epoch {} : {}".format(epoch, acc)) print("----------") if acc > best_acc: best_acc = acc logger.info("New acc : {}".format(best_acc)) save_best(sess, opts.work_dir) if opts.testing and opts.do_val: logger.info("Model: {}".format(MODEL)) logger.info("layers: {}".format(layers)) logger.info("max_features: {}".format(max_features)) logger.info("Min and max features: {} - {}".format(min_ngram, up)) logger.info("Best acc : {}".format(best_acc)) exit() """ ----------------- TEST ----------------- """ logger.info("\n-- TEST --\n") logger.info("Restoring the best Checkpoint.") # restore_file = sesions.restore_from_best(sess, save_path) restore_file = restore_from_best(sess, opts.work_dir) if restore_file: logger.info("Best model restored") else: logger.info("Cant restore the best model.") exit() Y_FALSA = np.random.randint(1, size=(BATCH_SIZE, n_classes)) print(Y_FALSA.shape) logger.info("\n-- TEST --\n") sess.run(test_init_op, feed_dict={ X: test_data[0], y: Y_FALSA, fnames_plc: test_data[1], batch_size: BATCH_SIZE, }) current_batch_index = 0 next_element = iter_test.get_next() loss_count = 0 classifieds = [] classifieds_to_write = [] while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_fnames = data results = sess.run( [softmax], feed_dict={ X: batch_x, y: Y_FALSA, batch_size: BATCH_SIZE, dropout_keep_prob: 1.0, lr: train_ops.lr_scheduler(1) }) for i in range(len(results[0])): # to write hyp = [np.argmax(results[0][i], axis=-1)] hyp = labelencoder.inverse_transform(hyp)[ 0] #real label #set doc_name = batch_fnames[i].decode("utf-8").split("/")[-1] classifieds_to_write.append((doc_name, lang, hyp)) logger.info("----------") logger.info("Writting results in output dir {}".format("{}/{}".format( opts.o, lang))) if sent != 2 and lang == 'en': dt_train = process.PAN2019(path=path, txt=txt_train, join_all=MODEL == "FF", sentiment_id=2) x_train = dt_train.X y2_train = dt_train.y2 del dt_train if max_features: rep = TfidfVectorizer(ngram_range=(min_ngram, up), max_features=max_features) else: rep = TfidfVectorizer(ngram_range=(min_ngram, up)) texts_rep_train = rep.fit_transform(x_train).toarray() dt_test = process.PAN2019_Test(path=path_test, join_all=MODEL == "FF", sentiment_id=2) del dt_test x_test = dt_test.X text_test_rep = rep.transform(x_test).toarray() process.write_from_array(classifieds_to_write, "{}/{}".format(opts.o, lang), x_train, texts_rep_train, y2_train, x_test, text_test_rep, fnames_test)
while True: try: data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt, batch_fnames = data _, loss_result = sess.run([train_op, loss], feed_dict={ X: batch_x, y: batch_tgt, lr: train_ops.lr_scheduler(epoch), batch_size: BATCH_SIZE, is_training: True, dropout_keep_prob: 0.3, }) # print("Loss: {}".format(loss_result)) loss_count += loss_result loss_count = loss_count / current_batch_index logger.info("Loss on epoch {} : {} - LR: {}".format(epoch, loss_count, train_ops.lr_scheduler(epoch))) acc = 0 # if do_val: # print("Eval") # ## Eval # sess.run(dev_init_op, feed_dict={
data = sess.run([next_element]) except tf.errors.OutOfRangeError: break current_batch_index += 1 data = data[0] batch_x, batch_tgt = data _, loss_result = sess.run( [train_op, loss], feed_dict={ X: batch_x, y: batch_tgt, batch_size: BATCH_SIZE, is_training: True, lr: train_ops.lr_scheduler(epoch), dropout_keep_prob: 0.5 }) loss_count += loss_result loss_count = loss_count / current_batch_index print("Loss on epoch {} : {}".format(epoch, loss_count)) print("Eval") ## Eval sess.run( dev_init_op, feed_dict={ # sess.run(dev_init_op, feed_dict={ X: dev_data[0], y: dev_data[1], batch_size: BATCH_SIZE,