capturing_data = True else: if not line: raise 'Empty line during capturing phase' pass if line.startswith('END_DATA'): stream("Capture finished") capturing_data = False break else: decodeData(line) stream("Decoding finished") x_text, y = data_helpers.load_data_labels(datasets) # Parameters # ================================================== # Data loading params tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.") tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.") # Model Hyperparameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: # x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) x_raw, y_test = data_helpers.load_data_labels(FLAGS.data_file, FLAGS.label_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
test2 = "/home/xxliu10/bigdata/classification/originnewsdata/2.test" test3 = "/home/xxliu10/bigdata/classification/originnewsdata/3.test" test4 = "/home/xxliu10/bigdata/classification/originnewsdata/4.test" test5 = "/home/xxliu10/bigdata/classification/originnewsdata/5.test" # Data Preparation # ================================================== # Load data print("Loading data...") datasets = data_helpers.get_datasets_textinline(train1, train2, train3, train4, train5) testdatasets = data_helpers.get_datasets_textinline(test1, test2, test3, test4, test5) x_train_ns, y_train_ns = data_helpers.load_data_labels(datasets) x_dev_ns, y_dev_ns = data_helpers.load_data_labels(testdatasets) x_text = x_train_ns + x_dev_ns print(len(x_train_ns)) print(len(x_dev_ns)) print(len(x_text)) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x_tr = np.array(list(vocab_processor.fit_transform(x_train_ns))) x_te = np.array(list(vocab_processor.fit_transform(x_dev_ns))) # Randomly shuffle data np.random.seed(10) tr_shuffle_indices = np.random.permutation(np.arange(len(y_train_ns))) te_shuffle_indices = np.random.permutation(np.arange(len(y_dev_ns)))
'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] training_classes = ['comp.graphics', 'alt.atheism', 'comp.sys.mac.hardware', 'misc.forsale', 'rec.autos'] # load data print("Loading data...") if dataset == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup(subset='train', categories=training_classes, remove=()) # TODO: use the remove parameter x_text, y_train = data_helpers.load_data_labels_remove_SW(datasets) else: dataset = data_helpers.get_datasets_localdata("./data/20newsgroup", categories=None) # TODO: tweak parameters in the future x_text, y_train = data_helpers.load_data_labels(dataset) # text is stored in x_test; # labels are stored in y # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) # TODO: should be hardcoded to save time print("Max document length: {}".format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x_train = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data # np.random.seed(10) # shuffle_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffle_indices] # y_shuffled = y[shuffle_indices] print(x_train.shape)
dataset_name = cfg["datasets"]["default"] # Data Preparation # ================================================== # Load data print("Loading data...Cool Joey :-)") datasets = None if dataset_name == "HAR_small": datasets_train = data_helpers.get_datasets( cfg["datasets"][dataset_name]["training_data_file"]["path"]) datasets_test = data_helpers.get_datasets( cfg["datasets"][dataset_name]["testing_data_file"]["path"]) x, y = data_helpers.load_data_labels(datasets_train) x_test, y_test = data_helpers.load_data_labels(datasets_test) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/dev set dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print("Train/Test split: {:d}/{:d}".format(len(y_train), len(y_test)))
# Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_labels() y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
print("") datasets = None # CHANGE THIS: Load data. Load your own data here dataset_name = cfg["datasets"]["default"] if FLAGS.eval_train: if dataset_name == "mrpolarity": datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"], cfg["datasets"][dataset_name]["negative_data_file"]["path"]) elif dataset_name == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup(subset="test", categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) x_raw, y_test = data_helpers.load_data_labels(datasets) y_test = np.argmax(y_test, axis=1) print("Total number of test examples: {}".format(len(y_test))) else: if dataset_name == "mrpolarity": datasets = {"target_names": ['positive_examples', 'negative_examples']} x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] else: datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']} x_raw = ["The number of reported cases of gonorrhea in Colorado increased", "I am in the market for a 24-bit graphics card for a PC"] y_test = [2, 1] x_words_raw, x_tags, x_labels, x_trees, x_indices, y, y_labels = data_helpers.load_data_labels('/u/a/n/anant/Dropbox/539_project/generated_test_data/') x_words = x_words_raw
import numpy as np import tensorflow as tf from tensorflow.contrib import learn import config import data_helpers from text_cnn import TextCNN # params print("\nparameters config:") for k, v in config.config.items(): print("{}={}".format(k, v)) # load data x_test, y = data_helpers.load_data_labels(config.config["positive_data_file"], config.config["negative_data_file"]) # build vocabulary max_document_length = max([len(x.split(" ")) for x in x_test]) vocab_processr = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processr.fit_transform(x_test))) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/test set dev_sample_index = -1 * int(config.config["dev_sample_percentage"] * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
def main(): import time start_time = time.time() FLAGS = flagClass() with open("config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) dataset_name = cfg["datasets"]["default"] if FLAGS.enable_word_embeddings and cfg['word_embeddings'][ 'default'] is not None: embedding_name = cfg['word_embeddings']['default'] embedding_dimension = cfg['word_embeddings'][embedding_name][ 'dimension'] else: embedding_dimension = FLAGS.embedding_dim # Data Preparation # ================================================== # Load data print("Loading data...") datasets = None if dataset_name == "mrpolarity": datasets = data_helpers.get_datasets_mrpolarity( cfg["datasets"][dataset_name]["positive_data_file"]["path"], cfg["datasets"][dataset_name]["negative_data_file"]["path"]) elif dataset_name == 'spamham': datasets = data_helpers.get_datasets_mrpolarity( cfg["datasets"][dataset_name]["spam_file"]["path"], cfg["datasets"][dataset_name]["ham_file"]["path"]) elif dataset_name == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup( subset="train", categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) elif dataset_name == "dbpedia": datasets = data_helpers.get_datasets_dbpedia( cfg["datasets"][dataset_name]["train_file"]["path"], cfg["datasets"][dataset_name]["train_file"]["limit"]) elif dataset_name == "email": datasets = data_helpers.get_datasets_email( container_path=cfg["datasets"][dataset_name]["container_path"], categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) elif dataset_name == "localdata": datasets = data_helpers.get_datasets_localdata( container_path=cfg["datasets"][dataset_name]["container_path"], categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) x_text, y = data_helpers.load_data_labels(datasets) # Build vocabulary # To limit memory usage, you can cut off input text to first 40 words # Other research has shown that first 40 words in text (IMDB dataset?) # were representative of the content of the sentence for classification # purposes - Comment out one of the two lines below # max_document_length = max([len(x.split(" ")) for x in x_text]) max_document_length = 40 # read up to 40 words from each sentence vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print('Sequence_length={}'.format(x_train.shape[1])) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=embedding_dimension, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(cnn.learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.enable_word_embeddings and cfg['word_embeddings'][ 'default'] is not None: vocabulary = vocab_processor.vocabulary_ initW = None if embedding_name == 'word2vec': # load embedding vectors from the word2vec print("Load word2vec file {}".format( cfg['word_embeddings']['word2vec']['path'])) initW = data_helpers.load_embedding_vectors_word2vec( vocabulary, cfg['word_embeddings']['word2vec']['path'], cfg['word_embeddings']['word2vec']['binary']) print("word2vec file has been loaded") elif embedding_name == 'glove': # load embedding vectors from the glove print("Load glove file {}".format( cfg['word_embeddings']['glove']['path'])) initW = data_helpers.load_embedding_vectors_glove( vocabulary, cfg['word_embeddings']['glove']['path'], embedding_dimension) print("glove file has been loaded\n") sess.run(cnn.W.assign(initW)) def train_step(x_batch, y_batch, learning_rate): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob, cnn.learning_rate: learning_rate } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}, learning_rate {:g}". format(time_str, step, loss, accuracy, learning_rate)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy, gr = sess.run([ global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.grad ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}, gr {}".format( time_str, step, loss, accuracy, gr)) if writer: writer.add_summary(summaries, step) # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) print("Number of epochs: {}".format(FLAGS.num_epochs)) num_batches_per_epoch = int( (len(list(zip(x_train, y_train))) - 1) / FLAGS.batch_size) + 1 print("Batches per epoch: {}".format(num_batches_per_epoch)) print("Batch size: {}".format(FLAGS.batch_size)) # It uses dynamic learning rate with a high value at the beginning to speed up the training max_learning_rate = 0.005 min_learning_rate = 0.0001 decay_speed = FLAGS.decay_coefficient * len( y_train) / FLAGS.batch_size # Training loop. For each batch... counter = 0 for batch in batches: learning_rate = min_learning_rate + ( max_learning_rate - min_learning_rate) * math.exp( -counter / decay_speed) counter += 1 x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch, learning_rate) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) print("runtime was " + str(time.time() - start_time))
"Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nPamaeters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper, value)) print("") # CHANGE THIS:load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph()
# FLAGS._parse_flags() 弃用了 # FLAGS.flag_values_dict() FLAGS.flag_values_dict() print('\nParameters:') for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), str(value.value))) # =============================== # 数据预处理 # 载入数据 print('loding data...') x_text, y = data_helpers.load_data_labels(FLAGS.positive_data_fill, FLAGS.negative_data_file) # build vocabulary 每篇文章中的词数最大的那一个 # 创建词汇表 max_document_length = max([len(x.split(' ')) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.array(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/test set 分开测试与训练集 这里是从后往前 dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
cfg["datasets"][dataset_name]["positive_data_file"]["path"], cfg["datasets"][dataset_name]["negative_data_file"]["path"]) elif dataset_name == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup( subset="train", categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) elif dataset_name == "localdata": datasets = data_helpers.get_datasets_localdata( container_path=cfg["datasets"][dataset_name]["container_path"], categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) x_text, y = data_helpers.load_data_labels(FLAGS.anger_dir, FLAGS.disgust_dir, FLAGS.fear_dir, FLAGS.neutral_dir, FLAGS.sadness_dir, FLAGS.surprise_dir) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") datasets = data_helpers.get_datasets_political_parties() x_text, y = data_helpers.load_data_labels(datasets) #print('x_text',x_text) #print('labels',y) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) if args.pre_trained: print('Load pre-trained word vectors') with open('fasttext_vocab_en.dat', 'rb') as fr: vocab = pickle.load(fr) embedding = np.load('fasttext_embedding_en.npy') pretrain = vocab_processor.fit(vocab.keys()) x = np.array(list(vocab_processor.transform(x_text))) embedding_size = FLAGS.fasttext_embedding_dim vocab_size = len(vocab) else: x = np.array(list(vocab_processor.fit_transform(x_text))) embedding_size = FLAGS.embedding_dim vocab_size = len(vocab_processor.vocabulary_) #print('VocabPr',x) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] #print('x_shuffled', x_shuffled) #print('y_shuffled', y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation #dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] #y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] train_frac = 0.7 val_frac = 0.2 test_frac = 0.1 def train_test_val_split(x_shuffled): return (x_shuffled[:int(len(x_shuffled)*train_frac)], x_shuffled[int(len(x_shuffled)*train_frac) : (int(len(x_shuffled)*train_frac) + int(len(x_shuffled)*val_frac))], x_shuffled[(int(len(x_shuffled)*train_frac) + int(len(x_shuffled)*val_frac)):]) def train_test_val_labels(y_shuffled): return (y_shuffled[:int(len(y_shuffled)*train_frac)], y_shuffled[int(len(y_shuffled)*train_frac) : (int(len(y_shuffled)*train_frac) + int(len(y_shuffled)*val_frac))], y_shuffled[(int(len(y_shuffled)*train_frac) + int(len(y_shuffled)*val_frac)):]) x_train, x_dev, x_test = train_test_val_split(x_shuffled) y_train, y_dev, y_test = train_test_val_labels(y_shuffled) #print('shape',x_train.shape) #print("Vocabulary". vocab_processor.vocabulary_) #print("Vocabulary",vocab_processor.vocabulary_._mapping) print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print('x_train',x_train.shape) print('y_train',y_train.shape) return x_train, y_train, vocab_processor,vocab_size, embedding_size, embedding, x_dev, y_dev, x_test, y_test
cfg["datasets"][dataset_name]["positive_data_file"]["path"], cfg["datasets"][dataset_name]["negative_data_file"]["path"]) elif dataset_name == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup( subset="train", categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) elif dataset_name == "localdata": datasets = data_helpers.get_datasets_localdata( container_path=cfg["datasets"][dataset_name]["container_path"], categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) x_text, y = data_helpers.load_data_labels(datasets) # x_test are the comments # y is 0 or 1 for negative/positive # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ================================================== # Load data print("Loading data...") x, y, vocab_processor = data_helpers.load_data_labels(FLAGS.data_file, FLAGS.label_file) # Randomly shuffle data # np.random.seed(10) # shuffle_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffle_indices] # y_shuffled = y[shuffle_indices] x_shuffled = x y_shuffled = y # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") datasets = data_helpers.get_datasets_tobacco() x_text, y = data_helpers.load_data_labels(datasets) #print('x_text',x_text) #print('labels',y) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) #print('max_document_length',max_document_length) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) #print("vocab_processor",vocab_processor) x = np.array(list(vocab_processor.fit_transform(x_text))) #print('VocabPr',x) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] #print('x_shuffled', x_shuffled) #print('y_shuffled', y_shuffled) # Split train/test set # TODO: This is very crude, should use cross-validation #dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] #y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] train_frac = 0.7 val_frac = 0.2 test_frac = 0.1 def train_test_val_split(x_shuffled): return (x_shuffled[:int(len(x_shuffled) * train_frac)], x_shuffled[int(len(x_shuffled) * train_frac):(int(len(x_shuffled) * train_frac) + int(len(x_shuffled) * val_frac))], x_shuffled[(int(len(x_shuffled) * train_frac) + int(len(x_shuffled) * val_frac)):]) def train_test_val_labels(y_shuffled): return (y_shuffled[:int(len(y_shuffled) * train_frac)], y_shuffled[int(len(y_shuffled) * train_frac):(int(len(y_shuffled) * train_frac) + int(len(y_shuffled) * val_frac))], y_shuffled[(int(len(y_shuffled) * train_frac) + int(len(y_shuffled) * val_frac)):]) x_train, x_dev, x_test = train_test_val_split(x_shuffled) y_train, y_dev, y_test = train_test_val_labels(y_shuffled) #print('shape',x_train.shape) #print("Vocabulary". vocab_processor.vocabulary_) #print("Vocabulary",vocab_processor.vocabulary_._mapping) print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) print('x_train', x_train.shape) print('y_train', y_train.shape) return x_train, y_train, vocab_processor, x_dev, y_dev, x_test, y_test
if FLAGS.eval_train: if dataset_name == "mrpolarity": datasets = data_helpers.get_datasets_mrpolarity( cfg["datasets"][dataset_name]["positive_data_file"]["path"], cfg["datasets"][dataset_name]["negative_data_file"]["path"]) elif dataset_name == "20newsgroup": datasets = data_helpers.get_datasets_20newsgroup( subset="test", categories=cfg["datasets"][dataset_name]["categories"], shuffle=cfg["datasets"][dataset_name]["shuffle"], random_state=cfg["datasets"][dataset_name]["random_state"]) elif dataset_name == "abstract": datasets = data_helpers.get_datasets_abstract("data/") elif dataset_name == 'intents': datasets = data_helpers.get_datasets_intentst("data/") x_text, y = data_helpers.load_data_labels(datasets) x_raw, y_test = data_helpers.load_data_labels(datasets) y_test = np.argmax(y_test, axis=1) print("Total number of test examples: {}".format(len(y_test))) else: if dataset_name == "mrpolarity": x_raw = [ "a masterpiece four years in the making", "everything is off." ] y_test = [1, 0] else: x_raw = [ "Experimental results on a large number of real-world data sets show that the proposed algorithm outperforms existing HMC methods", "In this paper, we overcome these deficiencies by proposing a hierarchy-aware loss function that is more appropriate for HMC." ] y_test = [2, 1]
import time import datetime import data_helpers from text_cnn import TextCNN from tensorflow.contrib import learn import csv from sklearn import metrics params = json.loads(open('./parameters.json').read()) checkpoint_dir = sys.argv[1] datasets = None filename = '20_news_group_to_test.csv.zip' x_raw, y_test, labels = data_helpers.load_data_labels(filename, 0) y_test = np.argmax(y_test, axis=1) print("Total number of test examples: {}".format(len(y_test))) # Map data into vocabulary vocab_path = os.path.join(checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) graph = tf.Graph() with graph.as_default():