for task in tasks: print 'Running for task', task checkpoint_dir = FLAGS.checkpoint_dir.format(task) # CHANGE THIS: Load data. Load your own data here x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_dir, task, 'test') #y_test = np.argmax(y_test, axis=1) #print y_test # Map data into vocabulary #vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") #vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) #x_test = np.array(list(vocab_processor.transform(x_raw))) x_test, vocab_vector = data_helpers.build_vocabulary(x_raw) #np.save('tmp/x_test.data', x_test) #x_test = np.load('tmp/x_test.data.npy') #vocab_vector = np.load('tmp/vocab_vector.data.npy') print("\nEvaluating...\n") # Evaluation # ================================================== print checkpoint_dir checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333) run_config = tf.ConfigProto() run_config.gpu_options.allow_growth = True # Output directory for models and summaries out_dir = data_helpers.mkdir_if_not_exist("./runs") # Load true_sentences and build vocab true_sentences = data_helpers.read_and_clean_file(FLAGS.true_data_file) padding_true_sentences = data_helpers.padding_sentences(true_sentences, FLAGS.padding_token, FLAGS.max_sentences_length) # Question: should we build voc just use true sentences or use all chinese word dic? # Here we use true sentences voc,voc_size = data_helpers.build_vocabulary(padding_true_sentences,'./runs/vocab') true_data = np.array(data_helpers.sentence2matrix(true_sentences,FLAGS.max_sentences_length,voc)) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(true_sentences))) true_data_shuffled = true_data[shuffle_indices] #fake_factors = fake_factor_dist.sample((FLAGS.batch_size, FLAGS.max_sentences_length,FLAGS.embedding_dim)) global_graph = tf.Graph() with global_graph.as_default(): sess = tf.Session(graph=global_graph) gan_model = GANModel(batch_size=FLAGS.batch_size,
#tasks = ['anger', 'fear', 'joy', 'sadness'] tasks = ['joy', 'sadness'] for task in tasks: # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.data_dir, task, 'train') # Build vocabulary #max_document_length = max([len(x.split(" ")) for x in x_text]) #vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) #x = np.array(list(vocab_processor.fit_transform(x_text))) x, vocab_vector = data_helpers.build_vocabulary(x_text) #np.save('tmp/x.data', x) #np.save('tmp/vocab_vector.data', vocab_vector) #x = np.load('tmp/x.data.npy') #vocab_vector = np.load('tmp/vocab_vector.data.npy') # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
FLAGS.shizheng_data_file,\ FLAGS.tiyu_data_file,\ FLAGS.yule_data_file) sentences = data_helpers.padding_sentences(x_text, FLAGS.padding_token,max_sentence_len) print("len(x_text)",len(x_text)) print("len(y)",len(y)) # Build vocabulary voc = None vocsize = None if os.path.exists('./runs/vocab'): # when sess restore,just reload vocab voc,vocsize = data_helpers.read_vocabulary('./runs/vocab') else: voc,vocsize = data_helpers.build_vocabulary(sentences,'./runs/vocab') x = np.array(data_helpers.sentence2matrix(sentences,max_sentence_len,voc)) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set data_len = len(x_shuffled) x_train,x_dev,y_train,y_dev= train_test_split(x_shuffled,y_shuffled,test_size=FLAGS.dev_per,random_state=42) print("Total/Train/Dev: {:d}/{:d}/{:d}".format(data_len,len(y_train), len(y_dev))) # Training