def toShakespeare(self): """Given a line of text, return that text in the indicated style. Args: modern_text: (string) The input. Returns: string: The translated text, if generated. """ args = load_arguments() vocab = Vocabulary(self.vocab_path, args.embedding, args.dim_emb) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = Model(args, vocab) model.saver.restore(sess, args.model) if args.beam > 1: decoder = beam_search.Decoder(sess, args, vocab, model) else: decoder = greedy_decoding.Decoder(sess, args, vocab, model) batch = get_batch([self.modern_text], [1], vocab.word2id) ori, tsf = decoder.rewrite(batch) out = ' '.join(w for w in tsf[0]) return out
def transform_text(text): tf.compat.v1.disable_eager_execution() args = load_arguments() ah = vars(args) ah['vocab'] = '../model/yelp.vocab' ah['model'] = '../model/model' ah['load_model'] = True ah['beam'] = 8 ah['batch_size'] = 1 inp = [text] vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size:', vocab.size) config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True with tf.compat.v1.Session(config=config) as sess: model = create_model(sess, args, vocab) decoder = beam_search.Decoder(sess, args, vocab, model) '''test_losses = transfer(model, decoder, sess, args, vocab, test0, test1, args.output)''' batches, order0, order1 = get_batches(inp, inp, vocab.word2id, args.batch_size) data0_tsf, data1_tsf = [], [] losses = Accumulator(len(batches), ['loss', 'rec', 'adv', 'd0', 'd1']) # rec, tsf = decoder.rewrite(inp) # print(rec) # print(tsf) for batch in batches: rec, tsf = decoder.rewrite(batch) half = batch['size'] // 2 print("rec:") print(rec) print("tsf:") print(tsf) data0_tsf += tsf[:half] data1_tsf += tsf[half:] n0, n1 = len(inp), len(inp) data0_tsf = reorder(order0, data0_tsf)[:n0] data1_tsf = reorder(order1, data1_tsf)[:n1] print(data0_tsf) print(data1_tsf)
def main(): print("ARGUMENTS: ") args = load_arguments() print("Arguments loaded. ") if args.num_clusters_end < args.num_clusters_start: args.num_clusters_end = args.num_clusters_start if args.plot_extensions.lower() == 'y': extensions.plot_extensions(args.dataset_path, args.num_extensions) if args.convert.lower() == 'y': converting_utilities.convert(args.dataset_path, args.num_extensions) start = args.num_clusters_start end = args.num_clusters_end num_clusters = start print("Clustering for all k: " + str(start) + "<=k<=" + str(end) + "...\n") while num_clusters <= end: if args.cluster_struct.lower() == 'y': schema_clustering.runflow(args.dataset_path, num_clusters, args.overwrite_distmat_struct, args.overwrite_plot_struct, args.fill_threshold) if args.cluster_text.lower() == 'y': document_clustering.runflow(num_clusters, args.overwrite_tokens_text, args.overwrite_clusters_text, args.dataset_path, args.minibatch_kmeans) num_clusters += 1
def main(): # Setting theano environment flags theano.config.floatX = 'float64' theano.config.optimizer = 'fast_run' theano.config.exception_verbosity = 'high' args = options.load_arguments() print(args) # Loading data print("Loading Review data.") prod_to_sent = load_reviews(args) flic = FLIC() load_flic_data(args, flic) sent_emb_dim = np.shape(prod_to_sent.values()[0][0])[1] # Evaluate model using cross validation if args.evaluate_model: # If training gets cancelled using the SIGINT exception, the model will still be evaluated on the testsets. signal_handler = GracefullExit(parent=True) # The following lists store the errors, accuracies and num_samples_average of the k processes (from the k-fold # cross validation). error_train_l = [] error_valid_l = [] error_test_l = [] accuracy_train_l = [] accuracy_valid_l = [] accuracy_test_l = [] num_samp_train_l = [] num_samp_valid_l = [] num_samp_test_l = [] k = args.num_cross_validation # The number of runs in the crossvalidation loop (k-fold cross validation) # The products in the testset is disjoint from the products in the training and validation setes. if args.testset_has_new_products: cross_validation = CrossValidationProducts(k=k, args=args, perc_validation_data= 0.15) else: cross_validation = CrossValidationLinks(k=k, args=args, perc_validation_data=0.5) # Parallelize cross validation runs pool = NoDaemonPool(processes=k) rand_generator = random.Random() rand_generator.jumpahead(random.randint(1, 10000000)) # A function to store the return values of the k processes (from the k-fold cross validation). def store_results(return_value): error_train, samples_train, accuracy_train, \ error_valid, samples_valid, accuracy_valid, \ error_test, samples_test, accuracy_test = return_value def average_sample_size(sampled_sents): return np.mean([len(samp) for samples in map(lambda x: x[2], sampled_sents) for samp in samples]) error_train_l.append(error_train) num_samp_train_l.append(average_sample_size(samples_train)) accuracy_train_l.append(accuracy_train) error_valid_l.append(error_valid) num_samp_valid_l.append(average_sample_size(samples_valid)) accuracy_valid_l.append(accuracy_valid) error_test_l.append(error_test) num_samp_test_l.append(average_sample_size(samples_test)) accuracy_test_l.append(accuracy_test) print("Starting Cross-Validation. I am process {}.".format(os.getpid())) k_0 = 0 # Cross validation loop for D, D_valid, D_test, D_C, D_C_valid, D_C_test in cross_validation.next_sample(): rand_generator.jumpahead(random.randint(1, 10000000)) pool.apply_async(_train_and_evaluate, (D, D_valid, D_test, D_C, D_C_valid, D_C_test, flic, prod_to_sent, args, sent_emb_dim, k_0), callback=store_results) k_0 += 1 pool.close() pool.join() sys.stdout = sys.__stdout__ # Store the results in a Pandas DataFrame. store_results_in_DF(args, flic._dim_att, sent_emb_dim, error_train_l, accuracy_train_l, num_samp_train_l, error_valid_l, accuracy_valid_l, num_samp_valid_l, error_test_l, accuracy_test_l, num_samp_test_l) # Console output for fast interpretation. print("Train error = {}".format(error_train_l)) print("Mean = {}".format(np.array(error_train_l).mean(axis=0))) print("Std = {}".format(np.array(error_train_l).std(axis=0))) print("Average sample size = {}".format(np.mean(num_samp_train_l))) print("Accuracy = {}\n".format(accuracy_train_l)) print("Validation error = {}".format(error_valid_l)) print("Mean = {}".format(np.array(error_valid_l).mean(axis=0))) print("Std = {}".format(np.array(error_valid_l).std(axis=0))) print("Average sample size = {}".format(np.mean(num_samp_valid_l))) print("Accuracy = {}\n".format(accuracy_valid_l)) print("Test error = {}".format(error_test_l)) print("Mean = {}".format(np.array(error_test_l).mean(axis=0))) print("Std = {}".format(np.array(error_test_l).std(axis=0))) print("Average sample size = {}".format(np.mean(num_samp_test_l))) print("Accuracy = {}\n".format(accuracy_test_l)) # Train the data else: # Loads the data print("Load training Data.") D, D_C, D_valid, D_C_valid, D_test, D_C_test = load_link_data(args, perc_validation_data=args.perc_validation_data, perc_test_data= args.perc_test_data) best_val_error = train_model(args, sent_emb_dim, flic, prod_to_sent, D, D_C, D_valid, D_C_valid) print("Best validation error = {}".format(best_val_error))
def online_transfer(neg_lines, write_file='neg.txt', style=0): with open(write_file, 'w') as f: for line in neg_lines: y = style batch = get_batch([line], [y], vocab.word2id) ori, tsf = decoder.rewrite(batch) # f.write('input: {}\n'.format(' '.join(line))) # f.write('original: {}\n'.format(' '.join(w for w in ori[0]))) # f.write('transfer: {}\n'.format(' '.join(w for w in tsf[0]))) f.write('{}\n'.format(' '.join(line))) f.write('{}\n'.format(' '.join(w for w in ori[0]))) f.write('{}\n'.format(' '.join(w for w in tsf[0]))) if __name__ == '__main__': args = load_arguments() if not os.path.exists(args.model): os.system("mkdir -p {}".format(args.model)) ##### data preparation ##### if args.train or args.latent_train: chosen = args.train if len(args.train) > len(args.latent_train) else \ args.latent_train # train0 = load_sent(chosen + '.0', args.max_train_size) # train1 = load_sent(chosen + '.1', args.max_train_size) train0 = load_sent(chosen + 'formal', args.max_train_size) train1 = load_sent(chosen + 'informal', args.max_train_size) print('#sents of training file 0:', len(train0))
# " prec2={:.4f} generator time={:.4f} encoder time={:.4f} total test time={:.4f}\n").format( # r_mse, # r_p1, # r_prec1, # r_prec2, # gen_time, # enc_time, # time.time() - start_rational_time #)) data = str('%.5f' % r_mse) + "\t" + str( '%4.2f' % r_p1) + "\t" + str('%4.4f' % r_prec1) + "\t" + str( '%4.4f' % r_prec2) + "\t" + str('%4.2f' % gen_time) + "\t" + str( '%4.2f' % enc_time) + "\t" + str( '%4.2f' % prec_cal_time) + "\t" + str( '%4.2f' % (time.time() - start_rational_time) ) + "\t" + str(args.sparsity) + "\t" + str( args.coherent) + "\t" + str( args.max_epochs) + "\t" + str( args.cur_epoch) with open(args.graph_data_path, 'a') as g_f: print 'writning to file: ', data g_f.write(data + "\n") if __name__ == "__main__": args = options.load_arguments() main()
dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ] if args.test: test_x, test_y = myio.read_annotations(args.test) test_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in test_x ] if args.train: model = Model( args = args, embedding_layer = embedding_layer, nclasses = len(train_y[0]) ) model.ready() #debug_func2 = theano.function( # inputs = [ model.x, model.z ], # outputs = model.generator.logpz # ) #theano.printing.debugprint(debug_func2) #return model.train( (train_x, train_y), (dev_x, dev_y) if args.dev else None, (test_x, test_y) if args.test else None ) if __name__=="__main__": args = options.load_arguments() main()
def main(): print("ARGUMENTS: ") args = load_arguments() print("Arguments loaded. ") if args.num_clusters_end < args.num_clusters_start: args.num_clusters_end = args.num_clusters_start if args.plot_extensions.lower() == 'y': extensions.plot_extensions(args.dataset_path, args.num_extensions) if args.convert.lower() == 'y': converting_utilities.convert(args.dataset_path, args.num_extensions, args.num_processes) if args.cluster_struct.lower() == 'y': start = args.num_clusters_start end = args.num_clusters_end num_clusters = start max_struct_score = 0 optimal_num_clusters = start print("Clustering structured files for all k: " + str(start) + "<=k<=" + str(end) + "...\n") while num_clusters <= end: scores = schema_clustering.runflow(args.dataset_path, num_clusters, args.overwrite_distmat_struct, args.overwrite_plot_struct, args.fill_threshold) struct_score = scores[0] print("Schema clustering with k=" + str(num_clusters) + " yields freqdrop score of " + str(struct_score)) if struct_score > max_struct_score: max_struct_score = struct_score optimal_num_clusters = num_clusters num_clusters += 1 print("k with highest freqdrop score:", str(optimal_num_clusters)) if args.cluster_text.lower() == 'y': start = args.num_clusters_start end = args.num_clusters_end num_clusters = start max_text_score = 0 optimal_num_clusters = start allscores = [] retokenize = args.overwrite_tokens_text print("Clustering text files for all k: " + str(start) + "<=k<=" + str(end) + "...\n") while num_clusters <= end: if num_clusters > start: retokenize = "n" sil, frqdrop, text_score = document_clustering.runflow( num_clusters, args.overwrite_tokens_text, args.overwrite_clusters_text, args.dataset_path, args.minibatch_kmeans, args.num_processes) print("Text clustering with k=" + str(num_clusters) + " yields freqdrop score of " + str(frqdrop)) allscores.append(frqdrop) if frqdrop > max_text_score: max_text_score = frqdrop optimal_num_clusters = num_clusters num_clusters += 1 for x in range(len(allscores)): print("k=" + str(start + x) + " cleanliness=" + str(allscores[x])) print("k with highest cleanliness score:", str(optimal_num_clusters))
batches = get_lm_batches(x, vocab.word2id, args.batch_size) tot_loss, n_words = 0, 0 for batch in batches: tot_loss += sess.run(model.tot_loss, feed_dict={model.batch_size: batch['size'], model.inputs: batch['inputs'], model.targets: batch['targets'], model.weights: batch['weights'], model.dropout: 1}) n_words += np.sum(batch['weights']) return np.exp(tot_loss / n_words) if __name__ == '__main__': args = load_arguments() if args.train: train = load_sent(args.train) if not os.path.isfile(args.vocab): build_vocab(train, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size', vocab.size if args.dev: dev = load_sent(args.dev) if args.test: test = load_sent(args.test)
str(test_prediction_scores[k]) + '\n') summary_file.write('\nParameters:\n') for key in options.IMPORTANT_PARAMS: summary_file.write(str(key) + ": " + str(vars(args)[key]) + "\n") print( f'Load Time: {asMinutes(TRAIN_START_TIME-LOAD_START_TIME)}\nTrain Time: {asMinutes(EVAL_START_TIME-TRAIN_START_TIME)}\nEval Time: {asMinutes(time() - EVAL_START_TIME)}\nTotal Time: {asMinutes(time()-LOAD_START_TIME)}' ) def get_user_yesno_answer(question): answer = input(question + '(y/n)') if answer == 'y': return True elif answer == 'n': return False else: print("Please answer 'y' or 'n'") return (get_user_yesno_answer(question)) if __name__ == "__main__": ARGS = options.load_arguments() import torch torch.manual_seed(ARGS.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False os.environ['CUDA_VISIBLE_DEVICES'] = ARGS.cuda_visible_devices import numpy as np np.random.seed(ARGS.seed) main(ARGS)