def prepare(path, suffix=''): data0 = load_sent(path + '.0' + suffix) data1 = load_sent(path + '.1' + suffix) x = data0 + data1 y = [0] * len(data0) + [1] * len(data1) z = sorted(zip(x, y), key=lambda i: len(i[0])) return zip(*z)
def prepare(path, suffix=''): data0 = load_sent(path + 'formal' + suffix) data1 = load_sent(path + 'informal' + suffix) x = data0 + data1 y = [0] * len(data0) + [1] * len(data1) z = sorted(zip(x, y), key=lambda i: len(i[0])) return zip(*z)
def prepare(path, suffix='', default_data=False): if default_data: data0 = load_sent(path + '.0' + suffix) data1 = load_sent(path + '.1' + suffix) else: data0 = load_sent('../data/anto2/sentiment.test.anto2.1') data1 = load_sent('../data/anto2/sentiment.test.anto2.0') # data0 = load_sent('../data/runtime/aa/epoch20.1.tsf') # data1 = load_sent('../data/runtime/aa/epoch20.0.tsf') x = data0 + data1 y = [0] * len(data0) + [1] * len(data1) z = sorted(zip(x, y), key=lambda i: len(i[0])) return zip(*z)
if __name__ == '__main__': args = load_arguments() if not os.path.exists(args.model): os.system("mkdir -p {}".format(args.model)) ##### data preparation ##### if args.train or args.latent_train: chosen = args.train if len(args.train) > len(args.latent_train) else \ args.latent_train # train0 = load_sent(chosen + '.0', args.max_train_size) # train1 = load_sent(chosen + '.1', args.max_train_size) train0 = load_sent(chosen + 'formal', args.max_train_size) train1 = load_sent(chosen + 'informal', args.max_train_size) print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size:', vocab.size) if args.dev or args.latent_dev: chosen = args.dev if len(args.dev) > len(args.latent_dev) else \ args.latent_dev dev0 = load_sent(chosen + 'formal')
def run_model(args): time = datetime.now().timestamp() train_filename = "sarc/sarc.train" sp_model_path = "tmp/sarc_bpe" sp = spm.SentencePieceProcessor() ##### data preparation ##### if args.train: logger = utils.init_logging(args, time) print("args: ", args) logger.info("args: "+str(args)) no_of_epochs = args.max_epochs train0 = load_sent(args.train + '.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) #train0, train1 = load_sent_csvgz(args.train, args.max_train_size) # if not os.path.isfile(train_filename): with open(train_filename, "w") as f: for sent in train0+train1: f.write(" ".join(sent)+"\n") # if not os.path.isfile(train_filename+".1"): # with open(train_filename+".1", "w") as f: # for sent in train1: # f.write(" ".join(sent)+"\n") print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) logger.info('#sents of training file 0: ' + str(len(train0))) logger.info('#sents of training file 1: ' + str(len(train1))) # if not os.path.isfile(args.vocab): # build_vocab(train0 + train1, args.vocab) # if not os.path.isfile(sp_model_path+".model") or not os.path.isfile(sp_model_path+".vocab"): if args.train: spm.SentencePieceTrainer.Train('--input='+train_filename+' --model_prefix='+sp_model_path+' \ --vocab_size=10000 --hard_vocab_limit=false --bos_piece=<go> --eos_piece=<eos> --pad_id=0 \ --bos_id=1 --eos_id=2 --unk_id=3 --user_defined_symbols=<url>,<at>,<hashtag>') sp.Load(sp_model_path+".model") # vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) dev0 = [] dev1 = [] if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') if args.predict: if args.model_path: # logger.info("Predicting a sample input\n---------------------\n") model = torch.load(args.model_path) model.training = False output = utils.predict(model, args.predict, args.target_sentiment, sp, args.beam) # output = output.replace(" ","") # output_new = "" # # output = re.sub(r"(\s\s+)", " ", output) # for val in output: # if val == " ": # output_new += " " # elif val == " ": # pass # else: # output_new += val print(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}") # logger.info(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}") if args.test: file0 = open(args.test+".0", "r") file1 = open(args.test+".1", "r") saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "model")) Path(saves_path).mkdir(parents=True, exist_ok=True) out_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"), "w") out_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"), "w") model = torch.load(args.model_path) model.training = False test_neg = file0.readlines() for line in test_neg: output = utils.predict(model, line, 1, sp, args.beam) # out_file_0.write(output+"\n") print("second") test_pos = file1.readlines() for line in test_pos: output = utils.predict(model, line, 0, sp, args.beam) out_file_1.write(output+"\n") # test0 = load_sent(args.test + '.0') # test1 = load_sent(args.test + '.1') # if args.model_path: # saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "model")) # Path(saves_path).mkdir(parents=True, exist_ok=True) # model = torch.load(args.model_path) # model.training = False # batches0, batches1, _, _ = utils.get_batches(test0, test1, model.vocab.word2id, model.args.batch_size) # output_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"), "w") # output_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"), "w") # for batch0, batch1 in zip(batches0, batches1): # batch0 = batch0["enc_inputs"] # batch1 = batch1["enc_inputs"] # test_outputs_0 = utils.predict_batch(model, batch0, sentiment=1, beam_size=args.beam, plain_format=True) # test_outputs_1 = utils.predict_batch(model, batch1, sentiment=0, beam_size=args.beam, plain_format=True) # output_file_0.write('\n'.join(test_outputs_0) + '\n') # output_file_1.write('\n'.join(test_outputs_1) + '\n') if args.train: summ_filename = 'runs/cross-alignment/'+utils.get_filename(args, time, "summary") writer = SummaryWriter(summ_filename) model = get_model(args, logger, sp) model.train_max_epochs(args, train0, train1, dev0, dev1, no_of_epochs, writer, time, sp, save_epochs_flag=True)
model = Model(args, vocab) if args.load_model: print 'Loading model from', args.model model.saver.restore(sess, args.model) else: print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer()) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: train0 = load_sent(args.train + '.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size:', vocab.size if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') if args.test:
model.batch_size: batch['size'], model.inputs: batch['inputs'], model.targets: batch['targets'], model.weights: batch['weights'], model.dropout: 1 }) n_words += np.sum(batch['weights']) return np.exp(tot_loss / n_words) if __name__ == '__main__': args = load_arguments() if args.train: train = load_sent(args.train) if not os.path.isfile(args.vocab): build_vocab(train, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size', vocab.size) if args.dev: dev = load_sent(args.dev) if args.test: test = load_sent(args.test) config = tf.ConfigProto() config.gpu_options.allow_growth = True
if args.load_model: print 'Loading model from', args.model model.saver.restore(sess, args.model) else: print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer()) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: # KS train0 = load_sent(args.train + '.0', args.max_train_size) # KS train1 = load_sent(args.train + '.1', args.max_train_size) train0 = load_sent('../data/st/train.original', args.max_train_size) train1 = load_sent('../data/st/train.modern', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) dev0 = load_sent('../data/st/dev.original') dev1 = load_sent('../data/st/dev.modern') print '#sents of training file 0:', len(dev0) print '#sents of training file 1:', len(dev1) test0 = load_sent('../data/st/test.original') test1 = load_sent('../data/st/test.modern') print '#sents of training file 0:', len(test0) print '#sents of training file 1:', len(test1) if not os.path.isfile(args.vocab):
def prepare_test(path): x = load_sent(path) y = [1]*len(x) z = sorted(zip(x, y), key=lambda i: len(i[0])) return zip(*z)
model = Model(args, vocab) if args.load_model: print 'Loading model from', args.model model.saver.restore(sess, args.model) else: print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer()) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: train0 = load_sent(args.train + '.0', args.max_train_size) train0_n = load_sent(args.train + '.noised1.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) train1_n = load_sent(args.train + '.noised1.1', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size:', vocab.size if args.dev: dev0 = load_sent(args.dev + '.0') dev0_n = load_sent(args.dev + '.noised1.0')
def run_model(args): time = datetime.now().timestamp() ##### data preparation ##### if args.train: logger, saves_dir = utils.init_logging(args, time) print("args: ", args) logger.info("args: " + str(args)) no_of_epochs = args.max_epochs train0 = load_sent(args.train + '.0', args.max_train_size, args.max_seq_length, args.sentence_flag) train1 = load_sent(args.train + '.1', args.max_train_size, args.max_seq_length, args.sentence_flag) print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) logger.info('#sents of training file 0: ' + str(len(train0))) logger.info('#sents of training file 1: ' + str(len(train1))) # build vocab for every run if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) dev0 = [] dev1 = [] if args.dev: dev0 = load_sent(args.dev + '.0', -1, args.max_seq_length, args.sentence_flag) dev1 = load_sent(args.dev + '.1', -1, args.max_seq_length, args.sentence_flag) if args.predict: if args.model_path: # logger.info("Predicting a sample input\n---------------------\n") device = torch.device( "cuda:" + str(args.cuda_device) if torch.cuda.is_available() else "cpu") model = torch.load(args.model_path, map_location=device) model.training = False output = utils.predict(model, args.predict, args.target_sentiment, args.beam) print( f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}" ) # logger.info(f"Input given: {args.predict} \nTarget sentiment: {args.target_sentiment} \nTranslated output: {output}") if args.test: logger, saves_dir = utils.init_logging(args, time) print("args: ", args) logger.info("args: " + str(args)) device = torch.device( "cuda:" + str(args.cuda_device) if torch.cuda.is_available() else "cpu") file0 = open(args.test + ".0", "r") file1 = open(args.test + ".1", "r") saves_path = os.path.join(args.saves_path, utils.get_filename(args, time, "")) Path(saves_path).mkdir(parents=True, exist_ok=True) out_file_0 = open(os.path.join(saves_path, "test_outputs_neg_to_pos"), "w") out_file_1 = open(os.path.join(saves_path, "test_outputs_pos_to_neg"), "w") model = torch.load(args.model_path, map_location=device) model.training = False for line in file0: line = line.strip("\n") output = utils.predict(model, line, 1, args.beam) out_file_0.write(output + "\n") for line in file1: line = line.strip("\n") output = utils.predict(model, line, 0, args.beam) out_file_1.write(output + "\n") if args.train: summ_filename = 'runs/cross-alignment/' + utils.get_filename( args, time, "summary") writer = SummaryWriter(summ_filename) model = get_model(args, vocab, logger) model.train_max_epochs(saves_dir, args, train0, train1, dev0, dev1, vocab, no_of_epochs, writer, time, save_epochs_flag=True)
f.write('{}\n'.format(' '.join(line))) f.write('{}\n'.format(' '.join(w for w in ori[0]))) f.write('{}\n'.format(' '.join(w for w in tsf[0]))) if __name__ == '__main__': args = load_arguments() if not os.path.exists(args.model): os.system("mkdir -p {}".format(args.model)) ##### data preparation ##### if args.train or args.latent_train: chosen = args.train if len(args.train) > len(args.latent_train) else \ args.latent_train train0 = load_sent(chosen + '.0', args.max_train_size) train1 = load_sent(chosen + '.1', args.max_train_size) print('#sents of training file 0:', len(train0)) print('#sents of training file 1:', len(train1)) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print('vocabulary size:', vocab.size) if args.dev or args.latent_dev: chosen = args.dev if len(args.dev) > len(args.latent_dev) else \ args.latent_dev dev0 = load_sent(chosen + '.0') dev1 = load_sent(chosen + '.1')
train0 = load_sent_lines(args.train + '.0', args.train_start, args.train_end) train1 = load_sent_lines(args.train + '.1', args.train_start, args.train_end) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size:', vocab.size if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') if args.test: test0 = load_sent(args.test + '.0') test1 = load_sent(args.test + '.1') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = create_model(sess, args, vocab) if args.beam > 1: decoder = beam_search.Decoder(sess, args, vocab, model) else: decoder = greedy_decoding.Decoder(sess, args, vocab, model)
for batch in batches: tot_loss += sess.run(model.tot_loss, feed_dict={model.batch_size: batch['size'], model.inputs: batch['inputs'], model.targets: batch['targets'], model.weights: batch['weights'], model.dropout: 1}) n_words += np.sum(batch['weights']) return np.exp(tot_loss / n_words) if __name__ == '__main__': args = load_arguments() if args.train: train = load_sent(args.train) if not os.path.isfile(args.vocab): build_vocab(train, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size', vocab.size if args.dev: dev = load_sent(args.dev) if args.test: test = load_sent(args.test) config = tf.ConfigProto() config.gpu_options.allow_growth = True
def create_model(sess, args, vocab): model = Model(args, vocab) if args.load_model: print 'Loading model from', args.model model.saver.restore(sess, args.model) else: print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer()) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: train0 = load_sent(args.train + '.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) if not os.path.isfile(args.vocab): build_vocab(train0 + train1, args.vocab) vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb) print 'vocabulary size:', vocab.size if args.dev: dev0 = load_sent(args.dev + '.0') dev1 = load_sent(args.dev + '.1') if args.test:
print 'Loading model from', args.model model.saver.restore(sess, args.model) else: print 'Creating model with fresh parameters.' sess.run(tf.global_variables_initializer()) return model if __name__ == '__main__': args = load_arguments() ##### data preparation ##### if args.train: # 0 is the starting style ! train0 = load_sent(args.train + '.0', args.max_train_size) train1 = load_sent(args.train + '.1', args.max_train_size) train2 = load_sent(args.train + '.2', args.max_train_size) train3 = load_sent(args.train + '.3', args.max_train_size) train4 = load_sent(args.train + '.4', args.max_train_size) train5 = load_sent(args.train + '.5', args.max_train_size) print '#sents of training file 0:', len(train0) print '#sents of training file 1:', len(train1) print '#sents of training file 2:', len(train2) print '#sents of training file 3:', len(train3) print '#sents of training file 4:', len(train4) print '#sents of training file 5:', len(train5) # loaded all three datasets here. Train once with 0-1 and once with 0-2 print("=====got here training=====")