def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) generator = Generator(num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, start_token=self.start_token) self.set_generator(generator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) oracle_dataloader = None dis_dataloader = None self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(self.oracle_file, 'w') as outfile: outfile.write( text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def main(): args = parser.parse_args() pp.pprint(vars(args)) config = vars(args) model_path = resources_path(os.path.join('trained_models', config['model_name'])) input_path = resources_path(os.path.join('inference_data', config['input_name'])) data_file = resources_path(args.data_dir, '{}.txt'.format(args.dataset)) sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) if args.dataset == 'emnlp_news' : data_file, lda_file = create_subsample_data_file(data_file) else: lda_file = data_file seq_len, vocab_size, word_index_dict, index_word_dict = text_precess(data_file, oracle_file=oracle_file) print(index_word_dict) config['seq_len'] = seq_len config['vocab_size'] = vocab_size print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) if config['topic']: topic_number = config['topic_number'] oracle_loader = RealDataTopicLoader(args.batch_size, args.seq_len) oracle_loader.set_dataset(args.dataset) oracle_loader.topic_num = topic_number oracle_loader.set_dictionaries(word_index_dict, index_word_dict) oracle_loader.get_LDA(word_index_dict, index_word_dict, data_file) print(oracle_loader.model_index_word_dict) inference_main(oracle_loader, config, model_path, input_path)
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) generator = Generator(num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, start_token=self.start_token) self.set_generator(generator) discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, vocab_size=self.vocab_size, emd_dim=self.emb_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, l2_reg_lambda=self.l2_reg_lambda) self.set_discriminator(discriminator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(self.oracle_file, 'w') as outfile: outfile.write(text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict # from utils.text_process import get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) g_embeddings = tf.Variable(tf.random_normal(shape=[self.vocab_size, self.emb_dim], stddev=0.1)) discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, emd_dim=self.emb_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, g_embeddings=g_embeddings, l2_reg_lambda=self.l2_reg_lambda) self.set_discriminator(discriminator) generator = Generator(num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, g_embeddings=g_embeddings, discriminator=discriminator, start_token=self.start_token) self.set_generator(generator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(self.oracle_file, 'w') as outfile: outfile.write(text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) goal_out_size = sum(self.num_filters) discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, vocab_size=self.vocab_size, dis_emb_dim=self.dis_embedding_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, batch_size=self.batch_size, hidden_dim=self.hidden_dim, start_token=self.start_token, goal_out_size=goal_out_size, step_size=4, l2_reg_lambda=self.l2_reg_lambda) self.set_discriminator(discriminator) generator = Generator(num_classes=2, num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, dis_emb_dim=self.dis_embedding_dim, goal_size=self.goal_size, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, filter_sizes=self.filter_size, start_token=self.start_token, num_filters=self.num_filters, goal_out_size=goal_out_size, D_model=discriminator, step_size=4) self.set_generator(generator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(self.oracle_file, 'w') as outfile: outfile.write(text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def main(): args = parser.parse_args() pp.pprint(vars(args)) config = vars(args) # train with different datasets if args.dataset == 'oracle': oracle_model = OracleLstm(num_vocabulary=args.vocab_size, batch_size=args.batch_size, emb_dim=args.gen_emb_dim, hidden_dim=args.hidden_dim, sequence_length=args.seq_len, start_token=args.start_token) oracle_loader = OracleDataLoader(args.batch_size, args.seq_len) gen_loader = OracleDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=args.vocab_size, batch_size=args.batch_size, seq_len=args.seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator(args.d_architecture, batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=args.vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) oracle_train(generator, discriminator, oracle_model, oracle_loader, gen_loader, config) elif args.dataset in ['image_coco', 'emnlp_news']: data_file = os.path.join(args.data_dir, '{}.txt'.format(args.dataset)) seq_len, vocab_size = text_precess(data_file) config['seq_len'] = seq_len # override the sequence length config['vocab_size'] = vocab_size print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) oracle_loader = RealDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator(args.d_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) f_classifier = models.get_classifier(args.f_architecture, scope="f_classifier", batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.f_emb_dim, num_rep=args.num_rep, sn=args.sn) real_train(generator, discriminator, f_classifier, oracle_loader, config) else: raise NotImplementedError('{}: unknown dataset!'.format(args.dataset))
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) generator = Generator(num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, start_token=self.start_token) self.set_generator(generator) discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, vocab_size=self.vocab_size, emd_dim=self.emb_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, l2_reg_lambda=self.l2_reg_lambda) self.set_discriminator(discriminator) # 创建dataloader gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) # 这时真实文本就使用现实中的文本 oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) # data pipe工作在这里完成!!!!!!!!!11 self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(self.oracle_file, 'w') as outfile: # 这里把oracle_file给编码了 outfile.write( text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) generator = Generator(num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, start_token=self.start_token) self.set_generator(generator) discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, vocab_size=self.vocab_size, emd_dim=self.emb_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, l2_reg_lambda=self.l2_reg_lambda, splited_steps=self.splited_steps) self.set_discriminator(discriminator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) with open(self.oracle_file, 'w') as outfile: outfile.write( text_to_code(tokens, self.wi_dict, self.sequence_length))
def main(): args = parser.parse_args() pp.pprint(vars(args)) config = vars(args) # train with different datasets if args.dataset == 'oracle': oracle_model = OracleLstm(num_vocabulary=args.vocab_size, batch_size=args.batch_size, emb_dim=args.gen_emb_dim, hidden_dim=args.hidden_dim, sequence_length=args.seq_len, start_token=args.start_token) oracle_loader = OracleDataLoader(args.batch_size, args.seq_len) gen_loader = OracleDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=args.vocab_size, batch_size=args.batch_size, seq_len=args.seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator(args.d_architecture, batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=args.vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) oracle_train(generator, discriminator, oracle_model, oracle_loader, gen_loader, config) elif args.dataset in ['image_coco', 'emnlp_news']: # custom dataset selected data_file = resources_path(args.data_dir, '{}.txt'.format(args.dataset)) sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) data_dir = resources_path(config['data_dir']) if args.dataset == 'image_coco': test_file = os.path.join(data_dir, 'testdata/test_coco.txt') elif args.dataset == 'emnlp_news': test_file = os.path.join(data_dir, 'testdata/test_emnlp.txt') else: raise NotImplementedError('Unknown dataset!') if args.dataset == 'emnlp_news': data_file, lda_file = create_subsample_data_file(data_file) else: lda_file = data_file seq_len, vocab_size, word_index_dict, index_word_dict = text_precess( data_file, test_file, oracle_file=oracle_file) config['seq_len'] = seq_len config['vocab_size'] = vocab_size print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) config['topic_loss_weight'] = args.topic_loss_weight if config['LSTM']: if config['topic']: topic_number = config['topic_number'] oracle_loader = RealDataTopicLoader(args.batch_size, args.seq_len) oracle_loader.set_dataset(args.dataset) oracle_loader.set_files(data_file, lda_file) oracle_loader.topic_num = topic_number oracle_loader.set_dictionaries(word_index_dict, index_word_dict) generator = models.get_generator( args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token, TopicInMemory=args.topic_in_memory, NoTopic=args.no_topic) from real.real_gan.real_topic_train_NoDiscr import real_topic_train_NoDiscr real_topic_train_NoDiscr(generator, oracle_loader, config, args) else: generator = models.get_generator(args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) oracle_loader = RealDataLoader(args.batch_size, args.seq_len) oracle_loader.set_dictionaries(word_index_dict, index_word_dict) oracle_loader.set_dataset(args.dataset) oracle_loader.set_files(data_file, lda_file) oracle_loader.topic_num = config['topic_number'] from real.real_gan.real_train_NoDiscr import real_train_NoDiscr real_train_NoDiscr(generator, oracle_loader, config, args) else: if config['topic']: topic_number = config['topic_number'] oracle_loader = RealDataTopicLoader(args.batch_size, args.seq_len) oracle_loader.set_dataset(args.dataset) oracle_loader.set_files(data_file, lda_file) oracle_loader.topic_num = topic_number oracle_loader.set_dictionaries(word_index_dict, index_word_dict) generator = models.get_generator( args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token, TopicInMemory=args.topic_in_memory, NoTopic=args.no_topic) discriminator = models.get_discriminator( args.d_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) if not args.no_topic: topic_discriminator = models.get_topic_discriminator( args.topic_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn, discriminator=discriminator) else: topic_discriminator = None from real.real_gan.real_topic_train import real_topic_train real_topic_train(generator, discriminator, topic_discriminator, oracle_loader, config, args) else: generator = models.get_generator(args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator( args.d_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) oracle_loader = RealDataLoader(args.batch_size, args.seq_len) from real.real_gan.real_train import real_train real_train(generator, discriminator, oracle_loader, config, args) elif args.dataset in ['Amazon_Attribute']: # custom dataset selected data_dir = resources_path(config['data_dir'], "Amazon_Attribute") sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) train_file = os.path.join(data_dir, 'train.csv') dev_file = os.path.join(data_dir, 'dev.csv') test_file = os.path.join(data_dir, 'test.csv') # create_tokens_files(data_files=[train_file, dev_file, test_file]) config_file = load_json(os.path.join(data_dir, 'config.json')) config = {**config, **config_file} # merge dictionaries from real.real_gan.loaders.amazon_loader import RealDataAmazonLoader oracle_loader = RealDataAmazonLoader(args.batch_size, args.seq_len) oracle_loader.create_batches( data_file=[train_file, dev_file, test_file]) oracle_loader.model_index_word_dict = load_json( join(data_dir, 'index_word_dict.json')) oracle_loader.model_word_index_dict = load_json( join(data_dir, 'word_index_dict.json')) generator = models.get_generator("amazon_attribute", vocab_size=config['vocabulary_size'], batch_size=args.batch_size, seq_len=config['seq_len'], gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token, user_num=config['user_num'], product_num=config['product_num'], rating_num=5) discriminator = models.get_discriminator( "amazon_attribute", batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocabulary_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) from real.real_gan.amazon_attribute_train import amazon_attribute_train amazon_attribute_train(generator, discriminator, oracle_loader, config, args) elif args.dataset in ['CustomerReviews', 'imdb']: from real.real_gan.loaders.custom_reviews_loader import RealDataCustomerReviewsLoader from real.real_gan.customer_reviews_train import customer_reviews_train # custom dataset selected if args.dataset == 'CustomerReviews': data_dir = resources_path(config['data_dir'], "MovieReviews", "cr") elif args.dataset == 'imdb': data_dir = resources_path(config['data_dir'], "MovieReviews", 'movie', 'sstb') else: raise ValueError sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) train_file = os.path.join(data_dir, 'train.csv') # create_tokens_files(data_files=[train_file, dev_file, test_file]) config_file = load_json(os.path.join(data_dir, 'config.json')) config = {**config, **config_file} # merge dictionaries oracle_loader = RealDataCustomerReviewsLoader(args.batch_size, args.seq_len) oracle_loader.create_batches(data_file=[train_file]) oracle_loader.model_index_word_dict = load_json( join(data_dir, 'index_word_dict.json')) oracle_loader.model_word_index_dict = load_json( join(data_dir, 'word_index_dict.json')) generator = models.get_generator("CustomerReviews", vocab_size=config['vocabulary_size'], batch_size=args.batch_size, start_token=args.start_token, seq_len=config['seq_len'], gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, sentiment_num=config['sentiment_num']) discriminator_positive = models.get_discriminator( "CustomerReviews", scope="discriminator_positive", batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocabulary_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) discriminator_negative = models.get_discriminator( "CustomerReviews", scope="discriminator_negative", batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocabulary_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) customer_reviews_train(generator, discriminator_positive, discriminator_negative, oracle_loader, config, args) else: raise NotImplementedError('{}: unknown dataset!'.format(args.dataset)) print("RUN FINISHED") return
def main(): print('program start') from utils.text_process import text_precess, text_to_code # TODO: move? from utils.text_process import get_tokenlized, get_word_list, get_dict random.seed(SEED) np.random.seed(SEED) assert START_TOKEN == 0 # JJ added SEQ_LENGTH, vocab_size = text_precess(true_file, val_file) gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH) gan_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH) val_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH) likelihood_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH) # For testing #vocab_size = 5000 # JJ added # Create training file and dicts tokens = get_tokenlized(true_file) val_tokens = get_tokenlized(val_file) word_set = get_word_list(tokens + val_tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(oracle_file, 'w') as outfile: outfile.write(text_to_code(tokens, word_index_dict, SEQ_LENGTH)) with open(val_oracle_file, 'w') as outfile: outfile.write(text_to_code(val_tokens, word_index_dict, SEQ_LENGTH)) generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN) #target_params = pickle.load(open('save/target_params_py3.pkl', 'rb')) #target_lstm = TARGET_LSTM(vocab_size, BATCH_SIZE, 32, 32, SEQ_LENGTH, START_TOKEN, target_params) # The oracle model mediator = Mediator(vocab_size, BATCH_SIZE, EMB_DIM * 2, HIDDEN_DIM * 2, SEQ_LENGTH, START_TOKEN, name="mediator", dropout_rate=M_DROPOUT_RATE, learning_rate=3e-3, with_professor_forcing=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # First, use the oracle model to provide the positive examples, which are sampled from the oracle data distribution #generate_samples(sess, target_lstm, BATCH_SIZE, generated_num, positive_file) gen_data_loader.create_batches(oracle_file) #positive_file) gan_data_loader.create_batches(oracle_file) #positive_file) #generate_samples(sess, target_lstm, BATCH_SIZE, generated_num, eval_file) val_data_loader.create_batches(val_oracle_file) #eval_file) log = open('save/experiment-log.txt', 'w') log_nll = open('save/experiment-log-nll.txt', 'w') #log_jsd = open('save/experiment-log-jsd.txt', 'w') # pre-train generator (default 0 epochs)(not recommended) print('Start pre-training...') log.write('pre-training...\n') saver = tf.train.Saver(tf.global_variables()) if RESTORE: saver.restore(sess, "saved_model/CoT") for epoch in range(PRE_EPOCH_NUM): loss = mle_epoch(sess, generator, gen_data_loader) if epoch % 1 == 0: generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file) likelihood_data_loader.create_batches(negative_file) test_loss = target_loss(sess, target_lstm, likelihood_data_loader) print('pre-train epoch ', epoch, 'nll_oracle ', test_loss) buffer = 'epoch:\t' + str(epoch) + '\tnll_oracle:\t' + str( test_loss) + '\n' log_nll.write(buffer) if epoch % 1 == 0: test_loss = target_loss(sess, generator, val_data_loader) print('pre-train epoch ', epoch, 'nll_test ', test_loss) buffer = 'epoch:\t' + str(epoch) + '\tnll_test:\t' + str( test_loss) + '\n' log_nll.write(buffer) print( '#########################################################################' ) toc = time.time() # JJ print('Start Cooperative Training...') for iter_idx in range(TOTAL_BATCH): print('iteration: ' + str(iter_idx) + '\ntime: ' + str(time.time() - toc)) toc = time.time() # Train the generator for one step for it in range(1): samples = generator.generate(sess) rewards = mediator.get_reward(sess, samples) feed = {generator.x: samples, generator.rewards: rewards} _ = sess.run( generator.g_updates, feed_dict=feed ) # JJ -> loss, _ = sess.run([generator.g_loss, generator.g_updates], feed_dict=feed) # Test # JJ delete ''' if iter_idx % 100 == 0 or iter_idx == TOTAL_BATCH - 1: generate_samples(sess, generator, BATCH_SIZE, generated_num, negative_file) likelihood_data_loader.create_batches(negative_file) test_loss = target_loss(sess, target_lstm, likelihood_data_loader) buffer = 'batch:\t' + str(iter_idx) + '\tnll_oracle:\t' + str(test_loss) + '\n' print('batch: ', iter_idx, 'nll_oracle: ', test_loss) log_nll.write(buffer) ''' if iter_idx % gen_data_loader.num_batch == 0: # epochs instead of batches #if iter_idx % 100 == 0: test_loss = target_loss(sess, generator, val_data_loader) print('epoch:\t', iter_idx // gen_data_loader.num_batch, 'nll_test ', test_loss) buffer = 'epoch:\t' + str( iter_idx // gen_data_loader.num_batch) + '\tnll_test:\t' + str( test_loss) + '\n' #print('batch:\t', iter_idx, 'nll_test ', test_loss) #buffer = 'batch:\t'+ str(iter_idx) + '\tnll_test:\t' + str(test_loss) + '\n' log_nll.write(buffer) saver.save(sess, "saved_model/CoT") # Train the mediator for _ in range(1): bnll_ = [] """ d_loss_ = [] for it in range(3): feed = { mediator.x0: gan_data_loader.next_batch(), mediator.x1: generator.generate(sess) } d_loss, _ = sess.run([mediator.d_loss, mediator.d_update], feed) d_loss_.append(d_loss) """ for it in range(1): feed = { mediator.x0: gen_data_loader.next_batch(), mediator.x1: generator.generate(sess) } bnll = sess.run(mediator.likelihood_loss, feed) bnll_.append(bnll) sess.run(mediator.dropout_on) _ = sess.run(mediator.likelihood_updates, feed) sess.run(mediator.dropout_off) if iter_idx % 10 == 0: bnll = np.mean(bnll_) print("mediator cooptrain iter#%d, balanced_nll %f" % (iter_idx, bnll)) log.write("%d\t%f\n" % (iter_idx, bnll)) #if iter_idx % gen_data_loader.num_batch == 0: #jsd = jsd_calculate(sess, generator, target_lstm) #print('cooptrain epoch#', iter_idx // gen_data_loader.num_batch, 'jsd ', jsd) #log_jsd.write("%d\t%f\n" % (iter_idx // gen_data_loader.num_batch, jsd)) #saver.save(sess, "saved_model/CoT") log.close() log_nll.close()
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' # 控制台直接运行函数输出(38, 4682) # end_token 4681 # start_token是0,seq中oracle文件是转码后的文本,里面没有start_token,但是运行的时候起始输入是0对应的向量 # 其实0对应的是个单词,并不是start_token,但是初始化统一为他也行 # return sequence_len+1, len(word_index_dict) + 1 self.sequence_length, self.vocab_size = text_precess(data_loc) end_token = self.vocab_size - 1 # self.sequence_length += 1 ###!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # goal_out_size = sum(self.num_filters) goal_out_size = self.emb_dim discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, vocab_size=self.vocab_size, dis_emb_dim=self.dis_embedding_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, batch_size=self.batch_size, hidden_dim=self.hidden_dim, start_token=self.start_token, goal_out_size=goal_out_size, step_size=self.step_size, l2_reg_lambda=self.l2_reg_lambda) # add self.set_discriminator(discriminator) # reward_co=self.reward_co att_model = Att_dis(vocab_size=self.vocab_size, emd_dim=self.emb_dim, sequence_length=self.sequence_length, batch_size=self.batch_size, sess=self.sess, end_token=end_token) self.att_model = att_model generator = Generator(num_classes=2, num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, dis_emb_dim=self.dis_embedding_dim, goal_size=self.goal_size, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, filter_sizes=self.filter_size, start_token=self.start_token, num_filters=self.num_filters, goal_out_size=goal_out_size, D_model=discriminator, att_model=att_model, step_size=self.step_size, sess=self.sess, end_token=end_token) self.set_generator(generator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length, end_token=end_token) oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(self.oracle_file, 'w') as outfile: outfile.write( text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def main(): args = parser.parse_args() # pp.pprint(vars(args)) config = vars(args) # train with different datasets if args.dataset == 'oracle': oracle_model = OracleLstm(num_vocabulary=args.vocab_size, batch_size=args.batch_size, emb_dim=args.gen_emb_dim, hidden_dim=args.hidden_dim, sequence_length=args.seq_len, start_token=args.start_token) oracle_loader = OracleDataLoader(args.batch_size, args.seq_len) gen_loader = OracleDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=args.vocab_size, batch_size=args.batch_size, seq_len=args.seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator(args.d_architecture, batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=args.vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) oracle_train(generator, discriminator, oracle_model, oracle_loader, gen_loader, config) elif args.dataset in ['image_coco', 'emnlp_news', 'emnlp_news_small']: data_file = os.path.join(args.data_dir, '{}.txt'.format(args.dataset)) seq_len, vocab_size, word_index_dict, index_word_dict = text_precess( data_file) config['seq_len'] = seq_len config['vocab_size'] = vocab_size # print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) oracle_loader = RealDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator(args.d_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) # print("gen params = ", count_params(generator.trainable_variables)) # print("disc params = ", count_params(discriminator.trainable_variables)) # sys.stdout.flush() load_model = False if config['load_saved_model'] != "": log_dir_path = os.path.dirname(config['load_saved_model']) config['log_dir'] = log_dir_path config['sample_dir'] = os.path.join( os.path.split(log_dir_path)[0], 'samples') index_word_dict = load_index_to_word_dict( os.path.join(config['log_dir'], "index_to_word_dict.json")) word_index_dict = {v: k for k, v in index_word_dict.items()} load_model = True else: if not os.path.exists(config['log_dir']): os.makedirs(config['log_dir']) json.dump( index_word_dict, open( os.path.join(config['log_dir'], "index_to_word_dict.json"), 'w')) json.dump( word_index_dict, open( os.path.join(config['log_dir'], "word_to_index_dict.json"), 'w')) pp.pprint(config) print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) sys.stdout.flush() real_train(generator, discriminator, oracle_loader, config, word_index_dict, index_word_dict, load_model=load_model) if args.dataset == "emnlp_news" or args.dataset == "emnlp_news_small": call([ "python", 'bleu_post_training_emnlp.py', os.path.join(os.path.split(config['log_dir'])[0], 'samples'), 'na' ], cwd=".") elif args.dataset == "image_coco": call([ "python", 'bleu_post_training.py', os.path.join(os.path.split(config['log_dir'])[0], 'samples'), 'na' ], cwd=".") elif args.dataset in ['ace0_small']: # data_file = os.path.join(args.data_dir, '{}.txt'.format(args.dataset)) # seq_len, vocab_size, word_index_dict, index_word_dict = text_precess(data_file) seq_len = config['seq_len'] vocab_size = config['vocab_size'] # # print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) # oracle_loader = RealDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=config['vocab_size'], batch_size=args.batch_size, seq_len=config['seq_len'], gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator( args.d_architecture, batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocab_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) # print("gen params = ", count_params(generator.trainable_variables)) # print("disc params = ", count_params(discriminator.trainable_variables)) # sys.stdout.flush() load_model = False if config['load_saved_model'] != "": log_dir_path = os.path.dirname(config['load_saved_model']) config['log_dir'] = log_dir_path config['sample_dir'] = os.path.join( os.path.split(log_dir_path)[0], 'samples') index_word_dict = load_index_to_word_dict( os.path.join(config['log_dir'], "index_to_word_dict.json")) word_index_dict = {v: k for k, v in index_word_dict.items()} load_model = True else: if not os.path.exists(config['log_dir']): os.makedirs(config['log_dir']) # json.dump(index_word_dict, open(os.path.join(config['log_dir'], "index_to_word_dict.json"), 'w')) # json.dump(word_index_dict, open(os.path.join(config['log_dir'], "word_to_index_dict.json"), 'w')) pp.pprint(config) print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) sys.stdout.flush() real_train_traj(generator, discriminator, None, config, None, None, load_model=load_model) # if args.dataset == "emnlp_news" or args.dataset == "emnlp_news_small": # call(["python", 'bleu_post_training_emnlp.py', os.path.join(os.path.split(config['log_dir'])[0], 'samples'), 'na'], cwd=".") # elif args.dataset == "image_coco": # call(["python", 'bleu_post_training.py', os.path.join(os.path.split(config['log_dir'])[0], 'samples'), 'na'], cwd=".") else: raise NotImplementedError('{}: unknown dataset!'.format(args.dataset))
def init_real_trainng(self, data_loc=None): from utils.text_process import text_precess, text_to_code from utils.text_process import get_tokenlized, get_word_list, get_dict if data_loc is None: data_loc = 'data/image_coco.txt' self.sequence_length, self.vocab_size = text_precess(data_loc) goal_out_size = sum(self.num_filters) discriminator = Discriminator(sequence_length=self.sequence_length, num_classes=2, vocab_size=self.vocab_size, dis_emb_dim=self.dis_embedding_dim, filter_sizes=self.filter_size, num_filters=self.num_filters, batch_size=self.batch_size, hidden_dim=self.hidden_dim, start_token=self.start_token, goal_out_size=goal_out_size, step_size=4, l2_reg_lambda=self.l2_reg_lambda) self.set_discriminator(discriminator) generator = Generator(num_classes=2, num_vocabulary=self.vocab_size, batch_size=self.batch_size, emb_dim=self.emb_dim, dis_emb_dim=self.dis_embedding_dim, goal_size=self.goal_size, hidden_dim=self.hidden_dim, sequence_length=self.sequence_length, filter_sizes=self.filter_size, start_token=self.start_token, num_filters=self.num_filters, goal_out_size=goal_out_size, D_model=discriminator, step_size=4) self.set_generator(generator) gen_dataloader = DataLoader(batch_size=self.batch_size, seq_length=self.sequence_length) oracle_dataloader = None dis_dataloader = DisDataloader(batch_size=self.batch_size, seq_length=self.sequence_length) self.set_data_loader(gen_loader=gen_dataloader, dis_loader=dis_dataloader, oracle_loader=oracle_dataloader) tokens = get_tokenlized(data_loc) word_set = get_word_list(tokens) #[word_index_dict, index_word_dict] = get_dict(word_set)#Original [word_index_dict, index_word_dict] = [{ 'b': '0', 'a': '1', 'r': '2', 'n': '3', 'd': '4', 'c': '5', 'q': '6', 'e': '7', 'g': '8', 'h': '9', 'i': '10', 'l': '11', 'k': '12', 'm': '13', 'f': '14', 'p': '15', 's': '16', 't': '17', 'w': '18', 'y': '19', 'v': '20' }, { '0': 'b', '1': 'a', '2': 'r', '3': 'n', '4': 'd', '5': 'c', '6': 'q', '7': 'e', '8': 'g', '9': 'h', '10': 'i', '11': 'l', '12': 'k', '13': 'm', '14': 'f', '15': 'p', '16': 's', '17': 't', '18': 'w', '19': 'y', '20': 'v' }] with open(self.oracle_file, 'w') as outfile: outfile.write( text_to_code(tokens, word_index_dict, self.sequence_length)) return word_index_dict, index_word_dict
def main(): print('program start') from utils.text_process import text_precess, text_to_code # TODO: move? from utils.text_process import get_tokenlized, get_word_list, get_dict random.seed(SEED) np.random.seed(SEED) assert START_TOKEN == 0 SEQ_LENGTH, vocab_size = text_precess(true_file, val_file) gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH) val_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH) # Create training file and dicts tokens = get_tokenlized(true_file) val_tokens = get_tokenlized(val_file) word_set = get_word_list(tokens + val_tokens) [word_index_dict, index_word_dict] = get_dict(word_set) with open(oracle_file, 'w') as outfile: outfile.write(text_to_code(tokens, word_index_dict, SEQ_LENGTH)) with open(val_oracle_file, 'w') as outfile: outfile.write(text_to_code(val_tokens, word_index_dict, SEQ_LENGTH)) generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN) #target_params = pickle.load(open('save/target_params_py3.pkl', 'rb')) #target_lstm = TARGET_LSTM(vocab_size, BATCH_SIZE, 32, 32, SEQ_LENGTH, START_TOKEN, target_params) # The oracle model # replace target lstm with true data mediator = Generator(vocab_size, BATCH_SIZE * 2, EMB_DIM * 2, HIDDEN_DIM * 2, SEQ_LENGTH, START_TOKEN, name="mediator", dropout_rate=M_DROPOUT_RATE) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) gen_data_loader.create_batches(oracle_file) val_data_loader.create_batches(val_oracle_file) log = open('save/experiment-log.txt', 'w') log_nll = open('save/experiment-log-nll.txt', 'w') # pre-train generator (default 0 epochs)(not recommended) print('Start pre-training...') log.write('pre-training...\n') for epoch in range(PRE_EPOCH_NUM): loss = mle_epoch(sess, generator, gen_data_loader) if epoch % 5 == 0: generate_samples(sess, generator, BATCH_SIZE, generated_num, generator_file) #get_real_test_file(index_word_dict, generator_file, test_file) # only needed in debugging test_loss = target_loss(sess, generator, val_data_loader) print('pre-train epoch ', epoch, 'nll_test ', test_loss) buffer = 'epoch:\t' + str(epoch) + '\tnll_test:\t' + str( test_loss) + '\n' log_nll.write(buffer) print( '#########################################################################' ) toc = time.time() print('Start Cooperative Training...') for iter_idx in range(TOTAL_BATCH): print('iteration: ' + str(iter_idx) + '\ntime: ' + str(time.time() - toc)) toc = time.time() # Train the generator for one step for it in range(1): samples = generator.generate(sess) rewards = mediator.get_reward( sess, np.concatenate([samples, samples], axis=0)) feed = { generator.x: samples, generator.rewards: rewards[0:BATCH_SIZE] } loss, _ = sess.run([generator.g_loss, generator.g_updates], feed_dict=feed) # Test, removed oracle test if iter_idx % gen_data_loader.num_batch == 0: # epochs instead of batches test_loss = target_loss(sess, generator, val_data_loader) print('epoch:\t', iter_idx // gen_data_loader.num_batch, 'nll_test ', test_loss) buffer = 'epoch:\t' + str( iter_idx // gen_data_loader.num_batch) + '\tnll_test:\t' + str( test_loss) + '\n' log_nll.write(buffer) if iter_idx == TOTAL_BATCH - 1: print('generating samples') generate_samples(sess, generator, BATCH_SIZE, generated_num, generator_file) get_real_test_file(index_word_dict, generator_file, test_file) # Train the mediator for _ in range(1): print('training mediator...') bnll_ = [] collected_x = [] ratio = 2 for it in range(ratio): if it % 2 == 0: x_batch = gen_data_loader.next_batch() else: x_batch = generator.generate(sess) collected_x.append(x_batch) collected_x = np.reshape(collected_x, [-1, SEQ_LENGTH]) np.random.shuffle(collected_x) collected_x = np.reshape(collected_x, [-1, BATCH_SIZE * 2, SEQ_LENGTH]) for it in range(1): feed = { mediator.x: collected_x[it], } print('running bnll sess') bnll = sess.run(mediator.likelihood_loss, feed) bnll_.append(bnll) print('running mediator and updating') sess.run(mediator.dropout_on) _ = sess.run(mediator.likelihood_updates, feed) sess.run(mediator.dropout_off) if iter_idx % 50 == 0: bnll = np.mean(bnll_) print("mediator cooptrain iter#%d, balanced_nll %f" % (iter_idx, bnll)) log.write("%d\t%f\n" % (iter_idx, bnll)) log.close() log_nll.close()