def text_precess(train_text_loc, test_text_loc=None, oracle_file=None) -> Tuple[int, int, dict, dict]: """ Get sequence length and dict size plus word_index dict and index_word. This is done because it is long to compute it in large corpora \n :param train_text_loc: train file :param test_text_loc: test file :return: sequence length of the longest sentences, dict size (how many different words), dict from word to index """ json_dict_wi_file = train_text_loc[:-4] + "_json_dict_wi.txt" json_dict_iw_file = train_text_loc[:-4] + "_json_dict_iw.txt" tokens_file = train_text_loc[:-4] + "_toekns.txt" try: word_index_dict = load_json(json_dict_wi_file) index_word_dict = load_json(json_dict_iw_file) (train_tokens, test_tokens) = load_pickle(tokens_file) except FileNotFoundError: # compute dictionaries and save them train_tokens = get_tokenized(train_text_loc) if test_text_loc is None: test_tokens = list() else: test_tokens = get_tokenized(test_text_loc) word_set = get_word_list(train_tokens + test_tokens) [word_index_dict, index_word_dict] = get_dict(word_set) write_json(json_dict_wi_file, word_index_dict) write_json(json_dict_iw_file, index_word_dict) write_pickle(tokens_file, train_tokens, test_tokens) if test_text_loc is None: sequence_len = len(max(train_tokens, key=len)) else: sequence_len = max(len(max(train_tokens, key=len)), len(max(test_tokens, key=len))) if oracle_file: try: fh = open(oracle_file, 'r') # Store configuration file values except FileNotFoundError: with open(oracle_file, 'w') as outfile: outfile.write( text_to_code( train_tokens # + test_tokens , word_index_dict, sequence_len)) return sequence_len, len( word_index_dict) + 1, word_index_dict, index_word_dict
def get_sentences(self): generated_sentences = [] data = load_json(self.json_file) for el in data['sentences']: generated_sentences.append(el['generated_sentence']) return generated_sentences
def get_reference(self): reference = list() json_obj = load_json(self.test_data) for i, hypothesis in enumerate(json_obj['sentences']): text = nltk.word_tokenize(hypothesis['generated_sentence']) reference.append(text) len_ref = len(reference) return reference[:self.sample_size]
def get_score(self): ngram = self.gram bleu = list() weight = tuple((1. / ngram for _ in range(ngram))) json_senteces = load_json(self.json_file) for ind, sentences in enumerate(json_senteces['sentences']): if ind > self.sample_size: break generated, ground_through = sentences['generated_sentence'], sentences['real_starting'] bleu.append(calc_bleu(ground_through, generated, weight)) return sum(bleu) / len(bleu)
def computeDistanceJaccard(self): jaccard_values = [] data = load_json(self.json_file) for el in data['sentences']: generated_sentence = el['generated_sentence'] values = [] for real_sent in self.all_sentences: values.append(distJaccard(generated_sentence, real_sent)) jaccard_values.append(1 - max(values)) return np.mean(jaccard_values)
def get_bleu(self): ngram = self.gram bleu = list() reference = self.get_reference() weight = tuple((1. / ngram for _ in range(ngram))) json_obj = load_json(self.test_data) t = time.time() # for i, hypothesis in enumerate(json_obj['sentences']): # hypothesis = hypothesis['generated_sentence'] # if i >= self.sample_size: # break # hypothesis = nltk.word_tokenize(hypothesis) # bleu.append(calc_bleu(reference, hypothesis, weight)) # i += 1 # return sum(bleu) / len(bleu) json_cropped = json_obj['sentences'][:self.sample_size] # we can swap out ProcessPoolExecutor for ThreadPoolExecutor with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: for bleu_res in executor.map(procedure, json_cropped, repeat(reference), repeat(weight)): bleu.append(bleu_res) # print("Bleu executed in {}".format(time.time() - t)) return sum(bleu) / len(bleu)
def main(): args = parser.parse_args() pp.pprint(vars(args)) config = vars(args) # train with different datasets if args.dataset == 'oracle': oracle_model = OracleLstm(num_vocabulary=args.vocab_size, batch_size=args.batch_size, emb_dim=args.gen_emb_dim, hidden_dim=args.hidden_dim, sequence_length=args.seq_len, start_token=args.start_token) oracle_loader = OracleDataLoader(args.batch_size, args.seq_len) gen_loader = OracleDataLoader(args.batch_size, args.seq_len) generator = models.get_generator(args.g_architecture, vocab_size=args.vocab_size, batch_size=args.batch_size, seq_len=args.seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator(args.d_architecture, batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=args.vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) oracle_train(generator, discriminator, oracle_model, oracle_loader, gen_loader, config) elif args.dataset in ['image_coco', 'emnlp_news']: # custom dataset selected data_file = resources_path(args.data_dir, '{}.txt'.format(args.dataset)) sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) data_dir = resources_path(config['data_dir']) if args.dataset == 'image_coco': test_file = os.path.join(data_dir, 'testdata/test_coco.txt') elif args.dataset == 'emnlp_news': test_file = os.path.join(data_dir, 'testdata/test_emnlp.txt') else: raise NotImplementedError('Unknown dataset!') if args.dataset == 'emnlp_news': data_file, lda_file = create_subsample_data_file(data_file) else: lda_file = data_file seq_len, vocab_size, word_index_dict, index_word_dict = text_precess( data_file, test_file, oracle_file=oracle_file) config['seq_len'] = seq_len config['vocab_size'] = vocab_size print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size)) config['topic_loss_weight'] = args.topic_loss_weight if config['LSTM']: if config['topic']: topic_number = config['topic_number'] oracle_loader = RealDataTopicLoader(args.batch_size, args.seq_len) oracle_loader.set_dataset(args.dataset) oracle_loader.set_files(data_file, lda_file) oracle_loader.topic_num = topic_number oracle_loader.set_dictionaries(word_index_dict, index_word_dict) generator = models.get_generator( args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token, TopicInMemory=args.topic_in_memory, NoTopic=args.no_topic) from real.real_gan.real_topic_train_NoDiscr import real_topic_train_NoDiscr real_topic_train_NoDiscr(generator, oracle_loader, config, args) else: generator = models.get_generator(args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) oracle_loader = RealDataLoader(args.batch_size, args.seq_len) oracle_loader.set_dictionaries(word_index_dict, index_word_dict) oracle_loader.set_dataset(args.dataset) oracle_loader.set_files(data_file, lda_file) oracle_loader.topic_num = config['topic_number'] from real.real_gan.real_train_NoDiscr import real_train_NoDiscr real_train_NoDiscr(generator, oracle_loader, config, args) else: if config['topic']: topic_number = config['topic_number'] oracle_loader = RealDataTopicLoader(args.batch_size, args.seq_len) oracle_loader.set_dataset(args.dataset) oracle_loader.set_files(data_file, lda_file) oracle_loader.topic_num = topic_number oracle_loader.set_dictionaries(word_index_dict, index_word_dict) generator = models.get_generator( args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token, TopicInMemory=args.topic_in_memory, NoTopic=args.no_topic) discriminator = models.get_discriminator( args.d_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) if not args.no_topic: topic_discriminator = models.get_topic_discriminator( args.topic_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn, discriminator=discriminator) else: topic_discriminator = None from real.real_gan.real_topic_train import real_topic_train real_topic_train(generator, discriminator, topic_discriminator, oracle_loader, config, args) else: generator = models.get_generator(args.g_architecture, vocab_size=vocab_size, batch_size=args.batch_size, seq_len=seq_len, gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token) discriminator = models.get_discriminator( args.d_architecture, batch_size=args.batch_size, seq_len=seq_len, vocab_size=vocab_size, dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) oracle_loader = RealDataLoader(args.batch_size, args.seq_len) from real.real_gan.real_train import real_train real_train(generator, discriminator, oracle_loader, config, args) elif args.dataset in ['Amazon_Attribute']: # custom dataset selected data_dir = resources_path(config['data_dir'], "Amazon_Attribute") sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) train_file = os.path.join(data_dir, 'train.csv') dev_file = os.path.join(data_dir, 'dev.csv') test_file = os.path.join(data_dir, 'test.csv') # create_tokens_files(data_files=[train_file, dev_file, test_file]) config_file = load_json(os.path.join(data_dir, 'config.json')) config = {**config, **config_file} # merge dictionaries from real.real_gan.loaders.amazon_loader import RealDataAmazonLoader oracle_loader = RealDataAmazonLoader(args.batch_size, args.seq_len) oracle_loader.create_batches( data_file=[train_file, dev_file, test_file]) oracle_loader.model_index_word_dict = load_json( join(data_dir, 'index_word_dict.json')) oracle_loader.model_word_index_dict = load_json( join(data_dir, 'word_index_dict.json')) generator = models.get_generator("amazon_attribute", vocab_size=config['vocabulary_size'], batch_size=args.batch_size, seq_len=config['seq_len'], gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, start_token=args.start_token, user_num=config['user_num'], product_num=config['product_num'], rating_num=5) discriminator = models.get_discriminator( "amazon_attribute", batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocabulary_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) from real.real_gan.amazon_attribute_train import amazon_attribute_train amazon_attribute_train(generator, discriminator, oracle_loader, config, args) elif args.dataset in ['CustomerReviews', 'imdb']: from real.real_gan.loaders.custom_reviews_loader import RealDataCustomerReviewsLoader from real.real_gan.customer_reviews_train import customer_reviews_train # custom dataset selected if args.dataset == 'CustomerReviews': data_dir = resources_path(config['data_dir'], "MovieReviews", "cr") elif args.dataset == 'imdb': data_dir = resources_path(config['data_dir'], "MovieReviews", 'movie', 'sstb') else: raise ValueError sample_dir = resources_path(config['sample_dir']) oracle_file = os.path.join(sample_dir, 'oracle_{}.txt'.format(args.dataset)) train_file = os.path.join(data_dir, 'train.csv') # create_tokens_files(data_files=[train_file, dev_file, test_file]) config_file = load_json(os.path.join(data_dir, 'config.json')) config = {**config, **config_file} # merge dictionaries oracle_loader = RealDataCustomerReviewsLoader(args.batch_size, args.seq_len) oracle_loader.create_batches(data_file=[train_file]) oracle_loader.model_index_word_dict = load_json( join(data_dir, 'index_word_dict.json')) oracle_loader.model_word_index_dict = load_json( join(data_dir, 'word_index_dict.json')) generator = models.get_generator("CustomerReviews", vocab_size=config['vocabulary_size'], batch_size=args.batch_size, start_token=args.start_token, seq_len=config['seq_len'], gen_emb_dim=args.gen_emb_dim, mem_slots=args.mem_slots, head_size=args.head_size, num_heads=args.num_heads, hidden_dim=args.hidden_dim, sentiment_num=config['sentiment_num']) discriminator_positive = models.get_discriminator( "CustomerReviews", scope="discriminator_positive", batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocabulary_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) discriminator_negative = models.get_discriminator( "CustomerReviews", scope="discriminator_negative", batch_size=args.batch_size, seq_len=config['seq_len'], vocab_size=config['vocabulary_size'], dis_emb_dim=args.dis_emb_dim, num_rep=args.num_rep, sn=args.sn) customer_reviews_train(generator, discriminator_positive, discriminator_negative, oracle_loader, config, args) else: raise NotImplementedError('{}: unknown dataset!'.format(args.dataset)) print("RUN FINISHED") return