def create_dataset(file_type, folder, train_diffs, train_msgs, test_diffs, test_msgs, valid_diffs, valid_msgs): (train_diffs, train_msgs, train_cnt, vocab_diffs, vocab_msgs) = get_dataset(file_type, train_diffs, train_msgs) test_diffs, test_msgs, test_cnt, _, _ = get_dataset( file_type, test_diffs, test_msgs) valid_diffs, valid_msgs, valid_cnt, _, _ = get_dataset( file_type, valid_diffs, valid_msgs) remove_dir(folder) make_dirs(folder) save_dataset(folder, "train." + str(train_cnt), train_diffs, train_msgs) save_dataset(folder, "test." + str(test_cnt), test_diffs, test_msgs) save_dataset(folder, "valid." + str(valid_cnt), valid_diffs, valid_msgs) save_vocab(folder, vocab_diffs, vocab_msgs)
def train_compass(self, compass_text, overwrite=False): compass_exists = os.path.isfile( os.path.join(self.opath, "compass.model")) if compass_exists and overwrite is False: self.compass = Word2Vec.load( os.path.join(self.opath, "compass.model")) print("Compass loaded from file.") else: sentences = CreateCorpus(compass_text) print("Training the compass.") if compass_exists: print("Compass will be overwritten after training") self.compass = self.train_model(sentences) self.compass.save(os.path.join(self.opath, "compass.model")) save_vocab(self.compass, "compass_twec") self.gvocab = self.compass.wv.vocab
def train_slice(self, slice_text, save=True): if self.compass == None: return Exception("Missing Compass") print("Training temporal embeddings: slice {}.".format(slice_text)) sentences = CreateCorpus(slice_text) model = self.train_model(sentences) model_name = os.path.splitext(os.path.basename(slice_text))[0] self.trained_slices[model_name] = model # modified saving function to save in w2v format if save: model.save(os.path.join(self.opath, model_name + ".model")) # Save vocab save_vocab(model, f'{model_name}_twec') return self.trained_slices[model_name]
def main( path_data: str, epochs: int, batch: int, vector_size: int, window: int, path_vectors: str, max_vocab: int, min_count: int, alpha: float, lr: float, x_max: int, save_mode: int, ): print("Preprocessing...") first_indices, second_indices, freq, word_index, word_counts = preprocessing( path_data, max_vocab, min_count, window) vocab_size = len(word_counts) + 1 print("Vocab size:", vocab_size) print("Training...") model = train( first_indices=first_indices, second_indices=second_indices, frequencies=freq, epochs=epochs, batch=batch, vector_size=vector_size, vocab_size=vocab_size, alpha=alpha, lr=lr, x_max=x_max, ) print("Saving vocab...") utils.save_vocab(config.VOCAB, word_counts) print("Saving embeddings file...") # path_folder = config.EMBEDDINGS.split("/")[0] # if not os.path.isdir(path_folder): # os.mkdir(path_folder) utils.save_word2vec_format(model, path_vectors, word_index, vector_size, save_mode)
raise NotImplementedError() image_postfix = ".png" helper = DataHelper(args.annot_file, args.ques_file) # Write dataset to file with open(args.output_file, "w") as output_file: for i in range(len(helper.dataset['annotations'])): imd_id = helper.dataset['annotations'][i]['image_id'] img_name = image_prefix + pad_with_zero(imd_id, args) + image_postfix ques_id = helper.dataset['annotations'][i]['question_id'] question = helper.qqa[ques_id]['question'] # Convert to comma-separated token string question = ','.join(question.strip().split()) answer = helper.dataset['annotations'][i]['multiple_choice_answer'] # each line contains: image_filename [tab] question [tab] answer output_file.write(img_name + "\t" + question + "\t" + answer + "\n") print('Saved dataset file at: {}'.format(args.output_file)) # Read the newly created dataset file to build the vocabulary & save to disk if args.vocab_file: save_vocab(args.output_file, args.vocab_file, args.min_word_count, args.num_cls)
def main(args): device = torch.device('cuda' if args.gpu else 'cpu') # construct Field objects SRC = data.Field(lower=True, init_token='<bos>', eos_token='<eos>') TGT = data.Field(lower=True, init_token='<bos>', eos_token='<eos>') fields = [('src', SRC), ('tgt', TGT)] slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \ and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen train_data = data.TabularDataset( path=args.train, format='tsv', fields=fields, filter_pred=slen_filter, ) valid_data = data.TabularDataset( path=args.valid, format='tsv', fields=fields, filter_pred=slen_filter, ) # construct Vocab objects SRC.build_vocab(train_data, min_freq=args.src_min_freq) if args.src_embed_path is not None: vector = utils.load_vector(args.src_embed_path) SRC.vocab.load_vectors(vector) TGT.build_vocab(train_data, min_freq=args.tgt_min_freq) if args.tgt_embed_path is not None: vector = utils.load_vector(args.tgt_embed_path) TGT.vocab.load_vectors(vector) # save fields if not os.path.exists(args.savedir): os.mkdir(args.savedir) utils.save_field(args.savedir, fields) utils.save_vocab(args.savedir, fields) # set iterator train_iter, valid_iter = data.BucketIterator.splits( (train_data, valid_data), batch_size=args.batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), repeat=False, ) print(f'| [src] Dictionary: {len(SRC.vocab.itos)} types') print(f'| [tgt] Dictionary: {len(TGT.vocab.itos)} types') print('') for iter_name, iterator in [('train', train_iter), ('valid', valid_iter)]: file_path = args.train if iter_name == 'train' else args.valid data_object = train_data if iter_name == 'train' else valid_data print(f' {iter_name}: {file_path}') for name, field in fields: n_tokens, n_unk = utils.get_statics(iterator, name, field) n_tokens -= 2 * len( data_object) # take <bos> and <eos> from n_tokens print(f'| [{name}] {n_tokens} tokens,', end='') print(f' coverage: {100*(n_tokens-n_unk)/n_tokens:.{4}}%') print('') # construct model model = Transformer(fields, args).to(device) criterion = nn.CrossEntropyLoss(ignore_index=TGT.vocab.stoi['<pad>']) optimizer_fn = utils.get_optimizer(args.optimizer) optimizer = optimizer_fn(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5) trainer = Trainer(model, criterion, optimizer, scheduler, args.clip, iteration=0) print('=============== MODEL ===============') print(model) print('') print('=============== OPTIMIZER ===============') print(optimizer) print('') epoch = 1 max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf best_loss = math.inf while epoch < max_epoch and trainer.n_updates < max_update and args.min_lr < trainer.get_lr( ): # train with tqdm(train_iter, dynamic_ncols=True) as pbar: train_loss = 0.0 trainer.model.train() for samples in pbar: bsz = samples.src.size(1) srcs = samples.src.to(device) tgts = samples.tgt.to(device) loss = trainer.step(srcs, tgts) train_loss += loss.item() # setting of progressbar pbar.set_description(f"epoch {str(epoch).zfill(3)}") progress_state = OrderedDict(loss=loss.item(), ppl=math.exp(loss.item()), bsz=len(samples), lr=trainer.get_lr(), clip=args.clip, num_updates=trainer.n_updates) pbar.set_postfix(progress_state) train_loss /= len(train_iter) print(f"| epoch {str(epoch).zfill(3)} | train ", end="") print(f"| loss {train_loss:.{4}} ", end="") print(f"| ppl {math.exp(train_loss):.{4}} ", end="") print(f"| lr {trainer.get_lr():.1e} ", end="") print(f"| clip {args.clip} ", end="") print(f"| num_updates {trainer.n_updates} |") # validation valid_loss = 0.0 trainer.model.eval() for samples in valid_iter: bsz = samples.src.size(1) srcs = samples.src.to(device) tgts = samples.tgt.to(device) loss = trainer.step(srcs, tgts) valid_loss += loss.item() valid_loss /= len(valid_iter) print(f"| epoch {str(epoch).zfill(3)} | valid ", end="") print(f"| loss {valid_loss:.{4}} ", end="") print(f"| ppl {math.exp(valid_loss):.{4}} ", end="") print(f"| lr {trainer.get_lr():.1e} ", end="") print(f"| clip {args.clip} ", end="") print(f"| num_updates {trainer.n_updates} |") # saving model save_vars = { 'epoch': epoch, 'iteration': trainer.n_updates, 'best_loss': valid_loss if valid_loss < best_loss else best_loss, 'args': args, 'weights': model.state_dict() } if valid_loss < best_loss: filename = os.path.join(args.savedir, 'checkpoint_best.pt') torch.save(save_vars, filename) if epoch % args.save_epoch == 0: filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt') torch.save(save_vars, filename) filename = os.path.join(args.savedir, 'checkpoint_last.pt') torch.save(save_vars, filename) # update trainer.scheduler.step(valid_loss) epoch += 1
# RNN_UNITS = 512 # RNN_TYPE = 'lstm' ############################## vocab_tone, idx2char_tone, char2idx_tone = build_vocab_tone(tone_dataframe) dataset_tone = build_dataset_tone(tone_dataframe, vocab_tone, idx2char_tone, char2idx_tone, MAX_WORD_LENGTH) # Path where the vocab will be saved logs_dir = os.path.join(working_dir, 'logs') os.makedirs(logs_dir, exist_ok=True) vocab_file_tone = os.path.join(logs_dir, 'vocab_tone.json') save_vocab(vocab_tone, idx2char_tone, char2idx_tone, vocab_file_tone) dataset_train_tone, dataset_val_tone = split_dataset(dataset_tone) dataset_train_tone = dataset_train_tone.batch(BATCH_SIZE, drop_remainder=True) dataset_val_tone = dataset_val_tone.batch(BATCH_SIZE, drop_remainder=True) model_tone = build_tonenet_model( name='ToneNetwork', vocab_size=len(vocab_tone), max_word_len=MAX_WORD_LENGTH, embedding_dim=EMBEDDING_DIM, rnn_type=RNN_TYPE, rnn_units=RNN_UNITS, learning_rate=0.01, )
# Load the training dataset word_sents, label_sents = conll_dataset_to_word_AND_label_sents("train") # Count the number of occurences of each lowercased word nb_occurs = {} for word_sent in word_sents: for word in word_sent: lword = word.lower() if lword not in nb_occurs: nb_occurs[lword] = 0 nb_occurs[lword] += 1 # Keep only the most frequent words # This is done to improve generalization on never-seen-before words. sorted_nb_occurs = sorted(nb_occurs.items(), key=lambda kv: kv[1], reverse=True) sorted_nb_occurs = sorted_nb_occurs[:int(args.keep * len(nb_occurs))] # Build and save vocabulary # Rk : Id 0 is reserved for padding and id 1 for never-seen-before words. vocab = {} for i, (lword, nb_occurs) in enumerate(sorted_nb_occurs): vocab[lword] = i + 2 save_vocab(vocab)
def main(args): device = torch.device('cuda' if args.gpu else 'cpu') if args.re_training is None: TEXT = data.Field( lower=True, init_token='<bos>', eos_token='<eos>' ) else: basedir, _ = os.path.split(args.re_training) path = os.path.join(basedir, 'text.field') TEXT = utils.load_field(path) fields = [('text', TEXT)] if args.task in monolingual_tasks \ else [('src', TEXT), ('tgt', TEXT)] slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \ and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen # load training data if args.task == 'translation': train_data = data.TabularDataset( path=args.train, format='tsv', fields=fields, filter_pred=slen_filter, ) else: # `causal`, `masked` train_data = datasets.LanguageModelingDataset( path=args.train, text_field=TEXT, newline_eos=True ) # set Vocabulary object if args.re_training is None: TEXT.build_vocab( train_data, min_freq=args.min_freq, specials=['<sep>', '<mask>'], ) if args.embed_path: vectors = utils.load_vector(args.embed_path) TEXT.vocab.load_vectors(vectors) if not os.path.exists(args.savedir): os.mkdir(args.savedir) # save a field object with open(os.path.join(args.savedir, 'text.field'), 'wb') as fout: dill.dump(TEXT, fout) utils.save_vocab(args.savedir, TEXT) # set training iterator if args.task == 'translation': train_iter = data.BucketIterator( train_data, batch_size=args.batch_size, sort_within_batch=True, sort_key= lambda x: len(x.src), repeat=False, ) else: # `causal`, `masked` train_iter = data.BPTTIterator( train_data, batch_size=args.batch_size, bptt_len=args.bptt_len, train=True, repeat=False, shuffle=True, ) print(f'| [text] Dictionary: {len(TEXT.vocab.itos)} types') print('') print(f'train: {args.train}') for name, field in fields: n_tokens, n_unk = utils.get_statics(train_iter, name, field) print(f'| [{name}] {n_tokens} tokens,', end='') print(f' coverage: {100*(n_tokens-n_unk)/n_tokens:.{4}}%') print('') # build a model model_class = get_model(args.task) if args.re_training is None: epoch = 1 iteration = 0 best_loss = math.inf model = model_class(TEXT, args).to(device) else: load_vars = torch.load(args.re_training) epoch = load_vars['epoch'] + 1 iteration = load_vars['iteration'] best_loss = load_vars['best_loss'] lm_args, lm_weights = load_vars['args'], load_vars['weights'] model = model_class(TEXT, lm_args) model.load_state_dict(lm_weights) model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>']) optimizer_fn = utils.get_optimizer(args.optimizer) optimizer = optimizer_fn(model.parameters(), lr=args.lr) trainer = Trainer(model, criterion, optimizer, args.clip, iteration) # show the details of model and optimizer print('=============== MODEL ===============') print(model) print('') print('=============== OPTIMIZER ===============') print(optimizer) print('') max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf assert not(max_epoch == math.inf and max_update == math.inf), \ 'Please set `--max-epoch` or `--max-update`.' while epoch <= max_epoch and trainer.n_updates <= max_update: # training with tqdm(train_iter, dynamic_ncols=True) as pbar: train_loss = 0.0 trainer.model.train() for samples in pbar: if args.task in monolingual_tasks: srcs = samples.text.to(device) tgts = None refs = None if args.task == 'masked' \ else samples.target.to(device) else: srcs = samples.src.to(device) tgts = samples.tgt.to(device) refs = None loss = trainer.step(srcs, tgts, refs) train_loss += loss.item() # setting of progressbar pbar.set_description(f'epoch {str(epoch).zfill(3)}') progress_state = OrderedDict( task=args.task, loss=loss.item(), ppl=math.exp(loss.item()), bsz=srcs.size(1), lr=trainer.get_lr(), clip=args.clip, num_updates=trainer.n_updates) pbar.set_postfix(progress_state) train_loss /= len(train_iter) print(f'| epoch {str(epoch).zfill(3)} | train ', end='') print(f'| loss {train_loss:.{4}} ', end='') print(f'| ppl {math.exp(train_loss):.{4}} ', end='') print(f'| lr {trainer.get_lr():.1e} ', end='') print(f'| clip {args.clip} ', end='') print(f'| num_updates {trainer.n_updates} |') # saving model save_vars = { 'epoch': epoch, 'iteration': trainer.n_updates, 'best_loss': train_loss if train_loss < best_loss else best_loss, 'args': args, 'weights': model.state_dict() } if train_loss < best_loss: best_loss = train_loss filename = os.path.join(args.savedir, 'checkpoint_best.pt') torch.save(save_vars, filename) if epoch % args.save_epoch == 0: filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt') torch.save(save_vars, filename) filename = os.path.join(args.savedir, 'checkpoint_last.pt') torch.save(save_vars, filename) # update epoch += 1
cfg.seed = cfg.seed if cfg.seed else random.randint(1, 10000) print('Random seed: {}'.format(cfg.seed)) torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) random.seed(cfg.seed) result_json = pjoin(cfg.savepath, 'result.json') if cfg.resume_result_json else None # DATA dataset = AttributeDataLoader(mbsize=cfg.vae.batch_size, max_seq_len=cfg.max_seq_len, device=device, attributes=cfg.attributes, **cfg.data_kwargs) dataset.print_stats() utils.save_vocab(dataset.TEXT.vocab, cfg.vocab_path) # MODEL if cfg.model.pretrained_emb: cfg.model.pretrained_emb = dataset.get_vocab_vectors() model = RNN_VAE(n_vocab=dataset.n_vocab, max_seq_len=cfg.max_seq_len, **cfg.model).to(device) print(model) if cfg.loadpath: model.load_state_dict(torch.load(cfg.loadpath)) print('Loaded model from ' + cfg.loadpath) # ---------------------------------------------#
############################## vocab_rhyme, idx2syl_rhyme, syl2idx_rhyme = build_vocab_rhyme(divine_comedy) dataset_rhyme = build_dataset_rhyme(divine_comedy, vocab_rhyme, idx2syl_rhyme, syl2idx_rhyme, seq_length=SEQ_LENGTH) # Path where the vocab will be saved logs_dir = os.path.join(working_dir, 'logs') os.makedirs(logs_dir, exist_ok=True) vocab_file_rhyme = os.path.join(logs_dir, 'vocab_rhyme.json') save_vocab(vocab_rhyme, idx2syl_rhyme, syl2idx_rhyme, vocab_file_rhyme) dataset_train_rhyme, dataset_val_rhyme = split_dataset(dataset_rhyme) dataset_train_rhyme = dataset_train_rhyme.batch(BATCH_SIZE, drop_remainder=True) dataset_val_rhyme = dataset_val_rhyme.batch(BATCH_SIZE, drop_remainder=True) model_rhyme = build_model( name='RhymeNetwork', vocab_size=len(vocab_rhyme), seq_length=SEQ_LENGTH, embedding_dim=EMBEDDING_DIM, rnn_type=RNN_TYPE, rnn_units=RNN_UNITS, learning_rate=0.01,
def main(args): device = torch.device('cuda' if args.gpu else 'cpu') if args.model: basedir, _ = os.path.split(args.model) path = os.path.join(basedir, 'text.field') TEXT = utils.load_field(path) else: TEXT = data.Field(lower=True, init_token='<bos>', eos_token='<eos>') fields = [('src', TEXT), ('tgt', TEXT)] if args.mode else [('src', TEXT)] # load training data if args.mode == 'finetune': slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \ and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen train_data = data.TabularDataset( path=args.train, format='tsv', fields=fields, filter_pred=slen_filter, ) else: # pretrain train_data = datasets.LanguageModelingDataset(path=args.train, text_field=TEXT, newline_eos=True) # set Vocabulary object if args.model is None: TEXT.build_vocab( train_data, min_freq=args.min_freq, specials=['<sep>', '<mask>'], ) if not os.path.exists(args.savedir): os.mkdir(args.savedir) utils.save_field(args.savedir, [('text', TEXT)]) utils.save_vocab(args.savedir, [('text', TEXT)]) # set training iterator if args.mode == 'finetune': train_iter = data.BucketIterator( train_data, batch_size=args.batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), repeat=False, ) else: # pre-train train_iter = datasets.BPTTIterator( train_data, batch_size=args.batch_size, bptt_len=args.bptt_len, train=True, repeat=False, shuffle=True, ) print(f'| [text] Dictionary: {len(TEXT.vocab.itos)} types') print('') print(f' train: {args.train}') utils.get_stats(train_iter, fields) # load validation data if args.valid is not None: if args.mode == 'finetune': valid_data = data.TabularDataset( path=args.valid, format='tsv', fields=fields, filter_pred=slen_filter, ) valid_iter = data.BucketIterator(valid_data, batch_size=args.batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), train=False, repeat=False, shuffle=False) else: # pre-train valid_data = datasets.LanguageModelingDataset(path=args.valid, text_field=TEXT, newline_eos=True) valid_iter = datasets.BPTTIterator( valid_data, batch_size=args.batch_size, bptt_len=args.bptt_len, train=False, repeat=False, shuffle=False, ) print(f'valid: {args.valid}') utils.get_stats(valid_iter, fields) # build a model if args.model: load_vars = torch.load(args.model) epoch = load_vars['epoch'] + 1 best_loss = load_vars['best_loss'] lm_args, lm_weights = load_vars['args'], load_vars['weights'] model = TranslationLM(TEXT, lm_args) model.load_state_dict(lm_weights) model.to(device) else: epoch = 1 best_loss = math.inf model = TranslationLM(TEXT, args).to(device) criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>']) optimizer_fn = utils.get_optimizer(args.optimizer) optimizer = optimizer_fn(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min') # show the details of model and optimizer print('=============== MODEL ===============') print(model) print('') print('=============== OPTIMIZER ===============') print(optimizer) print('') max_epoch = (args.max_epoch or math.inf) + epoch while epoch < max_epoch and args.min_lr < optimizer.param_groups[0]['lr']: # training model.train() loss = step(epoch, args.mode, model, train_iter, criterion, optimizer, device) # validation if args.valid is not None: model.eval() loss = step(epoch, args.mode, model, valid_iter, criterion, optimizer, device) # saving model save_vars = { 'epoch': epoch, 'best_loss': loss if loss < best_loss else best_loss, 'args': args, 'weights': model.state_dict() } if loss < best_loss: best_loss = loss filename = os.path.join(args.savedir, 'checkpoint_best.pt') torch.save(save_vars, filename) if epoch % args.save_epoch == 0: filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt') torch.save(save_vars, filename) filename = os.path.join(args.savedir, 'checkpoint_last.pt') torch.save(save_vars, filename) # update scheduler.step(best_loss) epoch += 1
dataset = build_dataset(divine_comedy, vocab, idx2syl, syl2idx, seq_length=SEQ_LENGTH) print("Corpus length: {} syllables".format(len(text_in_syls(divine_comedy)))) print("Vocab size:", len(vocab)) # Path where the vocab will be saved logs_dir = os.path.join(working_dir, 'logs') os.makedirs(logs_dir, exist_ok=True) vocab_file = os.path.join(logs_dir, 'vocab.json') save_vocab(vocab, idx2syl, syl2idx, vocab_file) dataset_train, dataset_val = split_dataset(dataset) dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True) dataset_val = dataset_val.batch(BATCH_SIZE, drop_remainder=True) model = build_model( vocab_size=len(vocab), seq_length=SEQ_LENGTH, embedding_dim=EMBEDDING_DIM, rnn_type=RNN_TYPE, rnn_units=RNN_UNITS, learning_rate=0.01, )
elif args.model_architecture == 'fasttext': model = FT_gensim(size=args.size, window=args.window, min_count=args.min_count, workers=args.threads, sg=args.sg, hs=args.hs, negative=args.ns) # build the vocabulary model.build_vocab(sentences) # train the model model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) elapsed = time.time() logging.info(f'Training finished. Took {elapsed-start} s') logging.info(f'Vocab size: {len(model.wv.vocab)}') # Save model to disk if args.format == 'gensim': model.wv.save(f'{MODELS_FOLDER /args.model_path}', separately=['vectors']) elif args.format == 'w2v': model.wv.save_word2vec_format(f'{MODELS_FOLDER / args.model_path}.txt', binary=True) # Save vocab to disk save_vocab(model, args.vocab_path)
def train(): args = hparams() print("Load data...") x_text = utils.load_data(args.text_file) vocab_dict = utils.build_vocab(x_text) x_data = utils.transform(x_text, vocab_dict) x_data = x_data[:-2] y_data = x_data[1:] # Split train/test set dev_sample_index = -1 * int( args.dev_sample_percentage * float(len(x_data))) x_train, x_dev = x_data[:dev_sample_index], x_data[dev_sample_index:] y_train, y_dev = y_data[:dev_sample_index], y_data[dev_sample_index:] print("Train/Dev split: {:d}/{:d}".format(len(x_train), len(x_dev))) utils.save_vocab(vocab_dict, args.vocab_file) del x_text, x_data, y_data #Training sess = tf.Session() with sess.as_default(): rnn = RNNLM(vocab_size=args.vocab_size, embedding_dim=args.embedding_dim, rnn_size=args.rnn_size, num_layers=args.num_layers, batch_size=args.batch_size, training=True) #Define train_op global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.Variable(args.learning_rate, name="learning_rate", trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(rnn.loss, tvars), args.max_grad_norm) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) #Save model params checkpoint_dir = os.path.abspath( os.path.join(os.path.curdir, "checkpoints")) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_file = os.path.join(checkpoint_dir, "model") #Save best model params dev_dir = os.path.abspath(os.path.join(os.path.curdir, "dev")) if not os.path.exists(dev_dir): os.makedirs(dev_dir) dev_file = os.path.join(dev_dir, "model") saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) dev_loss = 2e+50 # Initialize all variables sess.run(tf.global_variables_initializer()) # Training loop... for epoch in range(args.num_epochs): # Generate batches x_batches = utils.batch_iter(x_train, args.sequence_length, args.batch_size) y_batches = utils.batch_iter(y_train, args.sequence_length, args.batch_size) initial_state = sess.run(rnn.initial_state) for x_batch, y_batch in zip(x_batches, y_batches): feed_dict = { rnn.input_data: x_batch, rnn.targets: y_batch, rnn.input_keep_prob: args.input_keep_prob, rnn.output_keep_prob: args.output_keep_prob, rnn.initial_state: initial_state } _, step, loss = sess.run([train_op, global_step, rnn.loss], feed_dict=feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, loss)) #Evaluate on dev set current_step = tf.train.global_step(sess, global_step) if current_step % args.checkpoint_steps == 0: saver.save(sess, checkpoint_file, global_step=current_step) print("Save model to %s" % checkpoint_file) if current_step % args.evaluate_steps == 0: x_dev_batches = utils.batch_iter(x_dev, args.sequence_length, args.batch_size) y_dev_batches = utils.batch_iter(y_dev, args.sequence_length, args.batch_size) dev_losses = 0.0 i = 0 for x_dev_batch, y_dev_batch in zip( x_dev_batches, y_dev_batches): dev_feed_dict = { rnn.input_data: x_dev_batch, rnn.targets: y_dev_batch, rnn.input_keep_prob: 1.0, rnn.output_keep_prob: 1.0, rnn.initial_state: initial_state } step, loss = sess.run([global_step, rnn.loss], feed_dict=dev_feed_dict) time_str = datetime.datetime.now().isoformat() dev_losses += loss i += 1 loss = dev_losses / i print("Evaluate on dev set:") print("{}: step {}, loss {:g}".format( time_str, step, loss)) if dev_loss > loss: dev_loss = loss saver.save(sess, dev_file, global_step=current_step) print("Save better model to %s" % dev_file)
############################## vocab_verse, idx2syl_verse, syl2idx_verse = build_vocab_verse(divine_comedy) dataset_verse = build_dataset_verse(divine_comedy, vocab_verse, idx2syl_verse, syl2idx_verse, seq_length=SEQ_LENGTH) # Path where the vocab will be saved logs_dir = os.path.join(working_dir, 'logs') os.makedirs(logs_dir, exist_ok=True) vocab_file_verse = os.path.join(logs_dir, 'vocab_verse.json') save_vocab(vocab_verse, idx2syl_verse, syl2idx_verse, vocab_file_verse) dataset_train_verse, dataset_val_verse = split_dataset(dataset_verse) dataset_train_verse = dataset_train_verse.batch(BATCH_SIZE, drop_remainder=True) dataset_val_verse = dataset_val_verse.batch(BATCH_SIZE, drop_remainder=True) model_verse = build_model( name='VerseNetwork', vocab_size=len(vocab_verse), seq_length=SEQ_LENGTH, embedding_dim=EMBEDDING_DIM, rnn_type=RNN_TYPE, rnn_units=RNN_UNITS, learning_rate=0.01,
dataset = build_dataset(divine_comedy, vocab, idx2char, char2idx, seq_length=SEQ_LENGTH) print("Corpus length: {} characters".format(len(divine_comedy))) print("Vocab size:", len(vocab)) # Path where the vocab will be saved logs_dir = os.path.join(working_dir, 'logs') os.makedirs(logs_dir, exist_ok=True) vocab_file = os.path.join(logs_dir, 'vocab.json') save_vocab(vocab, idx2char, char2idx, vocab_file) dataset_train, dataset_val = split_dataset(dataset) dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True) dataset_val = dataset_val.batch(BATCH_SIZE, drop_remainder=True) model = build_model( vocab_size=len(vocab), seq_length=SEQ_LENGTH, embedding_dim=EMBEDDING_DIM, rnn_type=RNN_TYPE, rnn_units=RNN_UNITS, learning_rate=0.01, )
# EPOCHS = 200 # SEQ_LENGTH = 75 # EMBEDDING_DIM = 256 # RNN_UNITS = 1024 # RNN_TYPE = 'gru' ############################## vocab, idx2word, word2idx = build_vocab(divine_comedy) # Path where the vocab will be saved logs_dir = os.path.join(working_dir, 'logs') os.makedirs(logs_dir, exist_ok=True) vocab_file = os.path.join(logs_dir, 'vocab.json') save_vocab(vocab, idx2word, word2idx, vocab_file) dataset = build_dataset(divine_comedy, vocab, idx2word, word2idx, seq_length=SEQ_LENGTH) print("Corpus length: {} words".format(len(divine_comedy))) print("Vocab size:", len(vocab)) dataset_train, dataset_val = split_dataset(dataset) #for s in dataset_train.take(1).as_numpy_iterator(): # print(s)