def main(params): dp = DataProvider(params) # Create vocabulary and author index if params['resume'] == None: if params['atoms'] == 'char': char_to_ix, ix_to_char = dp.createCharVocab( params['vocab_threshold']) else: char_to_ix, ix_to_char = dp.createWordVocab( params['vocab_threshold']) auth_to_ix, ix_to_auth = dp.createAuthorIdx() else: saved_model = torch.load(params['resume']) char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_auth = saved_model['ix_to_auth'] ix_to_char = saved_model['ix_to_char'] params['vocabulary_size'] = len(char_to_ix) params['num_output_layers'] = len(auth_to_ix) model = CharTranslator(params) # set to train mode, this activates dropout model.train() #Initialize the RMSprop optimizer if params['use_sgd']: optim = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['decay_rate']) else: optim = torch.optim.RMSprop(model.parameters(), lr=params['learning_rate'], alpha=params['decay_rate'], eps=params['smooth_eps']) # Loss function if params['mode'] == 'generative': criterion = nn.CrossEntropyLoss() else: criterion = nn.NLLLoss() # Restore saved checkpoint if params['resume'] != None: model.load_state_dict(saved_model['state_dict']) optim.load_state_dict(saved_model['optimizer']) total_loss = 0. start_time = time.time() hidden = model.init_hidden(params['batch_size']) hidden_zeros = model.init_hidden(params['batch_size']) # Initialize the cache if params['randomize_batches']: dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros) # Compute the iteration parameters epochs = params['max_epochs'] total_seqs = dp.get_num_sents(split='train') iter_per_epoch = total_seqs // params['batch_size'] total_iters = iter_per_epoch * epochs best_loss = 1000000. best_val = 1000. eval_every = int(iter_per_epoch * params['eval_interval']) #val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_score = 0. #eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_rank = 1000 eval_function = eval_translator if params[ 'mode'] == 'generative' else eval_classify leakage = 0. #params['leakage'] print total_iters for i in xrange(total_iters): #TODO if params['split_generators']: c_aid = ix_to_auth[np.random.choice(auth_to_ix.values())] else: c_aid = None batch = dp.get_sentence_batch(params['batch_size'], split='train', atoms=params['atoms'], aid=c_aid, sample_by_len=params['sample_by_len']) inps, targs, auths, lens = dp.prepare_data( batch, char_to_ix, auth_to_ix, maxlen=params['max_seq_len']) # Reset the hidden states for which new docs have been sampled # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optim.zero_grad() #TODO if params['mode'] == 'generative': output, _ = model.forward_mltrain(inps, lens, inps, lens, hidden_zeros, auths=auths) targets = pack_padded_sequence(Variable(targs).cuda(), lens) loss = criterion(pack_padded_sequence(output, lens)[0], targets[0]) else: # for classifier auths is the target output, hidden = model.forward_classify(inps, hidden, compute_softmax=True) targets = Variable(auths).cuda() loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip']) # Take an optimization step optim.step() total_loss += loss.data.cpu().numpy()[0] # Save the hidden states in cache for later use if i % eval_every == 0 and i > 0: val_rank, val_score = eval_function(dp, model, params, char_to_ix, auth_to_ix, split='val') #if i % iter_per_epoch == 0 and i > 0 and leakage > params['leakage_min']: # leakage = leakage * params['leakage_decay'] #if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']): if i % params['log_interval'] == 0 and i > 0: cur_loss = total_loss / params['log_interval'] elapsed = time.time() - start_time print( '| epoch {:2.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( float(i) / iter_per_epoch, i, total_iters, params['learning_rate'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0. if val_rank <= best_val: save_checkpoint( { 'iter': i, 'arch': params, 'val_loss': val_rank, 'val_pplx': val_score, 'char_to_ix': char_to_ix, 'ix_to_char': ix_to_char, 'auth_to_ix': auth_to_ix, 'ix_to_auth': ix_to_auth, 'state_dict': model.state_dict(), 'loss': cur_loss, 'optimizer': optim.state_dict(), }, fappend=params['fappend'], outdir=params['checkpoint_output_directory']) best_val = val_rank start_time = time.time()
def main(params): dp = DataProvider(params) # Create vocabulary and author index if params['resume'] == None: if params['atoms'] == 'char': char_to_ix, ix_to_char = dp.create_char_vocab( params['vocab_threshold']) else: char_to_ix, ix_to_char = dp.create_word_vocab( params['vocab_threshold']) auth_to_ix, ix_to_auth = dp.create_author_idx() else: saved_model = torch.load(params['resume']) char_to_ix = saved_model['char_to_ix'] auth_to_ix = saved_model['auth_to_ix'] ix_to_char = saved_model['ix_to_char'] params['vocabulary_size'] = len(char_to_ix) params['num_output_layers'] = len(auth_to_ix) print params['vocabulary_size'], params['num_output_layers'] model = get_classifier(params) # set to train mode, this activates dropout model.train() # Initialize the RMSprop optimizer if params['use_sgd']: optim = torch.optim.SGD(model.parameters(), lr=params['learning_rate'], momentum=params['decay_rate']) else: optim = torch.optim.RMSprop([{ 'params': [p[1] for p in model.named_parameters() if p[0] != 'decoder_W'] }, { 'params': model.decoder_W, 'weight_decay': 0.000 }], lr=params['learning_rate'], alpha=params['decay_rate'], eps=params['smooth_eps']) # Loss function if len(params['balance_loss']) == 0: criterion = nn.CrossEntropyLoss() else: criterion = nn.CrossEntropyLoss( torch.FloatTensor(params['balance_loss']).cuda()) # Restore saved checkpoint if params['resume'] != None: model.load_state_dict(saved_model['state_dict']) # optim.load_state_dict(saved_model['optimizer']) total_loss = 0. class_loss = 0. start_time = time.time() hidden = model.init_hidden(params['batch_size']) hidden_zeros = model.init_hidden(params['batch_size']) # Initialize the cache if params['randomize_batches']: dp.set_hid_cache(range(len(dp.data['docs'])), hidden_zeros) # Compute the iteration parameters epochs = params['max_epochs'] total_seqs = dp.get_num_sents(split='train') iter_per_epoch = total_seqs // params['batch_size'] total_iters = iter_per_epoch * epochs best_loss = 0. best_val = 1000. eval_every = int(iter_per_epoch * params['eval_interval']) # val_score = eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_score = 0. # eval_model(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs = params['num_eval']) val_rank = 0 eval_function = eval_model if params[ 'mode'] == 'generative' else eval_classify leakage = params['leakage'] for i in xrange(total_iters): # TODO if params['randomize_batches']: batch, reset_next = dp.get_rand_doc_batch(params['batch_size'], split='train') b_ids = [b['id'] for b in batch] hidden = dp.get_hid_cache(b_ids, hidden) elif params['use_sentences']: c_aid = None # ix_to_auth[np.random.choice(auth_to_ix.values())] batch = dp.get_sentence_batch( params['batch_size'], split='train', aid=c_aid, atoms=params['atoms'], sample_by_len=params['sample_by_len']) hidden = hidden_zeros else: batch, reset_h = dp.get_doc_batch(split='train') if len(reset_h) > 0: hidden[0].data.index_fill_(1, torch.LongTensor(reset_h).cuda(), 0.) hidden[1].data.index_fill_(1, torch.LongTensor(reset_h).cuda(), 0.) inps, targs, auths, lens = dp.prepare_data(batch, char_to_ix, auth_to_ix, leakage=leakage) # Reset the hidden states for which new docs have been sampled # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optim.zero_grad() # TODO if params['mode'] == 'generative': output, hidden = model.forward(inps, lens, hidden, auths) targets = pack_padded_sequence(Variable(targs).cuda(), lens) loss = criterion(pack_padded_sequence(output, lens)[0], targets[0]) else: # for classifier auths is the target output, _ = model.forward_classify(targs, hidden, compute_softmax=False, lens=lens) targets = Variable(auths).cuda() lossClass = criterion(output, targets) if params['compression_layer']: loss = lossClass + (model.compression_W.weight.norm( p=1, dim=1)).mean() else: loss = lossClass loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), params['grad_clip']) # Take an optimization step optim.step() total_loss += loss.data.cpu().numpy()[0] class_loss += lossClass.data.cpu().numpy()[0] # Save the hidden states in cache for later use if params['randomize_batches']: if len(reset_next) > 0: hidden[0].data.index_fill_(1, torch.LongTensor(reset_next).cuda(), 0.) hidden[1].data.index_fill_(1, torch.LongTensor(reset_next).cuda(), 0.) dp.set_hid_cache(b_ids, hidden) if i % eval_every == 0 and i > 0: val_rank, val_score = eval_function(dp, model, params, char_to_ix, auth_to_ix, split='val', max_docs=params['num_eval']) if i % iter_per_epoch == 0 and i > 0 and leakage > params[ 'leakage_min']: leakage = leakage * params['leakage_decay'] # if (i % iter_per_epoch == 0) and ((i//iter_per_epoch) >= params['lr_decay_st']): if i % params['log_interval'] == 0 and i > 0: cur_loss = total_loss / params['log_interval'] class_loss = class_loss / params['log_interval'] elapsed = time.time() - start_time print( '| epoch {:3.2f} | {:5d}/{:5d} batches | lr {:02.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( float(i) / iter_per_epoch, i, total_iters, params['learning_rate'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(class_loss))) if val_rank >= best_loss: best_loss = val_rank save_checkpoint( { 'iter': i, 'arch': params, 'val_mean_rank': val_rank, 'val_auc': val_score, 'char_to_ix': char_to_ix, 'ix_to_char': ix_to_char, 'auth_to_ix': auth_to_ix, 'state_dict': model.state_dict(), 'loss': cur_loss, 'optimizer': optim.state_dict(), }, fappend=params['fappend'], outdir=params['checkpoint_output_directory']) best_val = val_rank start_time = time.time() total_loss = 0. class_loss = 0.