def __init__(self, model, criterion, optimizer, print_every, cuda=True):
     self.cuda = cuda
     self.model = model
     self.criterion = criterion
     self.optimizer = optimizer
     self.print_every = print_every
     self.start_epoch = 1
     self.logger = make_logger('log.train')
Exemple #2
0
 def __init__(self, model, reward_type, ce_criterion, optimizer, print_every, cuda=True):
     self.cuda = cuda
     self.model = model
     self.train_criterion = RLCriterion(reward_type, ce_criterion)
     self.valid_criterion = ce_criterion
     self.optimizer = optimizer
     self.print_every = print_every
     self.start_epoch = 1
     self.logger = make_logger('log.train')
def main():

    logger = make_logger('log.preprocess')

    # parse arguments
    opt = parse_args()
    dir_name, file_name = os.path.split(opt.save_data)
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    # build and save vocab
    print("Building vocab...")
    src_word2idx = build_vocab(opt.train_src, opt.src_vocab_size)
    tgt_word2idx = build_vocab(opt.train_tgt, opt.src_vocab_size)
    vocab = {'src': src_word2idx, 'tgt': tgt_word2idx}
    logger.info('Src vocab size: {}'.format(len(src_word2idx)))
    logger.info('Tgt vocab size: {}'.format(len(tgt_word2idx)))
    torch.save(vocab, opt.save_data+'.vocab.pt')

    # convert train text to ids
    print("Converting train text to ids...")
    train_src = convert_file_to_ids(opt.train_src, src_word2idx)
    train_tgt = convert_file_to_ids(opt.train_tgt, tgt_word2idx)
    print(len(train_src), len(train_tgt))
    assert len(train_src) == len(train_tgt)
    train = list(zip(train_src, train_tgt))
    logger.info("Train total lines: {}".format(len(train)))
    train = [t for t in train if len(t[0]) <= opt.src_seq_length and len(t[1]) <= opt.tgt_seq_length]
    logger.info("Train after filtering (src: {}, tgt: {}): {}".format(opt.src_seq_length, opt.tgt_seq_length, len(train)))
    torch.save(train, opt.save_data+'.train.pt')

    # convert dev text to ids
    print("Converting valid text to ids...")
    valid_src = convert_file_to_ids(opt.valid_src, src_word2idx)
    valid_tgt = convert_file_to_ids(opt.valid_tgt, tgt_word2idx)
    assert len(valid_src) == len(valid_tgt)
    valid = list(zip(valid_src, valid_tgt))
    logger.info("Valid total lines: {}".format(len(valid)))
    torch.save(valid, opt.save_data+'.valid.pt')
Exemple #4
0
    def __init__(self,
                 model,
                 sample_type,
                 reward_type,
                 ce_criterion,
                 optimizer,
                 print_every,
                 cuda=True):
        self.cuda = cuda
        self.model = model
        self.reward_type = reward_type
        self.train_criterion = RLCriterion(reward_type, ce_criterion)
        self.valid_criterion = ce_criterion
        self.optimizer = optimizer
        self.print_every = print_every
        self.start_epoch = 1
        self.logger = make_logger('log.train')

        if sample_type == 'sample':
            self.sample_func = sample_on_batch
        elif sample_type == 'beam':
            self.sample_func = beam_search_on_batch
        else:
            raise Exception("Undifined sample type.")