Example #1
0
def build_vocab(path, cutoff=0):
    # Requires a large amount of memeory, but only need to build once
    if os.path.isdir(path):
        # here we need some trick to deal with excessively large files
        # for each file we accumulate the counter of characters, and
        # at the end we simply pass a list of chars to the vocab builder
        counter = Counter()
        filenames = sorted(os.listdir(path))
        for filename in filenames:
            lines = open(path + '/' + filename).readlines()
            for line in lines:
                counter.update(list(line))
        # remove infrequent characters from vocab
        for k in list(counter.keys()):
            if counter[k] < cutoff:
                del counter[k]
        # a singleton list of all characters
        data = [sorted([x[0] for x in counter.most_common()])]
        vocab = CharVocab(
            data)  # skip cutoff argument because this has been dealt with
    else:
        lines = open(path).readlines()  # reserve '\n'
        data = [list(line) for line in lines]
        vocab = CharVocab(data, cutoff=cutoff)
    return vocab
Example #2
0
 def init_vocab(self, data):
     assert self.eval == False  # for eval vocab must exist
     charvocab = CharVocab(data, self.args['shorthand'])
     wordvocab = WordVocab(data,
                           self.args['shorthand'],
                           cutoff=7,
                           lower=True)
     uposvocab = WordVocab(data, self.args['shorthand'], idx=1)
     xposvocab = xpos_vocab_factory(data, self.args['shorthand'])
     featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3)
     lemmavocab = WordVocab(data,
                            self.args['shorthand'],
                            cutoff=7,
                            idx=4,
                            lower=True)
     deprelvocab = WordVocab(data, self.args['shorthand'], idx=6)
     vocab = MultiVocab({
         'char': charvocab,
         'word': wordvocab,
         'upos': uposvocab,
         'xpos': xposvocab,
         'feats': featsvocab,
         'lemma': lemmavocab,
         'deprel': deprelvocab
     })
     return vocab
Example #3
0
 def load(cls, filename, finetune=False):
     state = torch.load(filename, lambda storage, loc: storage)
     vocab = {'char': CharVocab.load_state_dict(state['vocab'])}
     model = cls(state['args'], vocab, state['pad'], state['is_forward_lm'])
     model.load_state_dict(state['state_dict'])
     model.eval()
     model.finetune = finetune  # set finetune status
     return model
Example #4
0
    def init_vocab(self, data):
        def from_model(model_filename):
            """ Try loading vocab from charLM model file. """
            state_dict = torch.load(model_filename, lambda storage, loc: storage)
            assert 'vocab' in state_dict, "Cannot find vocab in charLM model file."
            return state_dict['vocab']

        if self.eval:
            raise Exception("Vocab must exist for evaluation.")
        if self.args['charlm']:
            charvocab = CharVocab.load_state_dict(from_model(self.args['charlm_forward_file']))
        else: 
            charvocab = CharVocab(data, self.args['shorthand'])
        wordvocab = self.pretrain.vocab
        tagvocab = TagVocab(data, self.args['shorthand'], idx=1)
        vocab = MultiVocab({'char': charvocab,
                            'word': wordvocab,
                            'tag': tagvocab})
        return vocab
Example #5
0
def train(args):
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
        else '{}/{}_{}_charlm.pt'.format(args['save_dir'], args['shorthand'], args['direction'])
    vocab_file = args['save_dir'] + '/' + args['vocab_save_name'] if args['vocab_save_name'] is not None \
        else '{}/{}_vocab.pt'.format(args['save_dir'], args['shorthand'])

    if os.path.exists(vocab_file):
        logging.info('Loading existing vocab file')
        vocab = {
            'char':
            CharVocab.load_state_dict(
                torch.load(vocab_file, lambda storage, loc: storage))
        }
    else:
        logging.info('Building and saving vocab')
        vocab = {
            'char':
            build_vocab(args['train_file']
                        if args['train_dir'] is None else args['train_dir'],
                        cutoff=args['cutoff'])
        }
        torch.save(vocab['char'].state_dict(), vocab_file)
    logger.info("Training model with vocab size: {}".format(len(
        vocab['char'])))

    model = CharacterLanguageModel(
        args,
        vocab,
        is_forward_lm=True if args['direction'] == 'forward' else False)
    if args['cuda']: model = model.cuda()
    params = [param for param in model.parameters() if param.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args['lr0'],
                                momentum=args['momentum'],
                                weight_decay=args['weight_decay'])
    criterion = torch.nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        verbose=True,
        factor=args['anneal'],
        patience=args['patience'])

    writer = None
    if args['summary']:
        from torch.utils.tensorboard import SummaryWriter
        summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
            else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction'])
        writer = SummaryWriter(log_dir=summary_dir)

    best_loss = None
    for epoch in range(args['epochs']):
        # load train data from train_dir if not empty, otherwise load from file
        if args['train_dir'] is not None:
            train_path = args['train_dir']
        else:
            train_path = args['train_file']
        train_data = load_data(train_path, vocab, args['direction'])
        dev_data = load_data(args['eval_file'], vocab, args['direction'])
        train_epoch(args, vocab, train_data, model, params, optimizer,
                    criterion, epoch + 1)

        start_time = time.time()
        loss = evaluate_epoch(args, vocab, dev_data, model, criterion)
        ppl = math.exp(loss)
        elapsed = int(time.time() - start_time)
        scheduler.step(loss)
        logger.info(
            "| {:5d}/{:5d} epochs | time elapsed {:6d}s | loss {:5.2f} | ppl {:8.2f}"
            .format(
                epoch + 1,
                args['epochs'],
                elapsed,
                loss,
                ppl,
            ))
        if best_loss is None or loss < best_loss:
            best_loss = loss
            model.save(model_file)
            logger.info('new best model saved.')
        if writer:
            writer.add_scalar('dev_loss', loss, global_step=epoch + 1)
            writer.add_scalar('dev_ppl', ppl, global_step=epoch + 1)
    if writer:
        writer.close()
    return
Example #6
0
def train(args):
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
        else '{}/{}_{}_charlm.pt'.format(args['save_dir'], args['shorthand'], args['direction'])
    vocab_file = args['save_dir'] + '/' + args['vocab_save_name'] if args['vocab_save_name'] is not None \
        else '{}/{}_vocab.pt'.format(args['save_dir'], args['shorthand'])

    if os.path.exists(vocab_file):
        logger.info('Loading existing vocab file')
        vocab = {
            'char':
            CharVocab.load_state_dict(
                torch.load(vocab_file, lambda storage, loc: storage))
        }
    else:
        logger.info('Building and saving vocab')
        vocab = {
            'char':
            build_vocab(args['train_file']
                        if args['train_dir'] is None else args['train_dir'],
                        cutoff=args['cutoff'])
        }
        torch.save(vocab['char'].state_dict(), vocab_file)
    logger.info("Training model with vocab size: {}".format(len(
        vocab['char'])))

    model = CharacterLanguageModel(
        args,
        vocab,
        is_forward_lm=True if args['direction'] == 'forward' else False)
    if args['cuda']: model = model.cuda()
    params = [param for param in model.parameters() if param.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args['lr0'],
                                momentum=args['momentum'],
                                weight_decay=args['weight_decay'])
    criterion = torch.nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        verbose=True,
        factor=args['anneal'],
        patience=args['patience'])

    writer = None
    if args['summary']:
        from torch.utils.tensorboard import SummaryWriter
        summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
            else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction'])
        writer = SummaryWriter(log_dir=summary_dir)

    # evaluate model within epoch if eval_interval is set
    eval_within_epoch = False
    if args['eval_steps'] > 0:
        eval_within_epoch = True

    best_loss = None
    global_step = 0
    for epoch in range(1, args['epochs'] + 1):
        # load train data from train_dir if not empty, otherwise load from file
        if args['train_dir'] is not None:
            train_path = args['train_dir']
        else:
            train_path = args['train_file']
        train_data = load_data(train_path, vocab, args['direction'])
        dev_data = load_file(args['eval_file'], vocab,
                             args['direction'])  # dev must be a single file

        # run over entire training set
        for data_chunk in train_data:
            batches = batchify(data_chunk, args['batch_size'])
            hidden = None
            total_loss = 0.0
            total_batches = math.ceil(
                (batches.size(1) - 1) / args['bptt_size'])
            iteration, i = 0, 0
            # over the data chunk
            while i < batches.size(1) - 1 - 1:
                model.train()
                global_step += 1
                start_time = time.time()
                bptt = args['bptt_size'] if np.random.random(
                ) < 0.95 else args['bptt_size'] / 2.
                # prevent excessively small or negative sequence lengths
                seq_len = max(5, int(np.random.normal(bptt, 5)))
                # prevent very large sequence length, must be <= 1.2 x bptt
                seq_len = min(seq_len, int(args['bptt_size'] * 1.2))
                data, target = get_batch(batches, i, seq_len)
                lens = [data.size(1) for i in range(data.size(0))]
                if args['cuda']:
                    data = data.cuda()
                    target = target.cuda()

                optimizer.zero_grad()
                output, hidden, decoded = model.forward(data, lens, hidden)
                loss = criterion(decoded.view(-1, len(vocab['char'])), target)
                total_loss += loss.data.item()
                loss.backward()

                torch.nn.utils.clip_grad_norm_(params, args['max_grad_norm'])
                optimizer.step()

                hidden = repackage_hidden(hidden)

                if (iteration + 1) % args['report_steps'] == 0:
                    cur_loss = total_loss / args['report_steps']
                    elapsed = time.time() - start_time
                    logger.info(
                        "| epoch {:5d} | {:5d}/{:5d} batches | sec/batch {:.6f} | loss {:5.2f} | ppl {:8.2f}"
                        .format(
                            epoch,
                            iteration + 1,
                            total_batches,
                            elapsed / args['report_steps'],
                            cur_loss,
                            math.exp(cur_loss),
                        ))
                    total_loss = 0.0

                iteration += 1
                i += seq_len

                # evaluate if necessary
                if eval_within_epoch and global_step % args['eval_steps'] == 0:
                    _, _, best_loss = evaluate_and_save(args, vocab, dev_data, model, criterion, scheduler, best_loss, \
                        global_step, model_file, writer)

        # if eval_interval isn't provided, run evaluation after each epoch
        if not eval_within_epoch:
            _, _, best_loss = evaluate_and_save(args, vocab, dev_data, model, criterion, scheduler, best_loss, \
                        epoch, model_file, writer) # use epoch in place of global_step for logging

    if writer:
        writer.close()
    return