コード例 #1
0
def build_vocab(path, cutoff=0):
    # Requires a large amount of memeory, but only need to build once
    if os.path.isdir(path):
        # here we need some trick to deal with excessively large files
        # for each file we accumulate the counter of characters, and
        # at the end we simply pass a list of chars to the vocab builder
        counter = Counter()
        filenames = sorted(os.listdir(path))
        for filename in filenames:
            lines = open(path + '/' + filename).readlines()
            for line in lines:
                counter.update(list(line))
        # remove infrequent characters from vocab
        for k in list(counter.keys()):
            if counter[k] < cutoff:
                del counter[k]
        # a singleton list of all characters
        data = [sorted([x[0] for x in counter.most_common()])]
        vocab = CharVocab(
            data)  # skip cutoff argument because this has been dealt with
    else:
        lines = open(path).readlines()  # reserve '\n'
        data = [list(line) for line in lines]
        vocab = CharVocab(data, cutoff=cutoff)
    return vocab
コード例 #2
0
ファイル: data.py プロジェクト: Evan-Feng/XDepParse
 def init_vocab(self, data):
     assert self.eval == False  # for eval vocab must exist
     charvocab = CharVocab(data, self.args['shorthand'])
     wordvocab = WordVocab(data,
                           self.args['shorthand'],
                           cutoff=self.cutoff,
                           lower=True)
     uposvocab = WordVocab(data, self.args['shorthand'], idx=1)
     xposvocab = xpos_vocab_factory(data, self.args['shorthand'])
     featsvocab = FeatureVocab(data, self.args['shorthand'], idx=3)
     lemmavocab = WordVocab(data,
                            self.args['shorthand'],
                            cutoff=self.cutoff,
                            idx=4,
                            lower=True)
     deprelvocab = WordVocab(data, self.args['shorthand'], idx=6)
     vocab = MultiVocab({
         'char': charvocab,
         'word': wordvocab,
         'upos': uposvocab,
         'xpos': xposvocab,
         'feats': featsvocab,
         'lemma': lemmavocab,
         'deprel': deprelvocab
     })
     return vocab
コード例 #3
0
 def load(cls, filename, finetune=False):
     state = torch.load(filename, lambda storage, loc: storage)
     vocab = {'char': CharVocab.load_state_dict(state['vocab'])}
     model = cls(state['args'], vocab, state['pad'], state['is_forward_lm'])
     model.load_state_dict(state['state_dict'])
     model.eval()
     model.finetune = finetune  # set finetune status
     return model
コード例 #4
0
ファイル: data.py プロジェクト: msinkec/classla-stanfordnlp
    def init_vocab(self, data):
        def from_model(model_filename):
            """ Try loading vocab from charLM model file. """
            state_dict = torch.load(model_filename,
                                    lambda storage, loc: storage)
            assert 'vocab' in state_dict, "Cannot find vocab in charLM model file."
            return state_dict['vocab']

        if self.eval:
            raise Exception("Vocab must exist for evaluation.")
        if self.args['charlm']:
            charvocab = CharVocab.load_state_dict(
                from_model(self.args['charlm_forward_file']))
        else:
            charvocab = CharVocab(data, self.args['shorthand'])
        wordvocab = self.pretrain.vocab
        tagvocab = TagVocab(data, self.args['shorthand'], idx=1)
        vocab = MultiVocab({
            'char': charvocab,
            'word': wordvocab,
            'tag': tagvocab
        })
        return vocab
コード例 #5
0
ファイル: data.py プロジェクト: Evan-Feng/XDepParse
    def init_vocab(self, data_list):
        assert self.eval == False  # for eval vocab must exist
        data_all = sum(data_list, [])
        charvocab = CharVocab(data_all, self.args['shorthand'])

        # construct wordvocab from multiple files
        wordvocabs = [WordVocab(data, self.args['shorthand'], cutoff=0, lower=True) for data in data_list]
        wordset = list(set(sum([v._id2unit[len(VOCAB_PREFIX):len(VOCAB_PREFIX) + self.args['vocab_cutoff']] for v in wordvocabs], [])))
        wordvocab = wordvocabs[0]
        wordvocab._id2unit = VOCAB_PREFIX + wordset
        wordvocab._unit2id = {w: i for i, w in enumerate(wordvocab._id2unit)}
        print('Constructing a joint word vocabulary of size {} ...'.format(len(wordvocab)))

        uposvocab = WordVocab(data_all, self.args['shorthand'], idx=1)
        xposvocab = xpos_vocab_factory(data_all, self.args['shorthand'])
        featsvocab = FeatureVocab(data_all, self.args['shorthand'], idx=3)
        lemmavocab = WordVocab(data_all, self.args['shorthand'], cutoff=self.cutoff, idx=4, lower=True)
        vocab = MultiVocab({'char': charvocab,
                            'word': wordvocab,
                            'upos': uposvocab,
                            'xpos': xposvocab,
                            'feats': featsvocab,
                            'lemma': lemmavocab, })
        return vocab
コード例 #6
0
def train(args):
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
        else '{}/{}_{}_charlm.pt'.format(args['save_dir'], args['shorthand'], args['direction'])
    vocab_file = args['save_dir'] + '/' + args['vocab_save_name'] if args['vocab_save_name'] is not None \
        else '{}/{}_vocab.pt'.format(args['save_dir'], args['shorthand'])

    if os.path.exists(vocab_file):
        logging.info('Loading existing vocab file')
        vocab = {
            'char':
            CharVocab.load_state_dict(
                torch.load(vocab_file, lambda storage, loc: storage))
        }
    else:
        logging.info('Building and saving vocab')
        vocab = {
            'char':
            build_vocab(args['train_file']
                        if args['train_dir'] is None else args['train_dir'],
                        cutoff=args['cutoff'])
        }
        torch.save(vocab['char'].state_dict(), vocab_file)
    print("Training model with vocab size: {}".format(len(vocab['char'])))

    model = CharacterLanguageModel(
        args,
        vocab,
        is_forward_lm=True if args['direction'] == 'forward' else False)
    if args['cuda']: model = model.cuda()
    params = [param for param in model.parameters() if param.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args['lr0'],
                                momentum=args['momentum'],
                                weight_decay=args['weight_decay'])
    criterion = torch.nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        verbose=True,
        factor=args['anneal'],
        patience=args['patience'])

    best_loss = None
    for epoch in range(args['epochs']):
        # load train data from train_dir if not empty, otherwise load from file
        if args['train_dir'] is not None:
            train_path = args['train_dir']
        else:
            train_path = args['train_file']
        train_data = load_data(train_path, vocab, args['direction'])
        dev_data = load_data(args['eval_file'], vocab, args['direction'])
        train_epoch(args, vocab, train_data, model, params, optimizer,
                    criterion, epoch + 1)

        start_time = time.time()
        loss = evaluate_epoch(args, vocab, dev_data, model, criterion)
        elapsed = int(time.time() - start_time)
        scheduler.step(loss)
        print(
            "| {:5d}/{:5d} epochs | time elapsed {:6d}s | loss {:5.2f} | ppl {:8.2f}"
            .format(
                epoch + 1,
                args['epochs'],
                elapsed,
                loss,
                math.exp(loss),
            ))
        if best_loss is None or loss < best_loss:
            best_loss = loss
            model.save(model_file)
            print('new best model saved.')
    return