Exemple #1
0
    def load(self, path, debug=False):
        """Load from disk

        Parameters
        ----------
        path : str
            path to the directory which typically contains a config.pkl file and a model.bin file

        Returns
        -------
        DepParser
            parser itself
        """
        config = _Config.load(os.path.join(path, 'config.pkl'))
        if debug:
            print(config)
        with open(config.save_vocab_path, 'rb') as f:
            self._vocab = pickle.load(f)
        with mx.Context(mxnet_prefer_gpu()):
            self._parser = self.cls_parser(self._vocab,
                                           config.word_dims,
                                           config.tag_dims,
                                           config.dropout_emb,
                                           config.lstm_layers,
                                           config.lstm_hiddens,
                                           config.dropout_lstm_input,
                                           config.dropout_lstm_hidden,
                                           config.mlp_arc_size,
                                           config.mlp_rel_size,
                                           config.dropout_mlp,
                                           bert=config.bert_dim,
                                           debug=True)
            self._parser.load(config.save_model_path)
            self._parser.rnn.pret_word_embs.initialize(ctx=mxnet_prefer_gpu())
        return self
Exemple #2
0
    def __init__(self, model, detach: bool = True, context: mx.Context = None):
        super().__init__()

        """
            Contextual string embeddings of words, as proposed in Akbik et al., 2018.

            Parameters
            ----------
            arg1 : model
                model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward',
                'german-backward' depending on which character language model is desired
            arg2 : detach
                if set to false, the gradient will propagate into the language model. this dramatically slows down
                training and often leads to worse results, so not recommended.
        """
        self.static_embeddings = detach
        self.context = context if context else mxnet_prefer_gpu()
        self.lm = ContextualStringModel.load_language_model(model, context=self.context)
        self.detach = detach
        if detach:
            self.lm.freeze()
            self.static_embeddings = True

        self.is_forward_lm = self.lm.is_forward_lm

        with self.context:
            dummy_sentence = Sentence()
            dummy_sentence.add_token(Token('hello'))
            embedded_dummy = self.embed(dummy_sentence)
            self.__embedding_length = len(embedded_dummy[0].get_token(1).get_embedding())
Exemple #3
0
    def load(self, path, debug=False):
        """Load from disk

        Parameters
        ----------
        path : str
            path to the directory which typically contains a config.pkl file and a model.bin file

        Returns
        -------
        SDPParser
            parser itself
        """
        config: _Config = _Config.load(os.path.join(path, 'config.pkl'))
        if debug:
            print(config)
        self._vocab = vocab = ParserVocabulary.load(config.save_vocab_path)
        with mx.Context(mxnet_prefer_gpu()):
            self._parser = BiaffineDepParser(vocab,
                                             config.word_dims,
                                             config.tag_dims,
                                             config.dropout_emb,
                                             config.lstm_layers,
                                             config.lstm_hiddens,
                                             config.dropout_lstm_input,
                                             config.dropout_lstm_hidden,
                                             config.mlp_arc_size,
                                             config.mlp_rel_size,
                                             config.dropout_mlp,
                                             bert=config.bert_dim,
                                             debug=True)
            self._parser.initialize()
            self._parser.load(config.save_model_path)
        return self
 def fill(self, path):
     super().fill(path)
     for i, second_decoder in enumerate(self.arc_biaffines):
         sd_path = os.path.join(path, 'second_decoder{}.bin'.format(i))
         if os.path.isfile(sd_path):
             second_decoder.load_parameters(sd_path, ctx=mxnet_prefer_gpu())
             freeze(second_decoder)
    def fill(self, path):
        rnn_path = os.path.join(path, 'rnn.bin')
        if os.path.isfile(rnn_path):
            # print('load rnn')
            self.rnn.load_parameters(rnn_path, ctx=mxnet_prefer_gpu())
            freeze(self.rnn)

        for i, (mlp, decoder) in enumerate(zip(self.mlps, self.decoders)):
            mlp_path = os.path.join(path, 'mlp{}.bin'.format(i))
            if os.path.isfile(mlp_path):
                # print('load mlp')
                mlp.load_parameters(mlp_path, ctx=mxnet_prefer_gpu())
                freeze(mlp)

            decoder_path = os.path.join(path, 'decoder{}.bin'.format(i))
            if os.path.isfile(decoder_path):
                # print('load decoder')
                decoder.load_parameters(decoder_path, ctx=mxnet_prefer_gpu())
                freeze(decoder)
    def load(self, load_path, ctx=None):
        """Load model

        Parameters
        ----------
        load_path : str
            path to model file
        """
        if not ctx:
            ctx = mxnet_prefer_gpu()
        self.load_parameters(load_path, allow_missing=True, ctx=ctx)
Exemple #7
0
    def evaluate(self,
                 test_files: List[str],
                 save_dir=None,
                 logger=None,
                 num_buckets_test=10,
                 test_batch_size=5000,
                 bert_path=None,
                 debug=False):
        """Run evaluation on test set

        Parameters
        ----------
        test_files : str
            path to test set
        save_dir : str
            where to store intermediate results and log
        logger : logging.logger
            logger for printing results
        num_buckets_test : int
            number of clusters for sentences from test set
        test_batch_size : int
            batch size of test set

        Returns
        -------
        tuple
            UAS, LAS
        """
        parser = self._parser
        with mx.Context(mxnet_prefer_gpu()):
            UF, LF, speed = evaluate_joint_official_script(parser,
                                                           self._vocab,
                                                           num_buckets_test,
                                                           test_batch_size,
                                                           test_files,
                                                           save_dir,
                                                           bert=bert_path,
                                                           debug=debug)
            score_str = 'Test\n'
            for dataset, uf, lf in zip(test_files, UF, LF):
                dataset = os.path.basename(dataset)
                uf = uf * 100
                lf = lf * 100
                score_str += '{} UF={:0.1f} LF={:0.1f}\n'.format(
                    dataset, uf, lf)
            LF = sum(LF) / len(LF) * 100
            if logger is None:
                logger = init_logger(save_dir, 'test.log')
            logger.info(score_str + '%d sents/s' % (speed))

        return LF
Exemple #8
0
    def load_from_file(cls,
                       model_folder,
                       context: mx.Context = None,
                       **kwargs):
        if context is None:
            context = mxnet_prefer_gpu()
        config_path = os.path.join(model_folder, 'config.pkl')
        with open(config_path, 'rb') as f:
            config = pickle.load(f)
            with context:
                embedding_types = [
                    WordEmbeddings(
                        '{}data/embedding/fasttext100.vec.txt'.format(
                            kwargs.get('word_embedding_path', ''))),

                    # comment in this line to use character embeddings
                    # CharacterEmbeddings(),

                    # comment in these lines to use contextual string embeddings
                    CharLMEmbeddings('{}data/model/lm-news-forward'.format(
                        kwargs.get('word_embedding_path', '')),
                                     context=context),
                    CharLMEmbeddings('{}data/model/lm-news-backward'.format(
                        kwargs.get('word_embedding_path', '')),
                                     context=context),
                ]

                embeddings = StackedEmbeddings(embeddings=embedding_types)
                model = SequenceTagger(hidden_size=config['hidden_size'],
                                       embeddings=embeddings,
                                       tag_dictionary=config['tag_dictionary'],
                                       tag_type=config['tag_type'],
                                       use_crf=config['use_crf'],
                                       use_rnn=config['use_rnn'],
                                       rnn_layers=config['rnn_layers'])
                model.load_parameters(os.path.join(model_folder, 'model.bin'),
                                      ctx=context)
            return model
Exemple #9
0
    def evaluate(self,
                 test_file,
                 save_dir=None,
                 logger=None,
                 num_buckets_test=10,
                 test_batch_size=5000,
                 bert_path=None):
        parser = self._parser
        vocab = self._vocab
        with mx.Context(mxnet_prefer_gpu()):
            UF, LF, speed = dep_evaluate_official_script(
                parser,
                vocab,
                num_buckets_test,
                test_batch_size,
                test_file,
                os.path.join(save_dir, 'test.predict.conll'),
                bert=bert_path)
        if logger is None:
            logger = init_logger(save_dir, 'test.log')
        logger.info('Test: UF=%.2f%% LF=%.2f%% %d sents/s' % (UF, LF, speed))

        return LF
Exemple #10
0
    def parse(self, sentence):
        """Parse raw sentence into ConllSentence

        Parameters
        ----------
        sentence : list
            a list of (word, tag) tuples

        Returns
        -------
        ConllSentence
            ConllSentence object
        """
        words = np.zeros((len(sentence) + 1, 1), np.int32)
        tags = np.zeros((len(sentence) + 1, 1), np.int32)
        words[0, 0] = ParserVocabulary.ROOT
        tags[0, 0] = ParserVocabulary.ROOT
        vocab = self._vocab

        for i, (word, tag) in enumerate(sentence):
            words[i + 1,
                  0], tags[i + 1,
                           0] = vocab.word2id(word.lower()), vocab.tag2id(tag)

        with mx.Context(mxnet_prefer_gpu()):
            outputs = self._parser.forward(words, tags)
        words = []
        for arc, rel, (word, tag) in zip(outputs[0][0], outputs[0][1],
                                         sentence):
            words.append(
                ConllWord(id=len(words) + 1,
                          form=word,
                          pos=tag,
                          head=arc,
                          relation=vocab.id2rel(rel)))
        return ConllSentence(words)
Exemple #11
0
    def train(self,
              train_file: List[str],
              dev_file: List[str],
              save_dir,
              pretrained_embeddings_file=None,
              min_occur_count=2,
              lstm_layers=3,
              word_dims=100,
              tag_dims=100,
              dropout_emb=0.33,
              lstm_hiddens=400,
              dropout_lstm_input=0.33,
              dropout_lstm_hidden=0.33,
              mlp_arc_size=500,
              mlp_rel_size=100,
              dropout_mlp=0.33,
              learning_rate=1e-3,
              decay=.75,
              decay_steps=5000,
              beta_1=.9,
              beta_2=.9,
              epsilon=1e-12,
              num_buckets_train=40,
              num_buckets_valid=10,
              train_iters=50000,
              train_batch_size=5000,
              dev_batch_size=5000,
              validate_every=100,
              save_after=5000,
              root='root',
              transfer=None,
              bert_path=None,
              debug=False):
        """Train a deep biaffine dependency parser

        Parameters
        ----------
        train_file : str
            path to training set
        dev_file : str
            path to dev set
        save_dir : str
            a directory for saving model and related meta-data
        pretrained_embeddings_file : str
            pre-trained embeddings file, plain text format
        min_occur_count : int
            threshold of rare words, which will be replaced with UNKs,
        lstm_layers : int
            layers of lstm
        word_dims : int
            dimension of word embedding
        tag_dims : int
            dimension of tag embedding
        dropout_emb : float
            word dropout
        lstm_hiddens : int
            size of lstm hidden states
        dropout_lstm_input : int
            dropout on x in variational RNN
        dropout_lstm_hidden : int
            dropout on h in variational RNN
        mlp_arc_size : int
            output size of MLP for arc feature extraction
        mlp_rel_size : int
            output size of MLP for rel feature extraction
        dropout_mlp : float
            dropout on the output of LSTM
        learning_rate : float
            learning rate
        decay : float
            see ExponentialScheduler
        decay_steps : int
            see ExponentialScheduler
        beta_1 : float
            see ExponentialScheduler
        beta_2 : float
            see ExponentialScheduler
        epsilon : float
            see ExponentialScheduler
        num_buckets_train : int
            number of buckets for training data set
        num_buckets_valid : int
            number of buckets for dev data set
        train_iters : int
            training iterations
        train_batch_size : int
            training batch size
        dev_batch_size : int
            test batch size
        validate_every : int
            validate on dev set every such number of batches
        save_after : int
            skip saving model in early epochs
        root : str
            token for ROOT
        debug : bool
            debug mode

        Returns
        -------
        DepParser
            parser itself
        """
        logger = init_logger(save_dir)
        config = _Config(train_file, dev_file, None, save_dir,
                         pretrained_embeddings_file, min_occur_count,
                         lstm_layers, word_dims, tag_dims, dropout_emb,
                         lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden,
                         mlp_arc_size, mlp_rel_size, dropout_mlp,
                         learning_rate, decay, decay_steps, beta_1, beta_2,
                         epsilon, num_buckets_train, num_buckets_valid, None,
                         train_iters, train_batch_size, 0, debug)
        if transfer:
            with open(os.path.join(transfer, 'vocab.pkl'), 'rb') as f:
                self._vocab = pickle.load(f)
            self._vocab.append(
                ParserVocabulary(
                    train_file[-1],
                    pretrained_embeddings_file,
                    min_occur_count,
                    root=root,
                    shared_vocab=self._vocab[0],
                ))
        else:
            for t, d in zip(train_file, dev_file):
                self._vocab.append(
                    ParserVocabulary(
                        t,
                        pretrained_embeddings_file,
                        min_occur_count,
                        root=root,
                        shared_vocab=None
                        if len(self._vocab) == 0 else self._vocab[0],
                    ))
        with open(config.save_vocab_path, 'wb') as f:
            pickle.dump(self._vocab, f)
        for voc in self._vocab:
            voc.log_info(logger)

        with mx.Context(mxnet_prefer_gpu()):
            data_loaders = [
                DataLoader(t,
                           num_buckets_train,
                           vocab,
                           bert=bert_path[0] if bert_path else None)
                for t, vocab in zip(train_file, self._vocab)
            ]
            config.bert_dim = data_loaders[0].bert_dim
            config.save()
            self._parser = parser = self.cls_parser(
                self._vocab,
                word_dims,
                tag_dims,
                dropout_emb,
                lstm_layers,
                lstm_hiddens,
                dropout_lstm_input,
                dropout_lstm_hidden,
                mlp_arc_size,
                mlp_rel_size,
                dropout_mlp,
                bert=data_loaders[0].bert_dim,
                debug=debug)
            if transfer:
                parser.transfer = True
                parser.fill(transfer)
            parser.initialize()
            scheduler = ExponentialScheduler(learning_rate, decay, decay_steps)
            optimizer = mx.optimizer.Adam(learning_rate,
                                          beta_1,
                                          beta_2,
                                          epsilon,
                                          lr_scheduler=scheduler)
            trainer = gluon.Trainer(parser.collect_params(),
                                    optimizer=optimizer)
            global_step = 0
            best_LF = 0.
            batch_id = 0
            epoch = 1
            total_epoch = math.ceil(train_iters / validate_every)
            logger.info("Epoch {} out of {}".format(epoch, total_epoch))
            bar = Progbar(target=min(validate_every, train_iters))
            gs = [
                dl.get_batches(batch_size=train_batch_size, shuffle=False)
                for dl in data_loaders
            ]
            while global_step < train_iters:
                arcs_tasks = []
                rels_tasks = []
                bert_tasks = []
                for g in gs:
                    words, bert, tags, arcs, rels = next(
                        g, (None, None, None, None, None))
                    if words is None:
                        break
                    arcs_tasks.append(arcs)
                    rels_tasks.append(rels)
                    bert_tasks.append(bert)

                if words is None:
                    gs = [
                        dl.get_batches(batch_size=train_batch_size,
                                       shuffle=False) for dl in data_loaders
                    ]
                    continue

                with autograd.record():
                    arc_accuracy, rel_accuracy, loss = parser.forward(
                        words, bert, tags, arcs_tasks, rels_tasks)
                    loss_value = loss.asscalar()
                loss.backward()
                trainer.step(train_batch_size)
                batch_id += 1
                try:
                    bar.update(batch_id,
                               exact=[("LR", rel_accuracy, 2),
                                      ("loss", loss_value)])
                except OverflowError:
                    pass  # sometimes loss can be 0 or infinity, crashes the bar

                global_step += 1
                if global_step % validate_every == 0:
                    batch_id = 0
                    UF, LF, speed = evaluate_joint_official_script(
                        parser,
                        self._vocab,
                        num_buckets_valid,
                        dev_batch_size,
                        dev_file,
                        os.path.join(save_dir, 'dev.predict.conllu'),
                        bert=None if bert_path is None else bert_path[1])
                    score_str = ''
                    for dataset, lf in zip(dev_file, LF):
                        dataset = os.path.basename(dataset).replace(
                            '.conllu', '')
                        lf = lf * 100
                        score_str += '{}={:0.1f} '.format(dataset, lf)
                    if transfer:
                        LF = LF[-1] * 100
                    else:
                        LF = sum(LF) / len(LF) * 100
                    score_str += '{}={:0.1f} '.format('avg', LF)
                    logger.info(score_str + '%d sents/s' % (speed))
                    epoch += 1
                    bar = Progbar(target=min(validate_every, train_iters -
                                             global_step))
                    if global_step > save_after and LF > best_LF:
                        logger.info('- new best score!')
                        best_LF = LF
                        parser.save(config.save_model_path)
                    if global_step < train_iters:
                        logger.info("Epoch {} out of {}".format(
                            epoch, total_epoch))

        # When validate_every is too big
        if not os.path.isfile(config.save_model_path) or best_LF == 0:
            parser.save(config.save_model_path)

        return self
Exemple #12
0
    def train(self,
              train_file,
              dev_file,
              save_dir,
              pretrained_embeddings_file=None,
              min_occur_count=2,
              lstm_layers=3,
              word_dims=100,
              tag_dims=100,
              dropout_emb=0.33,
              lstm_hiddens=400,
              dropout_lstm_input=0.33,
              dropout_lstm_hidden=0.33,
              mlp_arc_size=500,
              mlp_rel_size=100,
              dropout_mlp=0.33,
              learning_rate=1e-3,
              decay=.75,
              decay_steps=5000,
              beta_1=.9,
              beta_2=.9,
              epsilon=1e-12,
              num_buckets_train=40,
              num_buckets_valid=10,
              train_iters=50000,
              train_batch_size=5000,
              dev_batch_size=5000,
              validate_every=100,
              save_after=5000,
              root='root',
              bert_path=None,
              interpolation=0.5,
              debug=False):
        if pretrained_embeddings_file is None:
            word_dims = False
        logger = init_logger(save_dir)
        config = _Config(train_file, dev_file, None, save_dir,
                         pretrained_embeddings_file, min_occur_count,
                         lstm_layers, word_dims, tag_dims, dropout_emb,
                         lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden,
                         mlp_arc_size, mlp_rel_size, dropout_mlp,
                         learning_rate, decay, decay_steps, beta_1, beta_2,
                         epsilon, num_buckets_train, num_buckets_valid, None,
                         train_iters, train_batch_size, debug)

        self._vocab = vocab = ParserVocabulary(train_file,
                                               pretrained_embeddings_file,
                                               min_occur_count,
                                               root=root)
        vocab.save(config.save_vocab_path)
        vocab.log_info(logger)

        with mx.Context(mxnet_prefer_gpu()):
            data_loader = DepDataLoader(
                train_file,
                num_buckets_train,
                vocab,
                bert=bert_path[0] if bert_path else None)
            config.bert_dim = data_loader.bert_dim
            config.save()
            self._parser = parser = BiaffineDepParser(
                vocab,
                word_dims,
                tag_dims,
                dropout_emb,
                lstm_layers,
                lstm_hiddens,
                dropout_lstm_input,
                dropout_lstm_hidden,
                mlp_arc_size,
                mlp_rel_size,
                dropout_mlp,
                bert=data_loader.bert_dim,
                interpolation=interpolation,
                debug=debug)
            parser.initialize()
            scheduler = ExponentialScheduler(learning_rate, decay, decay_steps)
            optimizer = mx.optimizer.Adam(learning_rate,
                                          beta_1,
                                          beta_2,
                                          epsilon,
                                          lr_scheduler=scheduler)
            trainer = gluon.Trainer(parser.collect_params(),
                                    optimizer=optimizer)
            global_step = 0
            best_LF = 0.
            batch_id = 0
            epoch = 1
            total_epoch = math.ceil(train_iters / validate_every)
            logger.info("Epoch {} out of {}".format(epoch, total_epoch))
            bar = Progbar(target=min(validate_every, train_iters))
            while global_step < train_iters:
                for words, bert, tags, arcs, rels in data_loader.get_batches(
                        batch_size=train_batch_size, shuffle=False):
                    with autograd.record():
                        arc_accuracy, rel_accuracy, loss = parser.forward(
                            words, bert, tags, arcs, rels)
                        loss_value = loss.asscalar()
                    loss.backward()
                    trainer.step(train_batch_size)
                    batch_id += 1
                    try:
                        bar.update(batch_id,
                                   exact=[("LR", rel_accuracy, 2),
                                          ("loss", loss_value)])
                    except OverflowError:
                        pass  # sometimes loss can be 0 or infinity, crashes the bar

                    global_step += 1
                    if global_step % validate_every == 0:
                        batch_id = 0
                        UF, LF, speed = dep_evaluate_official_script(
                            parser,
                            vocab,
                            num_buckets_valid,
                            dev_batch_size,
                            dev_file,
                            os.path.join(save_dir, 'dev.predict.conllu'),
                            bert=None if bert_path is None else bert_path[1])
                        logger.info('Dev: LF=%.1f%% %d sents/s' % (LF, speed))
                        epoch += 1
                        bar = Progbar(target=min(validate_every, train_iters -
                                                 global_step))
                        if global_step > save_after and LF > best_LF:
                            logger.info('- new best score!')
                            best_LF = LF
                            parser.save(config.save_model_path)
                        if global_step < train_iters:
                            logger.info("Epoch {} out of {}".format(
                                epoch, total_epoch))

        # When validate_every is too big
        if not os.path.isfile(config.save_model_path):
            parser.save(config.save_model_path)

        return self
Exemple #13
0
    def evaluate(self,
                 test_file,
                 save_dir=None,
                 logger=None,
                 num_buckets_test=10,
                 test_batch_size=5000,
                 bert_path=None,
                 chinese=False,
                 debug=False):
        """Run evaluation on test set

        Parameters
        ----------
        test_file : str
            path to test set
        save_dir : str
            where to store intermediate results and log
        logger : logging.logger
            logger for printing results
        num_buckets_test : int
            number of clusters for sentences from test set
        test_batch_size : int
            batch size of test set

        Returns
        -------
        tuple
            UAS, LAS
        """
        parser = self._parser
        vocab = self._vocab
        if logger is None:
            logger = init_logger(save_dir, 'test.log')
        with mx.Context(mxnet_prefer_gpu()):
            if chinese:
                result, speed = evaluate_chinese_sdp(
                    parser,
                    vocab,
                    num_buckets_test,
                    test_batch_size,
                    test_file,
                    os.path.join(save_dir, 'test.predict.conllu'),
                    bert=bert_path,
                    debug=debug)
                logger.info(test_file)
                for k, v in result.items():
                    logger.info('%s=%.2f%%' % (k, v))
                return result
            else:
                UF, LF, speed = evaluate_official_script(
                    parser,
                    vocab,
                    num_buckets_test,
                    test_batch_size,
                    test_file,
                    os.path.join(save_dir, os.path.basename(test_file)),
                    bert=bert_path,
                    debug=debug)
                UF = UF * 100
                LF = LF * 100
                logger.info('Test: UF=%.2f%% LF=%.2f%% %d sents/s' %
                            (UF, LF, speed))

                return LF
Exemple #14
0
    def train(self,
              base_path: str,
              sequence_length: int,
              learning_rate: float = 20,
              mini_batch_size: int = 100,
              anneal_factor: float = 0.25,
              patience: int = 10,
              clip=0.25,
              max_epochs: int = 10000):

        number_of_splits = len(self.corpus.train_files)
        val_data = self._batchify(self.corpus.valid, mini_batch_size)

        os.makedirs(base_path, exist_ok=True)
        loss_txt = os.path.join(base_path, 'loss.txt')
        savefile = os.path.join(base_path, 'best-lm.pt')

        try:
            with mx.Context(mxnet_prefer_gpu()):
                self.model.initialize()
                best_val_loss = 100000000
                scheduler = ReduceLROnPlateau(lr=learning_rate,
                                              verbose=True,
                                              factor=anneal_factor,
                                              patience=patience)
                optimizer = mx.optimizer.SGD(learning_rate=learning_rate,
                                             lr_scheduler=scheduler)
                trainer = gluon.Trainer(self.model.collect_params(),
                                        optimizer=optimizer)

                for epoch in range(1, max_epochs + 1):

                    print('Split %d' % epoch +
                          '\t - ({:%H:%M:%S})'.format(datetime.datetime.now()))

                    # for group in optimizer.param_groups:
                    #     learning_rate = group['lr']

                    train_slice = self.corpus.get_next_train_slice()

                    train_data = self._batchify(train_slice, mini_batch_size)
                    print('\t({:%H:%M:%S})'.format(datetime.datetime.now()))

                    # go into train mode
                    # self.model.train()

                    # reset variables
                    epoch_start_time = time.time()
                    total_loss = 0
                    start_time = time.time()

                    hidden = self.model.init_hidden(mini_batch_size)
                    cell = hidden.copy()

                    # not really sure what this does
                    ntokens = len(self.corpus.dictionary)

                    # do batches
                    for batch, i in enumerate(
                            range(0,
                                  len(train_data) - 1, sequence_length)):

                        data, targets = self._get_batch(
                            train_data, i, sequence_length)

                        # Starting each batch, we detach the hidden state from how it was previously produced.
                        # If we didn't, the model would try backpropagating all the way to start of the dataset.
                        hidden = self._repackage_hidden(hidden)
                        cell = self._repackage_hidden(cell)

                        # self.model.zero_grad()
                        # optimizer.zero_grad()

                        # do the forward pass in the model
                        with autograd.record():
                            output, rnn_output, hidden, cell = self.model.forward(
                                data, hidden, cell)
                            # try to predict the targets
                            loss = self.loss_function(
                                output.reshape(-1, ntokens), targets).mean()
                            loss.backward()

                        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                        # torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip)

                        trainer.step(mini_batch_size)

                        total_loss += loss.asscalar()

                        if batch % self.log_interval == 0 and batch > 0:
                            cur_loss = total_loss.item() / self.log_interval
                            elapsed = time.time() - start_time
                            print(
                                '| split {:3d} /{:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                                'loss {:5.2f} | ppl {:8.2f}'.format(
                                    epoch, number_of_splits, batch,
                                    len(train_data) // sequence_length,
                                    elapsed * 1000 / self.log_interval,
                                    cur_loss, self._safe_exp(cur_loss)))
                            total_loss = 0
                            start_time = time.time()

                    print('epoch {} done! \t({:%H:%M:%S})'.format(
                        epoch, datetime.datetime.now()))
                    scheduler.step(cur_loss)

                    ###############################################################################
                    # TEST
                    ###############################################################################
                    # skip evaluation
                    # val_loss = self.evaluate(val_data, mini_batch_size, sequence_length)
                    # scheduler.step(val_loss)
                    #
                    # # Save the model if the validation loss is the best we've seen so far.
                    # if val_loss < best_val_loss:
                    #     self.model.save(savefile)
                    #     best_val_loss = val_loss
                    #     print('best loss so far {:5.2f}'.format(best_val_loss))
                    val_loss = cur_loss
                    if (self.corpus.current_train_file_index +
                            1) % 100 == 0 or self.corpus.is_last_slice:
                        self.model.save(savefile)

                    ###############################################################################
                    # print info
                    ###############################################################################
                    print('-' * 89)

                    local_split_number = epoch % number_of_splits
                    if local_split_number == 0:
                        local_split_number = number_of_splits

                    summary = '| end of split {:3d} /{:3d} | epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' \
                              'valid ppl {:8.2f} | learning rate {:3.2f}'.format(local_split_number,
                                                                                 number_of_splits,
                                                                                 epoch,
                                                                                 (time.time() - epoch_start_time),
                                                                                 val_loss,
                                                                                 self._safe_exp(val_loss),
                                                                                 learning_rate)

                    with open(loss_txt, "a") as myfile:
                        myfile.write('%s\n' % summary)

                    print(summary)
                    print('-' * 89)

        except KeyboardInterrupt:
            print('-' * 89)
            print('Exiting from training early')
Exemple #15
0
                                                train_file='train.short.tsv',
                                                test_file='test.tsv',
                                                dev_file='dev.tsv'
                                                # train_file='debug.tsv',
                                                # test_file='debug.tsv',
                                                # dev_file='debug.tsv'
                                                )
# 2. what tag do we want to predict?
tag_type = 'pos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [
        # WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'),
        # WordEmbeddings('data/embedding/glove/glove.6B.100d.debug.txt'),
        # CharLMEmbeddings('data/model/lm-news-forward'),
        # CharLMEmbeddings('data/model/lm-news-backward'),
        BERTEmbeddings(['data/embedding/bert_large_cased/wsj.train.short.bert',
                        'data/embedding/bert_large_cased/wsj.dev.bert',
                        'data/embedding/bert_large_cased/wsj.test.bert']),
    ]

    embeddings = StackedEmbeddings(embeddings=embedding_types)

    # 5. initialize sequence tagger
    tagger = SequenceTagger(hidden_size=256,
                            embeddings=embeddings,
Exemple #16
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-13 15:01
import os
import pickle

import numpy as np
from bert_embedding import BertEmbedding

from bertsota.common.utils import mxnet_prefer_gpu

bert = BertEmbedding(model='bert_24_1024_16',
                     dataset_name='book_corpus_wiki_en_cased',
                     max_seq_length=270,
                     ctx=mxnet_prefer_gpu())


def embed_bert(sents):
    result = bert.embedding(sents)
    return [np.stack(s[2]) for s in result]


def make_bert_for(path, output):
    print(output)
    os.makedirs(os.path.dirname(output), exist_ok=True)
    total = 0
    with open(path) as src:
        batch = []
        tensor = []
        for line in src:
            line = line.strip()
Exemple #17
0
    def train(self,
              train_file,
              dev_file,
              save_dir,
              pretrained_embeddings_file=None,
              min_occur_count=2,
              lstm_layers=3,
              word_dims=100,
              tag_dims=100,
              dropout_emb=0.33,
              lstm_hiddens=400,
              dropout_lstm_input=0.33,
              dropout_lstm_hidden=0.33,
              mlp_arc_size=500,
              mlp_rel_size=100,
              dropout_mlp=0.33,
              learning_rate=1e-3,
              decay=.75,
              decay_steps=5000,
              beta_1=.9,
              beta_2=.9,
              epsilon=1e-12,
              num_buckets_train=40,
              num_buckets_valid=10,
              train_iters=50000,
              train_batch_size=5000,
              dev_batch_size=5000,
              validate_every=100,
              save_after=5000,
              root='root',
              bert_path=None,
              debug=False):
        """Train a deep biaffine dependency parser

        Parameters
        ----------
        train_file : str
            path to training set
        dev_file : str
            path to dev set
        save_dir : str
            a directory for saving model and related meta-data
        pretrained_embeddings_file : str
            pre-trained embeddings file, plain text format
        min_occur_count : int
            threshold of rare words, which will be replaced with UNKs,
        lstm_layers : int
            layers of lstm
        word_dims : int
            dimension of word embedding
        tag_dims : int
            dimension of tag embedding
        dropout_emb : float
            word dropout
        lstm_hiddens : int
            size of lstm hidden states
        dropout_lstm_input : int
            dropout on x in variational RNN
        dropout_lstm_hidden : int
            dropout on h in variational RNN
        mlp_arc_size : int
            output size of MLP for arc feature extraction
        mlp_rel_size : int
            output size of MLP for rel feature extraction
        dropout_mlp : float
            dropout on the output of LSTM
        learning_rate : float
            learning rate
        decay : float
            see ExponentialScheduler
        decay_steps : int
            see ExponentialScheduler
        beta_1 : float
            see ExponentialScheduler
        beta_2 : float
            see ExponentialScheduler
        epsilon : float
            see ExponentialScheduler
        num_buckets_train : int
            number of buckets for training data set
        num_buckets_valid : int
            number of buckets for dev data set
        train_iters : int
            training iterations
        train_batch_size : int
            training batch size
        dev_batch_size : int
            test batch size
        validate_every : int
            validate on dev set every such number of batches
        save_after : int
            skip saving model in early epochs
        root : str
            token for ROOT
        debug : bool
            debug mode

        Returns
        -------
        SDPParser
            parser itself
        """
        if pretrained_embeddings_file is None:
            word_dims = 0
        logger = init_logger(save_dir)
        config = _Config(train_file, dev_file, None, save_dir,
                         pretrained_embeddings_file, min_occur_count,
                         lstm_layers, word_dims, tag_dims, dropout_emb,
                         lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden,
                         mlp_arc_size, mlp_rel_size, dropout_mlp,
                         learning_rate, decay, decay_steps, beta_1, beta_2,
                         epsilon, num_buckets_train, num_buckets_valid, None,
                         train_iters, train_batch_size, debug)

        self._vocab = vocab = ParserVocabulary(train_file,
                                               pretrained_embeddings_file,
                                               min_occur_count,
                                               root=root)
        vocab.save(config.save_vocab_path)
        vocab.log_info(logger)

        with mx.Context(mxnet_prefer_gpu()):
            data_loader = DataLoader(train_file,
                                     num_buckets_train,
                                     vocab,
                                     bert=bert_path[0] if bert_path else None)
            config.bert_dim = data_loader.bert_dim
            config.save()
            self._parser = parser = BiaffineParser(vocab,
                                                   word_dims,
                                                   tag_dims,
                                                   dropout_emb,
                                                   lstm_layers,
                                                   lstm_hiddens,
                                                   dropout_lstm_input,
                                                   dropout_lstm_hidden,
                                                   mlp_arc_size,
                                                   mlp_rel_size,
                                                   dropout_mlp,
                                                   bert=data_loader.bert_dim,
                                                   debug=debug)
            parser.initialize()
            scheduler = ExponentialScheduler(learning_rate, decay, decay_steps)
            optimizer = mx.optimizer.Adam(learning_rate,
                                          beta_1,
                                          beta_2,
                                          epsilon,
                                          lr_scheduler=scheduler)
            trainer = gluon.Trainer(parser.collect_params(),
                                    optimizer=optimizer)
            global_step = 0
            best_LF = 0.
            batch_id = 0
            epoch = 1
            total_epoch = math.ceil(train_iters / validate_every)
            logger.info("Epoch {} out of {}".format(epoch, total_epoch))
            bar = Progbar(target=min(validate_every, train_iters))
            while global_step < train_iters:
                for words, bert, tags, arcs, rels in data_loader.get_batches(
                        batch_size=train_batch_size, shuffle=False):
                    with autograd.record():
                        arc_accuracy, rel_accuracy, loss = parser.forward(
                            words, bert, tags, arcs, rels)
                        loss_value = loss.asscalar()
                    loss.backward()
                    trainer.step(train_batch_size)
                    batch_id += 1
                    try:
                        bar.update(batch_id,
                                   exact=[("LR", rel_accuracy, 2),
                                          ("loss", loss_value)])
                    except OverflowError:
                        pass  # sometimes loss can be 0 or infinity, crashes the bar

                    global_step += 1
                    if global_step % validate_every == 0:
                        batch_id = 0
                        UF, LF, speed = evaluate_official_script(
                            parser,
                            vocab,
                            num_buckets_valid,
                            dev_batch_size,
                            dev_file,
                            os.path.join(save_dir, 'dev.predict.conllu'),
                            bert=None if bert_path is None else bert_path[1])
                        LF = LF * 100
                        logger.info('Dev: LF=%.1f%% %d sents/s' % (LF, speed))
                        epoch += 1
                        bar = Progbar(target=min(validate_every, train_iters -
                                                 global_step))
                        if global_step > save_after and LF > best_LF:
                            logger.info('- new best score!')
                            best_LF = LF
                            parser.save(config.save_model_path)
                        if global_step < train_iters:
                            logger.info("Epoch {} out of {}".format(
                                epoch, total_epoch))

        # When validate_every is too big
        if not os.path.isfile(config.save_model_path):
            parser.save(config.save_model_path)

        return self
Exemple #18
0
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/conll03',
                                                columns,
                                                train_file='train.tsv',
                                                test_file='test.tsv',
                                                dev_file='dev.tsv',
                                                tag_to_biloes='ner')

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [
        WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'),
        BERTEmbeddings([
            'data/embedding/bert_large_sum/conll03.train.bert',
            'data/embedding/bert_large_sum/conll03.dev.bert',
            'data/embedding/bert_large_sum/conll03.test.bert'
        ]),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),

        # comment in these lines to use contextual string embeddings
        CharLMEmbeddings('data/model/lm-news-forward'),
        CharLMEmbeddings('data/model/lm-news-backward'),
    ]
    def train(self,
              base_path: str,
              learning_rate: float = 0.1,
              mini_batch_size: int = 32,
              max_epochs: int = 100,
              anneal_factor: float = 0.5,
              patience: int = 2,
              save_model: bool = True,
              embeddings_in_memory: bool = True,
              train_with_dev: bool = False,
              context: mx.Context = None,
              show_test=False,
              cn=False) -> float:
        """

        :param base_path: a folder to store model, log etc.
        :param learning_rate:
        :param mini_batch_size:
        :param max_epochs:
        :param anneal_factor:
        :param patience:
        :param save_model:
        :param embeddings_in_memory:
        :param train_with_dev:
        :return: best dev f1
        """
        evaluation_method = 'F1'
        if self.model.tag_type in ['ner', 'np', 'srl']:
            evaluation_method = 'span-F1'
        if self.model.tag_type in ['pos', 'upos']:
            evaluation_method = 'accuracy'
        print(evaluation_method)

        os.makedirs(base_path, exist_ok=True)

        loss_txt = os.path.join(base_path, "loss.txt")
        open(loss_txt, "w", encoding='utf-8').close()

        anneal_mode = 'min' if train_with_dev else 'max'
        train_data = self.corpus.train

        # if training also uses dev data, include in training set
        if train_with_dev:
            train_data.extend(self.corpus.dev)

        # At any point you can hit Ctrl + C to break out of training early.
        try:
            with mx.Context(context if context else mxnet_prefer_gpu()):
                self.model.initialize()
                scheduler = ReduceLROnPlateau(lr=learning_rate,
                                              verbose=True,
                                              factor=anneal_factor,
                                              patience=patience,
                                              mode=anneal_mode)
                optimizer = mx.optimizer.SGD(learning_rate=learning_rate,
                                             lr_scheduler=scheduler,
                                             clip_gradient=5.0)
                trainer = gluon.Trainer(self.model.collect_params(),
                                        optimizer=optimizer)
                for epoch in range(0, max_epochs):
                    current_loss = 0
                    if not self.test_mode:
                        random.shuffle(train_data)

                    batches = [
                        train_data[x:x + mini_batch_size]
                        for x in range(0, len(train_data), mini_batch_size)
                    ]

                    batch_no = 0

                    for batch in batches:
                        batch = batch
                        batch_no += 1

                        # if batch_no % 100 == 0:
                        #     print("%d of %d (%f)" % (batch_no, len(batches), float(batch_no / len(batches))))

                        # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step()
                        batch.sort(key=lambda x: len(x), reverse=True)
                        with autograd.record():
                            self.model.embeddings.embed(batch)
                            loss = self.model.neg_log_likelihood(
                                batch, self.model.tag_type)

                        current_loss += loss.sum().asscalar()

                        loss.backward()

                        # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)

                        # optimizer.step()
                        trainer.step(len(batch))

                        sys.stdout.write(
                            "\r%.2f%%" %
                            (batch_no / float(len(batches)) * 100))
                        sys.stdout.flush()

                        if not embeddings_in_memory:
                            self.clear_embeddings_in_batch(batch)

                    current_loss /= len(train_data)

                    if not train_with_dev:
                        print('.. evaluating... dev... ')
                        dev_score, dev_fp, dev_result = self.evaluate(
                            self.corpus.dev,
                            base_path,
                            evaluation_method=evaluation_method,
                            embeddings_in_memory=embeddings_in_memory,
                            cn=cn)
                    else:
                        dev_fp = 0
                        dev_result = '_'

                    # anneal against train loss if training with dev, otherwise anneal against dev score
                    scheduler.step(
                        current_loss) if train_with_dev else scheduler.step(
                            dev_score)

                    # save if model is current best and we use dev data for model selection
                    if save_model and not train_with_dev and dev_score == scheduler.best:
                        self.model.save(base_path)
                    summary = '%d' % epoch + '\t({:%H:%M:%S})'.format(datetime.datetime.now()) \
                              + '\t%f\t%d\t%f\tDEV   %d\t' % (
                                  current_loss, scheduler.num_bad_epochs, learning_rate, dev_fp) + dev_result
                    summary = summary.replace('\n', '')
                    if self.corpus.test and len(
                            self.corpus.test) and show_test:
                        print('test... ')
                        test_score, test_fp, test_result = self.evaluate(
                            self.corpus.test,
                            base_path,
                            evaluation_method=evaluation_method,
                            embeddings_in_memory=embeddings_in_memory,
                            cn=cn)
                        summary += '\tTEST   \t%d\t' % test_fp + test_result
                    with open(loss_txt, "a") as loss_file:
                        loss_file.write('%s\n' % summary)
                        loss_file.close()
                    print(summary)

            # if we do not use dev data for model selection, save final model
            if save_model and train_with_dev:
                self.model.save(base_path)

            return scheduler.best  # return maximum dev f1

        except KeyboardInterrupt:
            print('-' * 89)
            print('Exiting from training early')
            print('saving model')
            self.model.save(base_path + "/final-model")
            print('done')