Exemple #1
0
    def train(cls):
        parser = argparse.ArgumentParser(description='Train a pos tagger')
        parser.add_argument('--config_path',
                            type=str,
                            help='config file in json format')
        parser.add_argument('--train_path',
                            type=str,
                            help='file path to the training set')
        parser.add_argument('--dev_path',
                            type=str,
                            help='file path to the development set')
        parser.add_argument('--model_path',
                            type=str,
                            help='file path where the model will be saved')
        parser.add_argument('--word_embeddings',
                            type=str,
                            default=':'.join(
                                ('fasttext', 'crawl-300d-2M-subword')),
                            help='word embeddings to use')
        parser.add_argument('--flair_embeddings',
                            dest='flair',
                            action='store_false',
                            help='use Flair embeddings')
        parser.add_argument('--learning_rate', type=float, default=0.1)
        parser.add_argument('--mini_batch_size', type=int, default=32)
        parser.add_argument('--max_epochs', type=int, default=100)
        parser.add_argument('--anneal_factor', type=float, default=0.5)
        parser.add_argument('--embeddings_in_memory',
                            action='store_true',
                            help='store embeddings in GPU memory')

        args = None
        try:
            args = parser.parse_args(sys.argv[3:])
            args = merge_args_with_config(args)
            args['pretrained_embeddings'] = args['word_embeddings'].split(':')
            for k in ['train_path', 'dev_path', 'model_path']:
                if not args[k]:
                    eprint('--{} is required'.format(k))
                    exit(1)
            args['trn_docs'] = conll_to_documents(args['train_path'],
                                                  headers={
                                                      0: 'text',
                                                      1: 'pos'
                                                  })
            args['dev_docs'] = conll_to_documents(args['dev_path'],
                                                  headers={
                                                      0: 'text',
                                                      1: 'pos'
                                                  })
        except SystemExit:
            parser.print_help()
            exit(1)
        tagger = POSFlairTagger()
        tagger.train(**args)
Exemple #2
0
 def evaluate(cls):
     parser = argparse.ArgumentParser(description='Evaluate a pos tagger')
     parser.add_argument('--model_path',
                         type=str,
                         default=ELIT_POS_FLAIR_EN_MIXED,
                         help='file path to the saved model')
     parser.add_argument('--test_path',
                         type=str,
                         required=True,
                         help='gold file in tsv format')
     args = None
     try:
         args = parser.parse_args(sys.argv[3:])
     except SystemExit:
         parser.print_help()
         exit(1)
     tagger = POSFlairTagger()
     tagger.load(args.model_path)
     tagger.evaluate(
         conll_to_documents(args.test_path, headers={
             0: 'text',
             1: 'pos'
         }))
Exemple #3
0
             **kwargs):
        """
        Load model
        :param model_path: path to stored model
        :param model_root: the root for model_path
        :param kwargs: not used
        :return: self
        """
        super().load(model_path, model_root=model_root, **kwargs)
        return self


if __name__ == '__main__':
    tagger = NERFlairTagger(mx.gpu(3))
    model_path = 'data/model/ner/jumbo'
    # tagger.train(conll_to_documents('data/conll-03/debug/eng.trn'), conll_to_documents('data/conll-03/debug/eng.dev'),
    #              model_path, pretrained_embeddings='data/embedding/glove/glove.6B.100d.debug.txt',
    #              forward_language_model='data/model/lm-news-forward',
    #              backward_language_model='data/model/lm-news-backward',
    #              max_epochs=1)
    tagger.load(model_path)
    test = conll_to_documents('data/dat/en-ner.tst',
                              headers={
                                  0: 'text',
                                  1: 'pos',
                                  2: 'ner'
                              })
    sent = tagger.decode(test)[0][SENS][3]
    print(sent[NER])
    print(tagger.evaluate(test))
Exemple #4
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-10-03 16:27
from types import SimpleNamespace

from elit.component.embedding.fasttext import FastText
from elit.component.tagger.corpus import conll_to_documents, label_map_from_conll
from elit.component.token_tagger.cnn import CNNTokenTagger
from elit.util.mx import mxnet_prefer_gpu

label_map = label_map_from_conll('data/ptb/pos/dev.tsv')
print(label_map)
tagger = CNNTokenTagger(ctx=mxnet_prefer_gpu(), key='pos',
                        embs=[FastText('https://elit-models.s3-us-west-2.amazonaws.com/cc.en.300.bin.zip')],
                        input_config=SimpleNamespace(row=100, col=5, dropout=0.5),
                        output_config=SimpleNamespace(num_class=len(label_map), flatten=True),
                        label_map=label_map
                        )
# 94.38
save_path = 'data/model/cnntagger'
tagger.train(conll_to_documents('data/ptb/pos/train.tsv', headers={0: 'text', 1: 'pos'}, gold=True),
             conll_to_documents('data/ptb/pos/dev.tsv', headers={0: 'text', 1: 'pos'}, gold=True),
             save_path)
tagger.load(save_path)
# Parameter 'dense1.weight' is missing in file 'data/model/cnntagger.params', which contains parameters: 'dense0.weight', 'dense0.bias'. Set allow_missing=True to ignore missing parameters.
tagger.evaluate(conll_to_documents('data/ptb/pos/dev.tsv', headers={0: 'text', 1: 'pos'}))
Exemple #5
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-06-16 11:37
from elit.component import POSFlairTagger
from elit.component.tagger.corpus import conll_to_documents
from elit.resources.pre_trained_models import ELIT_POS_FLAIR_EN_MIXED

tagger = POSFlairTagger()
model_path = ELIT_POS_FLAIR_EN_MIXED
tagger.load(model_path)
print(tagger.evaluate(conll_to_documents('data/dat/en-ddr.tst', headers={1: 'text', 3: 'pos'}),
                      output_dir='data/dat'))  # 97.96%
Exemple #6
0
        """
        Load model
        :param model_path: path to stored model
        :param model_root: the root for model_path
        :param kwargs: not used
        :return: self
        """
        super().load(model_path, model_root=model_root, **kwargs)
        return self


if __name__ == '__main__':
    tagger = POSFlairTagger(context=mx.gpu(3))
    model_path = 'data/model/pos/wsj'
    tagger.load(model_path)
    # tagger.train(conll_to_documents('data/dat/en-pos.dev', headers={0: 'text', 1: 'pos'}),
    #              conll_to_documents('data/dat/en-pos.dev', headers={0: 'text', 1: 'pos'}),
    #              model_path, pretrained_embeddings='data/embedding/glove/glove.6B.100d.debug.txt',
    #              forward_language_model='data/model/lm-news-forward',
    #              backward_language_model='data/model/lm-news-backward',
    #              max_epochs=1,
    #              embeddings_in_memory=False)
    test = conll_to_documents('data/wsj-pos/test.tsv',
                              headers={
                                  0: 'text',
                                  1: 'pos'
                              })
    sent = tagger.decode(test)[0][SENS][3]
    print(sent[POS])
    print(tagger.evaluate(test))