Ejemplo n.º 1
0
import mxnet as mx

from bertsota.common.utils import mxnet_prefer_gpu
from bertsota.tagger.corpus import NLPTaskDataFetcher, NLPTask

# get training, test and dev data
from bertsota.tagger.embeddings import WordEmbeddings, StackedEmbeddings, BERTEmbeddings, CharLMEmbeddings
from bertsota.tagger.sequence_tagger_model import SequenceTagger
from bertsota.tagger.sequence_tagger_trainer import SequenceTaggerTrainer

model_path = 'data/model/conll03-pe-fe2'
columns = {0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/conll03',
                                                columns,
                                                train_file='train.tsv',
                                                test_file='test.tsv',
                                                dev_file='dev.tsv',
                                                tag_to_biloes='ner')

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [
        WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'),
        BERTEmbeddings([
Ejemplo n.º 2
0
import mxnet as mx

from bertsota.common.utils import mxnet_prefer_gpu
from bertsota.tagger.corpus import NLPTaskDataFetcher
from bertsota.tagger.embeddings import WordEmbeddings, CharLMEmbeddings, StackedEmbeddings, BERTEmbeddings
from bertsota.tagger.sequence_tagger_model import SequenceTagger
from bertsota.tagger.sequence_tagger_trainer import SequenceTaggerTrainer

model_path = 'data/model/wsj-pos-dotatt-belc'
columns = {0: 'text', 1: 'pos'}
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/wsj-pos',
                                                columns,
                                                train_file='train.short.tsv',
                                                test_file='test.tsv',
                                                dev_file='dev.tsv'
                                                # train_file='debug.tsv',
                                                # test_file='debug.tsv',
                                                # dev_file='debug.tsv'
                                                )
# 2. what tag do we want to predict?
tag_type = 'pos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [
        # WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'),
Ejemplo n.º 3
0
import os

import mxnet as mx

from bertsota.common.utils import mxnet_prefer_gpu
from bertsota.tagger.corpus import NLPTaskDataFetcher
from bertsota.tagger.embeddings import WordEmbeddings, CharLMEmbeddings, StackedEmbeddings, BERTEmbeddings
from bertsota.tagger.sequence_tagger_model import SequenceTagger
from bertsota.tagger.sequence_tagger_trainer import SequenceTaggerTrainer

model_path = 'data/model/ctb-pos3'

columns = {0: 'text', 1: 'pos'}
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/ctb5.1-pos',
                                                columns,
                                                train_file='train.short.tsv',
                                                test_file='test.short.tsv',
                                                dev_file='dev.short.tsv')
# 2. what tag do we want to predict?
tag_type = 'pos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [
        WordEmbeddings('data/embedding/ctb.pos.fasttext.300.txt'),
        # BERTEmbeddings(['data/embedding/bert_base_sum/ctb.pos.train.bert',
        #                 'data/embedding/bert_base_sum/ctb.pos.dev.bert',
Ejemplo n.º 4
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-02-12 15:25
import mxnet as mx

from bertsota.common.utils import mxnet_prefer_gpu
from bertsota.tagger.corpus import NLPTaskDataFetcher
from bertsota.tagger.embeddings import WordEmbeddings, CharLMEmbeddings, StackedEmbeddings
from bertsota.tagger.sequence_tagger_model import SequenceTagger
from bertsota.tagger.sequence_tagger_trainer import SequenceTaggerTrainer

columns = {0: 'text', 1: 'pos'}
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/wsj-pos',
                                                columns,
                                                train_file='dev.tsv',
                                                test_file='dev.tsv',
                                                dev_file='dev.tsv')
# 2. what tag do we want to predict?
tag_type = 'pos'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [
        WordEmbeddings('data/embedding/glove/glove.6B.100d.debug.txt'),
        # CharLMEmbeddings('data/model/lm-news-forward'),
        # CharLMEmbeddings('data/model/lm-news-backward'),
    ]
Ejemplo n.º 5
0
import mxnet as mx

from bertsota.common.utils import mxnet_prefer_gpu
from bertsota.tagger.corpus import NLPTaskDataFetcher, NLPTask

# get training, test and dev data
from bertsota.tagger.embeddings import WordEmbeddings, StackedEmbeddings, BERTEmbeddings, CharLMEmbeddings
from bertsota.tagger.sequence_tagger_model import SequenceTagger
from bertsota.tagger.sequence_tagger_trainer import SequenceTaggerTrainer

model_path = 'data/model/ontoen-bert'
columns = {0: 'id', 1: 'text', 2: 'lemma', 3: 'ner'}
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/ontonotes-en',
                                                columns,
                                                train_file='train.tsv',
                                                test_file='test.tsv',
                                                dev_file='dev.tsv',
                                                tag_to_biloes='ner',
                                                source_scheme='ioblu')

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
with mx.Context(mxnet_prefer_gpu()):
    embedding_types = [