コード例 #1
0
def read_data():
    # dirname = './conll_format'

    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'np'}

    # this is the folder in which train, test and dev files reside
    data_folder = './conll_format'

    # retrieve corpus using column format, data folder and the names of the train, dev and test files
    corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns,
                                                              train_file='arr.train',
                                                              test_file='arr.test',
                                                              dev_file='arr.dev')

    return corpus
コード例 #2
0
ファイル: train_wv_glove.py プロジェクト: gungui98/ner
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, MemoryEmbeddings, CharacterEmbeddings
from typing import List
import torch

# 1. get the corpus
columns = {0: 'text', 1: 'ner'}
corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus(
    "data1",
    columns,
    train_file="train.txt",
    test_file="test.txt",
    dev_file="dev.txt")
print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),
コード例 #3
0
import gensim
import re

from flair.data import Sentence, TaggedCorpus, Token
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings
from typing import List

columns = {0: 'text', 1: 'pos', 2: 'ner'}

corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus(".", columns, train_file="ned.train",
                                                        dev_file="ned.testa",
                                                        test_file="ned.testb",
                                                        tag_to_biloes='ner')

tag_type = 'ner'



tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

print(corpus)

word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.nl.vec', binary=False)
word_vectors.save('wiki.nl.vec.gensim')

custom_embedding = WordEmbeddings('wiki.nl.vec.gensim')

char_lm_forward = CharLMEmbeddings('lm-nl-large-forward-v0.1.pt')
char_lm_backward = CharLMEmbeddings('lm-nl-large-backward-v0.1.pt')