def read_data(): # dirname = './conll_format' # define columns columns = {0: 'text', 1: 'pos', 2: 'np'} # this is the folder in which train, test and dev files reside data_folder = './conll_format' # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus(data_folder, columns, train_file='arr.train', test_file='arr.test', dev_file='arr.dev') return corpus
from flair.data import TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, MemoryEmbeddings, CharacterEmbeddings from typing import List import torch # 1. get the corpus columns = {0: 'text', 1: 'ner'} corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus( "data1", columns, train_file="train.txt", test_file="test.txt", dev_file="dev.txt") print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ # WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(),
import gensim import re from flair.data import Sentence, TaggedCorpus, Token from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings from typing import List columns = {0: 'text', 1: 'pos', 2: 'ner'} corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus(".", columns, train_file="ned.train", dev_file="ned.testa", test_file="ned.testb", tag_to_biloes='ner') tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(corpus) word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.nl.vec', binary=False) word_vectors.save('wiki.nl.vec.gensim') custom_embedding = WordEmbeddings('wiki.nl.vec.gensim') char_lm_forward = CharLMEmbeddings('lm-nl-large-forward-v0.1.pt') char_lm_backward = CharLMEmbeddings('lm-nl-large-backward-v0.1.pt')