def load_specific_corpus(dirname, files): columns = {0: 'text', 1: 'ner'} if files['train_file']: train_file = os.path.basename(files['train_file']) else: train_file = None if files['dev_file']: dev_file = os.path.basename(files['dev_file']) else: dev_file = None if files['test_file']: test_file = os.path.basename(files['test_file']) else: test_file = None corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( dirname, columns, train_file=train_file, dev_file=dev_file, test_file=test_file) log.info(corpus) return corpus
def train_tagger(data_path, model_path): tag_type='ct' # define columns columns = {0: 'text', 1: 'pos', 2: 'ct'} # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_path, columns, train_file='train.tsv', test_file='test.tsv') # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), CharLMEmbeddings('news-forward'), CharLMEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tag_dictionary = corpus.make_tag_dictionary(tag_type='ct') tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train(model_path, learning_rate=0.1, mini_batch_size=16, max_epochs=30)
def load_cropus(config): ''' this function load the cropus to flair library : https://github.com/zalandoresearch/flair the orgnization of data files required can be find in the above link ''' # the 3rd column should avoid named as 'ner', otherwise it will be convert into BIOES format by flair library columns = {0: 'text', 1: 'pos', 2: 'np', 3: 'ner11'} data_folder = config.path_data_root # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, #train_file='eng.train', train_file='eng.testb', test_file='eng.testb', dev_file='eng.testa') # skip the document separator in the CONLL cropus filtered_train = list( filter(lambda x: x.to_tokenized_string() != '-DOCSTART-', corpus.train)) filtered_dev = list( filter(lambda x: x.to_tokenized_string() != '-DOCSTART-', corpus.dev)) filtered_test = list( filter(lambda x: x.to_tokenized_string() != '-DOCSTART-', corpus.test)) return filtered_train, filtered_dev, filtered_test
def train(): # column format - word postag label columns = {0: "word", 1: "postag", 2: "ner"} data_folder = os.path.join(path, "../data/") # read train, dev and test set # here test set is same as dev set corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file="onto.testa") print(corpus) # create label dictionary tag_dictionary = corpus.make_tag_dictionary(tag_type = "ner") print(tag_dictionary.idx2item) # using glove embeddings and character embeddings embedding_types: List[TokenEmbeddings] = [WordEmbeddings("glove"), CharacterEmbeddings()] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types) # create sequence tagger and trainer instance tagger: SequenceTagger = SequenceTagger(hidden_size = 256, embeddings = embeddings, tag_dictionary = tag_dictionary, tag_type = "ner", use_crf = True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) model_path = os.path.join(path, "../models/") # commence training # model shall be saved in model_path under filename final-model.pt # this step takes at least 4 hours to complete, so please ensure access to GPU trainer.train(model_path, learning_rate = 0.1, mini_batch_size = 64, max_epochs = 3)
def test_load_no_dev_data(tasks_base_path): corpus = NLPTaskDataFetcher.load_column_corpus( (tasks_base_path / u'fashion_nodev'), { 0: u'text', 2: u'ner', }) assert (len(corpus.train) == 5) assert (len(corpus.dev) == 1) assert (len(corpus.test) == 1)
def main(args): args = parser.parse_args() # 1. get the corpus column_format = {0: 'word', 1: 'pos', 2: 'ner'} corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( Path(args.data_file[0]), column_format, tag_to_biloes='ner') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), # comment in these lines to use Bert embeddings # BertEmbeddings(), # comment in these lines to use Elmo embeddings # ELMoEmbeddings(), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/glove', learning_rate=0.1, mini_batch_size=32, max_epochs=50)
def test(): #from flair.data import TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher columns = {1: "ner", 3: "text"} corpus = NLPTaskDataFetcher.load_column_corpus( "../dataset/flair", column_format=columns, train_file="train_res_bilou.txt", test_file="test_res_bilou.txt")
def test_load_no_dev_data_explicit(tasks_base_path): corpus = NLPTaskDataFetcher.load_column_corpus( (tasks_base_path / u'fashion_nodev'), { 0: u'text', 2: u'ner', }, train_file=u'train.tsv', test_file=u'test.tsv') assert (len(corpus.train) == 5) assert (len(corpus.dev) == 1) assert (len(corpus.test) == 1)
def test_load_no_dev_data(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.load_column_corpus( tasks_base_path / 'fashion_nodev', { 0: 'text', 2: 'ner' }) assert len(list(corpus.train())) == 5 assert len(list(corpus.dev())) == 1 assert len(list(corpus.test())) == 1
def test_load_no_dev_data(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.load_column_corpus( tasks_base_path / "fashion_nodev", { 0: "text", 2: "ner" }) assert len(corpus.train) == 5 assert len(corpus.dev) == 1 assert len(corpus.test) == 1
def test_load_no_dev_data_explicit(tasks_base_path): # get training, test and dev data corpus = NLPTaskDataFetcher.load_column_corpus(tasks_base_path / 'fashion_nodev', { 0: 'text', 2: 'ner' }, train_file='train.tsv', test_file='test.tsv') assert len(corpus.train) == 5 assert len(corpus.dev) == 1 assert len(corpus.test) == 1
def __init__(self, corpus_name: str): corpus = NLPTaskDataFetcher.load_column_corpus( loc.abs_path([loc.ASSETS, loc.MODELS, loc.DIRKSON]), { 0: 'text', 1: 'ner' }, train_file=corpus_name + loc.DIRKSON_VALIDATION_TXT, test_file=corpus_name + loc.DIRKSON_TEST_TXT) embedding_types = [ BertEmbeddings('bert-base-uncased'), FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward') ] tag_type = 'ner' embeddings = StackedEmbeddings(embeddings=embedding_types) tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus) if not path.exists: os.mkdir( loc.abs_path( [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name])) trainer.train(loc.abs_path( [loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name]), learning_rate=0.1, mini_batch_size=32, max_epochs=150) plotter = Plotter() plotter.plot_training_curves( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.LOSS_TSV ])) plotter.plot_weights( loc.abs_path([ loc.ASSETS, loc.MODELS, loc.DIRKSON, corpus_name, loc.WEIGHTS_TXT ]))
def infer(): # column format - word, postag columns = {0: "word", 1: "postag"} data_folder = os.path.join(path, "../data/") # load sequence tagger model model_path = os.path.join(path, "../models/final-model.pt") tagger = SequenceTagger.load_from_file(model_path) # load test set corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file = "onto.train", dev_file = "onto.testa", test_file = "onto.testb") print(corpus) # commencing inference # this step takes atleast 30 minutes print("Infering on test set...") sentences = tagger.predict(corpus.test) out_fname = os.path.join(path, "../output/lstm_output.txt") with open(out_fname, "w") as fw: for sentence in sentences: for token in sentence.tokens: fw.write("{}\n".format(token.tags['ner'].value)) fw.write("\n")
from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter # 1. get the corpus # corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH) # DC, EE, EG, HG # define columns columns = {0: 'text', 1: 'DC', 2: 'EE', 3: 'EG', 4: 'HG', 5: 'comb'} # this is the folder in which train, test and dev files reside data_folder = 'data' # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file='eda_train.txt', test_file='eda_test.txt', dev_file='eda_dev.txt') print(corpus) # 2. what tag do we want to predict? tag_types = [ 'DC', 'EE', 'EG', 'HG', ] # 3. make the tag dictionary from the corpus tag_dictionaries = []
from torch.optim.adam import Adam from typing import List from hyperopt import hp from flair.hyperparameter.param_selection import SearchSpace, Parameter from flair.optim import SGDW import os import torch import gensim print(" ") columns = {0: 'token', 1: 'pos', 2: 'sublabel', 3: 'label'} data_folder = "data/" corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file="train_selective.txt", test_file="test_selective.txt", dev_file="dev_selective.txt") print(" ") print("Train len: ", len(corpus.train)) print("Test len: ", len(corpus.test)) print("Dev len: ", len(corpus.dev)) print(" ") print("Train: ", corpus.train[0].to_tagged_string('label')) print("Test: ", corpus.test[0].to_tagged_string('label')) print("Dev: ", corpus.dev[0].to_tagged_string('label')) tag_type = 'label' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List import os import setGPU columns = {0: 'text', 1: 'ner'} os.chdir("/home/lpinna") # this is the folder in which train, test and dev files reside data_folder = '/home/lpinna/classification1/training/vol3' # 1. get the corpus corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file='train_20190426_v1.csv', test_file='test_20190426_v1.csv') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) #print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [
def main(train_file): # 1. get the corpus # define columns columns = {0: 'text', 1: '', 2: '', 3: 'ner'} # this is the folder in which train, test and dev files reside data_folder = './eng_data_mini_onefile/' # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file=train_file, test_file='eng.testb', dev_file='eng.testa') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # 7. start training trainer.train('resources/taggers/example-ner', learning_rate=0.1, mini_batch_size=32, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('resources/taggers/example-ner/loss.tsv') plotter.plot_weights('resources/taggers/example-ner/weights.txt')
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, FlairEmbeddings, CharLMEmbeddings, ELMoEmbeddings, BertEmbeddings from pathlib import Path from typing import List # 1. get the corpus # define columns columns = {0: 'text', 1: 'ner'} # this is the folder in which train, test and dev files reside data_folder = './' #retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file='customData/usDL/train.txt', test_file='customData/usDL/test.txt', dev_file='customData/usDL/test.txt') # len(corpus.train) print(corpus.train[0].to_tagged_string('ner')) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) cachedir = Path( '/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/nishant/embeddings' )
args = parser.parse_args() print(vars(args)) column_format = {0: 'text', 1: 'ner'} # the datafiles generated by our scripts have columns: text ner [weight] if args.include_weight: column_format[2] = 'weight' # this can be modified to individual needs. data_folder = os.path.join(args.data_folder_prefix, args.folder_name) model_folder = os.path.join(args.model_folder_prefix, args.folder_name) if args.include_weight: model_folder += '_w' # print(column_format) corpus: Corpus = NLPTaskDataFetcher.load_column_corpus(data_folder, column_format=column_format, tag_to_biloes="ner") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) embedding_types: List[TokenEmbeddings] = [ # GloVe embeddings WordEmbeddings('glove'), # contextual string embeddings, forward FlairEmbeddings('news-forward'), # PooledFlairEmbeddings('news-forward', pooling='min'),
from flair.data import TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings from typing import List #1. get the corpus # define columns columns = {0: 'text', 1: 'ner'} # this is the folder in which train, test and dev files reside data_folder = './data/OntoNote4NER' # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file='train.char.bmes', test_file='test.char.bmes', dev_file='dev.char.bmes') print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('custom'), # WordEmbeddings('glove'),
# define column columns = {0: 'text', 1: 'ner'} config = configparser.ConfigParser() config.read('config') data_folder = config.get('data', 'data_folder') train_file = config.get('data', 'train_file') test_file = config.get('data', 'test_file') dev_file = config.get('data', 'dev_file') # retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file=train_file, test_file=test_file, dev_file=dev_file) print(corpus) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) ### used for OwnELMoEmbeddings from flair import device cuda_device = 0 if str(device) != 'cpu' else -1 model = allennlp.commands.elmo.ElmoEmbedder( options_file='path_to_pretrain_elmo_options.json', weight_file='path_to_pretrain_elmo_weights.hdf5',
from hyperopt import hp from flair.hyperparameter.param_selection import SearchSpace, Parameter from flair.optim import SGDW import os import torch import gensim from gensim.models import Word2Vec from gensim.models import FastText import sys print(" ") columns = {0: 'token', 1:'pos', 2: 'sublabel', 3:'label'} data_folder = "data/" corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file=sys.argv[1], test_file=sys.argv[2], dev_file=sys.argv[3]) print(" ") print("Train len: ", len(corpus.train)) print("Test len: ", len(corpus.test)) print("Dev len: ", len(corpus.dev)) print(" ") print("Train: ", corpus.train[0].to_tagged_string('label')) print("Test: ", corpus.test[0].to_tagged_string('label')) print("Dev: ", corpus.dev[0].to_tagged_string('label')) tag_type = 'label' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
from flair.data import TaggedCorpus, MultiCorpus from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings from typing import List from flair.data import Dictionary import flair, torch flair.device = torch.device('cpu') columns = {0: 'text', 1: 'ner'} data_folder = '../' corpus1: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.train.txt", test_file="de-da-te-ta.10E-4percent.conll.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.dev.txt") corpus2: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns, train_file="de-da-te-ta.10E-4percent.conll.84max.train.txt", test_file="de-da-te-ta.10E-4percent.conll.84max.test.txt", dev_file="de-da-te-ta.10E-4percent.conll.84max.dev.txt") corpus = MultiCorpus([corpus1, corpus2]) tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) #tag_dictionary: Dictionary = Dictionary.load('../vocab/m.model') glove_embedding = WordEmbeddings('../../glove/GLOVE/GloVe/vectors.gensim') word2vec_embedding = WordEmbeddings('../../huawei_w2v/vector.gensim') #bert_embedding = BertEmbeddings('../bert_pretraining/pretraining_outputs/pretraining_output_batch_size_32') embedding_types: List[TokenEmbeddings] = [WordEmbeddings('tr'), glove_embedding, word2vec_embedding] #embedding_types: List[TokenEmbeddings] = [custom_embedding] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, use_rnn=True, rnn_layers=2) from flair.trainers import ModelTrainer
def train_tagger(options): # Define columns columns = {1: 'text', 2: 'pos', 3: 'ner'} # What tag should be predicted? tag_type = 'ner' # Folder in which train, test and dev files reside data_folder = options.iob_dir + '/' + options.correction_mode # Folder in which to save tagging model and additional information tagger_folder = '/'.join([ options.tagger_dir, options.ner_cycle, options.lm_domain, options.correction_mode ]) + '-stringemb' # Retrieve corpus using column format, data folder and the names of the train, dev and test files corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus( data_folder, columns, train_file='train.txt', test_file='test.txt', dev_file='dev.txt') # Make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # Initialize embeddings char_embeddings = [ FlairEmbeddings(options.lm_dir + options.lm_domain + '-fw/best-lm.pt', use_cache=False), FlairEmbeddings(options.lm_dir + options.lm_domain + '-bw/best-lm.pt', use_cache=False) ] if not options.use_wiki_wordemb: if not options.use_press_wordemb: embedding_types: List[TokenEmbeddings] = char_embeddings else: embedding_types: List[TokenEmbeddings] = [ WordEmbeddings( 'resources.d/embeddings/fasttext/pressfr-wikifr') ] + char_embeddings tagger_folder = tagger_folder + '-wordemb-pr' else: embedding_types: List[TokenEmbeddings] = [WordEmbeddings('fr') ] + char_embeddings tagger_folder = tagger_folder + '-wordemb' if options.use_crf: tagger_folder = tagger_folder + '-crf' # Print information print(tagger_folder) print(corpus) print(tag_dictionary.idx2item) embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # Initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=options.use_crf) # Initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) # Start training trainer.train( tagger_folder, learning_rate=0.1, mini_batch_size=32, max_epochs=50, patience=options.train_patience, #train_with_dev=True, anneal_against_train_loss=False, embeddings_in_memory=False) # Plot training curves (optional) plotter = Plotter() plotter.plot_training_curves(tagger_folder + '/loss.tsv') plotter.plot_weights(tagger_folder + '/weights.txt')