def train(self, intent_fst) -> None: """Train intent classifier and named entity recognizers.""" # pylint: disable=E0401 from flair.data import Sentence, Token # pylint: disable=E0401 from flair.models import SequenceTagger, TextClassifier # pylint: disable=E0401 from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) # pylint: disable=E0401 from flair.data import TaggedCorpus # pylint: disable=E0401 from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache") ) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data") ) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data # ---------------------------- # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int(self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json") ) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir") ) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( "Generating %s sample(s) from %s intent(s)", num_samples, len(intent_fst_paths), ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples ) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)" ) sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug("Generated sentences in %s second(s)", sentence_time) # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for _ in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = {ev["start"]: ev for ev in intent_sent["entities"]} entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for _ in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug("Loading word embeddings from %s", cache_dir) word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data(class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug( "Intent classifier has %s example(s)", len(class_sentences) ) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug("Training %s NER sequence tagger(s)", len(ner_sentences)) # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( "NER tagger for %s has %s example(s)", intent_name, len(intent_ner_sents), ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
def load_classification_corpus( data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None, use_tokenizer: bool = True) -> TaggedCorpus: """ Helper function to get a TaggedCorpus from text classification-formatted task data :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :return: a TaggedCorpus with annotated train, dev and test data """ if type(data_folder) == str: data_folder: Path = Path(data_folder) if train_file is not None: train_file = data_folder / train_file if test_file is not None: test_file = data_folder / test_file if dev_file is not None: dev_file = data_folder / dev_file # automatically identify train / test / dev files if train_file is None: for file in data_folder.iterdir(): file_name = file.name if 'train' in file_name: train_file = file if 'test' in file_name: test_file = file if 'dev' in file_name: dev_file = file if 'testa' in file_name: dev_file = file if 'testb' in file_name: test_file = file log.info("Reading data from {}".format(data_folder)) log.info("Train: {}".format(train_file)) log.info("Dev: {}".format(dev_file)) log.info("Test: {}".format(test_file)) def make_read_text_classification_file(f): return lambda: NLPTaskDataFetcher.read_text_classification_file(f, use_tokenizer=use_tokenizer) sentences_train: Iterable[Sentence] = make_read_text_classification_file(train_file) sentences_test: Iterable[Sentence] = make_read_text_classification_file(test_file) if test_file is None or dev_file is None: total_number_of_sentences = 0 for sentence in sentences_train(): total_number_of_sentences += 1 train_indexes = set(range(0, total_number_of_sentences)) if dev_file is not None: sentences_dev: Iterable[Sentence] = make_read_text_classification_file(dev_file) else: dev_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1) train_indexes = sorted(train_indexes - dev_indexes) sentences_dev: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, dev_indexes) sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes) return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
import gensim import re from flair.data import Sentence, TaggedCorpus, Token from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings from typing import List sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( "universal-dependencies-1.2/UD_Bulgarian/bg-ud-train.conllu") sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( "universal-dependencies-1.2/UD_Bulgarian/bg-ud-dev.conllu") sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( "universal-dependencies-1.2/UD_Bulgarian/bg-ud-test.conllu") corpus: TaggedCorpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) tag_type = 'upos' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.bg.vec', binary=False) word_vectors.save('wiki.bg.vec.gensim') custom_embedding = WordEmbeddings('custom', 'wiki.bg.vec.gensim') char_lm_forward = CharLMEmbeddings('lm-bg-small-forward-v0.1.pt') char_lm_backward = CharLMEmbeddings('lm-bg-small-backward-v0.1.pt') embedding_types: List[TokenEmbeddings] = [
def load_classification_corpus(data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None, use_tokenizer: bool = True, max_tokens_per_doc=-1) -> TaggedCorpus: """ Helper function to get a TaggedCorpus from text classification-formatted task data :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :return: a TaggedCorpus with annotated train, dev and test data """ if type(data_folder) == str: data_folder: Path = Path(data_folder) if train_file is not None: train_file = data_folder / train_file if test_file is not None: test_file = data_folder / test_file if dev_file is not None: dev_file = data_folder / dev_file # automatically identify train / test / dev files if train_file is None: for file in data_folder.iterdir(): file_name = file.name if "train" in file_name: train_file = file if "test" in file_name: test_file = file if "dev" in file_name: dev_file = file if "testa" in file_name: dev_file = file if "testb" in file_name: test_file = file log.info("Reading data from {}".format(data_folder)) log.info("Train: {}".format(train_file)) log.info("Dev: {}".format(dev_file)) log.info("Test: {}".format(test_file)) sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( train_file, use_tokenizer=use_tokenizer, max_tokens_per_doc=max_tokens_per_doc) sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( test_file, use_tokenizer=use_tokenizer, max_tokens_per_doc=max_tokens_per_doc) if dev_file is not None: sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( dev_file, use_tokenizer=use_tokenizer, max_tokens_per_doc=max_tokens_per_doc) else: sentences_dev: List[Sentence] = [ sentences_train[i] for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1) ] sentences_train = [ x for x in sentences_train if x not in sentences_dev ] return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
def load_column_corpus( data_folder: Union[str, Path], column_format: Dict[int, str], train_file=None, test_file=None, dev_file=None, tag_to_biloes=None) -> TaggedCorpus: """ Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task data :param column_format: a map specifying the column format :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param tag_to_biloes: whether to convert to BILOES tagging scheme :return: a TaggedCorpus with annotated train, dev and test data """ if type(data_folder) == str: data_folder: Path = Path(data_folder) if train_file is not None: train_file = data_folder / train_file if test_file is not None: test_file = data_folder / test_file if dev_file is not None: dev_file = data_folder / dev_file # automatically identify train / test / dev files if train_file is None: for file in data_folder.iterdir(): file_name = file.name if file_name.endswith('.gz'): continue if 'train' in file_name and not '54019' in file_name: train_file = file if 'dev' in file_name: dev_file = file if 'testa' in file_name: dev_file = file if 'testb' in file_name: test_file = file # if no test file is found, take any file with 'test' in name if test_file is None: for file in data_folder.iterdir(): file_name = file.name if file_name.endswith('.gz'): continue if 'test' in file_name: test_file = file log.info("Reading data from {}".format(data_folder)) log.info("Train: {}".format(train_file)) log.info("Dev: {}".format(dev_file)) log.info("Test: {}".format(test_file)) def make_read_column_data(f): return lambda: NLPTaskDataFetcher.read_column_data(f, column_format) # get train and test data sentences_train: Iterable[Sentence] = make_read_column_data(train_file) # read in test file if exists, otherwise sample 10% of train data as test dataset if test_file is None or dev_file is None: total_number_of_sentences = 0 for sentence in sentences_train(): total_number_of_sentences += 1 train_indexes = set(range(0, total_number_of_sentences)) if test_file is not None: sentences_test: Iterable[Sentence] = make_read_column_data(test_file) else: test_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1) train_indexes = sorted(train_indexes - test_indexes) sentences_test: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, test_indexes) sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes) # read in dev file if exists, otherwise sample 10% of train data as dev dataset if dev_file is not None: sentences_dev: List[Sentence] = make_read_column_data(dev_file) else: dev_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1) train_indexes = sorted(train_indexes - dev_indexes) sentences_dev: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, dev_indexes) sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes) corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test, name=data_folder.name) if tag_to_biloes is not None: # convert tag scheme to iobes for sentence in corpus.get_all_sentences(): sentence.convert_tag_scheme(tag_type=tag_to_biloes, target_scheme='iobes') return corpus
def fetch_data(task: NLPTask) -> TaggedCorpus: if task == NLPTask.CONLL_03 or task == NLPTask.ONTONER or task == NLPTask.FASHION: data_folder = os.path.join('resources', 'tasks', 'conll_03') if task == NLPTask.ONTONER: data_folder = os.path.join('resources', 'tasks', 'onto-ner') if task == NLPTask.FASHION: data_folder = os.path.join('resources', 'tasks', 'fashion') sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data( os.path.join(data_folder, 'eng.train')) sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data( os.path.join(data_folder, 'eng.testa')) sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data( os.path.join(data_folder, 'eng.testb')) for sentence in sentences_train + sentences_test + sentences_dev: sentence: Sentence = sentence sentence.convert_tag_scheme(tag_type='ner', target_scheme='iobes') return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.CONLL_2000: data_folder = os.path.join('resources', 'tasks', 'conll_2000') sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data( os.path.join(data_folder, 'train.txt')) sentences_dev: List[Sentence] = [ sentences_train[i] for i in NLPTaskDataFetcher._sample() ] sentences_train = [ x for x in sentences_train if x not in sentences_dev ] sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data( os.path.join(data_folder, 'test.txt')) for sentence in sentences_train + sentences_test + sentences_dev: sentence: Sentence = sentence sentence.convert_tag_scheme(tag_type='np', target_scheme='iobes') return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.UD_ENGLISH: data_folder = os.path.join('resources', 'tasks', 'ud') sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'en-ud-train.conllu')) sentences_dev: List[Sentence] = [ sentences_train[i] for i in NLPTaskDataFetcher._sample() ] sentences_train = [ x for x in sentences_train if x not in sentences_dev ] sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'en-ud-dev.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.UD_GERMAN: data_folder = os.path.join('resources', 'tasks', 'ud-ger') sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'de_gsd-ud-train.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'de_gsd-ud-test.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'de_gsd-ud-dev.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.ONTONOTES: data_folder = os.path.join('resources', 'tasks', 'ontonotes') sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'train.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'test.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'dev.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.CONLL_12: data_folder = os.path.join('resources', 'tasks', 'conll_12') sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'train.propbank.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'test.propbank.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'dev.propbank.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.SRL: data_folder = os.path.join('resources', 'tasks', 'srl') sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_conll_2_column_data( os.path.join(data_folder, 'train.srl.conll'), 'srl') sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_conll_2_column_data( os.path.join(data_folder, 'test.srl.conll'), 'srl') sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_conll_2_column_data( os.path.join(data_folder, 'dev.srl.conll'), 'srl') return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.PENN: data_folder = os.path.join('resources', 'tasks', 'penn') sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'train.conll')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'valid.conll')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'test.conll')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.CONLL_03_GERMAN: data_folder = os.path.join('resources', 'tasks', 'conll_03-ger') sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_conll_03_german( os.path.join(data_folder, 'deu.train'), tag_scheme='iobes') sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_conll_03_german( os.path.join(data_folder, 'deu.testa'), tag_scheme='iobes') sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_conll_03_german( os.path.join(data_folder, 'deu.testb'), tag_scheme='iobes') return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.GERMEVAL: data_folder = os.path.join('resources', 'tasks', 'germeval') sentences_train: List[Sentence] = NLPTaskDataFetcher.read_germeval( os.path.join(data_folder, 'NER-de-train.tsv'), tag_scheme='iobes') sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_germeval( os.path.join(data_folder, 'NER-de-dev.tsv'), tag_scheme='iobes') sentences_test: List[Sentence] = NLPTaskDataFetcher.read_germeval( os.path.join(data_folder, 'NER-de-test.tsv'), tag_scheme='iobes') return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.IMDB: data_folder = os.path.join('resources', 'tasks', 'imdb') sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'train.txt')) sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'dev.txt')) sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'test.txt')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.AG_NEWS: data_folder = os.path.join('resources', 'tasks', 'ag_news') sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'train.txt')) sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'dev.txt')) sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'test.txt')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
def load_column_corpus( data_folder: Union[str, Path], column_format: Dict[int, str], train_file=None, test_file=None, dev_file=None, tag_to_biloes=None, ) -> TaggedCorpus: """ Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task data :param column_format: a map specifying the column format :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param tag_to_biloes: whether to convert to BILOES tagging scheme :return: a TaggedCorpus with annotated train, dev and test data """ if type(data_folder) == str: data_folder: Path = Path(data_folder) if train_file is not None: train_file = data_folder / train_file if test_file is not None: test_file = data_folder / test_file if dev_file is not None: dev_file = data_folder / dev_file # automatically identify train / test / dev files if train_file is None: for file in data_folder.iterdir(): file_name = file.name if file_name.endswith(".gz"): continue if "train" in file_name and not "54019" in file_name: train_file = file if "dev" in file_name: dev_file = file if "testa" in file_name: dev_file = file if "testb" in file_name: test_file = file # if no test file is found, take any file with 'test' in name if test_file is None: for file in data_folder.iterdir(): file_name = file.name if file_name.endswith(".gz"): continue if "test" in file_name: test_file = file log.info("Reading data from {}".format(data_folder)) log.info("Train: {}".format(train_file)) log.info("Dev: {}".format(dev_file)) log.info("Test: {}".format(test_file)) # get train and test data sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data( train_file, column_format) # read in test file if exists, otherwise sample 10% of train data as test dataset if test_file is not None: sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_column_data( test_file, column_format) else: sentences_test: List[Sentence] = [ sentences_train[i] for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1) ] sentences_train = [ x for x in sentences_train if x not in sentences_test ] # read in dev file if exists, otherwise sample 10% of train data as dev dataset if dev_file is not None: sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_column_data( dev_file, column_format) else: sentences_dev: List[Sentence] = [ sentences_train[i] for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1) ] sentences_train = [ x for x in sentences_train if x not in sentences_dev ] if tag_to_biloes is not None: # convert tag scheme to iobes for sentence in sentences_train + sentences_test + sentences_dev: sentence.convert_tag_scheme(tag_type=tag_to_biloes, target_scheme="iobes") return TaggedCorpus(sentences_train, sentences_dev, sentences_test, name=data_folder.name)
def fetch_data(task: NLPTask) -> TaggedCorpus: """ Helper function to fetch a TaggedCorpus for a specific NLPTask. For this to work you need to first download and put into the appropriate folder structure the corresponsing NLP task data. The tutorials on https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this code to create your own data fetchers. :param task: specification of the NLPTask you wish to get :return: a TaggedCorpus consisting of train, dev and test data """ data_folder = os.path.join('resources', 'tasks', str(task.value).lower()) print("reading data from %s".format(data_folder)) # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk) if task == NLPTask.CONLL_2000: columns = {0: 'text', 1: 'pos', 2: 'np'} return NLPTaskDataFetcher.fetch_column_corpus( data_folder, columns, train_file='train.txt', test_file='test.txt', tag_to_biloes='np') # many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag if task == NLPTask.CONLL_03 or task == NLPTask.ONTONER or task == NLPTask.FASHION: columns = {0: 'text', 1: 'pos', 2: 'np', 3: 'ner'} return NLPTaskDataFetcher.fetch_column_corpus( data_folder, columns, train_file='eng.train', test_file='eng.testb', dev_file='eng.testa', tag_to_biloes='ner') # the CoNLL 03 task for German has an additional lemma column if task == NLPTask.CONLL_03_GERMAN: columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} return NLPTaskDataFetcher.fetch_column_corpus( data_folder, columns, train_file='deu.train', test_file='deu.testb', dev_file='deu.testa', tag_to_biloes='ner') # the GERMEVAL task only has two columns: text and ner if task == NLPTask.GERMEVAL: columns = {1: 'text', 2: 'ner'} return NLPTaskDataFetcher.fetch_column_corpus( data_folder, columns, train_file='NER-de-train.tsv', test_file='NER-de-test.tsv', dev_file='NER-de-dev.tsv', tag_to_biloes='ner') # WSD tasks may be put into this column format if task == NLPTask.WSD: columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'sense'} return NLPTaskDataFetcher.fetch_column_corpus( data_folder, columns, train_file='semcor.tsv', test_file='semeval2015.tsv') # the UD corpora follow the CoNLL-U format, for which we have a special reader if task == NLPTask.UD_ENGLISH: # get train, test and dev data sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'en_ewt-ud-train.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'en_ewt-ud-test.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'en_ewt-ud-dev.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.UD_GERMAN: # get train, test and dev data sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'de_gsd-ud-train.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'de_gsd-ud-test.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'de_gsd-ud-dev.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.ONTONOTES: # get train, test and dev data sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'train.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'test.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'dev.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.CONLL_12: # get train, test and dev data sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'train.propbank.conllu')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'test.propbank.conllu')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'dev.propbank.conllu')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) if task == NLPTask.PENN: sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'train.conll')) sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'valid.conll')) sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud( os.path.join(data_folder, 'test.conll')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) # for text classifiers, we use our own special format if task == NLPTask.IMDB: sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'train.txt')) sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'dev.txt')) sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'test.txt')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test) # for text classifiers, we use our own special format if task == NLPTask.AG_NEWS: sentences_train: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'train.txt')) sentences_dev: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'dev.txt')) sentences_test: List[ Sentence] = NLPTaskDataFetcher.read_text_classification_file( os.path.join(data_folder, 'test.txt')) return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
def read_group(entities): sentences_dev = read_group_file('data/dev.tsv', entities) sentences_train = read_group_file('data/train.tsv', entities) return TaggedCorpus(sentences_train, sentences_dev, sentences_dev)
from typing import List from flair.data import Sentence, TaggedCorpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings from flair.models.text_classification_model import TextClassifier from flair.trainers.text_classification_trainer import TextClassifierTrainer sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt') sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt') sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt') corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [WordEmbeddings('de-fasttext'), CharLMEmbeddings('german-forward'), CharLMEmbeddings('german-backward')] # 4. init document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = TextClassifierTrainer(classifier, corpus, label_dict)
def train(self, X, y): X_text = X[:, self.args.TEXT_COL] y = y.flatten() #corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03) train: List[Sentence] = [] for tweet, label in zip(X_text, y): if tweet == '': tweet = 'dummy' s: Sentence = Sentence(tweet) s.add_label(str(label)) train.append(s) corpus: TaggedCorpus = TaggedCorpus(train, train, train) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ glove_embeddings, #twitter_embeddings, # comment in this line to use character embeddings #CharacterEmbeddings(), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), fflair, # FlairEmbeddings('news-backward'), bflair ] # 4. initialize document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) self.model = trainer.model self.model.save = self.save self.model.save_checkpoint = self.save_checkpoint # 7. start the training trainer.train('../data/ecuador_earthquake_2016/models', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=5) self.clf = classifier
#ds = X_text = X[:, self.args.TEXT_COL] y = y.flatten() # corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03) train: List[Sentence] = [] for tweet, label in zip(X_text, y): if tweet == '': tweet = 'dummy' s: Sentence = Sentence(tweet) s.add_label(label) train.append(s) corpus: TaggedCorpus = TaggedCorpus(train) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ WordEmbeddings('glove'), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] # 4. initialize document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(