Ejemplo n.º 1
0
    def train(self, intent_fst) -> None:
        """Train intent classifier and named entity recognizers."""
        # pylint: disable=E0401
        from flair.data import Sentence, Token

        # pylint: disable=E0401
        from flair.models import SequenceTagger, TextClassifier

        # pylint: disable=E0401
        from flair.embeddings import (
            FlairEmbeddings,
            StackedEmbeddings,
            DocumentRNNEmbeddings,
        )

        # pylint: disable=E0401
        from flair.data import TaggedCorpus

        # pylint: disable=E0401
        from flair.trainers import ModelTrainer

        # Directory to look for downloaded embeddings
        cache_dir = self.profile.read_path(
            self.profile.get("intent.flair.cache_dir", "flair/cache")
        )

        os.makedirs(cache_dir, exist_ok=True)

        # Directory to store generated models
        data_dir = self.profile.write_path(
            self.profile.get("intent.flair.data_dir", "flair/data")
        )

        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)

        self.embeddings = self.profile.get("intent.flair.embeddings", [])
        assert len(self.embeddings) > 0, "No word embeddings"

        # Create directories to write training data to
        class_data_dir = os.path.join(data_dir, "classification")
        ner_data_dir = os.path.join(data_dir, "ner")
        os.makedirs(class_data_dir, exist_ok=True)
        os.makedirs(ner_data_dir, exist_ok=True)

        # Convert FST to training data
        # ----------------------------

        # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] }
        sentences_by_intent: Dict[str, Any] = {}

        # Get sentences for training
        do_sampling = self.profile.get("intent.flair.do_sampling", True)
        start_time = time.time()

        if do_sampling:
            # Sample from each intent FST
            num_samples = int(self.profile.get("intent.flair.num_samples", 10000))
            intent_map_path = self.profile.read_path(
                self.profile.get("training.intent.intent_map", "intent_map.json")
            )

            with open(intent_map_path, "r") as intent_map_file:
                intent_map = json.load(intent_map_file)

            # Gather FSTs for all known intents
            fsts_dir = self.profile.write_dir(
                self.profile.get("speech_to_text.fsts_dir")
            )

            intent_fst_paths = {
                intent_id: os.path.join(fsts_dir, f"{intent_id}.fst")
                for intent_id in intent_map.keys()
            }

            # Generate samples
            self._logger.debug(
                "Generating %s sample(s) from %s intent(s)",
                num_samples,
                len(intent_fst_paths),
            )

            sentences_by_intent = sample_sentences_by_intent(
                intent_fst_paths, num_samples
            )
        else:
            # Exhaustively generate all sentences
            self._logger.debug(
                "Generating all possible sentences (may take a long time)"
            )
            sentences_by_intent = make_sentences_by_intent(intent_fst)

        sentence_time = time.time() - start_time
        self._logger.debug("Generated sentences in %s second(s)", sentence_time)

        # Get least common multiple in order to balance sentences by intent
        lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values()))

        # Generate examples
        class_sentences = []
        ner_sentences: Dict[str, List[Sentence]] = defaultdict(list)
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for intent_sent in intent_sents:
                # Only train an intent classifier if there's more than one intent
                if len(sentences_by_intent) > 1:
                    # Add balanced copies
                    for _ in range(num_repeats):
                        class_sent = Sentence(labels=[intent_name])
                        for word in intent_sent["tokens"]:
                            class_sent.add_token(Token(word))

                        class_sentences.append(class_sent)

                if len(intent_sent["entities"]) == 0:
                    continue  # no entities, no sequence tagger

                # Named entity recognition (NER) example
                token_idx = 0
                entity_start = {ev["start"]: ev for ev in intent_sent["entities"]}
                entity_end = {ev["end"]: ev for ev in intent_sent["entities"]}
                entity = None

                word_tags = []
                for word in intent_sent["tokens"]:
                    # Determine tag label
                    tag = "O" if not entity else f"I-{entity}"
                    if token_idx in entity_start:
                        entity = entity_start[token_idx]["entity"]
                        tag = f"B-{entity}"

                    word_tags.append((word, tag))

                    # word ner
                    token_idx += len(word) + 1

                    if (token_idx - 1) in entity_end:
                        entity = None

                # Add balanced copies
                for _ in range(num_repeats):
                    ner_sent = Sentence()
                    for word, tag in word_tags:
                        token = Token(word)
                        token.add_tag("ner", tag)
                        ner_sent.add_token(token)

                    ner_sentences[intent_name].append(ner_sent)

        # Start training
        max_epochs = int(self.profile.get("intent.flair.max_epochs", 100))

        # Load word embeddings
        self._logger.debug("Loading word embeddings from %s", cache_dir)
        word_embeddings = [
            FlairEmbeddings(os.path.join(cache_dir, "embeddings", e))
            for e in self.embeddings
        ]

        if len(class_sentences) > 0:
            self._logger.debug("Training intent classifier")

            # Random 80/10/10 split
            class_train, class_dev, class_test = self._split_data(class_sentences)
            class_corpus = TaggedCorpus(class_train, class_dev, class_test)

            # Intent classification
            doc_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=512,
                reproject_words=True,
                reproject_words_dimension=256,
            )

            classifier = TextClassifier(
                doc_embeddings,
                label_dictionary=class_corpus.make_label_dictionary(),
                multi_label=False,
            )

            self._logger.debug(
                "Intent classifier has %s example(s)", len(class_sentences)
            )
            trainer = ModelTrainer(classifier, class_corpus)
            trainer.train(class_data_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping intent classifier training")

        if len(ner_sentences) > 0:
            self._logger.debug("Training %s NER sequence tagger(s)", len(ner_sentences))

            # Named entity recognition
            stacked_embeddings = StackedEmbeddings(word_embeddings)

            for intent_name, intent_ner_sents in ner_sentences.items():
                ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents)
                ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test)

                tagger = SequenceTagger(
                    hidden_size=256,
                    embeddings=stacked_embeddings,
                    tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"),
                    tag_type="ner",
                    use_crf=True,
                )

                ner_intent_dir = os.path.join(ner_data_dir, intent_name)
                os.makedirs(ner_intent_dir, exist_ok=True)

                self._logger.debug(
                    "NER tagger for %s has %s example(s)",
                    intent_name,
                    len(intent_ner_sents),
                )
                trainer = ModelTrainer(tagger, ner_corpus)
                trainer.train(ner_intent_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping NER sequence tagger training")
Ejemplo n.º 2
0
    def load_classification_corpus(
            data_folder: Union[str, Path],
            train_file=None,
            test_file=None,
            dev_file=None,
            use_tokenizer: bool = True) -> TaggedCorpus:
        """
        Helper function to get a TaggedCorpus from text classification-formatted task data

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :return: a TaggedCorpus with annotated train, dev and test data
        """

        if type(data_folder) == str:
            data_folder: Path = Path(data_folder)

        if train_file is not None:
            train_file = data_folder / train_file
        if test_file is not None:
            test_file = data_folder / test_file
        if dev_file is not None:
            dev_file = data_folder / dev_file

        # automatically identify train / test / dev files
        if train_file is None:
            for file in data_folder.iterdir():
                file_name = file.name
                if 'train' in file_name:
                    train_file = file
                if 'test' in file_name:
                    test_file = file
                if 'dev' in file_name:
                    dev_file = file
                if 'testa' in file_name:
                    dev_file = file
                if 'testb' in file_name:
                    test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        def make_read_text_classification_file(f):
            return lambda: NLPTaskDataFetcher.read_text_classification_file(f,
                                                                            use_tokenizer=use_tokenizer)

        sentences_train: Iterable[Sentence] = make_read_text_classification_file(train_file)
        sentences_test: Iterable[Sentence] = make_read_text_classification_file(test_file)

        if test_file is None or dev_file is None:
            total_number_of_sentences = 0
            for sentence in sentences_train():
                total_number_of_sentences += 1
            train_indexes = set(range(0, total_number_of_sentences))

        if dev_file is not None:
            sentences_dev: Iterable[Sentence] = make_read_text_classification_file(dev_file)
        else:
            dev_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1)
            train_indexes = sorted(train_indexes - dev_indexes)
            sentences_dev: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, dev_indexes)
            sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes)

        return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
Ejemplo n.º 3
0
import gensim
import re

from flair.data import Sentence, TaggedCorpus, Token
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings
from typing import List

sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
    "universal-dependencies-1.2/UD_Bulgarian/bg-ud-train.conllu")
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
    "universal-dependencies-1.2/UD_Bulgarian/bg-ud-dev.conllu")
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
    "universal-dependencies-1.2/UD_Bulgarian/bg-ud-test.conllu")

corpus: TaggedCorpus = TaggedCorpus(sentences_train, sentences_dev,
                                    sentences_test)

tag_type = 'upos'

tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

word_vectors = gensim.models.KeyedVectors.load_word2vec_format('wiki.bg.vec',
                                                               binary=False)
word_vectors.save('wiki.bg.vec.gensim')

custom_embedding = WordEmbeddings('custom', 'wiki.bg.vec.gensim')

char_lm_forward = CharLMEmbeddings('lm-bg-small-forward-v0.1.pt')
char_lm_backward = CharLMEmbeddings('lm-bg-small-backward-v0.1.pt')

embedding_types: List[TokenEmbeddings] = [
Ejemplo n.º 4
0
    def load_classification_corpus(data_folder: Union[str, Path],
                                   train_file=None,
                                   test_file=None,
                                   dev_file=None,
                                   use_tokenizer: bool = True,
                                   max_tokens_per_doc=-1) -> TaggedCorpus:
        """
        Helper function to get a TaggedCorpus from text classification-formatted task data

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :return: a TaggedCorpus with annotated train, dev and test data
        """

        if type(data_folder) == str:
            data_folder: Path = Path(data_folder)

        if train_file is not None:
            train_file = data_folder / train_file
        if test_file is not None:
            test_file = data_folder / test_file
        if dev_file is not None:
            dev_file = data_folder / dev_file

        # automatically identify train / test / dev files
        if train_file is None:
            for file in data_folder.iterdir():
                file_name = file.name
                if "train" in file_name:
                    train_file = file
                if "test" in file_name:
                    test_file = file
                if "dev" in file_name:
                    dev_file = file
                if "testa" in file_name:
                    dev_file = file
                if "testb" in file_name:
                    test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        sentences_train: List[
            Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                train_file,
                use_tokenizer=use_tokenizer,
                max_tokens_per_doc=max_tokens_per_doc)
        sentences_test: List[
            Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                test_file,
                use_tokenizer=use_tokenizer,
                max_tokens_per_doc=max_tokens_per_doc)

        if dev_file is not None:
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    dev_file,
                    use_tokenizer=use_tokenizer,
                    max_tokens_per_doc=max_tokens_per_doc)
        else:
            sentences_dev: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [
                x for x in sentences_train if x not in sentences_dev
            ]

        return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
Ejemplo n.º 5
0
    def load_column_corpus(
            data_folder: Union[str, Path],
            column_format: Dict[int, str],
            train_file=None,
            test_file=None,
            dev_file=None,
            tag_to_biloes=None) -> TaggedCorpus:
        """
        Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.

        :param data_folder: base folder with the task data
        :param column_format: a map specifying the column format
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param tag_to_biloes: whether to convert to BILOES tagging scheme
        :return: a TaggedCorpus with annotated train, dev and test data
        """

        if type(data_folder) == str:
            data_folder: Path = Path(data_folder)

        if train_file is not None:
            train_file = data_folder / train_file
        if test_file is not None:
            test_file = data_folder / test_file
        if dev_file is not None:
            dev_file = data_folder / dev_file

        # automatically identify train / test / dev files
        if train_file is None:
            for file in data_folder.iterdir():
                file_name = file.name
                if file_name.endswith('.gz'): continue
                if 'train' in file_name and not '54019' in file_name:
                    train_file = file
                if 'dev' in file_name:
                    dev_file = file
                if 'testa' in file_name:
                    dev_file = file
                if 'testb' in file_name:
                    test_file = file

            # if no test file is found, take any file with 'test' in name
            if test_file is None:
                for file in data_folder.iterdir():
                    file_name = file.name
                    if file_name.endswith('.gz'): continue
                    if 'test' in file_name:
                        test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        def make_read_column_data(f):
            return lambda: NLPTaskDataFetcher.read_column_data(f, column_format)

        # get train and test data
        sentences_train: Iterable[Sentence] = make_read_column_data(train_file)

        # read in test file if exists, otherwise sample 10% of train data as test dataset

        if test_file is None or dev_file is None:
            total_number_of_sentences = 0
            for sentence in sentences_train():
                total_number_of_sentences += 1
            train_indexes = set(range(0, total_number_of_sentences))

        if test_file is not None:
            sentences_test: Iterable[Sentence] = make_read_column_data(test_file)
        else:
            test_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1)
            train_indexes = sorted(train_indexes - test_indexes)
            sentences_test: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, test_indexes)
            sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes)

        # read in dev file if exists, otherwise sample 10% of train data as dev dataset
        if dev_file is not None:
            sentences_dev: List[Sentence] = make_read_column_data(dev_file)
        else:
            dev_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1)
            train_indexes = sorted(train_indexes - dev_indexes)
            sentences_dev: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, dev_indexes)
            sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes)

        corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test, name=data_folder.name)

        if tag_to_biloes is not None:
            # convert tag scheme to iobes
            for sentence in corpus.get_all_sentences():
                sentence.convert_tag_scheme(tag_type=tag_to_biloes, target_scheme='iobes')

        return corpus
Ejemplo n.º 6
0
    def fetch_data(task: NLPTask) -> TaggedCorpus:

        if task == NLPTask.CONLL_03 or task == NLPTask.ONTONER or task == NLPTask.FASHION:

            data_folder = os.path.join('resources', 'tasks', 'conll_03')
            if task == NLPTask.ONTONER:
                data_folder = os.path.join('resources', 'tasks', 'onto-ner')
            if task == NLPTask.FASHION:
                data_folder = os.path.join('resources', 'tasks', 'fashion')

            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data(
                    os.path.join(data_folder, 'eng.train'))
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data(
                    os.path.join(data_folder, 'eng.testa'))
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data(
                    os.path.join(data_folder, 'eng.testb'))
            for sentence in sentences_train + sentences_test + sentences_dev:
                sentence: Sentence = sentence
                sentence.convert_tag_scheme(tag_type='ner',
                                            target_scheme='iobes')

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.CONLL_2000:

            data_folder = os.path.join('resources', 'tasks', 'conll_2000')

            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data(
                    os.path.join(data_folder, 'train.txt'))
            sentences_dev: List[Sentence] = [
                sentences_train[i] for i in NLPTaskDataFetcher._sample()
            ]
            sentences_train = [
                x for x in sentences_train if x not in sentences_dev
            ]
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_conll_sequence_labeling_data(
                    os.path.join(data_folder, 'test.txt'))
            for sentence in sentences_train + sentences_test + sentences_dev:
                sentence: Sentence = sentence
                sentence.convert_tag_scheme(tag_type='np',
                                            target_scheme='iobes')

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.UD_ENGLISH:
            data_folder = os.path.join('resources', 'tasks', 'ud')
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'en-ud-train.conllu'))
            sentences_dev: List[Sentence] = [
                sentences_train[i] for i in NLPTaskDataFetcher._sample()
            ]
            sentences_train = [
                x for x in sentences_train if x not in sentences_dev
            ]
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'en-ud-dev.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.UD_GERMAN:
            data_folder = os.path.join('resources', 'tasks', 'ud-ger')
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'de_gsd-ud-train.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'de_gsd-ud-test.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'de_gsd-ud-dev.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.ONTONOTES:
            data_folder = os.path.join('resources', 'tasks', 'ontonotes')
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'train.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'test.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'dev.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.CONLL_12:
            data_folder = os.path.join('resources', 'tasks', 'conll_12')
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'train.propbank.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'test.propbank.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'dev.propbank.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.SRL:
            data_folder = os.path.join('resources', 'tasks', 'srl')
            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_conll_2_column_data(
                    os.path.join(data_folder, 'train.srl.conll'), 'srl')
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_conll_2_column_data(
                    os.path.join(data_folder, 'test.srl.conll'), 'srl')
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_conll_2_column_data(
                    os.path.join(data_folder, 'dev.srl.conll'), 'srl')

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.PENN:
            data_folder = os.path.join('resources', 'tasks', 'penn')
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'train.conll'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'valid.conll'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'test.conll'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.CONLL_03_GERMAN:
            data_folder = os.path.join('resources', 'tasks', 'conll_03-ger')
            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_conll_03_german(
                    os.path.join(data_folder, 'deu.train'), tag_scheme='iobes')
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_conll_03_german(
                    os.path.join(data_folder, 'deu.testa'), tag_scheme='iobes')
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_conll_03_german(
                    os.path.join(data_folder, 'deu.testb'), tag_scheme='iobes')
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.GERMEVAL:
            data_folder = os.path.join('resources', 'tasks', 'germeval')
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_germeval(
                os.path.join(data_folder, 'NER-de-train.tsv'),
                tag_scheme='iobes')
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_germeval(
                os.path.join(data_folder, 'NER-de-dev.tsv'),
                tag_scheme='iobes')
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_germeval(
                os.path.join(data_folder, 'NER-de-test.tsv'),
                tag_scheme='iobes')
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.IMDB:
            data_folder = os.path.join('resources', 'tasks', 'imdb')
            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'train.txt'))
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'dev.txt'))
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'test.txt'))
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.AG_NEWS:
            data_folder = os.path.join('resources', 'tasks', 'ag_news')
            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'train.txt'))
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'dev.txt'))
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'test.txt'))
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
Ejemplo n.º 7
0
    def load_column_corpus(
        data_folder: Union[str, Path],
        column_format: Dict[int, str],
        train_file=None,
        test_file=None,
        dev_file=None,
        tag_to_biloes=None,
    ) -> TaggedCorpus:
        """
        Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.

        :param data_folder: base folder with the task data
        :param column_format: a map specifying the column format
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param tag_to_biloes: whether to convert to BILOES tagging scheme
        :return: a TaggedCorpus with annotated train, dev and test data
        """

        if type(data_folder) == str:
            data_folder: Path = Path(data_folder)

        if train_file is not None:
            train_file = data_folder / train_file
        if test_file is not None:
            test_file = data_folder / test_file
        if dev_file is not None:
            dev_file = data_folder / dev_file

        # automatically identify train / test / dev files
        if train_file is None:
            for file in data_folder.iterdir():
                file_name = file.name
                if file_name.endswith(".gz"):
                    continue
                if "train" in file_name and not "54019" in file_name:
                    train_file = file
                if "dev" in file_name:
                    dev_file = file
                if "testa" in file_name:
                    dev_file = file
                if "testb" in file_name:
                    test_file = file

            # if no test file is found, take any file with 'test' in name
            if test_file is None:
                for file in data_folder.iterdir():
                    file_name = file.name
                    if file_name.endswith(".gz"):
                        continue
                    if "test" in file_name:
                        test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        # get train and test data
        sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data(
            train_file, column_format)

        # read in test file if exists, otherwise sample 10% of train data as test dataset
        if test_file is not None:
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_column_data(
                    test_file, column_format)
        else:
            sentences_test: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [
                x for x in sentences_train if x not in sentences_test
            ]

        # read in dev file if exists, otherwise sample 10% of train data as dev dataset
        if dev_file is not None:
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_column_data(
                    dev_file, column_format)
        else:
            sentences_dev: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [
                x for x in sentences_train if x not in sentences_dev
            ]

        if tag_to_biloes is not None:
            # convert tag scheme to iobes
            for sentence in sentences_train + sentences_test + sentences_dev:
                sentence.convert_tag_scheme(tag_type=tag_to_biloes,
                                            target_scheme="iobes")

        return TaggedCorpus(sentences_train,
                            sentences_dev,
                            sentences_test,
                            name=data_folder.name)
Ejemplo n.º 8
0
    def fetch_data(task: NLPTask) -> TaggedCorpus:
        """
        Helper function to fetch a TaggedCorpus for a specific NLPTask. For this to work you need to first download
        and put into the appropriate folder structure the corresponsing NLP task data. The tutorials on
        https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this
        code to create your own data fetchers.
        :param task: specification of the NLPTask you wish to get
        :return: a TaggedCorpus consisting of train, dev and test data
        """

        data_folder = os.path.join('resources', 'tasks',
                                   str(task.value).lower())
        print("reading data from %s".format(data_folder))

        # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk)
        if task == NLPTask.CONLL_2000:
            columns = {0: 'text', 1: 'pos', 2: 'np'}

            return NLPTaskDataFetcher.fetch_column_corpus(
                data_folder,
                columns,
                train_file='train.txt',
                test_file='test.txt',
                tag_to_biloes='np')

        # many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag
        if task == NLPTask.CONLL_03 or task == NLPTask.ONTONER or task == NLPTask.FASHION:
            columns = {0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}

            return NLPTaskDataFetcher.fetch_column_corpus(
                data_folder,
                columns,
                train_file='eng.train',
                test_file='eng.testb',
                dev_file='eng.testa',
                tag_to_biloes='ner')

        # the CoNLL 03 task for German has an additional lemma column
        if task == NLPTask.CONLL_03_GERMAN:
            columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}

            return NLPTaskDataFetcher.fetch_column_corpus(
                data_folder,
                columns,
                train_file='deu.train',
                test_file='deu.testb',
                dev_file='deu.testa',
                tag_to_biloes='ner')

        # the GERMEVAL task only has two columns: text and ner
        if task == NLPTask.GERMEVAL:
            columns = {1: 'text', 2: 'ner'}

            return NLPTaskDataFetcher.fetch_column_corpus(
                data_folder,
                columns,
                train_file='NER-de-train.tsv',
                test_file='NER-de-test.tsv',
                dev_file='NER-de-dev.tsv',
                tag_to_biloes='ner')

        # WSD tasks may be put into this column format
        if task == NLPTask.WSD:
            columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'sense'}

            return NLPTaskDataFetcher.fetch_column_corpus(
                data_folder,
                columns,
                train_file='semcor.tsv',
                test_file='semeval2015.tsv')

        # the UD corpora follow the CoNLL-U format, for which we have a special reader
        if task == NLPTask.UD_ENGLISH:
            # get train, test and dev data
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'en_ewt-ud-train.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'en_ewt-ud-test.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'en_ewt-ud-dev.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.UD_GERMAN:
            # get train, test and dev data
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'de_gsd-ud-train.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'de_gsd-ud-test.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'de_gsd-ud-dev.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.ONTONOTES:
            # get train, test and dev data
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'train.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'test.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'dev.conllu'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.CONLL_12:
            # get train, test and dev data
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'train.propbank.conllu'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'test.propbank.conllu'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'dev.propbank.conllu'))
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        if task == NLPTask.PENN:
            sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'train.conll'))
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'valid.conll'))
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(
                os.path.join(data_folder, 'test.conll'))

            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        # for text classifiers, we use our own special format
        if task == NLPTask.IMDB:
            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'train.txt'))
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'dev.txt'))
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'test.txt'))
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

        # for text classifiers, we use our own special format
        if task == NLPTask.AG_NEWS:
            sentences_train: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'train.txt'))
            sentences_dev: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'dev.txt'))
            sentences_test: List[
                Sentence] = NLPTaskDataFetcher.read_text_classification_file(
                    os.path.join(data_folder, 'test.txt'))
            return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
Ejemplo n.º 9
0
def read_group(entities):
    sentences_dev = read_group_file('data/dev.tsv', entities)
    sentences_train = read_group_file('data/train.tsv', entities)
    return TaggedCorpus(sentences_train, sentences_dev, sentences_dev)
Ejemplo n.º 10
0
from typing import List

from flair.data import Sentence, TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, CharLMEmbeddings, DocumentLSTMEmbeddings
from flair.models.text_classification_model import TextClassifier
from flair.trainers.text_classification_trainer import TextClassifierTrainer

sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('training.preprocessed.txt')
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('dev.preprocessed.txt')
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file('test.preprocessed.txt')

corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test)

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('de-fasttext'),
                   CharLMEmbeddings('german-forward'),
                   CharLMEmbeddings('german-backward')]

# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_states=32)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

# 6. initialize the text classifier trainer
trainer = TextClassifierTrainer(classifier, corpus, label_dict)
Ejemplo n.º 11
0
    def train(self, X, y):

        X_text = X[:, self.args.TEXT_COL]
        y = y.flatten()
        #corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03)

        train: List[Sentence] = []

        for tweet, label in zip(X_text, y):
            if tweet == '':
                tweet = 'dummy'
            s: Sentence = Sentence(tweet)
            s.add_label(str(label))
            train.append(s)

        corpus: TaggedCorpus = TaggedCorpus(train, train, train)

        # 2. create the label dictionary
        label_dict = corpus.make_label_dictionary()

        # 3. make a list of word embeddings
        word_embeddings = [
            glove_embeddings,
            #twitter_embeddings,
            # comment in this line to use character embeddings
            #CharacterEmbeddings(),
            # comment in flair embeddings for state-of-the-art results
            # FlairEmbeddings('news-forward'),
            fflair,
            # FlairEmbeddings('news-backward'),
            bflair
        ]

        # 4. initialize document embedding by passing list of word embeddings
        document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256,
        )
        # 5. create the text classifier
        classifier = TextClassifier(document_embeddings,
                                    label_dictionary=label_dict,
                                    multi_label=False)

        # 6. initialize the text classifier trainer
        trainer = ModelTrainer(classifier, corpus)

        self.model = trainer.model
        self.model.save = self.save
        self.model.save_checkpoint = self.save_checkpoint

        # 7. start the training
        trainer.train('../data/ecuador_earthquake_2016/models',
                      learning_rate=0.1,
                      mini_batch_size=32,
                      anneal_factor=0.5,
                      patience=5,
                      max_epochs=5)

        self.clf = classifier
Ejemplo n.º 12
0
    #ds =

    X_text = X[:, self.args.TEXT_COL]
    y = y.flatten()
    # corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03)

    train: List[Sentence] = []

    for tweet, label in zip(X_text, y):
        if tweet == '':
            tweet = 'dummy'
        s: Sentence = Sentence(tweet)
        s.add_label(label)
        train.append(s)

    corpus: TaggedCorpus = TaggedCorpus(train)

    # 2. create the label dictionary
    label_dict = corpus.make_label_dictionary()

    # 3. make a list of word embeddings
    word_embeddings = [
        WordEmbeddings('glove'),

        # comment in flair embeddings for state-of-the-art results
        # FlairEmbeddings('news-forward'),
        # FlairEmbeddings('news-backward'),
    ]

    # 4. initialize document embedding by passing list of word embeddings
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(