Beispiel #1
0
 def custom_tokenizer(text: str) -> List[Token]:
     return [Token(text, 0)]
Beispiel #2
0
    def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True):
        """
        Instantiates a column dataset in CoNLL-U format.

        :param path_to_conll_file: Path to the CoNLL-U formatted file
        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
        """
        if type(path_to_conll_file) is str:
            path_to_conll_file = Path(path_to_conll_file)
        assert path_to_conll_file.exists()

        self.in_memory = in_memory
        self.path_to_conll_file = path_to_conll_file
        self.total_sentence_count: int = 0

        if self.in_memory:
            self.sentences: List[Sentence] = []
        else:
            self.indices: List[int] = []

        with open(str(self.path_to_conll_file), encoding="utf-8") as file:

            line = file.readline()
            position = 0
            sentence: Sentence = Sentence()
            while line:

                line = line.strip()
                fields: List[str] = re.split("\t+", line)
                if line == "":
                    if len(sentence) > 0:
                        self.total_sentence_count += 1
                        if self.in_memory:
                            self.sentences.append(sentence)
                        else:
                            self.indices.append(position)
                            position = file.tell()
                    sentence: Sentence = Sentence()

                elif line.startswith("#"):
                    line = file.readline()
                    continue
                elif "." in fields[0]:
                    line = file.readline()
                    continue
                elif "-" in fields[0]:
                    line = file.readline()
                    continue
                else:
                    token = Token(fields[1], head_id=int(fields[6]))
                    token.add_label("lemma", str(fields[2]))
                    token.add_label("upos", str(fields[3]))
                    token.add_label("pos", str(fields[4]))
                    token.add_label("dependency", str(fields[7]))

                    if len(fields) > 9 and 'SpaceAfter=No' in fields[9]:
                        token.whitespace_after = False

                    for morph in str(fields[5]).split("|"):
                        if "=" not in morph:
                            continue
                        token.add_label(morph.split("=")[0].lower(), morph.split("=")[1])

                    if len(fields) > 10 and str(fields[10]) == "Y":
                        token.add_label("frame", str(fields[11]))

                    sentence.add_token(token)

                line = file.readline()
            if len(sentence.tokens) > 0:
                self.total_sentence_count += 1
                if self.in_memory:
                    self.sentences.append(sentence)
                else:
                    self.indices.append(position)
Beispiel #3
0
    def read_conll_ud(path_to_conll_file: str) -> List[Sentence]:
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\s+", line)
            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith('#'):
                continue
            elif '.' in fields[0]:
                continue
            elif '-' in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag('lemma', str(fields[2]))
                token.add_tag('upos', str(fields[3]))
                token.add_tag('pos', str(fields[4]))
                token.add_tag('dependency', str(fields[7]))

                for morph in str(fields[5]).split('|'):
                    if not "=" in morph: continue
                    token.add_tag(
                        morph.split('=')[0].lower(),
                        morph.split('=')[1])

                if len(fields) > 10 and str(fields[10]) == 'Y':
                    token.add_tag('frame', str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0: sentences.append(sentence)

        return sentences
Beispiel #4
0
def form_sentence(tokens):
    s = Sentence()
    for w in tokens:
        s.add_token(Token(w))
    return s
Beispiel #5
0
    def __getitem__(self, index: int = 0) -> Sentence:

        if self.in_memory:
            sentence = self.sentences[index]
        else:
            with open(str(self.path_to_conll_file), encoding="utf-8") as file:
                file.seek(self.indices[index])
                line = file.readline()
                sentence: Sentence = Sentence()
                while line:

                    line = line.strip()
                    fields: List[str] = re.split("\t+", line)
                    if line == "":
                        if len(sentence) > 0:
                            break

                    elif line.startswith("#"):
                        line = file.readline()
                        continue
                    elif "." in fields[0]:
                        line = file.readline()
                        continue
                    elif "-" in fields[0]:
                        line = file.readline()
                        continue
                    else:
                        token = Token(fields[1], head_id=int(fields[6]))
                        token.add_label("lemma", str(fields[2]))
                        token.add_label("upos", str(fields[3]))
                        token.add_label("pos", str(fields[4]))
                        token.add_label("dependency", str(fields[7]))

                        if len(fields) > 9 and 'SpaceAfter=No' in fields[9]:
                            token.whitespace_after = False

                        for morph in str(fields[5]).split("|"):
                            if "=" not in morph:
                                continue
                            token.add_label(
                                morph.split("=")[0].lower(), morph.split("=")[1]
                            )

                        if len(fields) > 10 and str(fields[10]) == "Y":
                            token.add_label("frame", str(fields[11]))

                        sentence.add_token(token)

                    line = file.readline()
        return sentence
Beispiel #6
0
    def train(self, intent_fst) -> None:
        from flair.data import Sentence, Token
        from flair.models import SequenceTagger, TextClassifier
        from flair.embeddings import (
            FlairEmbeddings,
            StackedEmbeddings,
            DocumentRNNEmbeddings,
        )
        from flair.data import TaggedCorpus
        from flair.trainers import ModelTrainer

        # Directory to look for downloaded embeddings
        cache_dir = self.profile.read_path(
            self.profile.get("intent.flair.cache_dir", "flair/cache")
        )

        os.makedirs(cache_dir, exist_ok=True)

        # Directory to store generated models
        data_dir = self.profile.write_path(
            self.profile.get("intent.flair.data_dir", "flair/data")
        )

        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)

        self.embeddings = self.profile.get("intent.flair.embeddings", [])
        assert len(self.embeddings) > 0, "No word embeddings"

        # Create directories to write training data to
        class_data_dir = os.path.join(data_dir, "classification")
        ner_data_dir = os.path.join(data_dir, "ner")
        os.makedirs(class_data_dir, exist_ok=True)
        os.makedirs(ner_data_dir, exist_ok=True)

        # Convert FST to training data
        class_data_path = os.path.join(class_data_dir, "train.txt")
        ner_data_path = os.path.join(ner_data_dir, "train.txt")

        # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] }
        sentences_by_intent: Dict[str, Any] = {}

        # Get sentences for training
        do_sampling = self.profile.get("intent.flair.do_sampling", True)
        start_time = time.time()

        if do_sampling:
            # Sample from each intent FST
            num_samples = int(self.profile.get("intent.flair.num_samples", 10000))
            intent_map_path = self.profile.read_path(
                self.profile.get("training.intent.intent_map", "intent_map.json")
            )

            with open(intent_map_path, "r") as intent_map_file:
                intent_map = json.load(intent_map_file)

            # Gather FSTs for all known intents
            fsts_dir = self.profile.write_dir(
                self.profile.get("speech_to_text.fsts_dir")
            )

            intent_fst_paths = {
                intent_id: os.path.join(fsts_dir, f"{intent_id}.fst")
                for intent_id in intent_map.keys()
            }

            # Generate samples
            self._logger.debug(
                f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)"
            )

            sentences_by_intent = sample_sentences_by_intent(
                intent_fst_paths, num_samples
            )
        else:
            # Exhaustively generate all sentences
            self._logger.debug(
                "Generating all possible sentences (may take a long time)"
            )
            sentences_by_intent = make_sentences_by_intent(intent_fst)

        sentence_time = time.time() - start_time
        self._logger.debug(f"Generated sentences in {sentence_time} second(s)")

        # Get least common multiple in order to balance sentences by intent
        lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values()))

        # Generate examples
        class_sentences = []
        ner_sentences: Dict[str, List[Sentence]] = defaultdict(list)
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for intent_sent in intent_sents:
                # Only train an intent classifier if there's more than one intent
                if len(sentences_by_intent) > 1:
                    # Add balanced copies
                    for i in range(num_repeats):
                        class_sent = Sentence(labels=[intent_name])
                        for word in intent_sent["tokens"]:
                            class_sent.add_token(Token(word))

                        class_sentences.append(class_sent)

                if len(intent_sent["entities"]) == 0:
                    continue  # no entities, no sequence tagger

                # Named entity recognition (NER) example
                token_idx = 0
                entity_start = {ev["start"]: ev for ev in intent_sent["entities"]}
                entity_end = {ev["end"]: ev for ev in intent_sent["entities"]}
                entity = None

                word_tags = []
                for word in intent_sent["tokens"]:
                    # Determine tag label
                    tag = "O" if not entity else f"I-{entity}"
                    if token_idx in entity_start:
                        entity = entity_start[token_idx]["entity"]
                        tag = f"B-{entity}"

                    word_tags.append((word, tag))

                    # word ner
                    token_idx += len(word) + 1

                    if (token_idx - 1) in entity_end:
                        entity = None

                # Add balanced copies
                for i in range(num_repeats):
                    ner_sent = Sentence()
                    for word, tag in word_tags:
                        token = Token(word)
                        token.add_tag("ner", tag)
                        ner_sent.add_token(token)

                    ner_sentences[intent_name].append(ner_sent)

        # Start training
        max_epochs = int(self.profile.get("intent.flair.max_epochs", 100))

        # Load word embeddings
        self._logger.debug(f"Loading word embeddings from {cache_dir}")
        word_embeddings = [
            FlairEmbeddings(os.path.join(cache_dir, "embeddings", e))
            for e in self.embeddings
        ]

        if len(class_sentences) > 0:
            self._logger.debug("Training intent classifier")

            # Random 80/10/10 split
            class_train, class_dev, class_test = self._split_data(class_sentences)
            class_corpus = TaggedCorpus(class_train, class_dev, class_test)

            # Intent classification
            doc_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=512,
                reproject_words=True,
                reproject_words_dimension=256,
            )

            classifier = TextClassifier(
                doc_embeddings,
                label_dictionary=class_corpus.make_label_dictionary(),
                multi_label=False,
            )

            self._logger.debug(
                f"Intent classifier has {len(class_sentences)} example(s)"
            )
            trainer = ModelTrainer(classifier, class_corpus)
            trainer.train(class_data_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping intent classifier training")

        if len(ner_sentences) > 0:
            self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)")

            # Named entity recognition
            stacked_embeddings = StackedEmbeddings(word_embeddings)

            for intent_name, intent_ner_sents in ner_sentences.items():
                ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents)
                ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test)

                tagger = SequenceTagger(
                    hidden_size=256,
                    embeddings=stacked_embeddings,
                    tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"),
                    tag_type="ner",
                    use_crf=True,
                )

                ner_intent_dir = os.path.join(ner_data_dir, intent_name)
                os.makedirs(ner_intent_dir, exist_ok=True)

                self._logger.debug(
                    f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)"
                )
                trainer = ModelTrainer(tagger, ner_corpus)
                trainer.train(ner_intent_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping NER sequence tagger training")
Beispiel #7
0
 def plain_tokenizer(text: str) -> Iterable[Token]:
     res = []
     for tok in text.split():
         res.append(Token(tok))
     return res
Beispiel #8
0
def no_op_tokenizer(text: str) -> List[Token]:
    return [Token(text, idx=0, start_position=0)]
Beispiel #9
0
    def __init__(self,
                 model,
                 fine_tune: bool = False,
                 chars_per_chunk: int = 512):
        """
        initializes contextual string embeddings using a character-level language model.
        :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
                'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward'
                depending on which character language model is desired.
        :param fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows down
                training and often leads to overfitting, so use with caution.
        :param  chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster but requires
                more memory. Lower means slower but less memory.
        """
        super().__init__()

        cache_dir = Path("embeddings")

        aws_path: str = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources"

        self.PRETRAINED_MODEL_ARCHIVE_MAP = {
            # multilingual models
            "multi-forward":
            f"{aws_path}/embeddings-v0.4.3/lm-jw300-forward-v0.1.pt",
            "multi-backward":
            f"{aws_path}/embeddings-v0.4.3/lm-jw300-backward-v0.1.pt",
            "multi-v0-forward":
            f"{aws_path}/embeddings-v0.4/lm-multi-forward-v0.1.pt",
            "multi-v0-backward":
            f"{aws_path}/embeddings-v0.4/lm-multi-backward-v0.1.pt",
            "multi-v0-forward-fast":
            f"{aws_path}/embeddings-v0.4/lm-multi-forward-fast-v0.1.pt",
            "multi-v0-backward-fast":
            f"{aws_path}/embeddings-v0.4/lm-multi-backward-fast-v0.1.pt",
            # English models
            "en-forward":
            f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt",
            "en-backward":
            f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt",
            "en-forward-fast":
            f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt",
            "en-backward-fast":
            f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt",
            "news-forward":
            f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt",
            "news-backward":
            f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt",
            "news-forward-fast":
            f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt",
            "news-backward-fast":
            f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt",
            "mix-forward":
            f"{aws_path}/embeddings/lm-mix-english-forward-v0.2rc.pt",
            "mix-backward":
            f"{aws_path}/embeddings/lm-mix-english-backward-v0.2rc.pt",
            # Arabic
            "ar-forward":
            f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-forward-v0.1.pt",
            "ar-backward":
            f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-backward-v0.1.pt",
            # Bulgarian
            "bg-forward-fast":
            f"{aws_path}/embeddings-v0.3/lm-bg-small-forward-v0.1.pt",
            "bg-backward-fast":
            f"{aws_path}/embeddings-v0.3/lm-bg-small-backward-v0.1.pt",
            "bg-forward":
            f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-forward-v0.1.pt",
            "bg-backward":
            f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-backward-v0.1.pt",
            # Czech
            "cs-forward":
            f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-forward-v0.1.pt",
            "cs-backward":
            f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-backward-v0.1.pt",
            "cs-v0-forward":
            f"{aws_path}/embeddings-v0.4/lm-cs-large-forward-v0.1.pt",
            "cs-v0-backward":
            f"{aws_path}/embeddings-v0.4/lm-cs-large-backward-v0.1.pt",
            # Danish
            "da-forward":
            f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-forward-v0.1.pt",
            "da-backward":
            f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-backward-v0.1.pt",
            # German
            "de-forward":
            f"{aws_path}/embeddings/lm-mix-german-forward-v0.2rc.pt",
            "de-backward":
            f"{aws_path}/embeddings/lm-mix-german-backward-v0.2rc.pt",
            "de-historic-ha-forward":
            f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-forward-v0.1.pt",
            "de-historic-ha-backward":
            f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-backward-v0.1.pt",
            "de-historic-wz-forward":
            f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-forward-v0.1.pt",
            "de-historic-wz-backward":
            f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-backward-v0.1.pt",
            # Spanish
            "es-forward":
            f"{aws_path}/embeddings-v0.4/language_model_es_forward_long/lm-es-forward.pt",
            "es-backward":
            f"{aws_path}/embeddings-v0.4/language_model_es_backward_long/lm-es-backward.pt",
            "es-forward-fast":
            f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt",
            "es-backward-fast":
            f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt",
            # Basque
            "eu-forward":
            f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt",
            "eu-backward":
            f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt",
            "eu-v1-forward":
            f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
            "eu-v1-backward":
            f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
            "eu-v0-forward":
            f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt",
            "eu-v0-backward":
            f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt",
            # Persian
            "fa-forward":
            f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-forward-v0.1.pt",
            "fa-backward":
            f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-backward-v0.1.pt",
            # Finnish
            "fi-forward":
            f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-forward-v0.1.pt",
            "fi-backward":
            f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-backward-v0.1.pt",
            # French
            "fr-forward":
            f"{aws_path}/embeddings/lm-fr-charlm-forward.pt",
            "fr-backward":
            f"{aws_path}/embeddings/lm-fr-charlm-backward.pt",
            # Hebrew
            "he-forward":
            f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-forward-v0.1.pt",
            "he-backward":
            f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-backward-v0.1.pt",
            # Hindi
            "hi-forward":
            f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-forward-v0.1.pt",
            "hi-backward":
            f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-backward-v0.1.pt",
            # Croatian
            "hr-forward":
            f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-forward-v0.1.pt",
            "hr-backward":
            f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-backward-v0.1.pt",
            # Indonesian
            "id-forward":
            f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-forward-v0.1.pt",
            "id-backward":
            f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-backward-v0.1.pt",
            # Italian
            "it-forward":
            f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-forward-v0.1.pt",
            "it-backward":
            f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-backward-v0.1.pt",
            # Japanese
            "ja-forward":
            f"{aws_path}/embeddings-v0.4.1/lm__char-forward__ja-wikipedia-3GB/japanese-forward.pt",
            "ja-backward":
            f"{aws_path}/embeddings-v0.4.1/lm__char-backward__ja-wikipedia-3GB/japanese-backward.pt",
            # Dutch
            "nl-forward":
            f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-forward-v0.1.pt",
            "nl-backward":
            f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-backward-v0.1.pt",
            "nl-v0-forward":
            f"{aws_path}/embeddings-v0.4/lm-nl-large-forward-v0.1.pt",
            "nl-v0-backward":
            f"{aws_path}/embeddings-v0.4/lm-nl-large-backward-v0.1.pt",
            # Norwegian
            "no-forward":
            f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-forward-v0.1.pt",
            "no-backward":
            f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-backward-v0.1.pt",
            # Polish
            "pl-forward":
            f"{aws_path}/embeddings/lm-polish-forward-v0.2.pt",
            "pl-backward":
            f"{aws_path}/embeddings/lm-polish-backward-v0.2.pt",
            "pl-opus-forward":
            f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-forward-v0.1.pt",
            "pl-opus-backward":
            f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-backward-v0.1.pt",
            # Portuguese
            "pt-forward":
            f"{aws_path}/embeddings-v0.4/lm-pt-forward.pt",
            "pt-backward":
            f"{aws_path}/embeddings-v0.4/lm-pt-backward.pt",
            # Pubmed
            "pubmed-forward":
            f"{aws_path}/embeddings-v0.4.1/pubmed-2015-fw-lm.pt",
            "pubmed-backward":
            f"{aws_path}/embeddings-v0.4.1/pubmed-2015-bw-lm.pt",
            # Slovenian
            "sl-forward":
            f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-forward-v0.1.pt",
            "sl-backward":
            f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-backward-v0.1.pt",
            "sl-v0-forward":
            f"{aws_path}/embeddings-v0.3/lm-sl-large-forward-v0.1.pt",
            "sl-v0-backward":
            f"{aws_path}/embeddings-v0.3/lm-sl-large-backward-v0.1.pt",
            # Swedish
            "sv-forward":
            f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-forward-v0.1.pt",
            "sv-backward":
            f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt",
            "sv-v0-forward":
            f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt",
            "sv-v0-backward":
            f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt",
            # Tamil
            "ta-forward":
            f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt",
            "ta-backward":
            f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt",
        }

        if type(model) == str:

            # load model if in pretrained model map
            if model.lower() in self.PRETRAINED_MODEL_ARCHIVE_MAP:
                base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[model.lower()]
                model = cached_path(base_path, cache_dir=cache_dir)

            elif replace_with_language_code(
                    model) in self.PRETRAINED_MODEL_ARCHIVE_MAP:
                base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[
                    replace_with_language_code(model)]
                model = cached_path(base_path, cache_dir=cache_dir)

            elif not Path(model).exists():
                raise ValueError(
                    f'The given model "{model}" is not available or is not a valid path.'
                )

        from flair.models import LanguageModel

        if type(model) == LanguageModel:
            self.lm: LanguageModel = model
            self.name = f"Task-LSTM-{self.lm.hidden_size}-{self.lm.nlayers}-{self.lm.is_forward_lm}"
        else:
            self.lm: LanguageModel = LanguageModel.load_language_model(model)
            self.name = str(model)

        # embeddings are static if we don't do finetuning
        self.fine_tune = fine_tune
        self.static_embeddings = not fine_tune

        self.is_forward_lm: bool = self.lm.is_forward_lm
        self.chars_per_chunk: int = chars_per_chunk

        # embed a dummy sentence to determine embedding_length
        dummy_sentence: Sentence = Sentence()
        dummy_sentence.add_token(Token("hello"))
        embedded_dummy = self.embed(dummy_sentence)
        self.__embedding_length: int = len(
            embedded_dummy[0].get_token(1).get_embedding())

        # set to eval mode
        self.eval()
    def __init__(
        self,
        path_to_column_file: Path,
        column_name_map: Dict[int, str],
        tags_to_bioes: List[str] = None,
        comment_symbol: str = '#',
        in_memory: bool = True,
        document_separator_token: str = None,
        encoding: str = "utf-8",
    ):
        """
        Instantiates a column dataset (typically used for sequence labeling or word-level prediction).

        :param path_to_column_file: path to the file with the column-formatted data
        :param column_name_map: a map specifying the column format
        :param tags_to_bioes: whether to convert to BIOES tagging scheme
        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
        :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
        that indicates that a new document begins
        """
        assert path_to_column_file.exists()
        self.path_to_column_file = path_to_column_file
        self.tags_to_bioes = tags_to_bioes
        self.column_name_map = column_name_map
        self.comment_symbol = comment_symbol
        self.document_separator_token = document_separator_token

        # store either Sentence objects in memory, or only file offsets
        self.in_memory = in_memory
        if self.in_memory:
            self.sentences: List[Sentence] = []
        else:
            self.indices: List[int] = []

        self.total_sentence_count: int = 0

        # most data sets have the token text in the first column, if not, pass 'text' as column
        self.text_column: int = 0
        for column in self.column_name_map:
            if column_name_map[column] == "text":
                self.text_column = column

        # determine encoding of text file
        self.encoding = encoding

        sentence: Sentence = Sentence()
        with open(str(self.path_to_column_file), encoding=self.encoding) as f:

            line = f.readline()
            position = 0

            while line:

                if self.comment_symbol is not None and line.startswith(
                        comment_symbol):
                    line = f.readline()
                    continue

                if self.__line_completes_sentence(line):

                    if len(sentence) > 0:

                        sentence.infer_space_after()
                        if self.in_memory:
                            if self.tags_to_bioes is not None:
                                for tag in self.tags_to_bioes:
                                    sentence.convert_tag_scheme(
                                        tag_type=tag, target_scheme="iobes")
                            self.sentences.append(sentence)
                        else:
                            self.indices.append(position)
                            position = f.tell()
                        self.total_sentence_count += 1
                    sentence: Sentence = Sentence()

                else:
                    fields: List[str] = re.split("\s+", line)
                    token = Token(fields[self.text_column])
                    for column in column_name_map:
                        if len(fields) > column:
                            if column != self.text_column:
                                token.add_tag(self.column_name_map[column],
                                              fields[column])

                    if not line.isspace():
                        sentence.add_token(token)

                line = f.readline()

        if len(sentence.tokens) > 0:
            sentence.infer_space_after()
            if self.in_memory:
                self.sentences.append(sentence)
            else:
                self.indices.append(position)
            self.total_sentence_count += 1
Beispiel #11
0
    def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence:
        sentence: Sentence = Sentence()

        # Build the sentence tokens and add the annotations.
        for conllu_token in token_list:
            token = Token(conllu_token["form"])

            for field in self.token_annotation_fields:
                field_value: Any = conllu_token[field]
                if isinstance(field_value, dict):
                    # For fields that contain key-value annotations,
                    # we add the key as label type-name and the value as the label value.
                    for key, value in field_value.items():
                        token.add_label(typename=key, value=str(value))
                else:
                    token.add_label(typename=field, value=str(field_value))

            if conllu_token.get("misc") is not None:
                space_after: Optional[str] = conllu_token["misc"].get(
                    "SpaceAfter")
                if space_after == "No":
                    token.whitespace_after = False

            sentence.add_token(token)

        if "sentence_id" in token_list.metadata:
            sentence.add_label("sentence_id",
                               token_list.metadata["sentence_id"])

        if "relations" in token_list.metadata:
            for (
                    head_start,
                    head_end,
                    tail_start,
                    tail_end,
                    label,
            ) in token_list.metadata["relations"]:
                # head and tail span indices are 1-indexed and end index is inclusive
                head = Span(sentence.tokens[head_start - 1:head_end])
                tail = Span(sentence.tokens[tail_start - 1:tail_end])

                sentence.add_complex_label(
                    "relation", RelationLabel(value=label,
                                              head=head,
                                              tail=tail))

        # determine all NER label types in sentence and add all NER spans as sentence-level labels
        ner_label_types = []
        for token in sentence.tokens:
            for annotation in token.annotation_layers.keys():
                if annotation.startswith(
                        "ner") and annotation not in ner_label_types:
                    ner_label_types.append(annotation)

        for label_type in ner_label_types:
            spans = sentence.get_spans(label_type)
            for span in spans:
                sentence.add_complex_label(
                    "entity",
                    label=SpanLabel(span=span,
                                    value=span.tag,
                                    score=span.score),
                )

        return sentence
Beispiel #12
0
    def create_sentlist_from_file_batchmax(self, data, maxlen=64, compare_column="cat"):
        """
        takes a pandas dataframe with columns 'tok' and 'sentstart' and creates a list of flair Sentence objects with tags.
        Each flair Sentence object may contain several real sentences, but at most maxlen tokens.
        The Sentence object stops at a sentence boundary, so it is often shorter than maxlen.
        Sentences longer than maxlen are split!
        If a line with token value "EOF" is encountered, a shorter flair Sentence object is returned,
        so no file boundaries are crossed
        :param data_path:
        :return:
        """
        sent_list = []
        toklist = []
        catlist = []
        # the len_last_token is needed to add proper start/end pos for each sentence token
        len_last_token = 0
        # track the sentence that is currently being processed
        curr_sentence_tok = []
        curr_sentence_cat = []
        for index, row in data.iterrows():
            tok = str(row["tok"])
            if compare_column != "NaN":
                cat = str(row[compare_column])
            else:
                cat = "-"

            # if the current token is "EOF" this marks the end of sample file
            # chunks may not cross file boundaries, therefore end the sentence here in any case
            if tok == "EOF":
                # do not add this token to any list
                # merge toklist and curr_sentence_tok list to get all current tokens
                # and create a flair sentence
                toklist.extend(curr_sentence_tok)
                catlist.extend(curr_sentence_cat)
                self.logger.debug("create chunk at EOF with (len: {}): {}".format(len(toklist), toklist))
                self.logger.debug("catlist with (len: {}): {}".format(len(catlist), catlist))
                sent = Sentence()
                for i, tok in enumerate(toklist):
                    flair_tok = Token(str(tok), start_position=len_last_token)
                    len_last_token += len(tok) + 1
                    flair_tok.add_tag("cat", catlist[i])
                    sent.add_token(flair_tok)
                if len(sent.tokens) > 0:
                    sent_list.append(sent)
                len_last_token = 0
                toklist = []
                catlist = []
                # reset the curr sent lists as well
                curr_sentence_tok = []
                curr_sentence_cat = []

            else:
                # if we are at the start of a new sentence, add the contents of curr_sentence_tok
                # and curr_sentence_cat to the main lists and start a new curr_sentence
                if row["sentstart"] == "yes":
                    toklist.extend(curr_sentence_tok)
                    catlist.extend(curr_sentence_cat)
                    curr_sentence_tok = [tok]
                    curr_sentence_cat = [cat]
                else:
                    curr_sentence_tok.append(tok)
                    curr_sentence_cat.append(cat)

                # if the combined length of toklist and curr_sentence_tok is > maxlen now,
                # create a flair sentence with the tokens in toklist and reset it
                # the remaining tokens in curr_sentence_tok are saved for the next chunk
                if len(toklist) + len(curr_sentence_tok) > maxlen:
                    # if toklist is empty at this point, we have a sentence > maxlen
                    # and must split it. The last token currently in curr_sentence will
                    # be preserved for later so that the chunk is not too long
                    if len(toklist) == 0:
                        toklist.extend(curr_sentence_tok[0:-1])
                        catlist.extend(curr_sentence_cat[0:-1])
                        curr_sentence_tok = [curr_sentence_tok[-1]]
                        curr_sentence_cat = [curr_sentence_cat[-1]]
                        self.logger.debug("Sentence is split (len: {}): {}".format(len(toklist), toklist))

                    self.logger.debug("create chunk with (len: {}): {}".format(len(toklist), toklist))
                    self.logger.debug("catlist with (len: {}): {}".format(len(catlist), catlist))
                    sent = Sentence()
                    for i, tok in enumerate(toklist):
                        flair_tok = Token(str(tok), start_position=len_last_token)
                        len_last_token += len(tok) + 1
                        flair_tok.add_tag("cat", str(catlist[i]))
                        sent.add_token(flair_tok)
                    if len(sent.tokens) > 0:
                        sent_list.append(sent)
                    len_last_token = 0
                    toklist = []
                    catlist = []

        self.logger.debug("toklist: {}, curr_sent_tok: {}".format(len(toklist), len(curr_sentence_tok)))
        # if the loop is complete, empty the buffers and add them to the list
        if len(curr_sentence_tok) > 0:
            toklist.extend(curr_sentence_tok)
            catlist.extend(curr_sentence_cat)
            sent = Sentence()
            for i, tok in enumerate(toklist):
                flair_tok = Token(str(tok), start_position=len_last_token)
                len_last_token += len(tok) + 1
                flair_tok.add_tag("cat", str(catlist[i]))
                sent.add_token(flair_tok)
            if len(sent.tokens) > 0:
                sent_list.append(sent)
            len_last_token = 0

        return sent_list
Beispiel #13
0
def bert_embeddings(sentences, tokenized_contents, output_file=None):
    # Using bert_tokenizer for checking for sequence wordpeice tokens length > 512
    bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    if output_file:
        f = open(output_file, 'w')
    # init embedding
    # init multilingual BERT
    bert_embedding = TransformerWordEmbeddings('bert-large-uncased')
    long_sent = False
    for i, (sent, sent_tokens) in enumerate(zip(sentences,
                                                tokenized_contents)):
        print("Encoding the {}th input sentence for BERT embedding!".format(i))
        # getting the length of bert tokenized sentence after wordpeice tokenization
        if len(bert_tokenizer.tokenize(sent[0])) >= 510:
            long_sent = True
            truncated_tokens = sent_tokens[:len(sent_tokens) // 2]
            sent_tokens = sent_tokens[len(sent_tokens) // 2:]

        # Using our own tokens (our own tokenization)
        tokens: List[Token] = [Token(token) for token in sent_tokens]

        # create an empty sentence
        sentence = Sentence()

        # add tokens from our own tokenization
        sentence.tokens = tokens

        bert_embedding.embed(sentence)

        for j, (token, st) in enumerate(zip(sentence, sent_tokens)):
            if token.text != st:
                raise ValueError("Invalid token text")
            if output_file:
                f.write(
                    token.text + " " +
                    " ".join([str(num)
                              for num in token.embedding.tolist()]) + '\n')
            else:
                print(token.text + " " +
                      " ".join([str(num)
                                for num in token.embedding.tolist()]) + '\n')

        if long_sent:
            # tokenization for the rest of the sentence
            truncated_tokens: List[Token] = [
                Token(token) for token in truncated_tokens
            ]
            # Create empty sentence
            truncated_sentence = Sentence()
            # add tokens from our own tokenization
            truncated_sentence.tokens = truncated_tokens
            bert_embedding.embed(truncated_sentence)
            for token in truncated_sentence:
                if output_file:
                    f.write(token.text + " " + " ".join(
                        [str(num) for num in token.embedding.tolist()]) + '\n')
                else:
                    print(token.text + " " + " ".join(
                        [str(num) for num in token.embedding.tolist()]) + '\n')
            long_sent = False

        f.write('\n')
Beispiel #14
0
def mock_ner_span(text, tag, start, end):
    span = Span([]).set_label("class", tag)
    span.start_pos = start
    span.end_pos = end
    span.tokens = [Token(text[start:end])]
    return span
Beispiel #15
0
    def read_column_data(
        path_to_column_file: Path,
        column_name_map: Dict[int, str],
        infer_whitespace_after: bool = True,
    ):
        """
        Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the
        column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you
        specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third
        the chunk and the forth the NER tag.
        :param path_to_column_file: the path to the column file
        :param column_name_map: a map of column number to token annotation name
        :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token
        :return: list of sentences
        """
        sentences: List[Sentence] = []

        try:
            lines: List[str] = open(
                str(path_to_column_file),
                encoding="utf-8").read().strip().split("\n")
        except:
            log.info(
                'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(
                    path_to_column_file))
            lines: List[str] = open(
                str(path_to_column_file),
                encoding="latin1").read().strip().split("\n")

        # most data sets have the token text in the first column, if not, pass 'text' as column
        text_column: int = 0
        for column in column_name_map:
            if column_name_map[column] == "text":
                text_column = column

        sentence: Sentence = Sentence()
        for line in lines:

            if line.startswith("#"):
                continue

            if line.strip().replace("", "") == "":
                if len(sentence) > 0:
                    sentence.infer_space_after()
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            else:
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[text_column])
                for column in column_name_map:
                    if len(fields) > column:
                        if column != text_column:
                            token.add_tag(column_name_map[column],
                                          fields[column])

                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentence.infer_space_after()
            sentences.append(sentence)

        return sentences
Beispiel #16
0
    def read_conll_ud(path_to_conll_file: Path) -> List[Sentence]:
        """
       Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation
       :param path_to_conll_file: the path to the conll-u file
       :return: list of sentences
       """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\t+", line)
            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith('#'):
                continue
            elif '.' in fields[0]:
                continue
            elif '-' in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag('lemma', str(fields[2]))
                token.add_tag('upos', str(fields[3]))
                token.add_tag('pos', str(fields[4]))
                token.add_tag('dependency', str(fields[7]))

                for morph in str(fields[5]).split('|'):
                    if not "=" in morph: continue
                    token.add_tag(
                        morph.split('=')[0].lower(),
                        morph.split('=')[1])

                if len(fields) > 10 and str(fields[10]) == 'Y':
                    token.add_tag('frame', str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0: sentences.append(sentence)

        return sentences
Beispiel #17
0
    def read_conll_ud(path_to_conll_file: Path) -> List[Sentence]:
        """
       Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation
       :param path_to_conll_file: the path to the conll-u file
       :return: list of sentences
       """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file,
                                encoding="utf-8").read().strip().split("\n")

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\t+", line)
            if line == "":
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith("#"):
                continue
            elif "." in fields[0]:
                continue
            elif "-" in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag("lemma", str(fields[2]))
                token.add_tag("upos", str(fields[3]))
                token.add_tag("pos", str(fields[4]))
                token.add_tag("dependency", str(fields[7]))

                for morph in str(fields[5]).split("|"):
                    if not "=" in morph:
                        continue
                    token.add_tag(
                        morph.split("=")[0].lower(),
                        morph.split("=")[1])

                if len(fields) > 10 and str(fields[10]) == "Y":
                    token.add_tag("frame", str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentences.append(sentence)

        return sentences
Beispiel #18
0
 def custom_tokenizer(text: str) -> List[Token]:
     global text_tokens_map
     tokens = text_tokens_map[text]
     tokens: List[Token] = [Token(token) for token in tokens]
     return tokens