Ejemplo n.º 1
0
    def read_conll_2_column_data(path_to_conll_file: str, tag_name: str):

        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()
            else:
                # print(line)
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[0])
                token.add_tag(tag_name, fields[1])
                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentences.append(sentence)

        return sentences
Ejemplo n.º 2
0
    def read_germeval(path_to_conll_file: str,
                      tag_scheme='iob') -> List[Sentence]:
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            if line.startswith('#'):
                continue
            elif line == '':
                if len(sentence.tokens) > 0:
                    sentence.convert_tag_scheme(target_scheme=tag_scheme)
                    sentences.append(sentence)
                sentence: Sentence = Sentence()
            else:
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[1])
                token.add_tag('ner', fields[2])
                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentence.convert_tag_scheme(target_scheme=tag_scheme)
            sentences.append(sentence)

        return sentences
Ejemplo n.º 3
0
    def _convert_to_flair(self, data, labels=None):
        """ Convert data and labels into a list of flair.data.Sentence objects.

            Parameters
            ----------
            data : list(list(str))
                list of list of tokens, each inner list represents a list of
                    tokens or words in sentence, and each outer list represents
                    a sentence.
            labels : list(list(str)), can be None
                list of list of NER tags corresponding to tokens in data.

            Returns
            -------
            sentences : list(flair.data.Sentence)
        """
        sentences = []
        if labels is None:
            labels = data
            use_dummy_labels = True
        else:
            use_dummy_labels = False
        for tokens, tags in zip(data, labels):
            sentence = Sentence()
            for token, tag in zip(tokens, tags):
                t = Token(token)
                if not use_dummy_labels:
                    t.add_tag("ner", tag)
                sentence.add_token(t)
            sentences.append(sentence)
        return sentences
def process_conll_doc(input_file_name, output_file_name):

    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:

            myjson = {
                "text": unidecode.unidecode(d.to_tokenized_string()),
                "spans": []
            }
            res = requests.post(NEURAL_EL_SERVER, json=myjson)
            info = res.json()
            #print(info)
            for i in info:
                entity_ran = range(i[0], i[0] + i[1])
                #print(i[2] + " " + str(entity_ran))
                for t in d.tokens:
                    #print(t.text + " " + str(t.start_pos))
                    if t.start_position in entity_ran:
                        #print("found tag")
                        t.add_tag("pnme", i[2])

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Ejemplo n.º 5
0
def tag_it(token: Token, index, ner_spans):
    labels = [(start, end, label) for start, end, label in ner_spans if index >= start and index <= end]

    if len(labels) > 0:
        for start, end, label in labels:
            token.add_tag(TAG_TYPE, prefix_to_BIOES(label, start, end, index))
    else:
        token.add_tag(TAG_TYPE, 'O')
Ejemplo n.º 6
0
def test_sentence_to_tagged_string():
    token1 = Token('I', 0)
    token2 = Token('love', 1, 0)
    token3 = Token('Berlin', 2, 1)
    token3.add_tag('ner', 'LOC')
    sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)
    assert ('I love Berlin <LOC>' == sentence.to_tagged_string())
Ejemplo n.º 7
0
    def read_column_data(path_to_column_file: Path,
                         column_name_map: Dict[int, str],
                         infer_whitespace_after: bool = True):
        """
        Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the
        column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you
        specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third
        the chunk and the forth the NER tag.
        :param path_to_column_file: the path to the column file
        :param column_name_map: a map of column number to token annotation name
        :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token
        :return: list of sentences
        """
        sentences: List[Sentence] = []

        try:
            lines: List[str] = open(str(path_to_column_file), encoding='utf-8').read().strip().split('\n')
        except:
            log.info('UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(path_to_column_file))
            lines: List[str] = open(str(path_to_column_file), encoding='latin1').read().strip().split('\n')

        # most data sets have the token text in the first column, if not, pass 'text' as column
        text_column: int = 0
        for column in column_name_map:
            if column_name_map[column] == 'text':
                text_column = column

        sentence: Sentence = Sentence()
        for line in lines:

            if line.startswith('#'):
                continue

            if line.strip().replace('', '') == '':
                if len(sentence) > 0:
                    sentence.infer_space_after()
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            else:
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[text_column])
                for column in column_name_map:
                    if len(fields) > column:
                        if column != text_column:
                            token.add_tag(column_name_map[column], fields[column])

                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentence.infer_space_after()
            sentences.append(sentence)

        return sentences
Ejemplo n.º 8
0
def read_group_file(path_to_file, entities):
    sentences: List[Sentence] = []
    for line in open(path_to_file):
        sentence: Sentence = Sentence()
        labels_data, text = line.rstrip().split('\t')
        labels, tokens = data_to_bio(labels_data, text, entities)
        for label, token in zip(labels, tokens):
            token = Token(token)
            token.add_tag('ner', label)
            sentence.add_token(token)
        sentences.append(sentence)
    return sentences
Ejemplo n.º 9
0
def test_sentence_to_tagged_string():
    token1 = Token("I", 0)
    token2 = Token("love", 1, 0)
    token3 = Token("Berlin", 2, 1)
    token3.add_tag("ner", "LOC")

    sentence: Sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)

    assert "I love Berlin <LOC>" == sentence.to_tagged_string()
Ejemplo n.º 10
0
    def read_column_data(path_to_column_file: str, column_name_map: Dict[int,
                                                                         str]):
        """
        Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the
        column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you
        specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third
        the chunk and the forth the NER tag.
        :param path_to_column_file: the path to the column file
        :param column_name_map: a map of column number to token annotation name
        :return: list of sentences
        """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_column_file).read().strip().split('\n')

        # most data sets have the token text in the first column, if not, pass 'text' as column
        text_column: int = 0
        for column in column_name_map:
            if column_name_map[column] == 'text':
                text_column = column

        sentence: Sentence = Sentence()
        for line in lines:

            if line.startswith('#'):
                continue

            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            else:
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[text_column])
                for column in column_name_map:
                    if len(fields) > column:
                        if column != text_column:
                            token.add_tag(column_name_map[column],
                                          fields[column])
                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentences.append(sentence)

        return sentences
Ejemplo n.º 11
0
def standoff_to_flair_sents(
        docs: List[Document],
        tokenizer: Tokenizer,
        verbose=False) -> Tuple[List[Sentence], List[ParsedDoc]]:
    sents, parsed_docs = standoff_to_sents(docs=docs,
                                           tokenizer=tokenizer,
                                           verbose=verbose)

    flair_sents = []
    for sent in sents:
        flair_sent = Sentence()
        for token in sent:
            tok = Token(token.text)
            tok.add_tag(tag_type='ner', tag_value=token.label)
            flair_sent.add_token(tok)
        flair_sents.append(flair_sent)

    return flair_sents, parsed_docs
Ejemplo n.º 12
0
    def __getitem__(self, index: int = 0) -> Sentence:

        if self.in_memory:
            sentence = self.sentences[index]

        else:
            with open(str(self.path_to_column_file),
                      encoding=self.encoding) as file:
                file.seek(self.indices[index])
                line = file.readline()
                sentence: Sentence = Sentence()
                while line:
                    if self.comment_symbol is not None and line.startswith(
                            self.comment_symbol):
                        line = file.readline()
                        continue

                    if self.__line_completes_sentence(line):
                        if len(sentence) > 0:
                            sentence.infer_space_after()
                            if self.tag_to_bioes is not None:
                                sentence.convert_tag_scheme(
                                    tag_type=self.tag_to_bioes,
                                    target_scheme="iobes")
                            return sentence

                    else:
                        fields: List[str] = re.split("[\t\n]", line)
                        token = Token(fields[self.text_column])
                        for column in self.column_name_map:
                            if len(fields) > column:
                                if column != self.text_column:
                                    token.add_tag(self.column_name_map[column],
                                                  fields[column])

                        if not line.isspace():
                            sentence.add_token(token)

                    line = file.readline()
        return sentence
Ejemplo n.º 13
0
    def read_conll_sequence_labeling_data(path_to_conll_file: str):

        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()
            else:
                # print(line)
                fields: List[str] = re.split("\s+", line)
                token = Token(fields[0])
                token.add_tag('pos', fields[1])
                token.add_tag('np', fields[2])
                if len(fields) > 3:
                    token.add_tag('ner', fields[3])
                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentences.append(sentence)

        return sentences
Ejemplo n.º 14
0
    def read_conll_ud(path_to_conll_file: str) -> List[Sentence]:
        """
       Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation
       :param path_to_conll_file: the path to the conll-u file
       :return: list of sentences
       """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\s+", line)
            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith('#'):
                continue
            elif '.' in fields[0]:
                continue
            elif '-' in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag('lemma', str(fields[2]))
                token.add_tag('upos', str(fields[3]))
                token.add_tag('pos', str(fields[4]))
                token.add_tag('dependency', str(fields[7]))

                for morph in str(fields[5]).split('|'):
                    if not "=" in morph: continue;
                    token.add_tag(morph.split('=')[0].lower(), morph.split('=')[1])

                if len(fields) > 10 and str(fields[10]) == 'Y':
                    token.add_tag('frame', str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0: sentences.append(sentence)

        return sentences
Ejemplo n.º 15
0
    def train(self, intent_fst) -> None:
        from flair.data import Sentence, Token
        from flair.models import SequenceTagger, TextClassifier
        from flair.embeddings import (
            FlairEmbeddings,
            StackedEmbeddings,
            DocumentRNNEmbeddings,
        )
        from flair.data import TaggedCorpus
        from flair.trainers import ModelTrainer

        # Directory to look for downloaded embeddings
        cache_dir = self.profile.read_path(
            self.profile.get("intent.flair.cache_dir", "flair/cache")
        )

        os.makedirs(cache_dir, exist_ok=True)

        # Directory to store generated models
        data_dir = self.profile.write_path(
            self.profile.get("intent.flair.data_dir", "flair/data")
        )

        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)

        self.embeddings = self.profile.get("intent.flair.embeddings", [])
        assert len(self.embeddings) > 0, "No word embeddings"

        # Create directories to write training data to
        class_data_dir = os.path.join(data_dir, "classification")
        ner_data_dir = os.path.join(data_dir, "ner")
        os.makedirs(class_data_dir, exist_ok=True)
        os.makedirs(ner_data_dir, exist_ok=True)

        # Convert FST to training data
        class_data_path = os.path.join(class_data_dir, "train.txt")
        ner_data_path = os.path.join(ner_data_dir, "train.txt")

        # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] }
        sentences_by_intent: Dict[str, Any] = {}

        # Get sentences for training
        do_sampling = self.profile.get("intent.flair.do_sampling", True)
        start_time = time.time()

        if do_sampling:
            # Sample from each intent FST
            num_samples = int(self.profile.get("intent.flair.num_samples", 10000))
            intent_map_path = self.profile.read_path(
                self.profile.get("training.intent.intent_map", "intent_map.json")
            )

            with open(intent_map_path, "r") as intent_map_file:
                intent_map = json.load(intent_map_file)

            # Gather FSTs for all known intents
            fsts_dir = self.profile.write_dir(
                self.profile.get("speech_to_text.fsts_dir")
            )

            intent_fst_paths = {
                intent_id: os.path.join(fsts_dir, f"{intent_id}.fst")
                for intent_id in intent_map.keys()
            }

            # Generate samples
            self._logger.debug(
                f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)"
            )

            sentences_by_intent = sample_sentences_by_intent(
                intent_fst_paths, num_samples
            )
        else:
            # Exhaustively generate all sentences
            self._logger.debug(
                "Generating all possible sentences (may take a long time)"
            )
            sentences_by_intent = make_sentences_by_intent(intent_fst)

        sentence_time = time.time() - start_time
        self._logger.debug(f"Generated sentences in {sentence_time} second(s)")

        # Get least common multiple in order to balance sentences by intent
        lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values()))

        # Generate examples
        class_sentences = []
        ner_sentences: Dict[str, List[Sentence]] = defaultdict(list)
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for intent_sent in intent_sents:
                # Only train an intent classifier if there's more than one intent
                if len(sentences_by_intent) > 1:
                    # Add balanced copies
                    for i in range(num_repeats):
                        class_sent = Sentence(labels=[intent_name])
                        for word in intent_sent["tokens"]:
                            class_sent.add_token(Token(word))

                        class_sentences.append(class_sent)

                if len(intent_sent["entities"]) == 0:
                    continue  # no entities, no sequence tagger

                # Named entity recognition (NER) example
                token_idx = 0
                entity_start = {ev["start"]: ev for ev in intent_sent["entities"]}
                entity_end = {ev["end"]: ev for ev in intent_sent["entities"]}
                entity = None

                word_tags = []
                for word in intent_sent["tokens"]:
                    # Determine tag label
                    tag = "O" if not entity else f"I-{entity}"
                    if token_idx in entity_start:
                        entity = entity_start[token_idx]["entity"]
                        tag = f"B-{entity}"

                    word_tags.append((word, tag))

                    # word ner
                    token_idx += len(word) + 1

                    if (token_idx - 1) in entity_end:
                        entity = None

                # Add balanced copies
                for i in range(num_repeats):
                    ner_sent = Sentence()
                    for word, tag in word_tags:
                        token = Token(word)
                        token.add_tag("ner", tag)
                        ner_sent.add_token(token)

                    ner_sentences[intent_name].append(ner_sent)

        # Start training
        max_epochs = int(self.profile.get("intent.flair.max_epochs", 100))

        # Load word embeddings
        self._logger.debug(f"Loading word embeddings from {cache_dir}")
        word_embeddings = [
            FlairEmbeddings(os.path.join(cache_dir, "embeddings", e))
            for e in self.embeddings
        ]

        if len(class_sentences) > 0:
            self._logger.debug("Training intent classifier")

            # Random 80/10/10 split
            class_train, class_dev, class_test = self._split_data(class_sentences)
            class_corpus = TaggedCorpus(class_train, class_dev, class_test)

            # Intent classification
            doc_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=512,
                reproject_words=True,
                reproject_words_dimension=256,
            )

            classifier = TextClassifier(
                doc_embeddings,
                label_dictionary=class_corpus.make_label_dictionary(),
                multi_label=False,
            )

            self._logger.debug(
                f"Intent classifier has {len(class_sentences)} example(s)"
            )
            trainer = ModelTrainer(classifier, class_corpus)
            trainer.train(class_data_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping intent classifier training")

        if len(ner_sentences) > 0:
            self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)")

            # Named entity recognition
            stacked_embeddings = StackedEmbeddings(word_embeddings)

            for intent_name, intent_ner_sents in ner_sentences.items():
                ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents)
                ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test)

                tagger = SequenceTagger(
                    hidden_size=256,
                    embeddings=stacked_embeddings,
                    tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"),
                    tag_type="ner",
                    use_crf=True,
                )

                ner_intent_dir = os.path.join(ner_data_dir, intent_name)
                os.makedirs(ner_intent_dir, exist_ok=True)

                self._logger.debug(
                    f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)"
                )
                trainer = ModelTrainer(tagger, ner_corpus)
                trainer.train(ner_intent_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping NER sequence tagger training")
Ejemplo n.º 16
0
def process_conll_doc(input_file_name, output_file_name, ner_model,
                      with_disambiguation, sim_level_disambig):

    nertagger = SequenceTagger.load(ner_model)
    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:
            nertagger.predict(d)

            spans = []
            for nerspan in d.get_spans('ner'):
                start = nerspan.start_pos
                length = nerspan.end_pos - nerspan.start_pos
                spans.append({"start": start, "length": length})

            myjson = {
                "text": unidecode.unidecode(d.to_tokenized_string()),
                "spans": spans
            }

            res = requests.post(NEURAL_EL_SERVER, json=myjson)
            info = res.json()

            for nerspan in d.get_spans('ner'):
                for i in info:
                    if i[0] == nerspan.start_pos:
                        for t in nerspan.tokens:
                            t.add_tag("pnme", i[2])
                        break

            if with_disambiguation:
                searcher = load_disambiguation()
                for nerspan in d.get_spans('ner'):
                    if "pnme" not in nerspan.tokens[0].tags:
                        #print("calling with " + nerspan.text)
                        r = searcher.search(nerspan.text.lower(),
                                            sim_level_disambig)
                        #print(r)
                        if len(r) > 0:
                            d_tag = unidecode.unidecode(
                                (string.capwords(r[0]) +
                                 "_(disambiguation)").replace(" ", "_"))
                            for t2 in nerspan.tokens:
                                t2.add_tag("pnme", d_tag)

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Ejemplo n.º 17
0
    def create_sentlist_from_file_batchmax(self,
                                           data,
                                           maxlen=64,
                                           compare_column="cat"):
        """
        takes a pandas dataframe with columns 'tok' and 'sentstart' and creates a list of flair Sentence objects with tags.
        Each flair Sentence object may contain several real sentences, but at most maxlen tokens.
        The Sentence object stops at a sentence boundary, so it is often shorter than maxlen.
        Sentences longer than maxlen are split!
        If a line with token value "EOF" is encountered, a shorter flair Sentence object is returned,
        so no file boundaries are crossed
        :param data_path:
        :return:
        """
        sent_list = []
        toklist = []
        catlist = []
        # the len_last_token is needed to add proper start/end pos for each sentence token
        len_last_token = 0
        # track the sentence that is currently being processed
        curr_sentence_tok = []
        curr_sentence_cat = []
        for index, row in data.iterrows():
            tok = str(row["tok"])
            if compare_column != "NaN":
                cat = str(row[compare_column])
            else:
                cat = "-"

            # if the current token is "EOF" this marks the end of sample file
            # chunks may not cross file boundaries, therefore end the sentence here in any case
            if tok == "EOF":
                # do not add this token to any list
                # merge toklist and curr_sentence_tok list to get all current tokens
                # and create a flair sentence
                toklist.extend(curr_sentence_tok)
                catlist.extend(curr_sentence_cat)
                self.logger.debug(
                    "create chunk at EOF with (len: {}): {}".format(
                        len(toklist), toklist))
                self.logger.debug("catlist with (len: {}): {}".format(
                    len(catlist), catlist))
                sent = Sentence()
                for i, tok in enumerate(toklist):
                    flair_tok = Token(str(tok), start_position=len_last_token)
                    len_last_token += len(tok) + 1
                    flair_tok.add_tag("cat", catlist[i])
                    sent.add_token(flair_tok)
                if len(sent.tokens) > 0:
                    sent_list.append(sent)
                len_last_token = 0
                toklist = []
                catlist = []
                # reset the curr sent lists as well
                curr_sentence_tok = []
                curr_sentence_cat = []

            else:
                # if we are at the start of a new sentence, add the contents of curr_sentence_tok
                # and curr_sentence_cat to the main lists and start a new curr_sentence
                if row["sentstart"] == "yes":
                    toklist.extend(curr_sentence_tok)
                    catlist.extend(curr_sentence_cat)
                    curr_sentence_tok = [tok]
                    curr_sentence_cat = [cat]
                else:
                    curr_sentence_tok.append(tok)
                    curr_sentence_cat.append(cat)

                # if the combined length of toklist and curr_sentence_tok is > maxlen now,
                # create a flair sentence with the tokens in toklist and reset it
                # the remaining tokens in curr_sentence_tok are saved for the next chunk
                if len(toklist) + len(curr_sentence_tok) > maxlen:
                    # if toklist is empty at this point, we have a sentence > maxlen
                    # and must split it. The last token currently in curr_sentence will
                    # be preserved for later so that the chunk is not too long
                    if len(toklist) == 0:
                        toklist.extend(curr_sentence_tok[0:-1])
                        catlist.extend(curr_sentence_cat[0:-1])
                        curr_sentence_tok = [curr_sentence_tok[-1]]
                        curr_sentence_cat = [curr_sentence_cat[-1]]
                        self.logger.debug(
                            "Sentence is split (len: {}): {}".format(
                                len(toklist), toklist))

                    self.logger.debug("create chunk with (len: {}): {}".format(
                        len(toklist), toklist))
                    self.logger.debug("catlist with (len: {}): {}".format(
                        len(catlist), catlist))
                    sent = Sentence()
                    for i, tok in enumerate(toklist):
                        flair_tok = Token(str(tok),
                                          start_position=len_last_token)
                        len_last_token += len(tok) + 1
                        flair_tok.add_tag("cat", str(catlist[i]))
                        sent.add_token(flair_tok)
                    if len(sent.tokens) > 0:
                        sent_list.append(sent)
                    len_last_token = 0
                    toklist = []
                    catlist = []

        self.logger.debug("toklist: {}, curr_sent_tok: {}".format(
            len(toklist), len(curr_sentence_tok)))
        # if the loop is complete, empty the buffers and add them to the list
        if len(curr_sentence_tok) > 0:
            toklist.extend(curr_sentence_tok)
            catlist.extend(curr_sentence_cat)
            sent = Sentence()
            for i, tok in enumerate(toklist):
                flair_tok = Token(str(tok), start_position=len_last_token)
                len_last_token += len(tok) + 1
                flair_tok.add_tag("cat", str(catlist[i]))
                sent.add_token(flair_tok)
            if len(sent.tokens) > 0:
                sent_list.append(sent)
            len_last_token = 0

        return sent_list
Ejemplo n.º 18
0
def process_conll_doc(input_file_name, output_file_name, ner_model,
                      with_disambiguation, sim_level_disambig):

    nertagger = SequenceTagger.load(ner_model)
    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:
            nertagger.predict(d)

            centity = []
            newsent = []
            for token in d:
                #print(token)
                nertag = token.get_tag("ner").value
                #print(token.text + " " + nertag)
                if nertag[0:2] in ['B-', 'S-']:
                    if len(centity) != 0:
                        newsent.append("<entity>" + " ".join(centity) +
                                       "</entity>")
                        centity = []
                    centity.append(token.text)
                if nertag[0:2] in ['E-', 'I-']:
                    centity.append(token.text)
                if nertag == "O":
                    if len(centity) != 0:
                        newsent.append("<entity>" + " ".join(centity) +
                                       "</entity>")
                        centity = []
                    newsent.append(token.text)
            sent_for_ag = " ".join(newsent)
            agres = ag.disambiguate(sent_for_ag)

            for entity in d.get_spans('ner'):
                for r in agres:
                    if r["namedEntity"] == entity.text:
                        for t in entity.tokens:
                            t.add_tag("pnme", r["disambiguatedURL"])
                        break

            if with_disambiguation:
                searcher = load_disambiguation()
                for nerspan in d.get_spans('ner'):
                    if "pnme" not in nerspan.tokens[0].tags:
                        #print("calling with " + nerspan.text)
                        r = searcher.search(nerspan.text.lower(),
                                            sim_level_disambig)
                        #print(r)
                        if len(r) > 0:
                            d_tag = unidecode.unidecode(
                                (string.capwords(r[0]) +
                                 "_(disambiguation)").replace(" ", "_"))
                            for t2 in nerspan.tokens:
                                t2.add_tag("pnme", d_tag)

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Ejemplo n.º 19
0
    def test_check_input(self):
        """
        Test for check_input function
        """
        phone_sigs = [
            'cell', 'Cell', 'phone', 'Phone', 'Phone/fax', 'phone/fax',
            'Phone/Fax'
        ]
        fax_sigs = ['Fax', 'fax']

        # Check for email address
        sentence = Sentence()
        token = Token('hello')
        tag = 'S-email_id'
        token.add_tag('ner', tag)
        sentence.add_token(token)
        app.check_input(sentence)
        return_val = sentence[0].get_tag('ner').value
        self.assertNotEqual(return_val, tag)

        token = Token('*****@*****.**')
        sentence.add_token(token)
        app.check_input(sentence)
        return_val = sentence[1].get_tag('ner').value
        self.assertEqual(return_val, tag)

        token = Token('*****@*****.**')
        sentence.add_token(token)
        app.check_input(sentence)
        return_val = sentence[2].get_tag('ner').value
        self.assertNotEqual(return_val, tag)

        # Check for phone number
        for sig in phone_sigs:
            sentence = Sentence()
            token = Token(sig)
            tag = 'S-phone'
            token.add_tag('ner', tag)
            sentence.add_token(token)
            token = Token('123-456-7890')
            sentence.add_token(token)
            app.check_input(sentence)
            return_val = sentence[0].get_tag('ner').value
            self.assertNotEqual(return_val, tag)
            return_val = sentence[1].get_tag('ner').value
            self.assertEqual(return_val, tag)

        # Check for fax number
        for sig in fax_sigs:
            sentence = Sentence()
            token = Token(sig)
            tag = 'S-fax'
            token.add_tag('ner', tag)
            sentence.add_token(token)
            token = Token('123-456-7890')
            sentence.add_token(token)
            app.check_input(sentence)
            return_val = sentence[0].get_tag('ner').value
            self.assertNotEqual(return_val, tag)
            return_val = sentence[1].get_tag('ner').value
            self.assertEqual(return_val, tag)

        # Check for zipcode
        num = ''
        for i in range(10):
            num += str(i)
            sentence = Sentence()
            token = Token(num)
            tag = 'S-zipcode'
            sentence.add_token(token)
            app.check_input(sentence)
            return_val = sentence[0].get_tag('ner').value
            if len(num) == 5:
                self.assertEqual(return_val, tag)
            else:
                self.assertNotEqual(return_val, tag)
Ejemplo n.º 20
0
    def __init__(
        self,
        path_to_column_file: Path,
        column_name_map: Dict[int, str],
        tag_to_bioes: str = None,
        comment_symbol: str = None,
        in_memory: bool = True,
        document_separator_token: str = None,
        encoding: str = "utf-8",
    ):
        """
        Instantiates a column dataset (typically used for sequence labeling or word-level prediction).

        :param path_to_column_file: path to the file with the column-formatted data
        :param column_name_map: a map specifying the column format
        :param tag_to_bioes: whether to convert to BIOES tagging scheme
        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
        :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
        that indicates that a new document begins
        """
        assert path_to_column_file.exists()
        self.path_to_column_file = path_to_column_file
        self.tag_to_bioes = tag_to_bioes
        self.column_name_map = column_name_map
        self.comment_symbol = comment_symbol
        self.document_separator_token = document_separator_token

        # store either Sentence objects in memory, or only file offsets
        self.in_memory = in_memory
        if self.in_memory:
            self.sentences: List[Sentence] = []
        else:
            self.indices: List[int] = []

        self.total_sentence_count: int = 0

        # most data sets have the token text in the first column, if not, pass 'text' as column
        self.text_column: int = 0
        for column in self.column_name_map:
            if column_name_map[column] == "text":
                self.text_column = column

        # determine encoding of text file
        self.encoding = encoding

        sentence: Sentence = Sentence()
        with open(str(self.path_to_column_file), encoding=self.encoding) as f:

            line = f.readline()
            position = 0

            while line:

                if self.comment_symbol is not None and line.startswith(
                        comment_symbol):
                    line = f.readline()
                    continue

                if self.__line_completes_sentence(line):

                    if len(sentence) > 0:

                        sentence.infer_space_after()
                        if self.in_memory:
                            if self.tag_to_bioes is not None:
                                sentence.convert_tag_scheme(
                                    tag_type=self.tag_to_bioes,
                                    target_scheme="iobes")
                            self.sentences.append(sentence)
                        else:
                            self.indices.append(position)
                            position = f.tell()
                        self.total_sentence_count += 1
                    sentence: Sentence = Sentence()

                else:
                    fields: List[str] = re.split("[\t\n]", line)
                    token = Token(fields[self.text_column])
                    for column in column_name_map:
                        if len(fields) > column:
                            if column != self.text_column:
                                token.add_tag(self.column_name_map[column],
                                              fields[column])

                    if not line.isspace():
                        sentence.add_token(token)

                line = f.readline()

        if len(sentence.tokens) > 0:
            sentence.infer_space_after()
            if self.in_memory:
                self.sentences.append(sentence)
            else:
                self.indices.append(position)
            self.total_sentence_count += 1
Ejemplo n.º 21
0
    def read_conll_ud(path_to_conll_file: Path) -> List[Sentence]:
        """
       Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation
       :param path_to_conll_file: the path to the conll-u file
       :return: list of sentences
       """
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file,
                                encoding="utf-8").read().strip().split("\n")

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\t+", line)
            if line == "":
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith("#"):
                continue
            elif "." in fields[0]:
                continue
            elif "-" in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag("lemma", str(fields[2]))
                token.add_tag("upos", str(fields[3]))
                token.add_tag("pos", str(fields[4]))
                token.add_tag("dependency", str(fields[7]))

                for morph in str(fields[5]).split("|"):
                    if not "=" in morph:
                        continue
                    token.add_tag(
                        morph.split("=")[0].lower(),
                        morph.split("=")[1])

                if len(fields) > 10 and str(fields[10]) == "Y":
                    token.add_tag("frame", str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0:
            sentences.append(sentence)

        return sentences
Ejemplo n.º 22
0
    def read_conll_ud(path_to_conll_file: str) -> List[Sentence]:
        sentences: List[Sentence] = []

        lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \
            read().strip().split('\n')

        sentence: Sentence = Sentence()
        for line in lines:

            fields: List[str] = re.split("\s+", line)
            if line == '':
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence: Sentence = Sentence()

            elif line.startswith('#'):
                continue
            elif '.' in fields[0]:
                continue
            elif '-' in fields[0]:
                continue
            else:
                token = Token(fields[1], head_id=int(fields[6]))
                token.add_tag('lemma', str(fields[2]))
                token.add_tag('upos', str(fields[3]))
                token.add_tag('pos', str(fields[4]))
                token.add_tag('dependency', str(fields[7]))

                for morph in str(fields[5]).split('|'):
                    if not "=" in morph: continue
                    token.add_tag(
                        morph.split('=')[0].lower(),
                        morph.split('=')[1])

                if len(fields) > 10 and str(fields[10]) == 'Y':
                    token.add_tag('frame', str(fields[11]))

                sentence.add_token(token)

        if len(sentence.tokens) > 0: sentences.append(sentence)

        return sentences