Esempio n. 1
0
    def _split_known_tokens(self):

        # TODO: Remove the IOB functionalities, make optional
        known_tokens = []
        for k_token in self.known_tokens:
            sliced_text = k_token._text.split()

            if len(sliced_text) == 1:
                k_token.tag = "B-" + k_token.tag
                known_tokens.append(k_token)
                continue

            # Split token
            splitted_token = create_tokens_from_slices(
                text=k_token._text,
                tag=k_token.tag,
                ordered_slices=sliced_text,
                offset_step=k_token._init_index)

            splitted_token[0].tag = "B-" + splitted_token[0].tag
            for t in splitted_token[1:]:
                t.tag = "I-" + t.tag

            known_tokens += splitted_token

        self.known_tokens = sort_tokens(known_tokens)
Esempio n. 2
0
    def test_fit_known_tokens_create_token_to_the_rigth_and_left_between_2_tokens(
            self):
        # Create a token positioned inside the first word
        text = 'xto par'  # T-ex-to
        init = self._text.find(text)
        end = init + len(text) - 1
        known_token = Token(text, init, end, "teste")

        # Only the first sentence will be used
        text = self._st1._text

        pipeline = NerCorpusPipeline(text, [known_token])
        pipeline.apply_processing_rules()

        word_tokens_after = pipeline.word_tokens

        # Ensure the words have a valid structure
        for token in word_tokens_after:
            self.assertTrue(self._text[token._init_index:token._end_index +
                                       1] == token._text)

        # Ensure the known tokens have a valid structure
        for token in pipeline.known_tokens:
            self.assertTrue(self._text[token._init_index:token._end_index +
                                       1] == token._text)

        #self.assertTrue(len(word_tokens_after) == 4)
        self.assertTrue(
            set(['Te', 'xto', 'par', 'a', 'teste', '.']) == set([
                t._text for t in utils.sort_tokens(pipeline.known_tokens +
                                                   word_tokens_after)
            ]))
Esempio n. 3
0
 def save_conll_file(self, save_path):
     with open(save_path, "w") as f:
         for i, sentence in enumerate(self.sentences_tokens):
             # TODO: Create property to get the word tokens and known tokens together
             # TODO: Use auxiliary library for conll file manipulation
             # TODO: Build a smarter loop
             # TODO: Ensure all tokens were written - sanity test
             for token in sort_tokens(self.word_tokens + self.known_tokens):
                 if sentence.enclosing(token._init_index):
                     f.write(token._text + " " + token.tag + "\n")
             if i != len(self.sentences_tokens) - 1:
                 f.write("\n")
Esempio n. 4
0
    def _ensure_sentence_enclosing(self):
        for token in self.known_tokens:
            sentence_start = [
                sentence for sentence in self.sentences_tokens
                if sentence.enclosing(token._init_index)
            ]
            sentence_end = [
                sentence for sentence in self.sentences_tokens
                if sentence.enclosing(token._end_index)
            ]

            # TODO: Create exceptions for asserts
            assert (len(sentence_start) == 1)
            assert (len(sentence_end) == 1)

            sentence_start = sentence_start[0]
            sentence_end = sentence_end[0]

            # Ends in the same sentence it started?
            if sentence_start is sentence_end:
                continue

            # Merge sentences
            merged_sentence = Token(
                self._text[sentence_start._init_index:sentence_end._end_index +
                           1],
                sentence_start._init_index,
                sentence_end._end_index,
                tag='sentence')

            inside_flag = False
            sentences = []
            for sentence in self.sentences_tokens:
                # Jump sentences in between start - end
                if sentence is sentence_start:
                    inside_flag = True
                elif sentence is sentence_end:
                    inside_flag = False
                elif not inside_flag:
                    sentences.append(sentence)

            sentences.append(merged_sentence)
            self.sentences_tokens = sort_tokens(sentences)
Esempio n. 5
0
    def _fit_known_tokens(self):
        """
        Reserves the space of the known tokens by trimming out their regions in the words.
        E.i. if a word occupies the space of a known token 
        """

        for k_token in self.known_tokens:

            inside_flag = False
            token_span = []

            # TODO: Create an utilitary function for finding all the tokens in a specified span - apply to all points of the code where it occurs
            for w_token in self.word_tokens:
                if w_token.enclosing(k_token._init_index):
                    inside_flag = True
                    token_span.append(w_token)
                    # The end and init of the token are inside the same token
                    if w_token.enclosing(k_token._end_index):
                        break

                elif w_token.enclosing(k_token._end_index):
                    inside_flag = False
                    token_span.append(w_token)

                elif inside_flag:
                    token_span.append(w_token)

            for token in [
                    self._trim_token(token, k_token._init_index,
                                     k_token._end_index)
                    for token in token_span
            ]:
                if token:
                    self.word_tokens += token

            # TODO: Create an utilitary function for deleting all the tokens in a specified span - apply to all points of the code where it occurs
            for token in token_span:
                self.word_tokens.remove(token)

            self.word_tokens = sort_tokens(self.word_tokens)