Python tokenizeAndFilter Examples, delft.utilities.Tokenizer.tokenizeAndFilter Python Examples

Example #1

0

Show file

def test_get_entities_with_offsets():
    original_string = '(Mo -x 1 T x ) 3 Sb 7 with \uf084 x 0.1'
    tokens = [
        '(', 'Mo', '-', 'x', '1', 'T', 'x', ')', '3', 'Sb', '7', 'with',
        '\uf084', 'x', '0', '.', '1'
    ]
    tags = [
        'B-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>',
        'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>',
        'I-<formula>', 'I-<formula>', 'I-<formula>', 'O', 'O', 'B-<variable>',
        'B-<value>', 'I-<value>', 'I-<value>'
    ]
    types = [tag.split('-')[-1] for tag in tags]

    offsets = [(0, 1), (1, 3), (4, 5), (5, 6), (7, 8), (9, 10), (11, 12),
               (13, 14), (15, 16), (17, 19), (20, 21), (22, 26), (27, 28),
               (29, 30), (31, 32), (32, 33), (33, 34)]

    spaces = [
        offsets[offsetIndex][1] != offsets[offsetIndex + 1][0]
        for offsetIndex in range(0,
                                 len(offsets) - 1)
    ]

    for index in range(0, len(offsets)):
        chunk = original_string[offsets[index][0]:offsets[index][1]]

        assert chunk == tokens[index]

    entities_with_offsets = get_entities_with_offsets(tags, offsets)
    # (chunk_type, chunk_start, chunk_end, pos_start, pos_end)

    assert len(entities_with_offsets) == 3
    entity0 = entities_with_offsets[0]
    assert entity0[0] == "<formula>"
    entity_text = original_string[entity0[3]:entity0[4] + 1]
    assert entity_text == "(Mo -x 1 T x ) 3 Sb 7"
    assert tokens[entity0[1]:entity0[2]] == tokenizeAndFilter(entity_text)[0]

    entity1 = entities_with_offsets[1]
    assert entity1[0] == "<variable>"
    entity_text = original_string[entity1[3]:entity1[4] + 1]
    assert entity_text == "x"
    assert tokens[entity1[1]:entity1[2]] == tokenizeAndFilter(entity_text)[0]

    entity2 = entities_with_offsets[2]
    assert entity2[0] == "<value>"
    entity_text = original_string[entity2[3]:entity2[4] + 1]
    assert entity_text == "0.1"
    assert tokens[entity2[1]:entity2[2]] == tokenizeAndFilter(entity_text)[0]

Example #2

0

Show file

    def test_tokenizer_filter(self):
        input = 'this is a test, but a stupid test!!'

        output = tokenizeAndFilter(input)

        assert len(output) == 2
        assert output[0] == ['this', 'is', 'a', 'test', ',', 'but', 'a', 'stupid', 'test', '!', '!']
        assert output[1] == [(0, 4), (5, 7), (8, 9), (10, 14), (14, 15), (16, 19), (20, 21), (22, 28), (29, 33),
                             (33, 34), (34, 35)]

Example #3

0

Show file

File: tagger.py Project: elifesciences/sciencebeam-trainer-delft

    def iter_tag(
        self,
        texts,
        output_format,
        features=None,
        tag_transformed: bool = False
    ) -> Union[dict, Iterable[List[Tuple[str, str]]]]:
        assert isinstance(texts, list)

        dataset_transformer = self.dataset_transformer_factory()
        transformed_texts, transformed_features = dataset_transformer.fit_transform_x_and_features(
            texts, features)

        preds_concatenated_iterable = iter_predict_texts_with_sliding_window_if_enabled(
            texts=transformed_texts,
            features=transformed_features,
            model=self.model,
            model_config=self.model_config,
            preprocessor=self.preprocessor,
            max_sequence_length=self.max_sequence_length,
            input_window_stride=self.input_window_stride,
            embeddings=self.embeddings)
        for i, pred_item in enumerate(preds_concatenated_iterable):
            LOGGER.debug('pred_item.shape: %s', pred_item.shape)
            LOGGER.debug('pred_item=%r', pred_item)

            pred = [pred_item]
            text = texts[i]
            if tag_transformed:
                text = transformed_texts[i]

            if isinstance(text, str):
                tokens, offsets = tokenizeAndFilter(text)
            else:
                # it is a list of string, so a string already tokenized
                # note that in this case, offset are not present and json output is impossible
                tokens = text
                offsets = []

            LOGGER.debug('tokens: %s', tokens)

            tags = self._get_tags(pred)
            if not tag_transformed:
                tags = dataset_transformer.inverse_transform_y([tags])[0]
            LOGGER.debug('tags: %s', tags)

            if output_format == 'json':
                prob = self._get_prob(pred)
                piece = {}
                piece["text"] = text
                piece["entities"] = self._build_json_response(
                    tokens, tags, prob, offsets)["entities"]
                yield piece
            else:
                the_tags = list(zip(tokens, tags))
                yield the_tags

Example #4

0

Show file

    def test_tokenizer_filter_with_breaklines(self):
        input = '\nthis is yet \u2666 another, dummy... test,\na [stupid] test?!'

        output = tokenizeAndFilter(input)

        assert len(output) == 2
        assert output[0] == ['this', 'is', 'yet', '\u2666', 'another', ',', 'dummy', '.', '.', '.', 'test', ',', 'a',
                             '[', 'stupid', ']', 'test', '?', '!']
        assert output[1] == [(1, 5), (6, 8), (9, 12), (13, 14), (15, 22), (22, 23), (24, 29), (29, 30), (30, 31),
                             (31, 32), (33, 37), (37, 38), (39, 40), (41, 42), (42, 48), (48, 49), (50, 54), (54, 55),
                             (55, 56)]

Example #5

0

Show file

    def tag(self, texts, output_format, features=None):
        assert isinstance(texts, list)

        if output_format is 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "texts": []
            }
        else:
            list_of_tags = []

        to_tokeniz = False
        if (len(texts) > 0 and isinstance(texts[0], str)):
            to_tokeniz = True

        if 'bert' in self.model_config.model_type.lower():
            preds = self.model.predict(texts, fold_id=-1)
            for i in range(0, len(preds)):
                pred = preds[i]
                text = texts[i]

                if (isinstance(text, str)):
                    tokens, offsets = tokenizeAndFilter(text)
                else:
                    # it is a list of string, so a string already tokenized
                    # note that in this case, offset are not present and json output is impossible
                    tokens = text
                    offsets = []

                tags = pred
                prob = None

                if output_format is 'json':
                    piece = {}
                    piece["text"] = text
                    piece["entities"] = self._build_json_response(
                        text, tokens, tags, prob, offsets)["entities"]
                    res["texts"].append(piece)
                else:
                    the_tags = list(zip(tokens, tags))
                    list_of_tags.append(the_tags)

        else:
            predict_generator = DataGenerator(
                texts,
                None,
                batch_size=self.model_config.batch_size,
                preprocessor=self.preprocessor,
                char_embed_size=self.model_config.char_embedding_size,
                max_sequence_length=self.model_config.max_sequence_length,
                embeddings=self.embeddings,
                tokenize=to_tokeniz,
                shuffle=False,
                features=features)

            nb_workers = 6
            multiprocessing = True
            # multiple workers will not work with ELMo due to GPU memory limit (with GTX 1080Ti 11GB)
            if self.embeddings.use_ELMo:
                # worker at 0 means the training will be executed in the main thread
                nb_workers = 0
                multiprocessing = False
                # dump token context independent data for train set, done once for the training

            steps_done = 0
            steps = len(predict_generator)
            for generator_output in predict_generator:
                if steps_done == steps:
                    break
                preds = self.model.predict_on_batch(generator_output[0])

                for i in range(0, len(preds)):
                    pred = [preds[i]]
                    text = texts[i +
                                 (steps_done * self.model_config.batch_size)]

                    #if (isinstance(text, str)):
                    if to_tokeniz:
                        tokens, offsets = tokenizeAndFilter(text)
                    else:
                        # it is a list of string, so a string already tokenized
                        # note that in this case, offset are not present and json output is impossible
                        tokens = text
                        offsets = []

                    tags = self._get_tags(pred)
                    prob = self._get_prob(pred)

                    if output_format is 'json':
                        piece = {}
                        piece["text"] = text
                        piece["entities"] = self._build_json_response(
                            text, tokens, tags, prob, offsets)["entities"]
                        res["texts"].append(piece)
                    else:
                        the_tags = list(zip(tokens, tags))
                        list_of_tags.append(the_tags)
                steps_done += 1

        if output_format is 'json':
            return res
        else:
            return list_of_tags

Example #6

0

Show file

File: tagger.py Project: kermitt2/delft

    def tag(self, texts, output_format, features=None):

        if output_format == 'json':
            res = {
                "software": "DeLFT",
                "date": datetime.datetime.now().isoformat(),
                "model": self.model_config.model_name,
                "texts": []
            }
        else:
            list_of_tags = []

        to_tokeniz = False
        if (len(texts) > 0 and isinstance(texts[0], str)):
            to_tokeniz = True

        # dirty fix warning! in the particular case of using tf-addons CRF layer and having a
        # single sequence in the input batch, a tensor shape error can happen in the CRF
        # viterbi_decoding loop. So to prevent this, we add a dummy second sequence in the batch
        # that we will remove after prediction
        dummy_case = False
        if self.model_config.use_crf and not self.model_config.use_chain_crf and len(
                texts) == 1:
            if features == None:
                if to_tokeniz:
                    texts.append(texts[0])
                else:
                    texts.append(["dummy"])
            else:
                texts.append(texts[0])
                # add a dummy feature vector for the token dummy...
                features.append(features[0])
            dummy_case = True
        # end of dirty fix

        generator = self.model.get_generator()
        predict_generator = generator(
            texts,
            None,
            batch_size=self.model_config.batch_size,
            preprocessor=self.preprocessor,
            bert_preprocessor=self.transformer_preprocessor,
            char_embed_size=self.model_config.char_embedding_size,
            max_sequence_length=self.model_config.max_sequence_length,
            embeddings=self.embeddings,
            tokenize=to_tokeniz,
            shuffle=False,
            features=features,
            output_input_offsets=True,
            use_chain_crf=self.model_config.use_chain_crf)

        steps_done = 0
        steps = len(predict_generator)
        for generator_output in predict_generator:
            if dummy_case and steps_done == 1:
                break

            if steps_done == steps:
                break

            if isinstance(predict_generator, DataGeneratorTransformers):
                # the model uses transformer embeddings, so we need the input tokens to realign correctly the
                # labels and the inpit label texts

                # we need to remove one vector of the data corresponding to the marked tokens, this vector is not
                # expected by the model, but we need it to restore correctly the labels (which are produced
                # according to the sub-segmentation of wordpiece, not the expected segmentation)
                data = generator_output[0]

                input_offsets = data[-1]
                data = data[:-1]

                y_pred_batch = self.model.predict_on_batch(data)
                #y_pred_batch = np.argmax(y_pred_batch, -1)

                # results have been produced by a model using a transformer layer, so a few things to do
                # the labels are sparse, so integers and not one hot encoded
                # we need to restore back the labels for wordpiece to the labels for normal tokens
                # for this we can use the marked tokens provided by the generator
                new_y_pred_batch = []
                for y_pred_text, offsets_text in zip(y_pred_batch,
                                                     input_offsets):
                    new_y_pred_text = []
                    # this is the result per sequence, realign labels:
                    for q in range(len(offsets_text)):
                        if offsets_text[q][0] == 0 and offsets_text[q][1] == 0:
                            # special token
                            continue
                        if offsets_text[q][0] != 0:
                            # added sub-token
                            continue
                        new_y_pred_text.append(y_pred_text[q])
                    new_y_pred_batch.append(new_y_pred_text)
                preds = new_y_pred_batch
            else:
                # no weirdness changes on the input
                preds = self.model.predict_on_batch(generator_output[0])

            for i in range(0, len(preds)):
                pred = [preds[i]]
                text = texts[i + (steps_done * self.model_config.batch_size)]

                if to_tokeniz:
                    tokens, offsets = tokenizeAndFilter(text)
                else:
                    # it is a list of string, so a string already tokenized
                    # note that in this case, offset are not present and json output is impossible
                    tokens = text
                    offsets = []

                if not self.model_config.use_crf or self.model_config.use_chain_crf:
                    tags = self._get_tags(pred)
                    prob = self._get_prob(pred)
                else:
                    tags = self._get_tags_sparse(pred)
                    prob = self._get_prob_sparse(pred)

                if output_format == 'json':
                    piece = {}
                    piece["text"] = text
                    piece["entities"] = self._build_json_response(
                        text, tokens, tags, prob, offsets)["entities"]
                    res["texts"].append(piece)
                else:
                    the_tags = list(zip(tokens, tags))
                    list_of_tags.append(the_tags)
            steps_done += 1

        if output_format == 'json':
            return res
        else:
            return list_of_tags

Example #7

0

Show file

def load_data_and_labels_json_offsets(jsonCorpus, tokenizer=None):
    """
    Load data and labels from json corpus where annotations are expressed with offsets. 
    This requires a tokenizer passed as parameter. If tokenizer is None, we use the generic
    Indo-European tokenizer.

    Note: input file can be gzipped or not

    {
    "lang": "en",
    "level": "sentence",
    "documents": [
        {
            "id": "10.1371/journal.pone.0198300",
            "body_text": [
                {
                    "text": "The test was designed so that bacteria were collected at 1 hour and 6 hours after start time on each day of testing.",
                    "annotation_spans": [
                        {
                            "start": 30,
                            "end": 38,
                            "text": "bacteria",
                            "type": "dataset",
                            "datatype": "Tabular Data:Sample Table"
                        }
                    ]
                },
            ]
        }
    }    

    Returns:
        tuple(numpy array, numpy array): data and labels
    """
    if not os.path.exists(jsonCorpus):
        print("Invalid path file: ", jsonCorpus)
        return None, None

    all_tokens = []
    all_labels = []

    if jsonCorpus.endswith(".gz"):
        corpus_file = gzip.open(jsonCorpus, "rt")
    else:
        corpus_file = open(jsonCorpus, "rt")

    jsonDocuments = json.load(corpus_file)
    if "documents" in jsonDocuments:
        for jsonDocument in jsonDocuments["documents"]:
            if "body_text" in jsonDocument:
                for text_piece in jsonDocument["body_text"]:
                    if "text" in text_piece:
                        tokens = []
                        labels = []
                        text = text_piece["text"]
                        local_tokens, local_offsets = tokenizeAndFilter(text)
                        spans = []
                        if "annotation_spans" in text_piece:
                            for annotation_span in text_piece[
                                    "annotation_spans"]:
                                local_type = None
                                if "type" in annotation_span:
                                    local_type = annotation_span["type"]
                                    local_type = local_type.replace(" ", "_")
                                spans.append([
                                    annotation_span["start"],
                                    annotation_span["end"], local_type
                                ])
                        i = 0
                        for local_token in local_tokens:
                            tokens.append(local_token)
                            offset = local_offsets[i]
                            found = False
                            for span in spans:
                                if span[0] <= offset[0] and (
                                        offset[1] <= span[1]
                                        or offset[0] < span[1]):
                                    if span[0] == offset[0]:
                                        labels.append("B-" + span[2])
                                    else:
                                        labels.append("I-" + span[2])
                                    found = True
                                    break
                            if not found:
                                labels.append("O")
                            i += 1

                        all_tokens.append(tokens)
                        all_labels.append(labels)

    corpus_file.close()

    final_tokens = np.asarray(all_tokens)
    final_labels = np.asarray(all_labels)

    return final_tokens, final_labels

Example #8

0

Show file

File: tokenizer_test.py Project: zeta1999/delft

    def test_tokenize_and_filter(self):
        tokens, offsets = tokenizeAndFilter("this is a simple text")

        assert len(tokens) == len(offsets)
        assert len(tokens) == 5