def test_get_entities_with_offsets(): original_string = '(Mo -x 1 T x ) 3 Sb 7 with \uf084 x 0.1' tokens = [ '(', 'Mo', '-', 'x', '1', 'T', 'x', ')', '3', 'Sb', '7', 'with', '\uf084', 'x', '0', '.', '1' ] tags = [ 'B-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'I-<formula>', 'O', 'O', 'B-<variable>', 'B-<value>', 'I-<value>', 'I-<value>' ] types = [tag.split('-')[-1] for tag in tags] offsets = [(0, 1), (1, 3), (4, 5), (5, 6), (7, 8), (9, 10), (11, 12), (13, 14), (15, 16), (17, 19), (20, 21), (22, 26), (27, 28), (29, 30), (31, 32), (32, 33), (33, 34)] spaces = [ offsets[offsetIndex][1] != offsets[offsetIndex + 1][0] for offsetIndex in range(0, len(offsets) - 1) ] for index in range(0, len(offsets)): chunk = original_string[offsets[index][0]:offsets[index][1]] assert chunk == tokens[index] entities_with_offsets = get_entities_with_offsets(tags, offsets) # (chunk_type, chunk_start, chunk_end, pos_start, pos_end) assert len(entities_with_offsets) == 3 entity0 = entities_with_offsets[0] assert entity0[0] == "<formula>" entity_text = original_string[entity0[3]:entity0[4] + 1] assert entity_text == "(Mo -x 1 T x ) 3 Sb 7" assert tokens[entity0[1]:entity0[2]] == tokenizeAndFilter(entity_text)[0] entity1 = entities_with_offsets[1] assert entity1[0] == "<variable>" entity_text = original_string[entity1[3]:entity1[4] + 1] assert entity_text == "x" assert tokens[entity1[1]:entity1[2]] == tokenizeAndFilter(entity_text)[0] entity2 = entities_with_offsets[2] assert entity2[0] == "<value>" entity_text = original_string[entity2[3]:entity2[4] + 1] assert entity_text == "0.1" assert tokens[entity2[1]:entity2[2]] == tokenizeAndFilter(entity_text)[0]
def test_tokenizer_filter(self): input = 'this is a test, but a stupid test!!' output = tokenizeAndFilter(input) assert len(output) == 2 assert output[0] == ['this', 'is', 'a', 'test', ',', 'but', 'a', 'stupid', 'test', '!', '!'] assert output[1] == [(0, 4), (5, 7), (8, 9), (10, 14), (14, 15), (16, 19), (20, 21), (22, 28), (29, 33), (33, 34), (34, 35)]
def iter_tag( self, texts, output_format, features=None, tag_transformed: bool = False ) -> Union[dict, Iterable[List[Tuple[str, str]]]]: assert isinstance(texts, list) dataset_transformer = self.dataset_transformer_factory() transformed_texts, transformed_features = dataset_transformer.fit_transform_x_and_features( texts, features) preds_concatenated_iterable = iter_predict_texts_with_sliding_window_if_enabled( texts=transformed_texts, features=transformed_features, model=self.model, model_config=self.model_config, preprocessor=self.preprocessor, max_sequence_length=self.max_sequence_length, input_window_stride=self.input_window_stride, embeddings=self.embeddings) for i, pred_item in enumerate(preds_concatenated_iterable): LOGGER.debug('pred_item.shape: %s', pred_item.shape) LOGGER.debug('pred_item=%r', pred_item) pred = [pred_item] text = texts[i] if tag_transformed: text = transformed_texts[i] if isinstance(text, str): tokens, offsets = tokenizeAndFilter(text) else: # it is a list of string, so a string already tokenized # note that in this case, offset are not present and json output is impossible tokens = text offsets = [] LOGGER.debug('tokens: %s', tokens) tags = self._get_tags(pred) if not tag_transformed: tags = dataset_transformer.inverse_transform_y([tags])[0] LOGGER.debug('tags: %s', tags) if output_format == 'json': prob = self._get_prob(pred) piece = {} piece["text"] = text piece["entities"] = self._build_json_response( tokens, tags, prob, offsets)["entities"] yield piece else: the_tags = list(zip(tokens, tags)) yield the_tags
def test_tokenizer_filter_with_breaklines(self): input = '\nthis is yet \u2666 another, dummy... test,\na [stupid] test?!' output = tokenizeAndFilter(input) assert len(output) == 2 assert output[0] == ['this', 'is', 'yet', '\u2666', 'another', ',', 'dummy', '.', '.', '.', 'test', ',', 'a', '[', 'stupid', ']', 'test', '?', '!'] assert output[1] == [(1, 5), (6, 8), (9, 12), (13, 14), (15, 22), (22, 23), (24, 29), (29, 30), (30, 31), (31, 32), (33, 37), (37, 38), (39, 40), (41, 42), (42, 48), (48, 49), (50, 54), (54, 55), (55, 56)]
def tag(self, texts, output_format, features=None): assert isinstance(texts, list) if output_format is 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model_config.model_name, "texts": [] } else: list_of_tags = [] to_tokeniz = False if (len(texts) > 0 and isinstance(texts[0], str)): to_tokeniz = True if 'bert' in self.model_config.model_type.lower(): preds = self.model.predict(texts, fold_id=-1) for i in range(0, len(preds)): pred = preds[i] text = texts[i] if (isinstance(text, str)): tokens, offsets = tokenizeAndFilter(text) else: # it is a list of string, so a string already tokenized # note that in this case, offset are not present and json output is impossible tokens = text offsets = [] tags = pred prob = None if output_format is 'json': piece = {} piece["text"] = text piece["entities"] = self._build_json_response( text, tokens, tags, prob, offsets)["entities"] res["texts"].append(piece) else: the_tags = list(zip(tokens, tags)) list_of_tags.append(the_tags) else: predict_generator = DataGenerator( texts, None, batch_size=self.model_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, tokenize=to_tokeniz, shuffle=False, features=features) nb_workers = 6 multiprocessing = True # multiple workers will not work with ELMo due to GPU memory limit (with GTX 1080Ti 11GB) if self.embeddings.use_ELMo: # worker at 0 means the training will be executed in the main thread nb_workers = 0 multiprocessing = False # dump token context independent data for train set, done once for the training steps_done = 0 steps = len(predict_generator) for generator_output in predict_generator: if steps_done == steps: break preds = self.model.predict_on_batch(generator_output[0]) for i in range(0, len(preds)): pred = [preds[i]] text = texts[i + (steps_done * self.model_config.batch_size)] #if (isinstance(text, str)): if to_tokeniz: tokens, offsets = tokenizeAndFilter(text) else: # it is a list of string, so a string already tokenized # note that in this case, offset are not present and json output is impossible tokens = text offsets = [] tags = self._get_tags(pred) prob = self._get_prob(pred) if output_format is 'json': piece = {} piece["text"] = text piece["entities"] = self._build_json_response( text, tokens, tags, prob, offsets)["entities"] res["texts"].append(piece) else: the_tags = list(zip(tokens, tags)) list_of_tags.append(the_tags) steps_done += 1 if output_format is 'json': return res else: return list_of_tags
def tag(self, texts, output_format, features=None): if output_format == 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model_config.model_name, "texts": [] } else: list_of_tags = [] to_tokeniz = False if (len(texts) > 0 and isinstance(texts[0], str)): to_tokeniz = True # dirty fix warning! in the particular case of using tf-addons CRF layer and having a # single sequence in the input batch, a tensor shape error can happen in the CRF # viterbi_decoding loop. So to prevent this, we add a dummy second sequence in the batch # that we will remove after prediction dummy_case = False if self.model_config.use_crf and not self.model_config.use_chain_crf and len( texts) == 1: if features == None: if to_tokeniz: texts.append(texts[0]) else: texts.append(["dummy"]) else: texts.append(texts[0]) # add a dummy feature vector for the token dummy... features.append(features[0]) dummy_case = True # end of dirty fix generator = self.model.get_generator() predict_generator = generator( texts, None, batch_size=self.model_config.batch_size, preprocessor=self.preprocessor, bert_preprocessor=self.transformer_preprocessor, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, tokenize=to_tokeniz, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) steps_done = 0 steps = len(predict_generator) for generator_output in predict_generator: if dummy_case and steps_done == 1: break if steps_done == steps: break if isinstance(predict_generator, DataGeneratorTransformers): # the model uses transformer embeddings, so we need the input tokens to realign correctly the # labels and the inpit label texts # we need to remove one vector of the data corresponding to the marked tokens, this vector is not # expected by the model, but we need it to restore correctly the labels (which are produced # according to the sub-segmentation of wordpiece, not the expected segmentation) data = generator_output[0] input_offsets = data[-1] data = data[:-1] y_pred_batch = self.model.predict_on_batch(data) #y_pred_batch = np.argmax(y_pred_batch, -1) # results have been produced by a model using a transformer layer, so a few things to do # the labels are sparse, so integers and not one hot encoded # we need to restore back the labels for wordpiece to the labels for normal tokens # for this we can use the marked tokens provided by the generator new_y_pred_batch = [] for y_pred_text, offsets_text in zip(y_pred_batch, input_offsets): new_y_pred_text = [] # this is the result per sequence, realign labels: for q in range(len(offsets_text)): if offsets_text[q][0] == 0 and offsets_text[q][1] == 0: # special token continue if offsets_text[q][0] != 0: # added sub-token continue new_y_pred_text.append(y_pred_text[q]) new_y_pred_batch.append(new_y_pred_text) preds = new_y_pred_batch else: # no weirdness changes on the input preds = self.model.predict_on_batch(generator_output[0]) for i in range(0, len(preds)): pred = [preds[i]] text = texts[i + (steps_done * self.model_config.batch_size)] if to_tokeniz: tokens, offsets = tokenizeAndFilter(text) else: # it is a list of string, so a string already tokenized # note that in this case, offset are not present and json output is impossible tokens = text offsets = [] if not self.model_config.use_crf or self.model_config.use_chain_crf: tags = self._get_tags(pred) prob = self._get_prob(pred) else: tags = self._get_tags_sparse(pred) prob = self._get_prob_sparse(pred) if output_format == 'json': piece = {} piece["text"] = text piece["entities"] = self._build_json_response( text, tokens, tags, prob, offsets)["entities"] res["texts"].append(piece) else: the_tags = list(zip(tokens, tags)) list_of_tags.append(the_tags) steps_done += 1 if output_format == 'json': return res else: return list_of_tags
def load_data_and_labels_json_offsets(jsonCorpus, tokenizer=None): """ Load data and labels from json corpus where annotations are expressed with offsets. This requires a tokenizer passed as parameter. If tokenizer is None, we use the generic Indo-European tokenizer. Note: input file can be gzipped or not { "lang": "en", "level": "sentence", "documents": [ { "id": "10.1371/journal.pone.0198300", "body_text": [ { "text": "The test was designed so that bacteria were collected at 1 hour and 6 hours after start time on each day of testing.", "annotation_spans": [ { "start": 30, "end": 38, "text": "bacteria", "type": "dataset", "datatype": "Tabular Data:Sample Table" } ] }, ] } } Returns: tuple(numpy array, numpy array): data and labels """ if not os.path.exists(jsonCorpus): print("Invalid path file: ", jsonCorpus) return None, None all_tokens = [] all_labels = [] if jsonCorpus.endswith(".gz"): corpus_file = gzip.open(jsonCorpus, "rt") else: corpus_file = open(jsonCorpus, "rt") jsonDocuments = json.load(corpus_file) if "documents" in jsonDocuments: for jsonDocument in jsonDocuments["documents"]: if "body_text" in jsonDocument: for text_piece in jsonDocument["body_text"]: if "text" in text_piece: tokens = [] labels = [] text = text_piece["text"] local_tokens, local_offsets = tokenizeAndFilter(text) spans = [] if "annotation_spans" in text_piece: for annotation_span in text_piece[ "annotation_spans"]: local_type = None if "type" in annotation_span: local_type = annotation_span["type"] local_type = local_type.replace(" ", "_") spans.append([ annotation_span["start"], annotation_span["end"], local_type ]) i = 0 for local_token in local_tokens: tokens.append(local_token) offset = local_offsets[i] found = False for span in spans: if span[0] <= offset[0] and ( offset[1] <= span[1] or offset[0] < span[1]): if span[0] == offset[0]: labels.append("B-" + span[2]) else: labels.append("I-" + span[2]) found = True break if not found: labels.append("O") i += 1 all_tokens.append(tokens) all_labels.append(labels) corpus_file.close() final_tokens = np.asarray(all_tokens) final_labels = np.asarray(all_labels) return final_tokens, final_labels
def test_tokenize_and_filter(self): tokens, offsets = tokenizeAndFilter("this is a simple text") assert len(tokens) == len(offsets) assert len(tokens) == 5