def finetune(self, Xs, Y=None, batch_size=None): Xs, Y_new = indico_to_finetune_sequence(Xs, labels=Y, multi_label=self.multi_label, none_value="<PAD>") Y = Y_new if Y is not None else None return super().finetune(Xs, Y=Y, batch_size=batch_size)
def test_three_overlapping_labels(self): raw = ["Indico Is the best"] finetunex = [["Indico ", "Is the", " best"]] finetuney = [[("<PAD>", ), ("1", "2", "3"), ("1", "3")]] indicox_pred, indicoy_pred = finetune_to_indico_sequence( raw, finetunex, finetuney) indicoy = [[{ 'start': 7, 'end': 13, 'label': '2', 'text': 'Is the' }, { 'start': 7, 'end': 18, 'label': '1', 'text': 'Is the best' }, { 'start': 7, 'end': 18, 'label': '3', 'text': 'Is the best' }]] self.assertEqual(indicoy, indicoy_pred) self.assertEqual(raw, indicox_pred) finetunex_pred, finetuney_pred = indico_to_finetune_sequence( raw, indicoy) self.assertEqual(finetunex_pred, finetunex) self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0]) self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1]) self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
def finetune(self, X, Y, batch_size=None): """ :param X: A list of text snippets. Format: [batch_size] :param Y: A list of lists of annotations. Format: [batch_size, n_annotations], where each annotation is of the form: {'start': char_idx, 'end': char_idx, 'label': 'label'} :param batch_size: integer number of examples per batch. When N_GPUS > 1, this number corresponds to the number of training examples provided to each GPU. :param val_size: Float fraction or int number that represents the size of the validation set. :param val_interval: The interval for which validation is performed, measured in number of steps. """ X, Y = indico_to_finetune_sequence(X, Y, none_value="<PAD>") self.target_type = SEQUENCE_LABELING return self._finetune(X, Y, batch_size=batch_size)
def finetune(self, X, Y=None, batch_size=None): """ :param X: A list of text snippets. Format: [batch_size] :param Y: A list of lists of annotations. Format: [batch_size, n_annotations], where each annotation is of the form: {'start': 0, 'end': 5, 'label': 'class', 'text': 'sample text'} :param batch_size: integer number of examples per batch. When N_GPUS > 1, this number corresponds to the number of training examples provided to each GPU. :param val_size: Float fraction or int number that represents the size of the validation set. :param val_interval: The interval for which validation is performed, measured in number of steps. """ fit_language_model_only = (Y is None) X, Y = indico_to_finetune_sequence(X, Y, none_value="<PAD>") arr_encoded = self._text_to_ids(X, Y=Y) labels = None if fit_language_model_only else arr_encoded.labels return self._training_loop(arr_encoded, Y=labels, batch_size=batch_size)
def predict(self, X, max_length=None): """ Produces a list of most likely class labels as determined by the fine-tuned model. :param X: A list / array of text, shape [batch] :param max_length: the number of tokens to be included in the document representation. Providing more than `max_length` tokens as input will result in truncatindiion. :returns: list of class labels. """ doc_subseqs, _ = indico_to_finetune_sequence(X) arr_encoded = self._text_to_ids(doc_subseqs) labels = self._predict(doc_subseqs, max_length=max_length) all_subseqs = [] all_labels = [] for text, label_seq, position_seq in zip(X, labels, arr_encoded.char_locs): start_of_token = 0 doc_subseqs = [] doc_labels = [] for label, position in zip(label_seq, position_seq): if position == -1: # indicates padding / special tokens continue # if there are no current subsequence # or the current subsequence has the wrong label if not doc_subseqs or label != doc_labels[-1]: # start new subsequence doc_subseqs.append(text[start_of_token:position]) doc_labels.append(label) else: # continue appending to current subsequence doc_subseqs[-1] += text[start_of_token:position] start_of_token = position all_subseqs.append(doc_subseqs) all_labels.append(doc_labels) _, doc_annotations = finetune_to_indico_sequence( raw_texts=X, subseqs=all_subseqs, labels=all_labels, subtoken_predictions=self.config.subtoken_predictions ) return doc_annotations
def predict_proba(self, X, max_length=None): """ Produces a list of most likely class labels as determined by the fine-tuned model. :param X: A list / array of text, shape [batch] :param max_length: the number of tokens to be included in the document representation. Providing more than `max_length` tokens as input will result in truncatindiion. :returns: list of class labels. """ doc_subseqs, _ = indico_to_finetune_sequence(X) arr_encoded = self._text_to_ids_with_labels(doc_subseqs) batch_probas = self._predict_proba(X, max_length=max_length) result = [] for token_seq, proba_seq in zip(arr_encoded.tokens, batch_probas): result.append(list(zip(token_seq, proba_seq))) return result
def predict(self, X, max_length=None): """ Produces a list of most likely class labels as determined by the fine-tuned model. :param X: A list / array of text, shape [batch] :param max_length: the number of tokens to be included in the document representation. Providing more than `max_length` tokens as input will result in truncatindiion. :returns: list of class labels. """ doc_subseqs, _ = indico_to_finetune_sequence(X) max_length = max_length or self.config.max_length chunk_size = max_length - 2 step_size = chunk_size // 3 arr_encoded = self._text_to_ids(doc_subseqs) labels = [] batch_probas = [] with warnings.catch_warnings(): warnings.filterwarnings("ignore") max_length = max_length or self.config.max_length for xmb, mmb in self._infer_prep(doc_subseqs, max_length=max_length): output = self._eval(self.predict_op, feed_dict={ self.X: xmb, self.M: mmb, self.do_dropout: DROPOUT_OFF } ) prediction, probas = output.get(self.predict_op) batch_probas.extend(probas) formatted_predictions = self.label_encoder.inverse_transform(prediction) labels.extend(formatted_predictions) all_subseqs = [] all_labels = [] all_probs = [] doc_idx = -1 for chunk_idx, (label_seq, position_seq, proba_seq) in enumerate(zip(labels, arr_encoded.char_locs, batch_probas)): start_of_doc = arr_encoded.token_ids[chunk_idx][0][0] == self.encoder.start end_of_doc = ( chunk_idx + 1 >= len(arr_encoded.char_locs) or arr_encoded.token_ids[chunk_idx + 1][0][0] == self.encoder.start ) """ Chunk idx for prediction. Dividers at `step_size` increments. [ 1 | 1 | 2 | 3 | 3 ] """ if start_of_doc: # if this is the first chunk in a document, start accumulating from scratch doc_subseqs = [] doc_labels = [] doc_probs = [] doc_idx += 1 prob_accum = 0 start_of_token = 0 if not end_of_doc: # predict only on first two thirds label_seq, position_seq, proba_seq = label_seq[:step_size*2], position_seq[:step_size*2], proba_seq[:step_size*2] else: if end_of_doc: # predict on the rest of sequence label_seq, position_seq, proba_seq = label_seq[step_size:], position_seq[step_size:], proba_seq[step_size:] else: # predict only on middle third label_seq, position_seq, proba_seq = label_seq[step_size:step_size*2], position_seq[step_size: step_size*2], proba_seq[step_size:step_size*2] for label, position, proba in zip(label_seq, position_seq, proba_seq): if position == -1: # indicates padding / special tokens continue # if there are no current subsequence # or the current subsequence has the wrong label if not doc_subseqs or label != doc_labels[-1]: # start new subsequence doc_subseqs.append(X[doc_idx][start_of_token:position]) doc_labels.append(label) doc_probs.append([proba]) else: # continue appending to current subsequence doc_subseqs[-1] += X[doc_idx][start_of_token:position] doc_probs[-1].append(proba) start_of_token = position if end_of_doc: # last chunk in a document prob_dicts = [] for prob_seq in doc_probs: # format probabilities as dictionary probs = np.mean(np.vstack(prob_seq), axis=0) prob_dicts.append(dict(zip(self.label_encoder.classes_, probs))) all_subseqs.append(doc_subseqs) all_labels.append(doc_labels) all_probs.append(prob_dicts) _, doc_annotations = finetune_to_indico_sequence( raw_texts=X, subseqs=all_subseqs, labels=all_labels, probs=all_probs, subtoken_predictions=self.config.subtoken_predictions ) return doc_annotations