def test_nested_labels(self): raw = ["Indico Is the best"] finetunex = [["Indico ", "Is the", " best"]] finetuney = [[("1", ), ("1", "2", "3"), ("1", )]] encoder = GPTEncoder() indicox_pred, indicoy_pred = finetune_to_indico_sequence( raw, finetunex, finetuney, none_value="<PAD>") indicoy = [[{ 'start': 0, 'end': 18, 'label': '1', 'text': 'Indico Is the best' }, { 'start': 7, 'end': 13, 'label': '2', 'text': 'Is the' }, { 'start': 7, 'end': 13, 'label': '3', 'text': 'Is the' }]] self.assertEqual(indicoy, indicoy_pred) self.assertEqual(raw, indicox_pred) finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence( raw, indicoy, encoder=encoder, none_value="<PAD>") self.assertEqual(finetunex_pred, finetunex) self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0]) self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1]) self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
def test_overlapping_labels_with_single_label(self): text = ["Indico Rules"] finetunex = [["Indic", "o", " Rules"]] finetuney = [["1", "1", "2"]] indicoy = [[ { 'start': 0, 'end': 6, 'label': '1', 'text': 'Indico' }, { 'start': 5, 'end': 12, 'label': '2', 'text': 'o Rules' }, ]] finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence( text, indicoy, encoder=GPTEncoder(), none_value="<PAD>", multi_label=False) self.assertEqual(finetunex_pred, finetunex) self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0]) self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1]) self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
def finetune(self, Xs, Y=None, batch_size=None): """ :param Xs: A list of strings. :param Y: A list of labels of the same format as sequence labeling but with an option al additional field of the form: ``` { ... "association":{ "index": a, "relationship": relationship_name } ... ``` where index is the index of the relationship target into the label list and relationship_name is the type of the relationship. """ if self.config.association_types is None: raise FinetuneError( "Please set config.association_types before calling finetune.") Xs, Y_new, association_type, association_idx, idxs = indico_to_finetune_sequence( Xs, encoder=self.input_pipeline.text_encoder, labels=Y, multi_label=False, none_value=self.config.pad_token) Y = list(zip(Y_new, association_type, association_idx, idxs)) if Y is not None else None return super().finetune(Xs, Y=Y, batch_size=batch_size)
def test_overlapping_gpt2_subtokens(self): raw = ["Indico Is the best hey"] finetunex = [["Indico", " Is the", " best", " hey"]] finetuney = [[("1", ), ("1", "2"), ("2", ), ("<PAD>")]] encoder = GPT2Encoder() indicox_pred, indicoy_pred = finetune_to_indico_sequence( raw, finetunex, finetuney, none_value="<PAD>", subtoken_predictions=True) indicoy = [[ { 'start': 0, 'end': 13, 'label': '1', 'text': 'Indico Is the' }, { 'start': 6, 'end': 18, 'label': '2', 'text': ' Is the best' }, ]] self.assertEqual(indicoy, indicoy_pred) self.assertEqual(raw, indicox_pred) finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence( raw, indicoy, encoder=encoder, none_value="<PAD>") self.assertEqual(finetunex_pred, finetunex) self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0]) self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1]) self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
def test_three_overlapping_labels(self): raw = ["Indico Is the very best"] finetunex = [ ["Indico ", "Is the very", " best"] ] finetuney = [ [("<PAD>", ), ("1", "2", "3"), ("1", "3")] ] encoder = GPTEncoder() indicox_pred, indicoy_pred = finetune_to_indico_sequence(raw, finetunex, finetuney, none_value="<PAD>") indicoy_pred = [sorted(seq, key=lambda x: x['label']) for seq in indicoy_pred] indicoy = [ sorted( [ {'start': 7, 'end': 18, 'label': '2', 'text': 'Is the very'}, {'start': 7, 'end': 23, 'label': '1', 'text': 'Is the very best'}, {'start': 7, 'end': 23, 'label': '3', 'text': 'Is the very best'} ], key=lambda x: x['label'] ) ] self.assertEqual(indicoy, indicoy_pred) self.assertEqual(raw, indicox_pred) finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence( raw, indicoy, encoder=encoder, none_value="<PAD>" ) self.assertEqual(finetunex_pred, finetunex) self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0]) self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1]) self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
def test_roberta_failure(self): text = [ 'Margin Cost\n1357711\n593 2501\n1350\n700860\n65053899 06\n46032479 8308\n6452785 ' '50\n3353\n12546915 246\n094 10828664\n7 8058\n53696576 25\n7654 260919\n646256 ' '75300\n4\n091577 8177070\n012197121 38\n30414787 93\n6024915 600028\n8 2' ] label = [ [ {'text': '600028', 'label': '0000ff', 'start': 203, 'end': 209}, {'text': '8', 'label': '0000ff', 'start': 210, 'end': 211}, {'text': '2', 'label': '0000ff', 'start': 212, 'end': 213} ] ] finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence( text, label, encoder=RoBERTaEncoderV2(), none_value="<PAD>" ) target_x = [ [ 'Margin Cost\n1357711\n593 2501\n1350\n700860\n65053899 06\n46032479 8308\n6452785' ' 50\n3353\n12546915 246\n094 10828664\n7 8058\n53696576 25\n7654 260919\n646256 ' '75300\n4\n091577 8177070\n012197121 38\n30414787 93\n6024915', ' 600028', '\n', '8', ' 2' ] ] target_y = [[('<PAD>',), ('0000ff',), ('<PAD>',), ('0000ff',), ('0000ff',)]] self.assertEqual(finetunex_pred, target_x) self.assertEqual(finetuney_pred, target_y)
def finetune(self, Xs, Y=None, batch_size=None): Xs, Y_new, *_ = indico_to_finetune_sequence( Xs, encoder=self.input_pipeline.text_encoder, labels=Y, multi_label=self.multi_label, none_value=self.config.pad_token) Y = Y_new if Y is not None else None return super().finetune(Xs, Y=Y, batch_size=batch_size)
def finetune(self, X, Y=None, batch_size=None): """ :param X: A dictionary mapping from task name to inputs in the same format required for each of the models. :param Y: A dictionary mapping from task name to targets in the same format required for each of the models. :param batch_size: Number of examples per batch. When N_GPUS > 1, this number corresponds to the number of training examples provided to each GPU. :return: """ for t in [ task_name for task_name, t in self.config.tasks.items() if t == SequenceLabeler ]: X[t], Y[t], *_ = indico_to_finetune_sequence(X[t], labels=Y[t], multi_label=False, none_value="<PAD>") return super().finetune(X, Y=Y, batch_size=batch_size)
def finetune(self, Xs, Y=None, batch_size=None): context = None if self.config.use_auxiliary_info: context = Xs[1] Xs = Xs[0] Xs_new, Y_new, _, _, _ = indico_to_finetune_sequence( Xs, encoder=self.input_pipeline.text_encoder, labels=Y, multi_label=self.multi_label, none_value=self.config.pad_token, ) Y = Y_new if Y is not None else None if self.config.use_auxiliary_info: context_new = context Xs = [Xs_new, context_new] else: Xs = Xs_new return super().finetune(Xs, Y=Y, batch_size=batch_size)