コード例 #1
0
ファイル: test_utils.py プロジェクト: seeker1943/finetune
    def test_nested_labels(self):
        raw = ["Indico Is the best"]
        finetunex = [["Indico ", "Is the", " best"]]
        finetuney = [[("1", ), ("1", "2", "3"), ("1", )]]
        encoder = GPTEncoder()
        indicox_pred, indicoy_pred = finetune_to_indico_sequence(
            raw, finetunex, finetuney, none_value="<PAD>")

        indicoy = [[{
            'start': 0,
            'end': 18,
            'label': '1',
            'text': 'Indico Is the best'
        }, {
            'start': 7,
            'end': 13,
            'label': '2',
            'text': 'Is the'
        }, {
            'start': 7,
            'end': 13,
            'label': '3',
            'text': 'Is the'
        }]]
        self.assertEqual(indicoy, indicoy_pred)
        self.assertEqual(raw, indicox_pred)

        finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence(
            raw, indicoy, encoder=encoder, none_value="<PAD>")
        self.assertEqual(finetunex_pred, finetunex)
        self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0])
        self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1])
        self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
コード例 #2
0
ファイル: test_utils.py プロジェクト: seeker1943/finetune
    def test_overlapping_labels_with_single_label(self):
        text = ["Indico Rules"]
        finetunex = [["Indic", "o", " Rules"]]
        finetuney = [["1", "1", "2"]]
        indicoy = [[
            {
                'start': 0,
                'end': 6,
                'label': '1',
                'text': 'Indico'
            },
            {
                'start': 5,
                'end': 12,
                'label': '2',
                'text': 'o Rules'
            },
        ]]

        finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence(
            text,
            indicoy,
            encoder=GPTEncoder(),
            none_value="<PAD>",
            multi_label=False)
        self.assertEqual(finetunex_pred, finetunex)
        self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0])
        self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1])
        self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
コード例 #3
0
ファイル: association.py プロジェクト: yishuihanhan/finetune
    def finetune(self, Xs, Y=None, batch_size=None):
        """
        :param Xs: A list of strings.
        :param Y: A list of labels of the same format as sequence labeling but with an option al additional field
        of the form:
        ```
            {
                ...
                "association":{
                        "index": a,
                        "relationship": relationship_name
                }
                ...
        ```
        where index is the index of the relationship target into the label list and relationship_name is the type of
        the relationship.
        """
        if self.config.association_types is None:
            raise FinetuneError(
                "Please set config.association_types before calling finetune.")
        Xs, Y_new, association_type, association_idx, idxs = indico_to_finetune_sequence(
            Xs,
            encoder=self.input_pipeline.text_encoder,
            labels=Y,
            multi_label=False,
            none_value=self.config.pad_token)

        Y = list(zip(Y_new, association_type, association_idx,
                     idxs)) if Y is not None else None
        return super().finetune(Xs, Y=Y, batch_size=batch_size)
コード例 #4
0
ファイル: test_utils.py プロジェクト: seeker1943/finetune
    def test_overlapping_gpt2_subtokens(self):
        raw = ["Indico Is the best hey"]
        finetunex = [["Indico", " Is the", " best", " hey"]]
        finetuney = [[("1", ), ("1", "2"), ("2", ), ("<PAD>")]]
        encoder = GPT2Encoder()
        indicox_pred, indicoy_pred = finetune_to_indico_sequence(
            raw,
            finetunex,
            finetuney,
            none_value="<PAD>",
            subtoken_predictions=True)

        indicoy = [[
            {
                'start': 0,
                'end': 13,
                'label': '1',
                'text': 'Indico Is the'
            },
            {
                'start': 6,
                'end': 18,
                'label': '2',
                'text': ' Is the best'
            },
        ]]
        self.assertEqual(indicoy, indicoy_pred)
        self.assertEqual(raw, indicox_pred)

        finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence(
            raw, indicoy, encoder=encoder, none_value="<PAD>")
        self.assertEqual(finetunex_pred, finetunex)
        self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0])
        self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1])
        self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
コード例 #5
0
ファイル: test_utils.py プロジェクト: yishuihanhan/finetune
    def test_three_overlapping_labels(self):
        raw = ["Indico Is the very best"]
        finetunex = [
            ["Indico ", "Is the very", " best"]
        ]
        finetuney = [
            [("<PAD>", ), ("1", "2", "3"), ("1", "3")]
        ]
        encoder = GPTEncoder()
        indicox_pred, indicoy_pred = finetune_to_indico_sequence(raw, finetunex, finetuney, none_value="<PAD>")
        indicoy_pred = [sorted(seq, key=lambda x: x['label']) for seq in indicoy_pred]
        indicoy = [
            sorted(
                [
                    {'start': 7, 'end': 18, 'label': '2', 'text': 'Is the very'},
                    {'start': 7, 'end': 23, 'label': '1', 'text': 'Is the very best'},
                    {'start': 7, 'end': 23, 'label': '3', 'text': 'Is the very best'}
                ],
                key=lambda x: x['label']
            )
        ]
        self.assertEqual(indicoy, indicoy_pred)
        self.assertEqual(raw, indicox_pred)

        finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence(
            raw, indicoy, encoder=encoder, none_value="<PAD>"
        )
        self.assertEqual(finetunex_pred, finetunex)
        self.assertCountEqual(finetuney[0][0], finetuney_pred[0][0])
        self.assertCountEqual(finetuney[0][1], finetuney_pred[0][1])
        self.assertCountEqual(finetuney[0][2], finetuney_pred[0][2])
コード例 #6
0
ファイル: test_utils.py プロジェクト: tc-wolf/finetune
 def test_roberta_failure(self):
     text = [
         'Margin Cost\n1357711\n593 2501\n1350\n700860\n65053899 06\n46032479 8308\n6452785 '
         '50\n3353\n12546915 246\n094 10828664\n7 8058\n53696576 25\n7654 260919\n646256 '
         '75300\n4\n091577 8177070\n012197121 38\n30414787 93\n6024915 600028\n8 2'
     ]
     label = [
         [
             {'text': '600028', 'label': '0000ff', 'start': 203, 'end': 209},
             {'text': '8', 'label': '0000ff', 'start': 210, 'end': 211},
             {'text': '2', 'label': '0000ff', 'start': 212, 'end': 213}
         ]
     ]
     finetunex_pred, finetuney_pred, *_ = indico_to_finetune_sequence(
         text, label, encoder=RoBERTaEncoderV2(), none_value="<PAD>"
     )
     target_x = [
         [
             'Margin Cost\n1357711\n593 2501\n1350\n700860\n65053899 06\n46032479 8308\n6452785'
                 ' 50\n3353\n12546915 246\n094 10828664\n7 8058\n53696576 25\n7654 260919\n646256 '
                 '75300\n4\n091577 8177070\n012197121 38\n30414787 93\n6024915',
             ' 600028',
             '\n',
             '8',
             ' 2'
         ]
     ]
     target_y = [[('<PAD>',), ('0000ff',), ('<PAD>',), ('0000ff',), ('0000ff',)]]
     self.assertEqual(finetunex_pred, target_x)
     self.assertEqual(finetuney_pred, target_y)
コード例 #7
0
 def finetune(self, Xs, Y=None, batch_size=None):
     Xs, Y_new, *_ = indico_to_finetune_sequence(
         Xs,
         encoder=self.input_pipeline.text_encoder,
         labels=Y,
         multi_label=self.multi_label,
         none_value=self.config.pad_token)
     Y = Y_new if Y is not None else None
     return super().finetune(Xs, Y=Y, batch_size=batch_size)
コード例 #8
0
    def finetune(self, X, Y=None, batch_size=None):
        """

        :param X: A dictionary mapping from task name to inputs in the same format required for each of the models.
        :param Y: A dictionary mapping from task name to targets in the same format required for each of the models.
        :param batch_size: Number of examples per batch. When N_GPUS > 1, this number
                           corresponds to the number of training examples provided to each GPU.
        :return:
        """
        for t in [
                task_name for task_name, t in self.config.tasks.items()
                if t == SequenceLabeler
        ]:
            X[t], Y[t], *_ = indico_to_finetune_sequence(X[t],
                                                         labels=Y[t],
                                                         multi_label=False,
                                                         none_value="<PAD>")
        return super().finetune(X, Y=Y, batch_size=batch_size)
コード例 #9
0
    def finetune(self, Xs, Y=None, batch_size=None):
        context = None
        if self.config.use_auxiliary_info:
            context = Xs[1]
            Xs = Xs[0]
        Xs_new, Y_new, _, _, _ = indico_to_finetune_sequence(
            Xs,
            encoder=self.input_pipeline.text_encoder,
            labels=Y,
            multi_label=self.multi_label,
            none_value=self.config.pad_token,
        )

        Y = Y_new if Y is not None else None

        if self.config.use_auxiliary_info:
            context_new = context
            Xs = [Xs_new, context_new]
        else:
            Xs = Xs_new
        return super().finetune(Xs, Y=Y, batch_size=batch_size)