Beispiel #1
0
    def _data(self, name, path, type_name='dev', ignore_metric=False):
        if isinstance(path, str):
            input_src = os.path.join(self.data_dir, path)
            assert os.path.exists(input_src), f"{input_src} doesn't exists"
            data = self._read_tsv(input_src)
            if type_name == 'test':
                examples = ExampleSet(
                    [ExampleInstance((l[1], )) for l in data[1:]])
            else:
                examples = ExampleSet([
                    ExampleInstance((l[3], ), self.label2id(l[1]))
                    for l in data
                ])
        elif isinstance(path, ExampleSet):
            examples = path
        else:
            raise ValueError('Input type of path not supported')

        predict_fn = self.get_predict_fn(examples)
        return EvalData(name,
                        examples,
                        metrics_fn=self.get_metrics_fn(),
                        predict_fn=predict_fn,
                        ignore_metric=ignore_metric,
                        critial_metrics=['mcc'])
Beispiel #2
0
    def _data(self, name, path, type_name='dev'):
        input_src = os.path.join(self.data_dir, path)
        assert os.path.exists(input_src), f"{input_src} doesn't exists"
        data = self._read_tsv(input_src)
        predict_fn = self.get_predict_fn()
        if type_name == 'test':
            examples = ExampleSet(
                [ExampleInstance((l[7], l[8])) for l in data[1:]])
        else:
            examples = ExampleSet(
                [ExampleInstance((l[7], l[8]), float(l[9])) for l in data[1:]])

        return EvalData(name,
                        examples,
                        metrics_fn=self.get_metrics_fn(),
                        predict_fn=predict_fn)
  def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
    middle = self.load_jsonl(os.path.join(self.data_dir, 'train_middle.jsonl'))
    high = self.load_jsonl(os.path.join(self.data_dir, 'train_high.jsonl'))
    examples = ExampleSet(middle + high)
    if dataset_size is None:
      dataset_size = len(examples)*epochs
    return DynamicDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen), \
dataset_size = dataset_size, shuffle=True, **kwargs)
Beispiel #4
0
    def _data(self, name, path, type_name='dev', ignore_metric=False):
        input_src = os.path.join(self.data_dir, path)
        assert os.path.exists(input_src), f"{input_src} doesn't exists"
        data = self._read_tsv(input_src)
        predict_fn = self.get_predict_fn()
        if type_name == 'test':
            examples = ExampleSet(
                [ExampleInstance((l[8], l[9])) for l in data[1:]])
        else:
            examples = ExampleSet([
                ExampleInstance((l[8], l[9]), self.label2id(l[-1]))
                for l in data[1:]
            ])

        return EvalData(name,
                        examples,
                        metrics_fn=self.get_metrics_fn(input_src),
                        predict_fn=predict_fn,
                        ignore_metric=ignore_metric,
                        critial_metrics=['accuracy'])
Beispiel #5
0
 def train_data(self,
                max_seq_len=512,
                dataset_size=None,
                epochs=1,
                mask_gen=None,
                **kwargs):
     examples = ExampleSet(self.train_dev +
                           self.train_split)  # if l[3] in ['slate']])
     if dataset_size is None:
         dataset_size = len(examples) * epochs
     return DynamicDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen, label_type='float', training=True), \
 dataset_size = dataset_size, shuffle=True, **kwargs)
  def _data(self, name, path, type_name = 'dev', ignore_metric=False):
    if isinstance(path, str):
      path = [path]
    data = []
    for p in path:
      input_src = os.path.join(self.data_dir, p)
      assert os.path.exists(input_src), f"{input_src} doesn't exists"
      data.extend(self.load_jsonl(input_src))

    predict_fn = self.get_predict_fn()
    examples = ExampleSet(data)
    return EvalData(name, examples,
      metrics_fn = self.get_metrics_fn(), predict_fn = predict_fn, ignore_metric=ignore_metric, critial_metrics=['accuracy'])
Beispiel #7
0
 def train_data(self,
                max_seq_len=512,
                dataset_size=None,
                epochs=1,
                mask_gen=None,
                **kwargs):
     input_src = os.path.join(self.data_dir, 'train.tsv')
     assert os.path.exists(input_src), f"{input_src} doesn't exists"
     data = self._read_tsv(input_src)
     examples = ExampleSet([
         ExampleInstance((l[0], ), self.label2id(l[1])) for l in data[1:]
     ])  # if l[3] in ['slate']])
     if dataset_size is None:
         dataset_size = len(examples) * epochs
     return DynamicDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen), \
 dataset_size = dataset_size, shuffle=True, **kwargs)
Beispiel #8
0
    def anli_data(self, name, path, type_name='dev', ignore_metric=False):
        input_src = os.path.join(self.data_dir, path)
        assert os.path.exists(input_src), f"{input_src} doesn't exists"
        data = self._read_tsv(input_src)
        predict_fn = self.get_predict_fn()
        examples = ExampleSet([
            ExampleInstance((l[1], l[2]), self.label2id(l[3]))
            for l in data[1:]
        ])

        def _metric_fn(logits, labels):
            return OrderedDict(accuracy=metric_accuracy(logits, labels))

        return EvalData(name,
                        examples,
                        metrics_fn=_metric_fn,
                        predict_fn=predict_fn,
                        ignore_metric=ignore_metric,
                        critial_metrics=['accuracy'])
Beispiel #9
0
    def train_data(self,
                   max_seq_len=512,
                   dataset_size=None,
                   epochs=1,
                   mask_gen=None,
                   **kwargs):
        examples = []
        data_src = ['R1', 'R2', 'R3']
        for d in data_src:
            input_src = os.path.join(self.data_dir, f'anli_v0.1/{d}/train.tsv')
            data = self._read_tsv(input_src)
            examples += [
                ExampleInstance((l[1], l[2]), self.label2id(l[3]))
                for l in data[1:]
            ]

        examples = ExampleSet(examples)
        if dataset_size is None:
            dataset_size = len(examples) * epochs
        return DynamicDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen), \
    dataset_size = dataset_size, shuffle=True, **kwargs)