Exemple #1
0
    def data_generator(self,
                       batch_size=1,
                       phase='train',
                       shuffle=True,
                       data=None):
        if phase != 'predict' and not self.dataset:
            raise ValueError("The dataset is None ! It isn't allowed.")
        if phase == 'train':
            shuffle = True
            examples = self.get_train_examples()
            self.num_examples['train'] = len(examples)
        elif phase == 'val' or phase == 'dev':
            shuffle = False
            examples = self.get_dev_examples()
            self.num_examples['dev'] = len(examples)
        elif phase == 'test':
            shuffle = False
            examples = self.get_test_examples()
            self.num_examples['test'] = len(examples)
        elif phase == 'predict':
            shuffle = False
            examples = []
            seq_id = 0

            for item in data:
                # set label in order to run the program
                if self.dataset:
                    label = list(self.label_map.keys())[0]
                else:
                    label = 0
                if len(item) == 1:
                    item_i = InputExample(guid=seq_id,
                                          text_a=item[0],
                                          label=label)
                elif len(item) == 2:
                    item_i = InputExample(guid=seq_id,
                                          text_a=item[0],
                                          text_b=item[1],
                                          label=label)
                else:
                    raise ValueError(
                        "The length of input_text is out of handling, which must be 1 or 2!"
                    )
                examples.append(item_i)
                seq_id += 1
        else:
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'test', 'predict']."
            )

        def wrapper():
            if shuffle:
                np.random.shuffle(examples)

            for batch_data in self._prepare_batch_data(examples,
                                                       batch_size,
                                                       phase=phase):
                yield [batch_data]

        return wrapper
    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with codecs.open(input_file, "r", encoding="UTF-8") as f:
            reader = csv.reader(f, delimiter=",", quotechar=quotechar)
            examples = []
            seq_id = 0
            header = next(reader)  # skip header
            for line in reader:
                example = InputExample(
                    guid=seq_id, label=[int(i) for i in line[0].split(' ')], text_a=line[1])
                seq_id += 1
                examples.append(example)

            return examples