def data_generator(self, batch_size=1, phase='train', shuffle=True, data=None): if phase != 'predict' and not self.dataset: raise ValueError("The dataset is None ! It isn't allowed.") if phase == 'train': shuffle = True examples = self.get_train_examples() self.num_examples['train'] = len(examples) elif phase == 'val' or phase == 'dev': shuffle = False examples = self.get_dev_examples() self.num_examples['dev'] = len(examples) elif phase == 'test': shuffle = False examples = self.get_test_examples() self.num_examples['test'] = len(examples) elif phase == 'predict': shuffle = False examples = [] seq_id = 0 for item in data: # set label in order to run the program if self.dataset: label = list(self.label_map.keys())[0] else: label = 0 if len(item) == 1: item_i = InputExample(guid=seq_id, text_a=item[0], label=label) elif len(item) == 2: item_i = InputExample(guid=seq_id, text_a=item[0], text_b=item[1], label=label) else: raise ValueError( "The length of input_text is out of handling, which must be 1 or 2!" ) examples.append(item_i) seq_id += 1 else: raise ValueError( "Unknown phase, which should be in ['train', 'dev', 'test', 'predict']." ) def wrapper(): if shuffle: np.random.shuffle(examples) for batch_data in self._prepare_batch_data(examples, batch_size, phase=phase): yield [batch_data] return wrapper
def _read_tsv(self, input_file, quotechar=None): """Reads a tab separated value file.""" with codecs.open(input_file, "r", encoding="UTF-8") as f: reader = csv.reader(f, delimiter=",", quotechar=quotechar) examples = [] seq_id = 0 header = next(reader) # skip header for line in reader: example = InputExample( guid=seq_id, label=[int(i) for i in line[0].split(' ')], text_a=line[1]) seq_id += 1 examples.append(example) return examples