def data_process(self, sep='\t'): """ 数据处理 :return: """ if '.csv' not in self.train_data_path: self.train_data_path = data2csv(self.train_data_path, sep) self.index2label, self.label2index, self.labels, train_data = data_preprocess( self.train_data_path) self.num_classes = len(self.index2label) if self.valid_data_path: if '.csv' not in self.valid_data_path: self.valid_data_path = data2csv(self.valid_data_path, sep) _, _, _, valid_data = data_preprocess(self.valid_data_path) else: train_data, valid_data = split(train_data, self.split) if self.test_data_path: if '.csv' not in self.test_data_path: self.test_data_path = data2csv(self.test_data_path, sep) _, _, _, test_data = data_preprocess(self.test_data_path) else: test_data = [] self.train_generator = Data_Generator(train_data, self.label2index, self.tokenizer, self.batch_size, self.max_len) self.valid_generator = Data_Generator(valid_data, self.label2index, self.tokenizer, self.batch_size, self.max_len) self.test_generator = Data_Generator(test_data, self.label2index, self.tokenizer, self.batch_size, self.max_len)
def data_process(self, sep='\t'): """ 数据处理 :return: """ self.index2label, self.label2index, self.labels, train_data = json_data_process( self.train_data_path) # self.index2label, self.label2index, self.labels, train_data = txt_data_process(self.train_data_path) print(self.label2index, "1111111") self.num_classes = len(self.index2label) if self.valid_data_path: _, _, _, valid_data = json_data_process(self.valid_data_path) # _, _, _, valid_data = txt_data_process(self.valid_data_path) else: train_data, valid_data = split(train_data, self.split) if self.test_data_path: _, _, _, test_data = json_data_process(self.test_data_path) # _, _, _, test_data = txt_data_process(self.test_data_path) else: test_data = [] self.train_generator = datagenerator(train_data, self.label2index, self.tokenizer, self.batch_size, self.max_len) self.valid_generator = datagenerator(valid_data, self.label2index, self.tokenizer, self.batch_size, self.max_len) self.test_generator = datagenerator(test_data, self.label2index, self.tokenizer, self.batch_size, self.max_len)
def data_process(self): labels, train_data = data_process(self.train_data_path) if self.valid_data_path: _, self.valid_data = data_process(self.valid_data_path) else: train_data, self.valid_data = split(train_data, self.split) if self.test_data_path: _, self.test_data = data_process(self.test_data_path) self.index2label = dict(enumerate(labels)) self.label2index = {j: i for i, j in self.index2label.items()} self.num_classes = len(labels) * 2 + 1 self.labels = labels self.train_generator = Data_Generator(train_data, self.batch_size, self.tokenizer, self.label2index, self.max_len) logger.info('data process done')