def train_dataloader(self): # Random args = self.hparams.train.training train_sampler = RandomSampler() train_loader = DataSetIter(batch_size=args.batch_size, dataset=self.train_dataset, sampler=train_sampler, drop_last=False) return train_loader
def test(self): # turn on the testing mode; clean up the history network = self._model self.mode(network, is_test=True) self.eval_history.clear() output, truths = defaultdict(list), defaultdict(list) data_iterator = Batch(self.data, self.batch_size, sampler=RandomSampler(), as_numpy=False) with torch.no_grad(): for batch_x, batch_y in data_iterator: prediction = self.data_forward(network, batch_x) assert isinstance(prediction, dict) for k, v in prediction.items(): output[k].append(v) for k, v in batch_y.items(): truths[k].append(v) for k, v in output.items(): output[k] = itertools.chain(*v) for k, v in truths.items(): truths[k] = itertools.chain(*v) args = _build_args(self._evaluator, **output, **truths) eval_results = self._evaluator(**args) print("[tester] {}".format(self.print_eval_results(eval_results))) self.mode(network, is_test=False) return eval_results
def test_random_sampler(): sampler = RandomSampler() data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] ans = [data[i] for i in sampler(data)] assert len(ans) == len(data) for d in ans: assert d in data
def train_init_dataloder_for_model2(self): assert self.hparams.joint_training init_sampler = RandomSampler() args = self.hparams.train.init.model2 init_loader = DataSetIter(batch_size=args.batch_size, dataset=self.train_dataset_init_for_model2, sampler=init_sampler, drop_last=False) return init_loader
def train_init_dataloader(self): if self.train_dataset_init is None: return None init_sampler = RandomSampler() args = self.hparams.train.init if not self.hparams.joint_training else self.hparams.train.init.model1 init_loader = DataSetIter(batch_size=args.batch_size, dataset=self.train_dataset_init, sampler=init_sampler, drop_last=False) return init_loader
def init_data_loader(self, batch_size): self.train_data_loader = DataSetIter(self.train_set, batch_size, sampler=RandomSampler()) self.dev_data_loader = DataSetIter(self.dev_set, batch_size, sampler=SequentialSampler()) self.test_data_loader = DataSetIter(self.test_set, batch_size, sampler=SequentialSampler())
def __init__(self, dataset, batch_size, sampler=RandomSampler(), as_numpy=False): self.dataset = dataset self.batch_size = batch_size self.sampler = sampler self.as_numpy = as_numpy self.idx_list = None self.curidx = 0 self.num_batches = len(dataset) // batch_size + int( len(dataset) % batch_size != 0) self.cur_batch_indices = None
def train(self, network, train_data, dev_data=None): """General Training Procedure :param network: a model :param train_data: a DataSet instance, the training data :param dev_data: a DataSet instance, the validation data (optional) """ # transfer model to gpu if available if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() # self._model is used to access model-specific loss else: self._model = network # define Tester over dev data if self.validate: default_valid_args = {"batch_size": self.batch_size, "pickle_path": self.pickle_path, "use_cuda": self.use_cuda, "evaluator": self._evaluator} validator = self._create_validator(default_valid_args) logger.info("validator defined as {}".format(str(validator))) # optimizer and loss self.define_optimizer() logger.info("optimizer defined as {}".format(str(self._optimizer))) self.define_loss() logger.info("loss function defined as {}".format(str(self._loss_func))) # main training procedure start = time.time() logger.info("training epochs started") for epoch in range(1, self.n_epochs + 1): logger.info("training epoch {}".format(epoch)) # turn on network training mode self.mode(network, is_test=False) # prepare mini-batch iterator data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) logger.info("prepared data iterator") # one forward and backward pass self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch) # validation if self.validate: if dev_data is None: raise RuntimeError( "self.validate is True in trainer, but dev_data is None. Please provide the validation data.") logger.info("validation started") validator.test(network, dev_data)
def train(self): """Start Training. :return: """ try: if torch.cuda.is_available() and self.use_cuda: self.model = self.model.cuda() self.mode(self.model, is_test=False) start = time.time() self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) print("training epochs started " + self.start_time) if self.save_path is None: class psudoSW: def __getattr__(self, item): def pass_func(*args, **kwargs): pass return pass_func self._summary_writer = psudoSW() else: path = os.path.join( self.save_path, 'tensorboard_logs_{}'.format(self.start_time)) self._summary_writer = SummaryWriter(path) epoch = 1 while epoch <= self.n_epochs: data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), as_numpy=False) self._train_epoch(data_iterator, self.model, epoch, self.dev_data, start) # validate_every override validation at end of epochs if self.dev_data and self.validate_every <= 0: self.do_validation() epoch += 1 finally: self._summary_writer.close() del self._summary_writer
def test(self, network, dev_data): if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() else: self._model = network # turn on the testing mode; clean up the history self.mode(network, is_test=True) self.eval_history.clear() output_list = [] truth_list = [] data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) for batch_x, batch_y in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) output_list.append(prediction) truth_list.append(batch_y) eval_results = self.evaluate(output_list, truth_list) print("[tester] {}".format(self.print_eval_results(eval_results))) logger.info("[tester] {}".format(self.print_eval_results(eval_results)))
def _train_epoch(self): total_loss = 0 corrects, samples = 0, 0 n_tasks = len(self.task_lst) task_seq = list(np.random.permutation(n_tasks)) empty_task = copy.deepcopy(self.empty_tasks) self.model.train() self.model.zero_grad() for cur_step in range(self.n_steps_per_epoch): for task_id in task_seq: if task_id in empty_task: continue task = find_task(task_id, self.task_lst) batch = next(task.train_data_loader, None) if batch is None: # empty_task.add(task_id) task.train_data_loader = DataSetIter( task.train_set, self.batch_size, sampler=RandomSampler()) task.train_data_loader = iter(task.train_data_loader) continue x, y = batch batch_task_id = x["task_id"].cuda() batch_x = x["x"].cuda() batch_y = y["y"].cuda() self.masker.before_forward(batch_task_id[0].item()) if "seq_len" in x: seq_len = x["seq_len"].cuda() out = self.model(batch_task_id, batch_x, batch_y, seq_len) else: seq_len = None out = self.model(batch_task_id, batch_x, batch_y) loss, pred = out["loss"], out["pred"] self.steps += 1 total_loss += loss.item() loss = loss / self.accumulation_steps loss.backward() self.masker.after_forward(batch_task_id[0].item()) self.metrics[task_id].evaluate(pred, batch_y, seq_len) if self.steps % self.accumulation_steps == 0: nn.utils.clip_grad_value_(self.model.parameters(), 5) if self.scheduler is not None: self.scheduler.step() self.optim.step() self.optim.zero_grad() if self.steps % self.print_every == 0: self.summary_writer.add_scalar( "train_loss", total_loss / self.print_every, self.steps) score = self.metrics[task_id].get_metric() metric_name = "acc" if "acc" in score else "f1" score = score["acc"] if "acc" in score else score["f"] self.summary_writer.add_scalar("train_acc", score, self.steps) self.logger.info(" - Step {}: loss {}\t{}\t{}: {}".format( self.steps, total_loss / self.print_every, task.task_name, metric_name, score, )) total_loss = 0 if self.epoch_scheduler is not None: self.epoch_scheduler.step()
model = model.cuda() if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info('done!') logger.info('=========== preparing data: [{}] ==========='.format(args.task)) data_file = open('data/' + args.task + '.pkl', 'rb') data = pickle.load(data_file) data_file.close() bsz = args.batch_size // args.accumulation_steps logger.info('some examples:') if args.task == 'MNLI': train_ds = text2feature(data['train'], tokenizer, args.task) train_dataloader = Batch(train_ds, bsz, sampler=RandomSampler()) dev_matched_ds = text2feature(data['dev_matched'], tokenizer, args.task) dev_matched_dataloader = Batch(dev_matched_ds, bsz, sampler=SequentialSampler()) dev_mismatched_ds = text2feature(data['dev_mismatched'], tokenizer, args.task) dev_mismatched_dataloader = Batch(dev_mismatched_ds, bsz, sampler=SequentialSampler()) dev_dataloader = [dev_matched_dataloader, dev_mismatched_dataloader] test_matched_ds = text2feature(data['test_matched'], tokenizer, args.task, True) test_matched_dataloader = Batch(test_matched_ds, bsz, sampler=SequentialSampler()) test_mismatched_ds = text2feature(data['test_mismatched'], tokenizer, args.task, True) test_mismatched_dataloader = Batch(test_mismatched_ds, bsz, sampler=SequentialSampler())
def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, sampler=RandomSampler(), prefetch=False, use_tqdm=True, use_cuda=False, callbacks=None): """ :param DataSet train_data: the training data :param torch.nn.modules.module model: a PyTorch model :param LossBase loss: a loss object :param MetricBase metrics: a metric object or a list of metrics (List[MetricBase]) :param int n_epochs: the number of training epochs :param int batch_size: batch size for training and validation :param int print_every: step interval to print next training information. Default: -1(no print). :param int validate_every: step interval to do next validation. Default: -1(validate every epoch). :param DataSet dev_data: the validation data :param str save_path: file path to save models :param Optimizer optimizer: an optimizer object :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict.\\ `ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means it will raise error if some field are not used. 检查的原理是通过使用很小的batch(默认两个sample)来检查代码是 否能够运行,但是这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个 固定值的情况;(2)模型中存在累加前向计算次数的,可能会多计算几次。以上情况建议将check_code_level设置为-1 :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets smaller, add "-" in front of the string. For example:: metric_key="-PPL" # language model gets better as perplexity gets smaller :param BaseSampler sampler: method used to generate batch data. :param prefetch: bool, 是否使用额外的进程对产生batch数据。 :param bool use_tqdm: whether to use tqdm to show train progress. :param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以 通过callback机制实现。 """ super(Trainer, self).__init__() if not isinstance(train_data, DataSet): raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.") if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") # check metrics and dev_data if (not metrics) and dev_data is not None: raise ValueError("No metric for dev_data evaluation.") if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. self.increase_better = True if metric_key is not None: self.increase_better = False if metric_key[0] == "-" else True self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key elif len(metrics) > 0: self.metric_key = metrics[0].__class__.__name__.lower().strip('metric') # prepare loss losser = _prepare_losser(loss) # sampler check if not isinstance(sampler, BaseSampler): raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) self.train_data = train_data self.dev_data = dev_data # If None, No validation. self.model = model self.losser = losser self.metrics = metrics self.n_epochs = int(n_epochs) self.batch_size = int(batch_size) self.use_cuda = bool(use_cuda) self.save_path = save_path self.print_every = int(print_every) self.validate_every = int(validate_every) if validate_every!=0 else -1 self.best_metric_indicator = None self.best_dev_epoch = None self.best_dev_step = None self.best_dev_perf = None self.sampler = sampler self.prefetch = prefetch self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer else: self.optimizer = optimizer.construct_from_pytorch(self.model.parameters()) self.use_tqdm = use_tqdm self.print_every = abs(self.print_every) if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, metrics=self.metrics, batch_size=self.batch_size, use_cuda=self.use_cuda, verbose=0) self.step = 0 self.start_time = None # start timestamp
def testENAS(self): # 从csv读取数据到DataSet sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv" dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), sep='\t') print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply( lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import Batch from fastNLP.core.sampler import RandomSampler batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models.enas_model import ENASModel from fastNLP.models.enas_controller import Controller model = ENASModel(embed_num=len(vocab), num_classes=5) controller = Controller() from fastNLP.models.enas_trainer import ENASTrainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('words', 'word_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(pred="output", target="label_seq") metric = AccuracyMetric(pred="predict", target="label_seq") trainer = ENASTrainer(model=model, controller=controller, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(pred="output", target="label_seq"), metrics=AccuracyMetric(pred="predict", target="label_seq"), check_code_level=-1, save_path=None, batch_size=32, print_every=1, n_epochs=3, final_epochs=1) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def my_trainer(epochs, batch_size, lr, model_name, optimizer): lstm_model.to(device) loss_calc = nn.CrossEntropyLoss(reduction='mean') batch_iterator = Batch(dataset=train_data, batch_size=batch_size, sampler=RandomSampler()) batch_iterator2 = Batch(dataset=dev_data, batch_size=batch_size, sampler=RandomSampler()) loss_list = [] metric_list = [] # vali_loss_list = [] count = 0 min_perp = 0 min_perp_epoch = 0 for epo in range(epochs): for batch_x, batch_y in batch_iterator: x = batch_x['sentence'].cuda() y = batch_y['target'].cuda() optimizer.zero_grad() output = lstm_model(x)['pred'] # seq_len = output.shape[2] loss = loss_calc(output, y) loss.backward() optimizer.step() with torch.no_grad(): loss_list.append(loss.item()) if count % 10 == 0: print("step:", count, ", loss =", loss.item()) count += 1 perp = validation(batch_size, batch_iterator2) # vali_loss_list.append(vali_loss) if epo == 0 or min_perp >= perp: min_perp = perp # torch.save(lstm_model.state_dict(), model_name) min_perp_epoch = epo + 1 with torch.no_grad(): metric_list.append(perp) print("epochs =", epo + 1, ", perplexity =", perp) # print(gen_poem(lstm_model, vocab, "日")) # print(gen_poem(lstm_model, vocab, "红")) # print(gen_poem(lstm_model, vocab, "山")) # print(gen_poem(lstm_model, vocab, "夜")) # print(gen_poem(lstm_model, vocab, "湖")) # print(gen_poem(lstm_model, vocab, "海")) # print(gen_poem(lstm_model, vocab, "月")) print("finish train, best model in epoch", min_perp_epoch, ", perplexity =", min_perp) # torch.save(lstm_model.state_dict(), model_name+"_final") plt.plot(range(1, len(loss_list) + 1), loss_list, label='train_loss') plt.xlabel('steps') plt.ylabel('Loss') plt.title('Adam\nlearning_rate=%.1e, betas=(0.5, 0.99)' % (lr)) plt.legend() plt.show() plt.plot(range(1, len(metric_list) + 1), metric_list, label='perplexity') plt.xlabel('epochs') plt.ylabel('Perplexity') plt.title('Adam\nlearning_rate=%.1e, betas=(0.5, 0.99)' % (lr)) plt.legend() plt.show() return loss_list
def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True): """ :param DataSet train_data: the training data :param torch.nn.modules.module model: a PyTorch model :param LossBase loss: a loss object :param MetricBase or List[MetricBase] metrics: a metric object or a list of metrics :param int n_epochs: the number of training epochs :param int batch_size: batch size for training and validation :param int print_every: step interval to print next training information. Default: -1(no print). :param int validate_every: step interval to do next validation. Default: -1(validate every epoch). :param DataSet dev_data: the validation data :param use_cuda: :param save_path: file path to save models :param Optimizer optimizer: an optimizer object :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict. `ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means it will raise error if some field are not used. :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets smaller, add a `-` character in front of the string. For example :: metric_key="-PPL" # language model gets better as perplexity gets smaller :param sampler: method used to generate batch data. :param use_tqdm: boolean, use tqdm to show train progress. """ super(Trainer, self).__init__() if not isinstance(train_data, DataSet): raise TypeError( f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}." ) if not isinstance(model, nn.Module): raise TypeError( f"The type of model must be torch.nn.Module, got {type(model)}." ) # check metrics and dev_data if (not metrics) and dev_data is not None: raise ValueError("No metric for dev_data evaluation.") if metrics and (dev_data is None): raise ValueError( "No dev_data for evaluations, pass dev_data or set metrics to None. " ) # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. self.increase_better = True if metric_key is not None: self.increase_better = False if metric_key[0] == "-" else True self.metric_key = metric_key[1:] if metric_key[ 0] == "+" or metric_key[0] == "-" else metric_key elif len(metrics) > 0: self.metric_key = metrics[0].__class__.__name__.lower().strip( 'metric') # prepare loss losser = _prepare_losser(loss) # sampler check if not isinstance(sampler, BaseSampler): raise ValueError( "The type of sampler should be fastNLP.BaseSampler, got {}.". format(type(sampler))) if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) self.train_data = train_data self.dev_data = dev_data # If None, No validation. self.model = model self.losser = losser self.metrics = metrics self.n_epochs = int(n_epochs) self.batch_size = int(batch_size) self.use_cuda = bool(use_cuda) self.save_path = save_path self.print_every = int(print_every) self.validate_every = int(validate_every) self.best_metric_indicator = None self.sampler = sampler if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer else: self.optimizer = optimizer.construct_from_pytorch( self.model.parameters()) self.use_tqdm = use_tqdm if self.use_tqdm: tester_verbose = 0 self.print_every = abs(self.print_every) else: tester_verbose = 1 if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, metrics=self.metrics, batch_size=self.batch_size, use_cuda=self.use_cuda, verbose=tester_verbose) self.step = 0 self.start_time = None # start timestamp
def init_data_iterator(self, prop=0.8): train_data, test_data = get_text_classification_datasets() train_dataset = DataSet() valid_dataset = DataSet() length = len(train_data.data) for i in range(length): if i < int(prop * length): train_dataset.append( Instance(text=train_data.data[i], label=int(train_data.target[i]))) else: valid_dataset.append( Instance(text=train_data.data[i], label=int(train_data.target[i]))) test_dataset = DataSet() for i in range(len(test_data.data)): test_dataset.append( Instance(text=test_data.data[i], label=int(test_data.target[i]))) trans = str.maketrans({key: None for key in string.punctuation}) train_dataset.apply(lambda x: x['text'].lower().translate(trans), new_field_name='text') train_dataset.apply( lambda x: re.sub(pattern=r'\s', repl=' ', string=x['text']), new_field_name='text') train_dataset.apply(lambda x: x['text'].split(' '), new_field_name='text') train_dataset.apply(remove_empty, new_field_name='text') train_dataset.apply(pad_label, new_field_name='label_pad') valid_dataset.apply(lambda x: x['text'].lower().translate(trans), new_field_name='text') valid_dataset.apply( lambda x: re.sub(pattern=r'\s', repl=' ', string=x['text']), new_field_name='text') valid_dataset.apply(lambda x: x['text'].split(' '), new_field_name='text') valid_dataset.apply(remove_empty, new_field_name='text') valid_dataset.apply(pad_label, new_field_name='label_pad') test_dataset.apply(lambda x: x['text'].lower().translate(trans), new_field_name='text') test_dataset.apply( lambda x: re.sub(pattern=r'\s', repl=' ', string=x['text']), new_field_name='text') test_dataset.apply(lambda x: x['text'].split(' '), new_field_name='text') test_dataset.apply(remove_empty, new_field_name='text') test_dataset.apply(pad_label, new_field_name='label_pad') vocab = Vocabulary(min_freq=10) train_dataset.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() train_dataset.apply( lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='text_index') valid_dataset.apply( lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='text_index') test_dataset.apply( lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='text_index') train_dataset.set_input('text_index') train_dataset.set_target('label_pad') valid_dataset.set_input('text_index') valid_dataset.set_target('label_pad') test_dataset.set_input('text_index') test_dataset.set_target('label_pad') bs = self.args['data']['batch_size'] train_batch = Batch(dataset=train_dataset, batch_size=bs, sampler=RandomSampler()) valid_batch = Batch(dataset=valid_dataset, batch_size=bs, sampler=RandomSampler()) test_batch = Batch(dataset=test_dataset, batch_size=bs, sampler=RandomSampler()) self.input_dim = len(vocab) return train_batch, valid_batch, test_batch
def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" dataset = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(sample_path) print(len(dataset)) print(dataset[0]) print(dataset[-3]) dataset.append(Instance(raw_sentence='fake data', label='0')) # 将所有数字转为小写 dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') # label转int dataset.apply(lambda x: int(x['label']), new_field_name='label') # 使用空格分割句子 def split_sent(ins): return ins['raw_sentence'].split() dataset.apply(split_sent, new_field_name='words') # 增加长度信息 dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') print(len(dataset)) print(dataset[0]) # DataSet.drop(func)筛除数据 dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True) print(len(dataset)) # 设置DataSet中,哪些field要转为tensor # set target,loss或evaluate中的golden,计算loss,模型评估时使用 dataset.set_target("label") # set input,模型forward时使用 dataset.set_input("words", "seq_len") # 分出测试集、训练集 test_data, train_data = dataset.split(0.5) print(len(test_data)) print(len(train_data)) # 构建词表, Vocabulary.add(word) vocab = Vocabulary(min_freq=2) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() # index句子, Vocabulary.to_index(word) train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 from fastNLP.core.batch import DataSetIter from fastNLP.core.sampler import RandomSampler batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) break from fastNLP.models import CNNText model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1) from fastNLP import Trainer from copy import deepcopy # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 train_data.rename_field('label', 'label_seq') test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(target="label_seq") metric = AccuracyMetric(target="label_seq") # 实例化Trainer,传入模型和数据,进行训练 # 先在test_data拟合(确保模型的实现是正确的) copy_model = deepcopy(model) overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5, dev_data=test_data, metrics=metric, save_path=None) overfit_trainer.train() # 用train_data训练,在test_data验证 trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(target="label_seq"), metrics=AccuracyMetric(target="label_seq"), save_path=None, batch_size=32, n_epochs=5) trainer.train() print('Train finished!') # 调用Tester在test_data上评价效果 from fastNLP import Tester tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"), batch_size=4) acc = tester.test() print(acc)
def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50, validate_every=-1, dev_data=None, use_cuda=False, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True): super(Trainer, self).__init__() if not isinstance(train_data, DataSet): raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.") if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") # check metrics and dev_data if (not metrics) and dev_data is not None: raise ValueError("No metric for dev_data evaluation.") if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. self.increase_better = True if metric_key is not None: self.increase_better = False if metric_key[0] == "-" else True self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key elif len(metrics) > 0: self.metric_key = metrics[0].__class__.__name__.lower().strip('metric') # prepare loss losser = _prepare_losser(loss) # sampler check if not isinstance(sampler, BaseSampler): raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) if check_code_level > -1: _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) self.train_data = train_data self.dev_data = dev_data # If None, No validation. self.model = model self.losser = losser self.metrics = metrics self.n_epochs = int(n_epochs) self.batch_size = int(batch_size) self.use_cuda = bool(use_cuda) self.save_path = save_path self.print_every = int(print_every) self.validate_every = int(validate_every) self.best_metric_indicator = None self.sampler = sampler if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer else: self.optimizer = optimizer.construct_from_pytorch(self.model.parameters()) self.use_tqdm = use_tqdm if self.use_tqdm: tester_verbose = 0 self.print_every = abs(self.print_every) else: tester_verbose = 1 if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, metrics=self.metrics, batch_size=self.batch_size, use_cuda=self.use_cuda, verbose=tester_verbose) self.step = 0 self.start_time = None # start timestamp