def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) if args.do_train: print('| Training Start') for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') if args.save_last_snapshot: snapshot.save("last_snapshot") if args.do_eval: print('| Evaluation Start') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): InitNodes(args) flow.env.grpc_use_no_signal() flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) for epoch in range(args.num_epochs): metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=epoch_size, batch_size=train_batch_size, loss_key='loss') for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(epoch, i)) snapshot.save('epoch_{}'.format(epoch))
def main(): InitNodes(args) flow.env.log_dir(args.log_dir) snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.save_init) print(" {} iter per epoch...".format(epoch_size)) for epoch in range(1, args.num_epochs + 1): metric = Metric( desc="train", calculate_batches=args.loss_print_every_n_iter, batch_size=train_batch_size, loss_key="loss", ) for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: metric = Metric( desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size, ) for i in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(epoch, i)) if epoch % args.save_epoch_interval == 0: snapshot.save("epoch_{}".format(epoch)) if args.save_last: snapshot.save("epoch_{}".format("last"))
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) #if args.save_and_break: # print("save model just after init and exit") # snapshot.save("initial_snapshot") # import sys # sys.exit() for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get(metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train or args.do_eval: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) if args.do_train: summary = Summary(args.log_dir, args) for epoch in range(args.num_epochs): metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['total_loss']) for step in range(epoch_size): SquadFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) if args.save_last_snapshot: snapshot.save("last_snapshot") if args.do_eval: assert os.path.isdir(args.eval_data_dir) all_results = [] for step in range(num_eval_steps): unique_ids, start_positions, end_positions = SquadDevJob().get() unique_ids = unique_ids.numpy() start_positions = start_positions.numpy() end_positions = end_positions.numpy() for unique_id, start_position, end_position in zip( unique_ids, start_positions, end_positions): all_results.append( RawResult( unique_id=int(unique_id[0]), start_logits=start_position.flatten().tolist(), end_logits=end_position.flatten().tolist(), )) if step % args.loss_print_every_n_iter == 0: print("{}/{}, num of results:{}".format( step, num_eval_steps, len(all_results))) print("last uid:", unique_id[0]) gen_eval_predict_json(args, all_results)
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) #PretrainJob().async_get(metric.metric_cb(step, epoch=3)) if (step + 1) % args.model_save_every_n_iter == 0: snapshot.save("snapshot_%d" % (step + 1)) if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init) print("num_accumulation_steps:", args.num_accumulation_steps) metric = Metric( desc="train", print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=["total_loss", "mlm_loss", "nsp_loss"], ) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) # PretrainJob().async_get(metric.metric_cb(step, epoch=3)) if (step + 1) % args.model_save_every_n_iter == 0: snapshot.save("snapshot_%d" % (step + 1)) if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) best_dev_acc = 0.0 best_result = {} for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') result = run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] best_result = result save_model = True print('Best result:', result) # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # best_result = result # save_model = True #print('Best result:', result) if task_name in mcc_tasks and result[ 'matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] best_result = result save_model = True print('Best result:', result) if save_model: if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join(args.model_save_dir) # print("Saving best model to {}".format(snapshot_save_path)) snapshot.save('best') flow.sync_default_session() print('Best result:', best_result) print("Saving best model to " + os.path.join(args.model_save_dir, 'snapshot_best')) if args.serve_for_online: print('Deleting the optimizer parmas from model_save_dir...') remove_optimizer_params( os.path.join(args.model_save_dir, 'snapshot_best')) # if args.save_last_snapshot: # snapshot.save("last_snapshot") if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point = flow.train.CheckPoint() check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
# flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration ======= # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration >>>>>>> tianshu if args.val_data_dir: metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(val_batch_size): <<<<<<< HEAD # if i<=10: # InferenceNet().get() # if i ==10: # flow.tensorrt.cache_int8_calibration() # else: # InferenceNet().async_get(metric.metric_cb(epoch, i)) ======= # if i<=10: # InferenceNet().get() # if i ==10: # flow.tensorrt.cache_int8_calibration() # else: # InferenceNet().async_get(metric.metric_cb(epoch, i)) >>>>>>> tianshu InferenceNet().async_get(metric.metric_cb(epoch, i)) snapshot.save('epoch_{}'.format(epoch)) if __name__ == "__main__": main()