def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) if args.do_train: print('| Training Start') for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') if args.save_last_snapshot: snapshot.save("last_snapshot") if args.do_eval: print('| Evaluation Start') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) #if args.save_and_break: # print("save model just after init and exit") # snapshot.save("initial_snapshot") # import sys # sys.exit() for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get(metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): InitNodes(args) flow.env.grpc_use_no_signal() flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) for epoch in range(args.num_epochs): metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, summary=summary, save_summary_steps=epoch_size, batch_size=train_batch_size, loss_key='loss') for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(epoch, i)) snapshot.save('epoch_{}'.format(epoch))
def main(): InitNodes(args) flow.env.log_dir(args.log_dir) snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.save_init) print(" {} iter per epoch...".format(epoch_size)) for epoch in range(1, args.num_epochs + 1): metric = Metric( desc="train", calculate_batches=args.loss_print_every_n_iter, batch_size=train_batch_size, loss_key="loss", ) for i in range(epoch_size): TrainNet().async_get(metric.metric_cb(epoch, i)) if args.val_data_dir: metric = Metric( desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size, ) for i in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(epoch, i)) if epoch % args.save_epoch_interval == 0: snapshot.save("epoch_{}".format(epoch)) if args.save_last: snapshot.save("epoch_{}".format("last"))
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train or args.do_eval: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) if args.do_train: summary = Summary(args.log_dir, args) for epoch in range(args.num_epochs): metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['total_loss']) for step in range(epoch_size): SquadFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) if args.save_last_snapshot: snapshot.save("last_snapshot") if args.do_eval: assert os.path.isdir(args.eval_data_dir) all_results = [] for step in range(num_eval_steps): unique_ids, start_positions, end_positions = SquadDevJob().get() unique_ids = unique_ids.numpy() start_positions = start_positions.numpy() end_positions = end_positions.numpy() for unique_id, start_position, end_position in zip( unique_ids, start_positions, end_positions): all_results.append( RawResult( unique_id=int(unique_id[0]), start_logits=start_position.flatten().tolist(), end_logits=end_position.flatten().tolist(), )) if step % args.loss_print_every_n_iter == 0: print("{}/{}, num of results:{}".format( step, num_eval_steps, len(all_results))) print("last uid:", unique_id[0]) gen_eval_predict_json(args, all_results)
def main(): InitNodes(args) assert args.model_load_dir, "Must have model load dir!" flow.env.log_dir(args.log_dir) # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) print("Restoring model from {}.".format(args.model_load_dir)) flow.load_variables(flow.checkpoint.get(args.model_load_dir)) metric = Metric(desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size) for i in range(args.num_epochs): for j in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(0, j))
def main(): InitNodes(args) assert args.model_load_dir, 'Must have model load dir!' flow.env.log_dir(args.log_dir) modelSize = getdirsize(args.model_load_dir) summary = Summary(args.log_dir, args, modelSize) # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) print("Restoring model from {}.".format(args.model_load_dir)) checkpoint = flow.train.CheckPoint() checkpoint.load(args.model_load_dir) if args.use_int8_online: for j in range(10):
def main(): InitNodes(args) assert args.model_load_dir, 'Must have model load dir!' flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args) # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) print("Restoring model from {}.".format(args.model_load_dir)) checkpoint = flow.train.CheckPoint() checkpoint.load(args.model_load_dir) metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, save_summary_steps=num_val_steps, batch_size=val_batch_size) for i in range(args.num_epochs): for j in range(num_val_steps): InferenceNet().async_get(metric.metric_cb(0, j))
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir) metric = Metric(desc='train', print_steps=args.loss_print_every_n_iter, batch_size=batch_size, keys=['total_loss', 'mlm_loss', 'nsp_loss']) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) #PretrainJob().async_get(metric.metric_cb(step, epoch=3)) if (step + 1) % args.model_save_every_n_iter == 0: snapshot.save("snapshot_%d" % (step + 1)) if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init) print("num_accumulation_steps:", args.num_accumulation_steps) metric = Metric( desc="train", print_steps=args.loss_print_every_n_iter, batch_size=batch_size * args.num_accumulation_steps, keys=["total_loss", "mlm_loss", "nsp_loss"], ) for step in range(args.iter_num): PretrainJob().async_get(metric.metric_cb(step)) # PretrainJob().async_get(metric.metric_cb(step, epoch=3)) if (step + 1) % args.model_save_every_n_iter == 0: snapshot.save("snapshot_%d" % (step + 1)) if args.save_last_snapshot: snapshot.save("last_snapshot")
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) if args.do_train: snapshot = Snapshot(args.model_save_dir, args.model_load_dir) summary = Summary(args.log_dir, args) best_dev_acc = 0.0 best_result = {} for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): BertGlueFinetuneJob().async_get( metric.metric_cb(step, epoch=epoch)) #if 1: #step % args.loss_print_every_n_iter == 0: run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') result = run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] best_result = result save_model = True print('Best result:', result) # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # best_result = result # save_model = True #print('Best result:', result) if task_name in mcc_tasks and result[ 'matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] best_result = result save_model = True print('Best result:', result) if save_model: if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join(args.model_save_dir) # print("Saving best model to {}".format(snapshot_save_path)) snapshot.save('best') flow.sync_default_session() print('Best result:', best_result) print("Saving best model to " + os.path.join(args.model_save_dir, 'snapshot_best')) if args.serve_for_online: print('Deleting the optimizer parmas from model_save_dir...') remove_optimizer_params( os.path.join(args.model_save_dir, 'snapshot_best')) # if args.save_last_snapshot: # snapshot.save("last_snapshot") if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point = flow.train.CheckPoint() check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): InitNodes(args) flow.env.log_dir(args.log_dir) modelSize = getdirsize(args.model_load_dir) summary = Summary(args.log_dir, args, modelSize)
def main(): InitNodes(args) flow.env.log_dir(args.log_dir) summary = Summary(args.log_dir, args)
def main(): flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) check_point = flow.train.CheckPoint() summary = Summary(args.log_dir, args) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) if args.do_train: print('Combining two models into one dir') if not os.path.exists('./tmp'): os.makedirs('./tmp') args.total_model = tempfile.mkdtemp(dir='./tmp') CopyFile(args.student_model, args.total_model) CopyFile(args.teacher_model, args.total_model) print('Loading model...') check_point.load(args.total_model) # # check_point.load(args.teacher_model) # # check_point.load(args.student_model) # print('Start training...') global_step = 0 best_dev_acc = 0.0 for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) global_step += 1 # if (global_step + 1) % args.model_save_every_n_iter == 0: # if not os.path.exists(args.model_save_dir): # os.makedirs(args.model_save_dir) # snapshot_save_path = os.path.join( # args.model_save_dir, "snapshot_%d" % (global_step + 1) # ) # print("Saving model to {}.".format(snapshot_save_path)) # check_point.save(snapshot_save_path) # if args.pred_distill: print('EvalTrainJob...') run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') print('EvalValJob...') result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') if not args.pred_distill: save_model = True else: save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] save_model = True # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # save_model = True if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] save_model = True print('Best result:', result) if save_model: if os.path.exists(args.model_save_dir): import shutil shutil.rmtree(args.model_save_dir) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) snapshot_save_path = os.path.join(args.model_save_dir) print("Saving best model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) flow.sync_default_session() if args.save_last_snapshot: snapshot_save_path = args.model_save_dir if os.path.exists(args.model_save_dir): import shutil shutil.rmtree(args.model_save_dir) print("Saving model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) flow.sync_default_session() if global_step >= 100: # remove tmp total models print('Removing the tmp models...') import shutil shutil.rmtree(args.total_model) if args.serve_for_online: print('Deleting the teacher params and the optimizer parmas from model_save_dir...') remove_teacher_params(args.model_save_dir) if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
def main(): flow.config.enable_debug_mode(True) flow.config.gpu_device_num(args.gpu_num_per_node) flow.env.log_dir(args.log_dir) InitNodes(args) check_point = flow.train.CheckPoint() check_point.init() summary = Summary(args.log_dir, args) if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) if args.do_train: print('Start training...') global_step = 0 best_dev_acc = 0.0 print('epoch_size:', epoch_size) print('args.iter_num:', args.iter_num) for epoch in range(args.num_epochs): metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, batch_size=batch_size, keys=['loss']) for step in range(epoch_size): loss = DistilJob().get() if step % 10 == 0: print('step/epoch_size:{}/{} epoch:{}'.format( step, epoch_size, epoch)) print('loss:', loss['loss'].mean()) # global_step+=1 # DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) print('EvalTrainJob...') run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') print('EvalValJob...') result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') save_model = False if task_name in acc_tasks and result['accuracy'] > best_dev_acc: best_dev_acc = result['accuracy'] save_model = True # if task_name in corr_tasks and result['corr'] > best_dev_acc: # best_dev_acc = result['corr'] # save_model = True if task_name in mcc_tasks and result[ 'matthews_corrcoef'] > best_dev_acc: best_dev_acc = result['matthews_corrcoef'] save_model = True print('Best result:', result) if save_model: if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) snapshot_save_path = os.path.join(args.model_save_dir) print("Saving best model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) if args.save_last_snapshot: snapshot_save_path = args.model_save_dir print("Saving model to {}".format(snapshot_save_path)) check_point.save(snapshot_save_path) if args.do_eval: print('Loading model...') print(args.model_save_dir) if not args.do_train: check_point.load(args.model_save_dir) print('Evaluation...') run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')