def main():
    InitNodes(args)
    flow.env.grpc_use_no_signal()
    flow.env.log_dir(args.log_dir)

    summary = Summary(args.log_dir, args)
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train',
                        calculate_batches=args.loss_print_every_n_iter,
                        summary=summary,
                        save_summary_steps=epoch_size,
                        batch_size=train_batch_size,
                        loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(desc='validation',
                            calculate_batches=num_val_steps,
                            summary=summary,
                            save_summary_steps=num_val_steps,
                            batch_size=val_batch_size)
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        snapshot.save('epoch_{}'.format(epoch))
def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir,
                        args.save_init)

    print(" {} iter per epoch...".format(epoch_size))

    for epoch in range(1, args.num_epochs + 1):
        metric = Metric(
            desc="train",
            calculate_batches=args.loss_print_every_n_iter,
            batch_size=train_batch_size,
            loss_key="loss",
        )
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(
                desc="validation",
                calculate_batches=num_val_steps,
                batch_size=val_batch_size,
            )
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        if epoch % args.save_epoch_interval == 0:
            snapshot.save("epoch_{}".format(epoch))

    if args.save_last:
        snapshot.save("epoch_{}".format("last"))
Esempio n. 3
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    summary = Summary(args.log_dir, args)
    if args.do_train:
        print('| Training Start')
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        print('| Evaluation Start')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    #if args.save_and_break:
    #    print("save model just after init and exit")
    #    snapshot.save("initial_snapshot")
    #    import sys
    #    sys.exit()

    for epoch in range(args.num_epochs):
        metric = Metric(desc='finetune',
                        print_steps=args.loss_print_every_n_iter,
                        batch_size=batch_size,
                        keys=['loss'])

        for step in range(epoch_size):
            BertGlueFinetuneJob().async_get(metric.metric_cb(step,
                                                             epoch=epoch))
            #if 1: #step % args.loss_print_every_n_iter == 0:

        run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
Esempio n. 5
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    if args.do_train or args.do_eval:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    if args.do_train:
        summary = Summary(args.log_dir, args)
        for epoch in range(args.num_epochs):
            metric = Metric(desc='train',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['total_loss'])

            for step in range(epoch_size):
                SquadFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        assert os.path.isdir(args.eval_data_dir)
        all_results = []
        for step in range(num_eval_steps):
            unique_ids, start_positions, end_positions = SquadDevJob().get()
            unique_ids = unique_ids.numpy()
            start_positions = start_positions.numpy()
            end_positions = end_positions.numpy()

            for unique_id, start_position, end_position in zip(
                    unique_ids, start_positions, end_positions):
                all_results.append(
                    RawResult(
                        unique_id=int(unique_id[0]),
                        start_logits=start_position.flatten().tolist(),
                        end_logits=end_position.flatten().tolist(),
                    ))

            if step % args.loss_print_every_n_iter == 0:
                print("{}/{}, num of results:{}".format(
                    step, num_eval_steps, len(all_results)))
                print("last uid:", unique_id[0])

        gen_eval_predict_json(args, all_results)
Esempio n. 6
0
def main():
    InitNodes(args)
    assert args.model_load_dir, "Must have model load dir!"

    flow.env.log_dir(args.log_dir)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    flow.load_variables(flow.checkpoint.get(args.model_load_dir))
    metric = Metric(desc="validation",
                    calculate_batches=num_val_steps,
                    batch_size=val_batch_size)

    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))
Esempio n. 7
0
def main():
    InitNodes(args)
    assert args.model_load_dir, 'Must have model load dir!'

    flow.env.log_dir(args.log_dir)
    summary = Summary(args.log_dir, args)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    checkpoint = flow.train.CheckPoint()
    checkpoint.load(args.model_load_dir)
    metric = Metric(desc='validation',
                    calculate_batches=num_val_steps,
                    summary=summary,
                    save_summary_steps=num_val_steps,
                    batch_size=val_batch_size)

    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))
Esempio n. 8
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    metric = Metric(desc='train',
                    print_steps=args.loss_print_every_n_iter,
                    batch_size=batch_size,
                    keys=['total_loss', 'mlm_loss', 'nsp_loss'])
    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        #PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init)

    print("num_accumulation_steps:", args.num_accumulation_steps)
    metric = Metric(
        desc="train",
        print_steps=args.loss_print_every_n_iter,
        batch_size=batch_size * args.num_accumulation_steps,
        keys=["total_loss", "mlm_loss", "nsp_loss"],
    )

    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        # PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)
    if args.do_train:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

        summary = Summary(args.log_dir, args)
        best_dev_acc = 0.0
        best_result = {}
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))
                #if 1: #step % args.loss_print_every_n_iter == 0:

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            result = run_eval_job(BertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                best_result = result
                save_model = True
                print('Best result:', result)

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     best_result = result
            #     save_model = True
            #print('Best result:', result)

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                best_result = result
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                # snapshot_save_path = os.path.join(args.model_save_dir)
                # print("Saving best model to {}".format(snapshot_save_path))
                snapshot.save('best')
                flow.sync_default_session()
        print('Best result:', best_result)
        print("Saving best model to " +
              os.path.join(args.model_save_dir, 'snapshot_best'))

        if args.serve_for_online:
            print('Deleting the optimizer parmas from model_save_dir...')
            remove_optimizer_params(
                os.path.join(args.model_save_dir, 'snapshot_best'))

        # if args.save_last_snapshot:
        #     snapshot.save("last_snapshot")
    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)
        if not args.do_train:
            check_point = flow.train.CheckPoint()
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
Esempio n. 11
0
    parser = configs.get_parser()
    args = parser.parse_args()
    configs.print_args(args)

    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.config.enable_debug_mode(True)

    @flow.global_function(get_val_config(args))
    def IOTest():
        if args.train_data_dir:
            assert os.path.exists(args.train_data_dir)
            print("Loading data from {}".format(args.train_data_dir))
            (labels, images) = load_imagenet_for_training(args)
        else:
            print("Loading synthetic data.")
            (labels, images) = load_synthetic(args)
        outputs = {"images": images, "labels": labels}
        return outputs

    total_device_num = args.num_nodes * args.gpu_num_per_node
    train_batch_size = total_device_num * args.batch_size_per_device
    summary = Summary(args.log_dir, args, filename='io_test.csv')
    metric = Metric(desc='io_test',
                    calculate_batches=args.loss_print_every_n_iter,
                    summary=summary,
                    save_summary_steps=args.loss_print_every_n_iter,
                    batch_size=train_batch_size,
                    prediction_key=None)
    for i in range(1000):
        IOTest().async_get(metric.metric_cb(0, i))

def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)
    modelSize = getdirsize(args.model_load_dir)
    summary = Summary(args.log_dir, args, modelSize)
>>>>>>> tianshu
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
                        summary=summary, save_summary_steps=epoch_size,
                        batch_size=train_batch_size, loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))
<<<<<<< HEAD
       # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration
=======
        # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration
>>>>>>> tianshu
        if args.val_data_dir:
            metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
                            save_summary_steps=num_val_steps, batch_size=val_batch_size)
            for i in range(val_batch_size):
<<<<<<< HEAD
               # if i<=10:
               #     InferenceNet().get()
               #     if i ==10:
               #         flow.tensorrt.cache_int8_calibration()
               # else:
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    check_point = flow.train.CheckPoint()

    summary = Summary(args.log_dir, args)
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)
    if args.do_train:
        print('Combining two models into one dir')
        if not os.path.exists('./tmp'):
            os.makedirs('./tmp')

        args.total_model = tempfile.mkdtemp(dir='./tmp')
        CopyFile(args.student_model, args.total_model)
        CopyFile(args.teacher_model, args.total_model)
        print('Loading model...')
        check_point.load(args.total_model)
        #     # check_point.load(args.teacher_model)
        #     # check_point.load(args.student_model)
        #
        print('Start training...')
        global_step = 0
        best_dev_acc = 0.0
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
                            batch_size=batch_size, keys=['loss'])

            for step in range(epoch_size):
                DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
                global_step += 1
                # if (global_step + 1) % args.model_save_every_n_iter == 0:
                #     if not os.path.exists(args.model_save_dir):
                #         os.makedirs(args.model_save_dir)
                #     snapshot_save_path = os.path.join(
                #         args.model_save_dir, "snapshot_%d" % (global_step + 1)
                #     )
                #     print("Saving model to {}.".format(snapshot_save_path))
                #     check_point.save(snapshot_save_path)

            # if args.pred_distill:
            print('EvalTrainJob...')
            run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
            print('EvalValJob...')
            result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
            if not args.pred_distill:
                save_model = True
            else:
                save_model = False
                if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                    best_dev_acc = result['accuracy']
                    save_model = True

                # if task_name in corr_tasks and result['corr'] > best_dev_acc:
                #     best_dev_acc = result['corr']
                #     save_model = True

                if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
                    best_dev_acc = result['matthews_corrcoef']
                    save_model = True
                    print('Best result:', result)

                if save_model:
                    if os.path.exists(args.model_save_dir):
                        import shutil
                        shutil.rmtree(args.model_save_dir)
                    if not os.path.exists(args.model_save_dir):
                        os.makedirs(args.model_save_dir)
                    snapshot_save_path = os.path.join(args.model_save_dir)
                    print("Saving best model to {}".format(snapshot_save_path))
                    check_point.save(snapshot_save_path)
                    flow.sync_default_session()

        if args.save_last_snapshot:
            snapshot_save_path = args.model_save_dir
            if os.path.exists(args.model_save_dir):
                import shutil
                shutil.rmtree(args.model_save_dir)
            print("Saving model to {}".format(snapshot_save_path))
            check_point.save(snapshot_save_path)
            flow.sync_default_session()

        if global_step >= 100:
            # remove tmp total models
            print('Removing the tmp models...')
            import shutil
            shutil.rmtree(args.total_model)

        if args.serve_for_online:
            print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
            remove_teacher_params(args.model_save_dir)

    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)

        if not args.do_train:
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
    summary = Summary(args.log_dir, args, modelSize)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    checkpoint = flow.train.CheckPoint()
    checkpoint.load(args.model_load_dir)


    if args.use_int8_online:
        for j in range(10):
<<<<<<< HEAD
            flow.tensorrt.cache_int8_calibration()
=======
            InferenceNet().get()
            flow.tensorrt.cache_int8_calibration()
            flow.tensorrt.write_int8_calibration("./int8_calibration")
>>>>>>> tianshu

    warmup = 2
    for j in range(warmup):
        InferenceNet().get()

    metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
                    save_summary_steps=num_val_steps, batch_size=val_batch_size)
    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))


if __name__ == "__main__":
    main()