コード例 #1
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    summary = Summary(args.log_dir, args)
    if args.do_train:
        print('| Training Start')
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        print('| Evaluation Start')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
コード例 #2
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    #if args.save_and_break:
    #    print("save model just after init and exit")
    #    snapshot.save("initial_snapshot")
    #    import sys
    #    sys.exit()

    summary = Summary(args.log_dir, args)
    for epoch in range(args.num_epochs):
        metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, 
                        batch_size=batch_size, keys=['loss'])

        for step in range(epoch_size):
            BertGlueFinetuneJob().async_get(metric.metric_cb(step, epoch=epoch))
            #if 1: #step % args.loss_print_every_n_iter == 0: 

        run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
コード例 #3
0
def main():
    InitNodes(args)
    flow.env.grpc_use_no_signal()
    flow.env.log_dir(args.log_dir)

    summary = Summary(args.log_dir, args)
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train',
                        calculate_batches=args.loss_print_every_n_iter,
                        summary=summary,
                        save_summary_steps=epoch_size,
                        batch_size=train_batch_size,
                        loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(desc='validation',
                            calculate_batches=num_val_steps,
                            summary=summary,
                            save_summary_steps=num_val_steps,
                            batch_size=val_batch_size)
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        snapshot.save('epoch_{}'.format(epoch))
コード例 #4
0
ファイル: trainer.py プロジェクト: Ereebay/Federated-Learning
    def train(self):
        # start training
        for rounds in range(cfg.Total.R):
            print(f'==================The {rounds + 1}-th round==================')
            training_client = random.sample(self.train_clients, 4)
            # print(f' The training clients is {training_client}')
            data_sum = 0
            datalist = []
            for client in training_client:
                data_sum += len(client.train_dataloader.dataset)
                data = len(client.train_dataloader.dataset)
                datalist.append(data)

            for idx, client in enumerate(training_client):
                client.fork(self.server)
                agreement = datalist[idx] / data_sum
                metric = client.compute_gradients(cfg.Total.E, agreement)
                self.recorder.validate(client)
                self.recorder.printer(client)
            self.server.merge(training_client)

            print(f'Start personalization testing')
            for idx, client in enumerate(self.test_list):
                client.fork(self.server)
                print(f'Start adapatation process')
                client.adapt(cfg.Total.E)
                print(f'Start validation')
                self.recorder.validate(client)
                self.recorder.printer(client)

            self.recorder.validate(self.server)
            self.recorder.printer(self.server)

        self.recorder.finish()
        Summary()
コード例 #5
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    if args.do_train or args.do_eval:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    if args.do_train:
        summary = Summary(args.log_dir, args)
        for epoch in range(args.num_epochs):
            metric = Metric(desc='train',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['total_loss'])

            for step in range(epoch_size):
                SquadFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        assert os.path.isdir(args.eval_data_dir)
        all_results = []
        for step in range(num_eval_steps):
            unique_ids, start_positions, end_positions = SquadDevJob().get()
            unique_ids = unique_ids.numpy()
            start_positions = start_positions.numpy()
            end_positions = end_positions.numpy()

            for unique_id, start_position, end_position in zip(
                    unique_ids, start_positions, end_positions):
                all_results.append(
                    RawResult(
                        unique_id=int(unique_id[0]),
                        start_logits=start_position.flatten().tolist(),
                        end_logits=end_position.flatten().tolist(),
                    ))

            if step % args.loss_print_every_n_iter == 0:
                print("{}/{}, num of results:{}".format(
                    step, num_eval_steps, len(all_results)))
                print("last uid:", unique_id[0])

        gen_eval_predict_json(args, all_results)
コード例 #6
0
def main():
    InitNodes(args)
    assert args.model_load_dir, 'Must have model load dir!'

    flow.env.log_dir(args.log_dir)

    modelSize = getdirsize(args.model_load_dir)

    summary = Summary(args.log_dir, args, modelSize)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    checkpoint = flow.train.CheckPoint()
    checkpoint.load(args.model_load_dir)


    if args.use_int8_online:
        for j in range(10):
コード例 #7
0
def main():
    InitNodes(args)
    assert args.model_load_dir, 'Must have model load dir!'

    flow.env.log_dir(args.log_dir)
    summary = Summary(args.log_dir, args)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    checkpoint = flow.train.CheckPoint()
    checkpoint.load(args.model_load_dir)
    metric = Metric(desc='validation',
                    calculate_batches=num_val_steps,
                    summary=summary,
                    save_summary_steps=num_val_steps,
                    batch_size=val_batch_size)

    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))
コード例 #8
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    summary = Summary(args.log_dir, args)
    metric = Metric(desc='train',
                    print_steps=args.loss_print_every_n_iter,
                    summary=summary,
                    batch_size=batch_size,
                    keys=['total_loss', 'mlm_loss', 'nsp_loss'])
    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        #PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
コード例 #9
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)
    if args.do_train:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

        summary = Summary(args.log_dir, args)
        best_dev_acc = 0.0
        best_result = {}
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))
                #if 1: #step % args.loss_print_every_n_iter == 0:

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            result = run_eval_job(BertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                best_result = result
                save_model = True
                print('Best result:', result)

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     best_result = result
            #     save_model = True
            #print('Best result:', result)

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                best_result = result
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                # snapshot_save_path = os.path.join(args.model_save_dir)
                # print("Saving best model to {}".format(snapshot_save_path))
                snapshot.save('best')
                flow.sync_default_session()
        print('Best result:', best_result)
        print("Saving best model to " +
              os.path.join(args.model_save_dir, 'snapshot_best'))

        if args.serve_for_online:
            print('Deleting the optimizer parmas from model_save_dir...')
            remove_optimizer_params(
                os.path.join(args.model_save_dir, 'snapshot_best'))

        # if args.save_last_snapshot:
        #     snapshot.save("last_snapshot")
    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)
        if not args.do_train:
            check_point = flow.train.CheckPoint()
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
コード例 #10
0
    parser = configs.get_parser()
    args = parser.parse_args()
    configs.print_args(args)

    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.config.enable_debug_mode(True)

    @flow.global_function(get_val_config(args))
    def IOTest():
        if args.train_data_dir:
            assert os.path.exists(args.train_data_dir)
            print("Loading data from {}".format(args.train_data_dir))
            (labels, images) = load_imagenet_for_training(args)
        else:
            print("Loading synthetic data.")
            (labels, images) = load_synthetic(args)
        outputs = {"images": images, "labels": labels}
        return outputs

    total_device_num = args.num_nodes * args.gpu_num_per_node
    train_batch_size = total_device_num * args.batch_size_per_device
    summary = Summary(args.log_dir, args, filename='io_test.csv')
    metric = Metric(desc='io_test',
                    calculate_batches=args.loss_print_every_n_iter,
                    summary=summary,
                    save_summary_steps=args.loss_print_every_n_iter,
                    batch_size=train_batch_size,
                    prediction_key=None)
    for i in range(1000):
        IOTest().async_get(metric.metric_cb(0, i))
コード例 #11
0
def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)
    modelSize = getdirsize(args.model_load_dir)
    summary = Summary(args.log_dir, args, modelSize)
コード例 #12
0
def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)

    summary = Summary(args.log_dir, args)
コード例 #13
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    check_point = flow.train.CheckPoint()

    summary = Summary(args.log_dir, args)
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)
    if args.do_train:
        print('Combining two models into one dir')
        if not os.path.exists('./tmp'):
            os.makedirs('./tmp')

        args.total_model = tempfile.mkdtemp(dir='./tmp')
        CopyFile(args.student_model, args.total_model)
        CopyFile(args.teacher_model, args.total_model)
        print('Loading model...')
        check_point.load(args.total_model)
        #     # check_point.load(args.teacher_model)
        #     # check_point.load(args.student_model)
        #
        print('Start training...')
        global_step = 0
        best_dev_acc = 0.0
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
                            batch_size=batch_size, keys=['loss'])

            for step in range(epoch_size):
                DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
                global_step += 1
                # if (global_step + 1) % args.model_save_every_n_iter == 0:
                #     if not os.path.exists(args.model_save_dir):
                #         os.makedirs(args.model_save_dir)
                #     snapshot_save_path = os.path.join(
                #         args.model_save_dir, "snapshot_%d" % (global_step + 1)
                #     )
                #     print("Saving model to {}.".format(snapshot_save_path))
                #     check_point.save(snapshot_save_path)

            # if args.pred_distill:
            print('EvalTrainJob...')
            run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
            print('EvalValJob...')
            result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
            if not args.pred_distill:
                save_model = True
            else:
                save_model = False
                if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                    best_dev_acc = result['accuracy']
                    save_model = True

                # if task_name in corr_tasks and result['corr'] > best_dev_acc:
                #     best_dev_acc = result['corr']
                #     save_model = True

                if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
                    best_dev_acc = result['matthews_corrcoef']
                    save_model = True
                    print('Best result:', result)

                if save_model:
                    if os.path.exists(args.model_save_dir):
                        import shutil
                        shutil.rmtree(args.model_save_dir)
                    if not os.path.exists(args.model_save_dir):
                        os.makedirs(args.model_save_dir)
                    snapshot_save_path = os.path.join(args.model_save_dir)
                    print("Saving best model to {}".format(snapshot_save_path))
                    check_point.save(snapshot_save_path)
                    flow.sync_default_session()

        if args.save_last_snapshot:
            snapshot_save_path = args.model_save_dir
            if os.path.exists(args.model_save_dir):
                import shutil
                shutil.rmtree(args.model_save_dir)
            print("Saving model to {}".format(snapshot_save_path))
            check_point.save(snapshot_save_path)
            flow.sync_default_session()

        if global_step >= 100:
            # remove tmp total models
            print('Removing the tmp models...')
            import shutil
            shutil.rmtree(args.total_model)

        if args.serve_for_online:
            print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
            remove_teacher_params(args.model_save_dir)

    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)

        if not args.do_train:
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
コード例 #14
0
def main():
    flow.config.enable_debug_mode(True)
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    check_point = flow.train.CheckPoint()
    check_point.init()

    summary = Summary(args.log_dir, args)
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)

    if args.do_train:
        print('Start training...')
        global_step = 0
        best_dev_acc = 0.0
        print('epoch_size:', epoch_size)
        print('args.iter_num:', args.iter_num)
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                loss = DistilJob().get()
                if step % 10 == 0:
                    print('step/epoch_size:{}/{}   epoch:{}'.format(
                        step, epoch_size, epoch))
                    print('loss:', loss['loss'].mean())
                # global_step+=1
                # DistilJob().async_get(metric.metric_cb(step, epoch=epoch))

            print('EvalTrainJob...')
            run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
            print('EvalValJob...')
            result = run_eval_job(StudentBertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                save_model = True

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     save_model = True

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                snapshot_save_path = os.path.join(args.model_save_dir)
                print("Saving best model to {}".format(snapshot_save_path))
                check_point.save(snapshot_save_path)

        if args.save_last_snapshot:
            snapshot_save_path = args.model_save_dir
            print("Saving model to {}".format(snapshot_save_path))
            check_point.save(snapshot_save_path)

    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)

        if not args.do_train:
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')