def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    #if args.save_and_break:
    #    print("save model just after init and exit")
    #    snapshot.save("initial_snapshot")
    #    import sys
    #    sys.exit()

    for epoch in range(args.num_epochs):
        metric = Metric(desc='finetune',
                        print_steps=args.loss_print_every_n_iter,
                        batch_size=batch_size,
                        keys=['loss'])

        for step in range(epoch_size):
            BertGlueFinetuneJob().async_get(metric.metric_cb(step,
                                                             epoch=epoch))
            #if 1: #step % args.loss_print_every_n_iter == 0:

        run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
Beispiel #2
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    summary = Summary(args.log_dir, args)
    if args.do_train:
        print('| Training Start')
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        print('| Evaluation Start')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    if args.do_train or args.do_eval:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    if args.do_train:
        summary = Summary(args.log_dir, args)
        for epoch in range(args.num_epochs):
            metric = Metric(desc='train',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['total_loss'])

            for step in range(epoch_size):
                SquadFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))

        if args.save_last_snapshot:
            snapshot.save("last_snapshot")

    if args.do_eval:
        assert os.path.isdir(args.eval_data_dir)
        all_results = []
        for step in range(num_eval_steps):
            unique_ids, start_positions, end_positions = SquadDevJob().get()
            unique_ids = unique_ids.numpy()
            start_positions = start_positions.numpy()
            end_positions = end_positions.numpy()

            for unique_id, start_position, end_position in zip(
                    unique_ids, start_positions, end_positions):
                all_results.append(
                    RawResult(
                        unique_id=int(unique_id[0]),
                        start_logits=start_position.flatten().tolist(),
                        end_logits=end_position.flatten().tolist(),
                    ))

            if step % args.loss_print_every_n_iter == 0:
                print("{}/{}, num of results:{}".format(
                    step, num_eval_steps, len(all_results)))
                print("last uid:", unique_id[0])

        gen_eval_predict_json(args, all_results)
Beispiel #4
0
def main():
    InitNodes(args)
    assert args.model_load_dir, "Must have model load dir!"

    flow.env.log_dir(args.log_dir)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    flow.load_variables(flow.checkpoint.get(args.model_load_dir))
    metric = Metric(desc="validation",
                    calculate_batches=num_val_steps,
                    batch_size=val_batch_size)

    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))
    def main(self):
        processed_list, alphabet, _, emb_dim = pkl.load(
            open(self.config['res_path'].format(self.config['dataset']), 'rb'))
        if isinstance(processed_list, dict):
            processed_list = [processed_list]
        scores = []
        for data_list in processed_list:
            train_data = MyDatasetLoader(self.config, data_list,
                                         'train').get_data()
            valid_data = MyDatasetLoader(self.config, data_list,
                                         'valid').get_data()
            test_data = MyDatasetLoader(self.config, data_list,
                                        'test').get_data()

            self.model = TextCNN(self.config, alphabet, emb_dim,
                                 self.device).to(self.device)
            for w in self.model.parameters():
                print(w.shape, w.requires_grad)
            self.optimizer = Adam(filter(lambda x: x.requires_grad,
                                         self.model.parameters()),
                                  lr=self.config['lr'],
                                  weight_decay=float(self.config['l2']),
                                  eps=float(self.config['esp']))
            self.metircs = Metric()
            score = self.forward(train_data, valid_data, test_data)
            scores.append(score)
        print('| valid best | global best|')
        print('| --- | --- |')
        for w in scores:
            print("| {:.4f} | {:.4f} |".format(w[0], w[1]))
        if len(scores) > 1:
            print("valid Avg\tglobal Avg")
            print("| {:.4f} | {:.4f} |".format(np.mean([w[0] for w in scores]),
                                               np.mean([w[1]
                                                        for w in scores])))
def train(model, loader, criterion, optimizer, epoch, device, opt):

    model.train()

    train_loss = 0.0
    losses = AverageMeter()
    metric = Metric(opt.num_classes)
    for i, (imgs, spatial_locations, word_vectors, targets_predicates,
            targets_confidences) in enumerate(loader):
        # compute outputs
        imgs, spatial_locations, word_vectors, targets_confidences, targets_predicates = imgs.to(
            device), spatial_locations.to(device), word_vectors.to(
                device), targets_confidences.to(device), targets_predicates.to(
                    device)
        confidences, predicates = model(imgs, spatial_locations, word_vectors)

        # compute loss
        loss1 = criterion(confidences, targets_confidences)
        loss2 = criterion(predicates, targets_predicates)
        tot_loss = loss1 + loss2
        train_loss += tot_loss.item()

        losses.update(tot_loss.item(), imgs.size(0))
        predicates = torch.sigmoid(predicates)
        metric.update(predicates, targets_predicates)

        optimizer.zero_grad()
        tot_loss.backward()
        optimizer.step()

        # show information
        if (i + 1) % opt.log_interval == 0:
            avg_loss = train_loss / opt.log_interval
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, losses.count, len(loader.dataset),
                100. * (i + 1) / len(loader), avg_loss))
            train_loss = 0.0

    # show information
    recall = metric.compute_metrics()
    print('Train set ({:d} samples): Average loss: {:.4f}\tRecall: {:.4f}'.
          format(losses.count, losses.avg, recall))

    return losses.avg, recall
Beispiel #7
0
def main():
    InitNodes(args)
    assert args.model_load_dir, 'Must have model load dir!'

    flow.env.log_dir(args.log_dir)
    summary = Summary(args.log_dir, args)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    checkpoint = flow.train.CheckPoint()
    checkpoint.load(args.model_load_dir)
    metric = Metric(desc='validation',
                    calculate_batches=num_val_steps,
                    summary=summary,
                    save_summary_steps=num_val_steps,
                    batch_size=val_batch_size)

    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))
Beispiel #8
0
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    metric = Metric(desc='train',
                    print_steps=args.loss_print_every_n_iter,
                    batch_size=batch_size,
                    keys=['total_loss', 'mlm_loss', 'nsp_loss'])
    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        #PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir,
                        args.save_init)

    print(" {} iter per epoch...".format(epoch_size))

    for epoch in range(1, args.num_epochs + 1):
        metric = Metric(
            desc="train",
            calculate_batches=args.loss_print_every_n_iter,
            batch_size=train_batch_size,
            loss_key="loss",
        )
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(
                desc="validation",
                calculate_batches=num_val_steps,
                batch_size=val_batch_size,
            )
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        if epoch % args.save_epoch_interval == 0:
            snapshot.save("epoch_{}".format(epoch))

    if args.save_last:
        snapshot.save("epoch_{}".format("last"))
def main():
    InitNodes(args)
    flow.env.grpc_use_no_signal()
    flow.env.log_dir(args.log_dir)

    summary = Summary(args.log_dir, args)
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train',
                        calculate_batches=args.loss_print_every_n_iter,
                        summary=summary,
                        save_summary_steps=epoch_size,
                        batch_size=train_batch_size,
                        loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))

        if args.val_data_dir:
            metric = Metric(desc='validation',
                            calculate_batches=num_val_steps,
                            summary=summary,
                            save_summary_steps=num_val_steps,
                            batch_size=val_batch_size)
            for i in range(num_val_steps):
                InferenceNet().async_get(metric.metric_cb(epoch, i))
        snapshot.save('epoch_{}'.format(epoch))
def validate(model, loader, criterion, epoch, device, opt):

    model.eval()

    losses = AverageMeter()
    metric = Metric(opt.num_classes)
    with torch.no_grad():
        for i, (imgs, spatial_locations, word_vectors, targets_predicates, targets_confidences) in enumerate(loader):
            # compute outputs
            imgs, spatial_locations, word_vectors, targets_confidences, targets_predicates = imgs.to(device), spatial_locations.to(
                device), word_vectors.to(device),  targets_confidences.to(device), targets_predicates.to(device)
            confidences, predicates = model(imgs, spatial_locations, word_vectors)

            # compute loss
            loss = criterion(predicates, targets_predicates)

            metric.update(predicates, targets_predicates)
            losses.update(loss.item(), imgs.size(0))


    # show information
    recall = metric.compute_metrics()
    print('Validation set ({:d} samples): Average loss: {:.4f}\tRecall: {:.4f}'.format(losses.count, losses.avg, recall))
    return losses.avg, recall
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init)

    print("num_accumulation_steps:", args.num_accumulation_steps)
    metric = Metric(
        desc="train",
        print_steps=args.loss_print_every_n_iter,
        batch_size=batch_size * args.num_accumulation_steps,
        keys=["total_loss", "mlm_loss", "nsp_loss"],
    )

    for step in range(args.iter_num):
        PretrainJob().async_get(metric.metric_cb(step))
        # PretrainJob().async_get(metric.metric_cb(step, epoch=3))
        if (step + 1) % args.model_save_every_n_iter == 0:
            snapshot.save("snapshot_%d" % (step + 1))

    if args.save_last_snapshot:
        snapshot.save("last_snapshot")
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)
    if args.do_train:
        snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

        summary = Summary(args.log_dir, args)
        best_dev_acc = 0.0
        best_result = {}
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                BertGlueFinetuneJob().async_get(
                    metric.metric_cb(step, epoch=epoch))
                #if 1: #step % args.loss_print_every_n_iter == 0:

            run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
            result = run_eval_job(BertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                best_result = result
                save_model = True
                print('Best result:', result)

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     best_result = result
            #     save_model = True
            #print('Best result:', result)

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                best_result = result
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                # snapshot_save_path = os.path.join(args.model_save_dir)
                # print("Saving best model to {}".format(snapshot_save_path))
                snapshot.save('best')
                flow.sync_default_session()
        print('Best result:', best_result)
        print("Saving best model to " +
              os.path.join(args.model_save_dir, 'snapshot_best'))

        if args.serve_for_online:
            print('Deleting the optimizer parmas from model_save_dir...')
            remove_optimizer_params(
                os.path.join(args.model_save_dir, 'snapshot_best'))

        # if args.save_last_snapshot:
        #     snapshot.save("last_snapshot")
    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)
        if not args.do_train:
            check_point = flow.train.CheckPoint()
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')
    parser = configs.get_parser()
    args = parser.parse_args()
    configs.print_args(args)

    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.config.enable_debug_mode(True)

    @flow.global_function(get_val_config(args))
    def IOTest():
        if args.train_data_dir:
            assert os.path.exists(args.train_data_dir)
            print("Loading data from {}".format(args.train_data_dir))
            (labels, images) = load_imagenet_for_training(args)
        else:
            print("Loading synthetic data.")
            (labels, images) = load_synthetic(args)
        outputs = {"images": images, "labels": labels}
        return outputs

    total_device_num = args.num_nodes * args.gpu_num_per_node
    train_batch_size = total_device_num * args.batch_size_per_device
    summary = Summary(args.log_dir, args, filename='io_test.csv')
    metric = Metric(desc='io_test',
                    calculate_batches=args.loss_print_every_n_iter,
                    summary=summary,
                    save_summary_steps=args.loss_print_every_n_iter,
                    batch_size=train_batch_size,
                    prediction_key=None)
    for i in range(1000):
        IOTest().async_get(metric.metric_cb(0, i))
Beispiel #15
0
    from job_function_util import get_val_config

    parser = configs.get_parser()
    args = parser.parse_args()
    configs.print_args(args)

    flow.config.gpu_device_num(args.gpu_num_per_node)
    # flow.config.enable_debug_mode(True)
    @flow.global_function(get_val_config(args))
    def IOTest():
        if args.train_data_dir:
            assert os.path.exists(args.train_data_dir)
            print("Loading data from {}".format(args.train_data_dir))
            (labels, images) = load_imagenet_for_training(args)
        else:
            print("Loading synthetic data.")
            (labels, images) = load_synthetic(args)
        outputs = {"images": images, "labels": labels}
        return outputs

    total_device_num = args.num_nodes * args.gpu_num_per_node
    train_batch_size = total_device_num * args.batch_size_per_device
    metric = Metric(
        desc="io_test",
        calculate_batches=args.loss_print_every_n_iter,
        batch_size=train_batch_size,
        prediction_key=None,
    )
    for i in range(1000):
        IOTest().async_get(metric.metric_cb(0, i))
                tmp = os.path.getsize(os.path.join(root, name))
                size += tmp
        # size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
    return size


def main():
    InitNodes(args)
    flow.env.log_dir(args.log_dir)
    modelSize = getdirsize(args.model_load_dir)
    summary = Summary(args.log_dir, args, modelSize)
>>>>>>> tianshu
    snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

    for epoch in range(args.num_epochs):
        metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
                        summary=summary, save_summary_steps=epoch_size,
                        batch_size=train_batch_size, loss_key='loss')
        for i in range(epoch_size):
            TrainNet().async_get(metric.metric_cb(epoch, i))
<<<<<<< HEAD
       # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration
=======
        # flow.tensorrt.write_int8_calibration("./int8_calibration") # mkdir int8_calibration
>>>>>>> tianshu
        if args.val_data_dir:
            metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
                            save_summary_steps=num_val_steps, batch_size=val_batch_size)
            for i in range(val_batch_size):
<<<<<<< HEAD
               # if i<=10:
def main():
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    check_point = flow.train.CheckPoint()

    summary = Summary(args.log_dir, args)
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)
    if args.do_train:
        print('Combining two models into one dir')
        if not os.path.exists('./tmp'):
            os.makedirs('./tmp')

        args.total_model = tempfile.mkdtemp(dir='./tmp')
        CopyFile(args.student_model, args.total_model)
        CopyFile(args.teacher_model, args.total_model)
        print('Loading model...')
        check_point.load(args.total_model)
        #     # check_point.load(args.teacher_model)
        #     # check_point.load(args.student_model)
        #
        print('Start training...')
        global_step = 0
        best_dev_acc = 0.0
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
                            batch_size=batch_size, keys=['loss'])

            for step in range(epoch_size):
                DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
                global_step += 1
                # if (global_step + 1) % args.model_save_every_n_iter == 0:
                #     if not os.path.exists(args.model_save_dir):
                #         os.makedirs(args.model_save_dir)
                #     snapshot_save_path = os.path.join(
                #         args.model_save_dir, "snapshot_%d" % (global_step + 1)
                #     )
                #     print("Saving model to {}.".format(snapshot_save_path))
                #     check_point.save(snapshot_save_path)

            # if args.pred_distill:
            print('EvalTrainJob...')
            run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
            print('EvalValJob...')
            result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
            if not args.pred_distill:
                save_model = True
            else:
                save_model = False
                if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                    best_dev_acc = result['accuracy']
                    save_model = True

                # if task_name in corr_tasks and result['corr'] > best_dev_acc:
                #     best_dev_acc = result['corr']
                #     save_model = True

                if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
                    best_dev_acc = result['matthews_corrcoef']
                    save_model = True
                    print('Best result:', result)

                if save_model:
                    if os.path.exists(args.model_save_dir):
                        import shutil
                        shutil.rmtree(args.model_save_dir)
                    if not os.path.exists(args.model_save_dir):
                        os.makedirs(args.model_save_dir)
                    snapshot_save_path = os.path.join(args.model_save_dir)
                    print("Saving best model to {}".format(snapshot_save_path))
                    check_point.save(snapshot_save_path)
                    flow.sync_default_session()

        if args.save_last_snapshot:
            snapshot_save_path = args.model_save_dir
            if os.path.exists(args.model_save_dir):
                import shutil
                shutil.rmtree(args.model_save_dir)
            print("Saving model to {}".format(snapshot_save_path))
            check_point.save(snapshot_save_path)
            flow.sync_default_session()

        if global_step >= 100:
            # remove tmp total models
            print('Removing the tmp models...')
            import shutil
            shutil.rmtree(args.total_model)

        if args.serve_for_online:
            print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
            remove_teacher_params(args.model_save_dir)

    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)

        if not args.do_train:
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
Beispiel #18
0
def main():
    flow.config.enable_debug_mode(True)
    flow.config.gpu_device_num(args.gpu_num_per_node)
    flow.env.log_dir(args.log_dir)

    InitNodes(args)

    check_point = flow.train.CheckPoint()
    check_point.init()

    summary = Summary(args.log_dir, args)
    if not os.path.exists(args.model_save_dir):
        os.makedirs(args.model_save_dir)

    if args.do_train:
        print('Start training...')
        global_step = 0
        best_dev_acc = 0.0
        print('epoch_size:', epoch_size)
        print('args.iter_num:', args.iter_num)
        for epoch in range(args.num_epochs):
            metric = Metric(desc='finetune',
                            print_steps=args.loss_print_every_n_iter,
                            summary=summary,
                            batch_size=batch_size,
                            keys=['loss'])

            for step in range(epoch_size):
                loss = DistilJob().get()
                if step % 10 == 0:
                    print('step/epoch_size:{}/{}   epoch:{}'.format(
                        step, epoch_size, epoch))
                    print('loss:', loss['loss'].mean())
                # global_step+=1
                # DistilJob().async_get(metric.metric_cb(step, epoch=epoch))

            print('EvalTrainJob...')
            run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
            print('EvalValJob...')
            result = run_eval_job(StudentBertGlueEvalValJob,
                                  num_eval_steps,
                                  desc='eval')

            save_model = False
            if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
                best_dev_acc = result['accuracy']
                save_model = True

            # if task_name in corr_tasks and result['corr'] > best_dev_acc:
            #     best_dev_acc = result['corr']
            #     save_model = True

            if task_name in mcc_tasks and result[
                    'matthews_corrcoef'] > best_dev_acc:
                best_dev_acc = result['matthews_corrcoef']
                save_model = True
                print('Best result:', result)

            if save_model:
                if not os.path.exists(args.model_save_dir):
                    os.makedirs(args.model_save_dir)
                snapshot_save_path = os.path.join(args.model_save_dir)
                print("Saving best model to {}".format(snapshot_save_path))
                check_point.save(snapshot_save_path)

        if args.save_last_snapshot:
            snapshot_save_path = args.model_save_dir
            print("Saving model to {}".format(snapshot_save_path))
            check_point.save(snapshot_save_path)

    if args.do_eval:
        print('Loading model...')
        print(args.model_save_dir)

        if not args.do_train:
            check_point.load(args.model_save_dir)
        print('Evaluation...')
        run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
    summary = Summary(args.log_dir, args, modelSize)
    # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
    print("Restoring model from {}.".format(args.model_load_dir))
    checkpoint = flow.train.CheckPoint()
    checkpoint.load(args.model_load_dir)


    if args.use_int8_online:
        for j in range(10):
<<<<<<< HEAD
            flow.tensorrt.cache_int8_calibration()
=======
            InferenceNet().get()
            flow.tensorrt.cache_int8_calibration()
            flow.tensorrt.write_int8_calibration("./int8_calibration")
>>>>>>> tianshu

    warmup = 2
    for j in range(warmup):
        InferenceNet().get()

    metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
                    save_summary_steps=num_val_steps, batch_size=val_batch_size)
    for i in range(args.num_epochs):
        for j in range(num_val_steps):
            InferenceNet().async_get(metric.metric_cb(0, j))


if __name__ == "__main__":
    main()
Beispiel #20
0
 def compute_scores(self, gt, preds):
     ret = {'ndcg': Metric.ndcg(gt, preds), 'auc': Metric.auc(gt, preds)}
     return ret
Beispiel #21
0
    import os
    import config as configs
    from util import Summary, InitNodes, Metric
    from job_function_util import get_val_config
    parser = configs.get_parser()
    args = parser.parse_args()
    configs.print_args(args)

    flow.config.gpu_device_num(args.gpu_num_per_node)
    #flow.config.enable_debug_mode(True)
    @flow.global_function(get_val_config(args))
    def IOTest():
        if args.train_data_dir:
            assert os.path.exists(args.train_data_dir)
            print("Loading data from {}".format(args.train_data_dir))
            (labels, images) = load_imagenet_for_training(args)
        else:
            print("Loading synthetic data.")
            (labels, images) = load_synthetic(args)
        outputs = {"images": images, "labels": labels}
        return outputs

    total_device_num = args.num_nodes * args.gpu_num_per_node
    train_batch_size = total_device_num * args.batch_size_per_device
    summary = Summary(args.log_dir, args, filename='io_test.csv')
    metric = Metric(desc='io_test', calculate_batches=args.loss_print_every_n_iter,
                    summary=summary, save_summary_steps=args.loss_print_every_n_iter,
                    batch_size=train_batch_size, prediction_key=None)
    for i in range(1000):
        IOTest().async_get(metric.metric_cb(0, i))