Ejemplo n.º 1
0
def main():
    # capture the config path from the run arguments
    # then process the json configuration file
    try:
        args = get_args()
        config = process_config(args.config)
    except:
        print("missing or invalid arguments")
        exit(0)

    print('Create the data generator.')
    data_loader = RobertaDataLoader(config)

    print('Create the model.')
    model = RobertaModel(config)

    print('Creating the Experiment')
    experiment = Experiment(api_key=config.exp.comet_api_key, project_name=config.exp.name, auto_output_logging="simple")
    
    print('Create the trainer')
    trainer = RobertaTrainer(model.model, experiment, config, data_loader.get_train_data())
    
    with experiment.train():
        print('Start training the model.')
        trainer.train()
        model.save()

    with experiment.test():
        print('Predicting the testing data')
        trainer.predict(data_loader.get_test_data(), data_loader.get_tokenizer())
Ejemplo n.º 2
0
def train(X, y, outdir, max_feat=30):
    experiment = Experiment(project_name='color-ml')

    with experiment.train():
        gp_kernel = RationalQuadratic(
            length_scale=0.1, length_scale_bounds=(1e-4, 0.5)) + WhiteKernel(
                0.01, (1e-3, 0.5e-1))
        gp = GaussianProcessRegressor(kernel=gp_kernel,
                                      n_restarts_optimizer=15,
                                      normalize_y=True)

        sfs = SFS(
            gp,
            k_features=max_feat,
            forward=True,
            floating=False,
            scoring='neg_mean_squared_error',
            cv=5,
            verbose=2,
            n_jobs=-1,
        )

        sfs = sfs.fit(X, y)

        joblib.dump(sfs, os.path.join(outdir, 'sfs.joblib'))

    return sfs
Ejemplo n.º 3
0
def fit_validate(exp_params, k, data_path, write_path, others=None, custom_tag=''):
    """Fit model and compute metrics on train and validation set. Intended for hyperparameter search.

    Only logs final metrics and scatter plot of final embedding.

    Args:
        exp_params(dict): Parameter dict. Should at least have keys model_name, dataset_name & random_state. Other
        keys are assumed to be model parameters.
        k(int): Fold identifier.
        data_path(str): Data directory.
        write_path(str): Where to write temp files.
        others(dict): Other things to log to Comet experiment.
        custom_tag(str): Custom tag for comet experiment.

    """
    # Comet experiment
    exp = Experiment(parse_args=False)
    exp.disable_mp()
    custom_tag += '_validate'
    exp.add_tag(custom_tag)
    exp.log_parameters(exp_params)

    if others is not None:
        exp.log_others(others)

    # Parse experiment parameters
    model_name, dataset_name, random_state, model_params = parse_params(exp_params)

    # Fetch and split dataset.
    data_train = getattr(grae.data, dataset_name)(split='train', random_state=random_state, data_path=data_path)
    data_train, data_val = data_train.validation_split(random_state=FOLD_SEEDS[k])

    # Model
    m = getattr(grae.models, model_name)(random_state=FOLD_SEEDS[k], **model_params)
    m.write_path = write_path
    m.data_val = data_val

    with exp.train():
        m.fit(data_train)

        # Log plot
        m.comet_exp = exp
        m.plot(data_train, data_val, title=f'{model_name} : {dataset_name}')

        # Probe embedding
        prober = EmbeddingProber()
        prober.fit(model=m, dataset=data_train, mse_only=True)
        train_z, train_metrics = prober.score(data_train, is_train=True)

        # Log train metrics
        exp.log_metrics(train_metrics)

    with exp.validate():
        val_z, val_metrics = prober.score(data_val)

        # Log train metrics
        exp.log_metrics(val_metrics)

    # Log marker to mark successful experiment
    exp.log_other('success', 1)
Ejemplo n.º 4
0
    def train(self):
        os.mkdir(self.paths['path'])
        if self.use_comet and self.api_key and self.project_name and self.workspace:
            experiment = Experiment(api_key=self.api_key,
                                    project_name=self.project_name,
                                    workspace=self.workspace)
            experiment.log_dataset_hash(self.train_dataset)
            experiment.add_tags([
                str(self.architecture), "text_generation",
                f"nb_labels_{self.number_labels}"
            ])
            with experiment.train():
                hist = self.fit_dataset(self.train_dataset, self.val_dataset,
                                        self.epochs)
            experiment.end()
        elif self.use_comet:
            raise Exception(
                "Please provide an api_key, project_name and workspace for comet_ml"
            )
        else:
            callbacks = self.callback_func(
                tensorboard_dir=self.paths['tensorboard_path'],
                checkpoint_path=self.paths['checkpoint_path'])
            hist = self.model.fit_dataset(self.train_dataset, self.val_dataset,
                                          self.epochs, callbacks)

        self.metrics = get_metrics(hist, "sparse_categorical_accuracy")
        self.export_weights(self.model)
        self.export_info(self.model_info)
        self.export_metrics(self.metrics)
        self.export_tokenizer(self.tokenizer)
        if self.do_zip_model:
            self.zip_model()
Ejemplo n.º 5
0
def train(train_data, model, optimizer, experiment: Experiment):
    global TRAIN_MINI_BATCH
    global EPOCH

    model.train()
    model.cuda()

    batches = len(train_data)
    total_loss = 0
    loss_func = nn.L1Loss()
    with experiment.train():
        for x, in tqdm(train_data):
            x = x.cuda()

            optimizer.zero_grad()
            prediction, _ = model(x)
            loss = loss_func(prediction, x)
            # loss = MyLoss(x, prediction)

            experiment.log_metric(
                "mini-batch loss", loss.item(), step=TRAIN_MINI_BATCH)
            TRAIN_MINI_BATCH += 1
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        average_loss = total_loss / batches
        experiment.log_metric("batch loss", average_loss, step=EPOCH)

    return average_loss
Ejemplo n.º 6
0
def main():
    STARTTIME0 = time.strftime('run_%Y_%m_%d_%H_%M_%s')
    METRICS = []
    for ts_size in [3000, 5000, 5600]:
        for iteration in range(10):
            _, _, X_train, X_test, y_train, y_test, _ = process_data(
                size=ts_size)

            experiment = Experiment(api_key=os.environ['COMET_API_KEY'],
                                    project_name='color-ml')

            experiment.log_parameters(PARAMETERS_MEDIAN)

            with experiment.train():
                regressor_median = fit(X_train, y_train)

            metrics_dict = get_metrics_dict(regressor_median, X_test, y_test,
                                            experiment)
            metrics_dict['iteration'] = iteration
            metrics_dict['ts_size'] = ts_size

            METRICS.append(metrics_dict)

    df = pd.DataFrame(METRICS)
    df.to_csv('learningurve_' + STARTTIME0 + '.csv')
    experiment.log_asset('learningurve_' + STARTTIME0 + '.csv')
Ejemplo n.º 7
0
def log_metrics(metrics: dict, comet_logger: Experiment, epoch: int,
                context_val: bool):
    if context_val:
        with comet_logger.validate():
            comet_logger.log_metrics(metrics, epoch=epoch)
    else:
        with comet_logger.train():
            comet_logger.log_metrics(metrics, epoch=epoch)
Ejemplo n.º 8
0
def log_simclr_images(img1: Tensor, img2: Tensor, context_val: bool,
                      comet_logger: Experiment):

    if context_val:
        with comet_logger.validate():
            plot_simclr_images(img1.data[0].cpu(), img2.data[0].cpu(),
                               comet_logger)
    else:
        with comet_logger.train():
            plot_simclr_images(img1.data[0].cpu(), img2.data[0].cpu(),
                               comet_logger)
Ejemplo n.º 9
0
    def _train_with_comet(self, train_dataset, val_dataset):
        experiment = Experiment(api_key=self.api_key,
                                project_name=self.project_name,
                                workspace=self.workspace)
        experiment.log_dataset_hash(train_dataset)
        experiment.add_tags([
            str(self.architecture), self.name,
            f"nb_labels_{self.label_encoder_classes_number}"
        ])
        with experiment.train():
            hist = self.fit_dataset(train_dataset, val_dataset)
        experiment.end()

        return hist
def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    experiment = Experiment(project_name="tf")
    experiment.log_parameters(hyper_params)
    experiment.log_dataset_hash(mnist)

    with tf.Session() as sess:
        with experiment.train():
            sess.run(tf.global_variables_initializer())
            experiment.set_model_graph(sess.graph)

            for i in range(hyper_params["steps"]):
                batch = mnist.train.next_batch(hyper_params["batch_size"])
                experiment.set_step(i)
                # Compute train accuracy every 10 steps
                if i % 10 == 0:
                    train_accuracy = accuracy.eval(feed_dict={
                        x: batch[0],
                        y_: batch[1]
                    })
                    print('step %d, training accuracy %g' %
                          (i, train_accuracy))
                    experiment.log_metric("accuracy", train_accuracy, step=i)

                # Update weights (back propagation)
                _, loss_val = sess.run([train_step, cross_entropy],
                                       feed_dict={
                                           x: batch[0],
                                           y_: batch[1]
                                       })

                experiment.log_metric("loss", loss_val, step=i)

        ### Finished Training ###

        with experiment.test():
            # Compute test accuracy
            acc = accuracy.eval(feed_dict={
                x: mnist.test.images,
                y_: mnist.test.labels
            })
            experiment.log_metric("accuracy", acc)
            print('test accuracy %g' % acc)
Ejemplo n.º 11
0
def log_hybrid2_images(
    img1: Tensor,
    img2: Tensor,
    params: Dict[str, Tensor],
    context_val: bool,
    comet_logger: Experiment,
):
    params = {k: v.data[0].cpu() for k, v in params.items()}
    if context_val:
        with comet_logger.validate():
            plot_hybrid2_images(img1.data[0].cpu(), img2.data[0].cpu(), params,
                                comet_logger)
    else:
        with comet_logger.train():
            plot_hybrid2_images(img1.data[0].cpu(), img2.data[0].cpu(), params,
                                comet_logger)
Ejemplo n.º 12
0
    def fit(self, experiment: Experiment, cuda: bool = True) -> dict:
        # Prepare comet.ml logger
        experiment.log_parameters(self.hparams)  # log hyper parameters

        # noinspection PyUnresolvedReferences
        torch.backends.cudnn.benchmark = True  # enable cudnn benchmark mode for better performance
        self.is_cuda = cuda
        if self.is_cuda:
            self.cuda()  # move model to cuda

        train_set = self.prepare_dataset()
        self.criterion = self.prepare_criterion()
        optimizer = self.prepare_optimizers()

        best_model = {"loss": 999, "epoch": -1, "model": None}

        self.train()  # set the model to train mode

        with experiment.train():
            for e in range(self.epoch):
                self.before_train_epoch()

                batch_loss = 0
                for batch_idx, batch in enumerate(train_set):
                    optimizer.zero_grad()

                    loss = self.train_step(batch)

                    batch_loss += loss.item()

                    loss.backward()

                    _ = torch.nn.utils.clip_grad_norm_(self.parameters(), self.gradients_norm)

                    optimizer.step()

                avg_loss = batch_loss / len(train_set)
                experiment.log_metric("epoch_loss", avg_loss, step=e)
                # noinspection PyUnboundLocalVariable
                if avg_loss < best_model["loss"]:
                    best_model["loss"] = avg_loss
                    best_model["epoch"] = e
                    best_model["state_dict"] = deepcopy(self.state_dict())  # detach all weights with deepcopy

        return best_model
Ejemplo n.º 13
0
class Logger(object):
    def __init__(self, dataset_name, model_name):
        self.model_name = model_name
        self.project_name = "%s-%s" % (dataset_name, self.model_name)
        self.logdir = os.path.join(hp.logdir, self.project_name)
        self.writer = SummaryWriter(log_dir=self.logdir)

        self.experiment = None  # Experiment(api_key="luY5eUQDsBynS168WxJiRPJmJ", project_name=self.project_name, log_code=False)
        if hp.comet_ml_api_key is not None:
            self.experiment = Experiment(api_key=hp.comet_ml_api_key,
                                         project_name=self.project_name,
                                         log_code=False)
            self.experiment.log_multiple_params(
                dict((name, getattr(hp, name)) for name in dir(hp)
                     if not name.startswith('__')))

    def log_step(self, phase, step, loss_dict, image_dict):
        if phase == 'train':
            if step % 50 == 0:
                if self.experiment is not None:
                    with self.experiment.train():
                        self.experiment.log_multiple_metrics(loss_dict,
                                                             step=step)

                # self.writer.add_scalar('lr', get_lr(), step)
                # self.writer.add_scalar('%s-step/loss' % phase, loss, step)
                for key in sorted(loss_dict):
                    self.writer.add_scalar('%s-step/%s' % (phase, key),
                                           loss_dict[key], step)

            if step % 1000 == 0:
                for key in sorted(image_dict):
                    self.writer.add_image('%s/%s' % (self.model_name, key),
                                          image_dict[key], step)

    def log_epoch(self, phase, step, loss_dict):
        for key in sorted(loss_dict):
            self.writer.add_scalar('%s/%s' % (phase, key), loss_dict[key],
                                   step)

        if phase == 'valid':
            if self.experiment is not None:
                with self.experiment.validate():
                    self.experiment.log_multiple_metrics(loss_dict, step=step)
Ejemplo n.º 14
0
def train(train_data, model, optimizer, experiment: Experiment):
    global TRAIN_MINI_BATCH

    model.train()
    model.cuda()

    batches = len(train_data)
    total_loss = 0
    predictions = []
    ground_truth = []
    with experiment.train():
        for x, y in tqdm(train_data):
            x = x.cuda()
            y = y.cuda().float().view

            optimizer.zero_grad()
            prediction = model(x)
            loss = F.binary_cross_entropy(prediction, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            prediction = prediction >= 0.5
            predictions.append(prediction.detach().cpu().numpy())
            ground_truth.append(y.detach().cpu().numpy())

            experiment.log_metric("Mini batch loss",
                                  loss.item(),
                                  step=TRAIN_MINI_BATCH)
            TRAIN_MINI_BATCH += 1

            print(total_loss)

    average_loss = total_loss / batches
    predictions = np.concatenate(predictions)
    ground_truth = np.concatenate(ground_truth)

    accuracy = accuracy_score(ground_truth, predictions)
    f1score = f1_score(ground_truth, predictions)
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)

    return average_loss, accuracy, f1score, precision, recall
Ejemplo n.º 15
0
def log_pairwise_images(
    img1: Tensor,
    img2: Tensor,
    gt_pred: Dict[str, Tensor],
    context_val: bool,
    comet_logger: Experiment,
):
    gt_pred = {
        k: [v[0].data[0].cpu().numpy(), v[1].data[0].cpu().numpy()]
        for k, v in gt_pred.items()
    }
    if context_val:
        with comet_logger.validate():
            plot_pairwise_images(img1.data[0].cpu(), img2.data[0].cpu(),
                                 gt_pred, comet_logger)
    else:
        with comet_logger.train():
            plot_pairwise_images(img1.data[0].cpu(), img2.data[0].cpu(),
                                 gt_pred, comet_logger)
def run(experiment: Experiment, params: argparse.Namespace):
    sb3_utils.set_random_seed(params.seed, using_cuda=use_cuda)
    env = helper.make_env(params, 'env')

    # Logs will be saved in log_dir/monitor.csv
    env = Monitor(env)

    with experiment.train():
        callback = SaveOnBestTrainingRewardCallback(experiment,
                                                    check_freq=1000)
        # Deactivate all the DQN extensions to have the original version
        # In practice, it is recommend to have them activated
        model = DQN(CnnPolicy,
                    env,
                    learning_rate=params.learning_rate,
                    gamma=params.gamma,
                    seed=params.seed,
                    max_grad_norm=params.max_grad_norm,
                    verbose=1,
                    device=device,
                    policy_kwargs={'features_extractor_class': ColoringCNN})
        model.learn(total_timesteps=params.max_ts, callback=callback)
Ejemplo n.º 17
0
def log_image(
    prediction: Tensor,
    y: Tensor,
    x: Tensor,
    gpu: bool,
    context_val: bool,
    comet_logger: Experiment,
):
    if gpu:
        pred_label = prediction.data[0].cpu().numpy()
        true_label = y.data[0].cpu().detach().numpy()
    else:
        pred_label = prediction[0].detach().numpy()
        true_label = y[0].detach().numpy()
    if context_val:
        with comet_logger.validate():
            plot_truth_vs_prediction(pred_label, true_label, x.data[0].cpu(),
                                     comet_logger)
    else:
        with comet_logger.train():
            plot_truth_vs_prediction(pred_label, true_label, x.data[0].cpu(),
                                     comet_logger)
Ejemplo n.º 18
0
    def calibrate_ensemble(
        models: list,
        X_valid: np.array,
        y_valid: np.array,
        experiment: Experiment,
        voting: str = "soft",
        calibrate: str = "isotonic",
    ) -> Tuple[VotingClassifier, float]:
        """Collects base models into a voting classifier, trains it and then performs
        probability calibration

        Arguments:
            models {list} -- list of optimized base models
            X_valid {np.array} -- feature matrix
            y_valid {np.array} -- label vector

        Keyword Arguments:
            voting {str} -- voting mechanism (hard or soft) (default: {"soft"})
            n {int} -- number of CV folds for isotonic regression (default: {10})
            calibrate {str} -- probability calibration method (none, isotonic, sigmoid) (default: {isotonic})

        Returns:
            [CalibratedClassifierCV, float] -- [calibrated classifier and elapsed time]
        """
        trainlogger.debug("calibrating and building ensemble model")
        startime = time.process_time()

        assert len(X_valid) == len(y_valid)
        # calibrate the base esimators
        with experiment.train():
            vc = VotingClassifier(models, voting=voting)
            trainlogger.debug("now, calibrating the base base estimators")

            vc._calibrate_base_estimators(calibrate, X_valid, y_valid)  # pylint:disable=protected-access

        endtime = time.process_time()
        elapsed_time = endtime - startime

        return vc, elapsed_time
Ejemplo n.º 19
0
def main(cmd=None, stdout=True):
    args = get_args(cmd, stdout)

    model_id = "seed_{}_strat_{}_noise_fn_{}_noise_fp_{}_num_passes_{}_seed_size_{}_model_{}_batch_size_{}_gamma_{}_label_budget_{}_epochs_{}".format(
        args.seed, args.strategy, args.noise_fn, args.noise_fp, args.num_passes, args.seed_size, args.model, args.batch_size, args.gamma, args.label_budget, args.epochs)

    logging.basicConfig(
        filename="{}/{}.txt".format(args.dout, model_id),
        format='%(asctime)s %(levelname)-8s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=logging.INFO)
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

    logger = Experiment(comet_ml_key, project_name="ActiveDialogue")
    logger.log_parameters(vars(args))

    if args.model == "glad":
        model_arch = GLAD
    elif args.model == "gce":
        model_arch = GCE

    env = PartialEnv(load_dataset, model_arch, args)
    if args.seed_size:
        with logger.train():
            if not env.load('seed'):
                logging.info("No loaded seed. Training now.")
                env.seed_fit(args.seed_epochs, prefix="seed")
                logging.info("Seed completed.")
            else:
                logging.info("Loaded seed.")
                if args.force_seed:
                    logging.info("Training seed regardless.")
                    env.seed_fit(args.seed_epochs, prefix="seed")
        env.load('seed')

    use_strategy = False
    if args.strategy == "entropy":
        use_strategy = True
        strategy = partial_entropy
    elif args.strategy == "bald":
        use_strategy = True
        strategy = partial_bald

    if use_strategy:
        if args.threshold_strategy == "fixed":
            strategy = FixedThresholdStrategy(strategy, args, True)
        elif args.threshold_strategy == "variable":
            strategy = VariableThresholdStrategy(strategy, args, True)
        elif args.threshold_strategy == "randomvariable":
            strategy = StochasticVariableThresholdStrategy(
                strategy, args, True)

    ended = False
    i = 0

    initial_metrics = env.metrics(True)
    logger.log_current_epoch(i)
    logging.info("Initial metrics: {}".format(initial_metrics))
    for k, v in initial_metrics.items():
        logger.log_metric(k, v)

    with logger.train():
        while not ended:
            i += 1

            # Observe environment state
            logger.log_current_epoch(i)

            if env.can_label:
                # Obtain label request from strategy
                obs, preds = env.observe(20 if args.strategy ==
                                         "bald" else 1)
                if args.strategy != "bald":
                    preds = preds[0]
                if args.strategy == "aggressive":
                    label_request = aggressive(preds)
                elif args.strategy == "random":
                    label_request = random(preds)
                elif args.strategy == "passive":
                    label_request = passive(preds)
                elif use_strategy:
                    label_request = strategy.observe(preds)
                else:
                    raise ValueError()

                # Label solicitation
                labeled = env.label(label_request)
                if use_strategy:
                    strategy.update(
                        sum([
                            np.sum(s.flatten())
                            for s in label_request.values()
                        ]),
                        sum([
                            np.sum(np.ones_like(s).flatten())
                            for s in label_request.values()
                        ]))
            else:
                break

            # Environment stepping
            ended = env.step()
            # Fit every al_batch of items
            best = env.fit(prefix=model_id, reset_model=True)
            for k, v in best.items():
                logger.log_metric(k, v)
            env.load(prefix=model_id)

    # Final fit
    final_metrics = env.fit(epochs=args.final_epochs,
                            prefix="final_fit_" + model_id,
                            reset_model=True)
    for k, v in final_metrics.items():
        logger.log_metric("Final " + k, v)
        logging.info("Final " + k + ": " + str(v))
    logging.info("Run finished.")
Ejemplo n.º 20
0
def main():
    global args, best_acc1
    args = parser.parse_args()

    #########################################################################################
    # Create options
    #########################################################################################
    if args.bert_model == "bert-base-uncased":
        question_features_path = BASE_EXTRACTED_QUES_FEATURES_PATH
    elif args.bert_model == "bert-base-multilingual-cased":
        question_features_path = CASED_EXTRACTED_QUES_FEATURES_PATH
    else:
        question_features_path = EXTRACTED_QUES_FEATURES_PATH

    options = {
        'vqa': {
            'trainsplit': args.vqa_trainsplit
        },
        'logs': {
            'dir_logs': args.dir_logs
        },
        'model': {
            'arch': args.arch,
            'seq2vec': {
                'type': args.st_type,
                'dropout': args.st_dropout,
                'fixed_emb': args.st_fixed_emb
            }
        },
        'optim': {
            'lr': args.learning_rate,
            'batch_size': args.batch_size,
            'epochs': args.epochs
        }
    }
    if args.path_opt is not None:
        with open(args.path_opt, 'r') as handle:
            options_yaml = yaml.load(handle, Loader=yaml.FullLoader)
        options = utils.update_values(options, options_yaml)
    print('## args')
    pprint(vars(args))
    print('## options')
    pprint(options)
    if args.help_opt:
        return

    # Set datasets options
    if 'vgenome' not in options:
        options['vgenome'] = None

    #########################################################################################
    # Create needed datasets
    #########################################################################################

    trainset = datasets.factory_VQA(options['vqa']['trainsplit'],
                                    options['vqa'], options['coco'],
                                    options['vgenome'])
    train_loader = trainset.data_loader(
        batch_size=options['optim']['batch_size'],
        num_workers=args.workers,
        shuffle=True)

    if options['vqa']['trainsplit'] == 'train':
        valset = datasets.factory_VQA('val', options['vqa'], options['coco'])
        val_loader = valset.data_loader(
            batch_size=options['optim']['batch_size'],
            num_workers=args.workers)

    if options['vqa']['trainsplit'] == 'trainval' or args.evaluate:
        testset = datasets.factory_VQA('test', options['vqa'], options['coco'])
        test_loader = testset.data_loader(
            batch_size=options['optim']['batch_size'],
            num_workers=args.workers)

    #########################################################################################
    # Create model, criterion and optimizer
    #########################################################################################

    model = models.factory(options['model'],
                           trainset.vocab_words(),
                           trainset.vocab_answers(),
                           cuda=True,
                           data_parallel=True)
    criterion = criterions.factory(options['vqa'], cuda=True)
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()),
        options['optim']['lr'])

    #########################################################################################
    # args.resume: resume from a checkpoint OR create logs directory
    #########################################################################################

    exp_logger = None
    if args.resume:
        args.start_epoch, best_acc1, exp_logger = load_checkpoint(
            model.module, optimizer,
            os.path.join(options['logs']['dir_logs'], args.resume))
    else:
        # Or create logs directory
        if os.path.isdir(options['logs']['dir_logs']):
            if click.confirm(
                    'Logs directory already exists in {}. Erase?'.format(
                        options['logs']['dir_logs'], default=False)):
                os.system('rm -r ' + options['logs']['dir_logs'])
            else:
                return
        os.system('mkdir -p ' + options['logs']['dir_logs'])
        path_new_opt = os.path.join(options['logs']['dir_logs'],
                                    os.path.basename(args.path_opt))
        path_args = os.path.join(options['logs']['dir_logs'], 'args.yaml')
        with open(path_new_opt, 'w') as f:
            yaml.dump(options, f, default_flow_style=False)
        with open(path_args, 'w') as f:
            yaml.dump(vars(args), f, default_flow_style=False)

    if exp_logger is None:
        # Set loggers
        exp_name = os.path.basename(
            options['logs']['dir_logs'])  # add timestamp
        exp_logger = logger.Experiment(exp_name, options)
        exp_logger.add_meters('train', make_meters())
        exp_logger.add_meters('test', make_meters())
        if options['vqa']['trainsplit'] == 'train':
            exp_logger.add_meters('val', make_meters())
        exp_logger.info['model_params'] = utils.params_count(model)
        print('Model has {} parameters'.format(
            exp_logger.info['model_params']))

    #########################################################################################
    # args.evaluate: on valset OR/AND on testset
    #########################################################################################

    if args.evaluate:
        path_logger_json = os.path.join(options['logs']['dir_logs'],
                                        'logger.json')

        if options['vqa']['trainsplit'] == 'train':
            acc1, val_results = engine.validate(val_loader, model, criterion,
                                                exp_logger, args.start_epoch,
                                                args.print_freq)
            # save results and compute OpenEnd accuracy
            exp_logger.to_json(path_logger_json)
            save_results(val_results, args.start_epoch, valset.split_name(),
                         options['logs']['dir_logs'], options['vqa']['dir'])

        test_results, testdev_results = engine.test(test_loader, model,
                                                    exp_logger,
                                                    args.start_epoch,
                                                    args.print_freq)
        # save results and DOES NOT compute OpenEnd accuracy
        exp_logger.to_json(path_logger_json)
        save_results(test_results, args.start_epoch, testset.split_name(),
                     options['logs']['dir_logs'], options['vqa']['dir'])
        save_results(testdev_results, args.start_epoch,
                     testset.split_name(testdev=True),
                     options['logs']['dir_logs'], options['vqa']['dir'])
        return

    #########################################################################################
    # Begin training on train/val or trainval/test
    #########################################################################################
    experiment = Experiment(api_key="AgTGwIoRULRgnfVR5M8mZ5AfS",
                            project_name="vqa",
                            workspace="vuhoangminh")
    experiment.log_parameters(flatten(options))

    with experiment.train():
        for epoch in range(args.start_epoch + 1, options['optim']['epochs']):

            engine.train(train_loader, model, criterion, optimizer, exp_logger,
                         epoch, experiment, args.print_freq)

            if options['vqa']['trainsplit'] == 'train':
                # evaluate on validation set
                with experiment.validate():
                    acc1, val_results = engine.validate(
                        val_loader, model, criterion, exp_logger, epoch,
                        args.print_freq)
                    # this will be logged as validation accuracy based on the context.
                    experiment.log_metric("acc1", acc1)

                # remember best prec@1 and save checkpoint
                is_best = acc1 > best_acc1
                best_acc1 = max(acc1, best_acc1)
                save_checkpoint(
                    {
                        'epoch': epoch,
                        'arch': options['model']['arch'],
                        'best_acc1': best_acc1,
                        'exp_logger': exp_logger
                    }, model.module.state_dict(), optimizer.state_dict(),
                    options['logs']['dir_logs'], args.save_model,
                    args.save_all_from, is_best)

                # save results and compute OpenEnd accuracy
                save_results(val_results, epoch, valset.split_name(),
                             options['logs']['dir_logs'],
                             options['vqa']['dir'])

            else:
                test_results, testdev_results = engine.test(
                    test_loader,
                    model,
                    exp_logger,
                    epoch,
                    args.print_freq,
                    topk=3,
                    dict=io_utils.read_pickle(question_features_path),
                    bert_dim=options["model"]["dim_q"])

                # save checkpoint at every timestep
                save_checkpoint(
                    {
                        'epoch': epoch,
                        'arch': options['model']['arch'],
                        'best_acc1': best_acc1,
                        'exp_logger': exp_logger
                    }, model.module.state_dict(), optimizer.state_dict(),
                    options['logs']['dir_logs'], args.save_model,
                    args.save_all_from)

                # save results and DOES NOT compute OpenEnd accuracy
                save_results(test_results, epoch, testset.split_name(),
                             options['logs']['dir_logs'],
                             options['vqa']['dir'])
                save_results(testdev_results, epoch,
                             testset.split_name(testdev=True),
                             options['logs']['dir_logs'],
                             options['vqa']['dir'])
Ejemplo n.º 21
0
    def train_and_evaluate(self, train_gen, val_gen, epochs):
        """
        """
        experiment = Experiment(
            api_key="VNQSdbR1pw33EkuHbUsGUSZWr",
            project_name="piratesofthecaribbean",
            workspace="florpi",
        )
        model = self.build()
        with experiment.train():
            model_path = os.path.join(self.directory,
                                      "cnn_{epoch:02d}-{val_loss:.2f}.hdf5")
            callbacks = [
                ModelCheckpoint(model_path, monitor="val_loss", mode="min"),
                # EarlyStopping(
                #     monitor="val_loss",
                #     mode="min",
                #     min_delta=0.1,
                #     patience=1,
                #     restore_best_weights=True,
                # ),
            ]
            model.fit(
                train_gen,
                epochs=epochs,
                validation_data=val_gen,
                callbacks=callbacks,
                class_weight=CLASS_WEIGHTS,
            )
        model.save(os.path.join(self.directory, "cnn_final.h5"))
        # Run validation
        with experiment.test():
            probabilities = []
            y_val_all = []
            # reset generator
            val_gen.reset()
            for idx, (X_val, y_val) in tqdm(enumerate(val_gen),
                                            desc="valset",
                                            total=val_gen._num_examples):
                y_val_all += y_val.tolist()
                probs = model.predict(X_val)
                probabilities += probs.tolist()
                if idx > val_gen._num_examples:
                    break

            y_true = np.argmax(y_val_all, axis=-1)
            y_pred = np.argmax(probabilities, axis=-1)
            visualize.plot_confusion_matrix(y_true,
                                            y_pred,
                                            classes=LABELS,
                                            normalize=True,
                                            experiment=experiment)

            visualize.plot_confusion_matrix(y_true,
                                            y_pred,
                                            classes=LABELS,
                                            normalize=False,
                                            experiment=experiment)
            experiment.log_confusion_matrix(y_true=y_true,
                                            y_predicted=y_pred,
                                            labels=LABELS)
        return model
Ejemplo n.º 22
0
class Experiment:
    """
        A helper class to facilitate the training and validation procedure of the GoTurnRemix model

        Parameters
        ----------
        learning_rate: float
            Learning rate to train the model. The optimizer is SGD and the loss is L1 Loss
        image_size: int
            The size of the input image. This has to be fixed before the data is created
        data_path: Path
            Path to the data folder. If the folder name includes "pickle", then the data saved as pickles are loaded
        augment: bool
            Perform augmentation on the images before training
        logs_path: Path
            Path to save the validation predictions at the end of each epoch
        models_path: Path
            Path to save the model state at the end of each epoch
        save_name: str
            Name of the folder in which the logs and models are saved. If not provided, the current datetime is used
    """
    def __init__(self,
                 learning_rate: float,
                 image_size: int,
                 data_path: Path,
                 augment: bool = True,
                 logs_path: Path = None,
                 models_path: Path = None,
                 save_name: str = None,
                 comet_api: str = None):
        self.image_size = image_size
        self.logs_path = logs_path
        self.models_path = models_path
        self.model = GoTurnRemix()
        self.model.cuda()
        self.criterion = torch.nn.L1Loss()
        self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                                self.model.parameters()),
                                         lr=learning_rate)
        self.model_name = str(datetime.datetime.now()).split('.')[0].replace(
            ':', '-').replace(' ', '-')
        self.model_name = save_name if save_name else self.model_name
        self.augment = augment
        self.data = Data(data_path,
                         target_size=self.image_size,
                         transforms=augment)
        self.comet = None
        if comet_api:
            self.comet = Comet(api_key=comet_api)
            self.comet.log_parameter('learning_rate', learning_rate)
            self.comet.log_parameter('image_size', image_size)
            self.comet.log_parameter('augment', augment)

    def __train_step__(self, data):
        """
        Performs one step of the training procedure

        Parameters
        ----------
        data
            data obtained from @Data.__getitem__

        Returns
        -------
           Loss at the end of training step
        """
        if self.comet:
            self.comet.train()
        previous_cropped, current_cropped, bbox, scale, crop = data
        previous_cropped = torch.div(previous_cropped, 255).float().cuda()
        current_cropped = torch.div(current_cropped, 255).float().cuda()
        previous_cropped = torch.autograd.Variable(previous_cropped,
                                                   requires_grad=True)
        current_cropped = torch.autograd.Variable(current_cropped,
                                                  requires_grad=True)
        bbox = bbox.requires_grad_(True).float().cuda()
        self.optimizer.zero_grad()
        preds = self.model(previous_cropped, current_cropped)

        del previous_cropped
        del current_cropped
        gc.collect()

        loss = self.criterion(preds, bbox)
        if self.comet:
            self.comet.log_metric('loss', loss)
        loss.backward()
        self.optimizer.step()
        return loss

    def __test__(self):
        """
        Test tracking of the model

        Returns
        -------
            Test loss and test predictions
        """
        # Set model to evaluation mode
        if self.comet:
            self.comet.test()
        self.model.eval()
        test_preds = []
        test_loss = []
        video_frames = self.data.video_frames[-1]
        video_annotations = self.data.video_annotations[-1]
        p_a = video_annotations[0]
        p_f = video_frames[0]
        test_preds.append(p_a)

        for i in tqdm(range(1, len(video_annotations)), desc='Validating'):
            c_a = video_annotations[i]
            c_f = video_frames[i]
            p_c, c_c, bbox, scale, crop = self.data.make_crops(
                p_f, c_f, p_a, c_a)
            p_c = torch.div(torch.from_numpy(p_c),
                            255).unsqueeze(0).float().cuda()
            c_c = torch.div(torch.from_numpy(c_c),
                            255).unsqueeze(0).float().cuda()
            bbox = torch.tensor(bbox, requires_grad=False).float().cuda()
            preds = self.model(p_c, c_c)

            del p_c
            del c_c
            gc.collect()

            loss = torch.nn.functional.l1_loss(preds, bbox)
            if self.comet:
                self.comet.log_metric('val_loss', loss)
            test_loss.append(loss.item())
            preds = self.data.get_bbox(preds.cpu().detach().numpy()[0],
                                       self.image_size, scale, crop)
            test_preds.append(preds)
            p_a = preds
            p_f = c_f
        return test_loss, test_preds

    def __validate__(self):
        """
        Performs validation on the model

        Returns
        -------
            Validation loss and validation predictions
        """
        # Set model to evaluation mode
        if self.comet:
            self.comet.validate()
        self.model.eval()
        validation_preds = []
        validation_loss = []
        video_frames = self.data.video_frames[-1]
        video_annotations = self.data.video_annotations[-1]
        p_a = video_annotations[0]
        p_f = video_frames[0]
        validation_preds.append(p_a)

        for i in tqdm(range(1, len(video_annotations)), desc='Validating'):
            c_a = video_annotations[i]
            c_f = video_frames[i]
            p_c, c_c, bbox, scale, crop = self.data.make_crops(
                p_f, c_f, p_a, c_a)
            p_c = torch.div(torch.from_numpy(p_c),
                            255).unsqueeze(0).float().cuda()
            c_c = torch.div(torch.from_numpy(c_c),
                            255).unsqueeze(0).float().cuda()
            bbox = torch.tensor(bbox, requires_grad=False).float().cuda()
            preds = self.model(p_c, c_c)

            del p_c
            del c_c
            gc.collect()

            loss = torch.nn.functional.l1_loss(preds, bbox)
            if self.comet:
                self.comet.log_metric('val_loss', loss)
            validation_loss.append(loss.item())
            preds = self.data.get_bbox(preds.cpu().detach().numpy()[0],
                                       self.image_size, scale, crop)
            validation_preds.append(preds)
            p_a = c_a
            p_f = c_f
        return validation_loss, validation_preds

    def train(self,
              epochs: int,
              batch_size: int,
              validate: bool = True,
              test: bool = True):
        """
        Trains the model for @epochs number of epochs

        Parameters
        ----------
        epochs: int
            Number of epochs to train the model
        batch_size: int
            The size of each batch when training the model
        validate: bool, default=True
            If True, validation occurs at the end of each epoch
            The results are saved in @logs_path and models are saved in @models_path
        test: bool, default=True
            If True, the model is tested for tracking at the end of the training procedure
            The results are saved in @logs_path

        Returns
        -------
            list: List containing the training loss at the end of each epoch
        """
        if self.comet:
            self.comet.log_parameter('epochs', epochs)
            self.comet.log_parameter('batch_size', batch_size)
        loss_per_epoch = []
        preds_per_epoch = []
        # Set the model to training mode
        self.model.train()
        # Create a DataLoader to feed data to the model
        dataloader = torch.utils.data.DataLoader(dataset=self.data,
                                                 batch_size=batch_size,
                                                 shuffle=True)

        # Run for @epochs number of epochs
        for epoch in range(epochs):
            if self.comet:
                self.comet.log_metric('epoch', epoch)
            running_loss = []
            for step, data in enumerate(
                    tqdm(dataloader,
                         total=int(len(self.data) / batch_size),
                         desc='Epoch {}'.format(epoch))):
                loss = self.__train_step__(data)
                running_loss.append(loss.item())
            training_loss = sum(running_loss) / len(running_loss)
            if self.comet:
                self.comet.log_metric('mean_train_loss', training_loss)
            loss_per_epoch.append(sum(running_loss) / len(running_loss))
            if validate:
                validation_loss, validation_preds = self.__validate__()
                if self.comet:
                    self.comet.log_metric('mean_validation_loss',
                                          validation_loss)
                preds_per_epoch.append(validation_preds)
                print('Validation loss: {}'.format(
                    sum(validation_loss) / len(validation_loss)))
            # Save the model at this stage
            if self.models_path:
                (self.models_path / self.model_name).mkdir(exist_ok=True)
                torch.save(self.model, (self.models_path / self.model_name /
                                        'epoch_{}'.format(epoch)).resolve())
            print('Training Loss: {}'.format(training_loss))
        # Save the validation frames, ground truths and predictions at this stage
        if self.logs_path:
            (self.logs_path / self.model_name).mkdir(exist_ok=True)
            save = {
                'frames': self.data.video_frames[-1],
                'truth': self.data.video_annotations[-1],
                'preds': preds_per_epoch
            }
            np.save(
                str((self.logs_path / self.model_name /
                     'preds_per_epoch.npy').resolve()), save)
        # Test the model and save the results
        if test:
            test_loss, test_preds = self.__test__()
            if self.logs_path:
                (self.logs_path / self.model_name).mkdir(exist_ok=True)
                save = {
                    'frames': self.data.video_frames[-1],
                    'truth': self.data.video_annotations[-1],
                    'preds': test_preds,
                    'loss': test_loss
                }
                np.save(
                    str((self.logs_path / self.model_name /
                         'test_preds.npy').resolve()), save)
        return loss_per_epoch
Ejemplo n.º 23
0
def run(args, train, sparse_evidences, claims_dict):
    BATCH_SIZE = args.batch_size
    LEARNING_RATE = args.learning_rate
    DATA_SAMPLING = args.data_sampling
    NUM_EPOCHS = args.epochs
    MODEL = args.model
    RANDOMIZE = args.no_randomize
    PRINT = args.print

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    logger = Logger('./logs/{}'.format(time.localtime()))

    if MODEL:
        print("Loading pretrained model...")
        model = torch.load(MODEL)
        model.load_state_dict(torch.load(MODEL).state_dict())
    else:
        model = cdssm.CDSSM()
        model = model.cuda()
        model = model.to(device)

    # model = cdssm.CDSSM()
    # model = model.cuda()
    # model = model.to(device)

    if torch.cuda.device_count() > 0:
        print("Let's use", torch.cuda.device_count(), "GPU(s)!")
        model = nn.DataParallel(model)

    print("Created model with {:,} parameters.".format(
        putils.count_parameters(model)))

    # if MODEL:
    # print("TEMPORARY change to loading!")
    # model.load_state_dict(torch.load(MODEL).state_dict())

    print("Created dataset...")

    # use an 80/20 train/validate split!
    train_size = int(len(train) * 0.80)
    #test = int(len(train) * 0.5)
    train_dataset = pytorch_data_loader.WikiDataset(
        train[:train_size],
        claims_dict,
        data_sampling=DATA_SAMPLING,
        sparse_evidences=sparse_evidences,
        randomize=RANDOMIZE)
    val_dataset = pytorch_data_loader.WikiDataset(
        train[train_size:],
        claims_dict,
        data_sampling=DATA_SAMPLING,
        sparse_evidences=sparse_evidences,
        randomize=RANDOMIZE)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=BATCH_SIZE,
                                  num_workers=0,
                                  shuffle=True,
                                  collate_fn=pytorch_data_loader.PadCollate())
    val_dataloader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                num_workers=0,
                                shuffle=True,
                                collate_fn=pytorch_data_loader.PadCollate())

    # Loss and optimizer
    criterion = torch.nn.NLLLoss()
    # criterion = torch.nn.SoftMarginLoss()
    # if torch.cuda.device_count() > 0:
    # print("Let's parallelize the backward pass...")
    # criterion = DataParallelCriterion(criterion)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=LEARNING_RATE,
                                 weight_decay=1e-3)

    OUTPUT_FREQ = max(int((len(train_dataset) / BATCH_SIZE) * 0.02), 20)
    parameters = {
        "batch size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "learning rate": LEARNING_RATE,
        "optimizer": optimizer.__class__.__name__,
        "loss": criterion.__class__.__name__,
        "training size": train_size,
        "data sampling rate": DATA_SAMPLING,
        "data": args.data,
        "sparse_evidences": args.sparse_evidences,
        "randomize": RANDOMIZE,
        "model": MODEL
    }
    experiment = Experiment(api_key="YLsW4AvRTYGxzdDqlWRGCOhee",
                            project_name="clsm",
                            workspace="moinnadeem")
    experiment.add_tag("train")
    experiment.log_asset("cdssm.py")
    experiment.log_dataset_info(name=args.data)
    experiment.log_parameters(parameters)

    model_checkpoint_dir = "models/saved_model"
    for key, value in parameters.items():
        if type(value) == str:
            value = value.replace("/", "-")
        if key != "model":
            model_checkpoint_dir += "_{}-{}".format(key.replace(" ", "_"),
                                                    value)

    print("Training...")
    beginning_time = time.time()
    best_loss = torch.tensor(float("inf"),
                             dtype=torch.float)  # begin loss at infinity

    for epoch in range(NUM_EPOCHS):
        beginning_time = time.time()
        mean_train_acc = 0.0
        train_running_loss = 0.0
        train_running_accuracy = 0.0
        model.train()
        experiment.log_current_epoch(epoch)

        with experiment.train():
            for train_batch_num, inputs in enumerate(train_dataloader):
                claims_tensors, claims_text, evidences_tensors, evidences_text, labels = inputs

                claims_tensors = claims_tensors.cuda()
                evidences_tensors = evidences_tensors.cuda()
                labels = labels.cuda()
                #claims = claims.to(device).float()
                #evidences = evidences.to(device).float()
                #labels = labels.to(device)

                y_pred = model(claims_tensors, evidences_tensors)

                y = (labels)
                # y = y.unsqueeze(0)
                # y = y.unsqueeze(0)
                # y_pred = parallel.gather(y_pred, 0)

                y_pred = y_pred.squeeze()
                # y = y.squeeze()

                loss = criterion(y_pred, torch.max(y, 1)[1])
                # loss = criterion(y_pred, y)

                y = y.float()
                binary_y = torch.max(y, 1)[1]
                binary_pred = torch.max(y_pred, 1)[1]
                accuracy = (binary_y == binary_pred).to("cuda")
                accuracy = accuracy.float()
                accuracy = accuracy.mean()
                train_running_accuracy += accuracy.item()
                mean_train_acc += accuracy.item()
                train_running_loss += loss.item()

                if PRINT:
                    for idx in range(len(y)):
                        print(
                            "Claim: {}, Evidence: {}, Prediction: {}, Label: {}"
                            .format(claims_text[0], evidences_text[idx],
                                    torch.exp(y_pred[idx]), y[idx]))

                if (train_batch_num %
                        OUTPUT_FREQ) == 0 and train_batch_num > 0:
                    elapsed_time = time.time() - beginning_time
                    binary_y = torch.max(y, 1)[1]
                    binary_pred = torch.max(y_pred, 1)[1]
                    print(
                        "[{}:{}:{:3f}s] training loss: {}, training accuracy: {}, training recall: {}"
                        .format(
                            epoch, train_batch_num /
                            (len(train_dataset) / BATCH_SIZE), elapsed_time,
                            train_running_loss / OUTPUT_FREQ,
                            train_running_accuracy / OUTPUT_FREQ,
                            recall_score(binary_y.cpu().detach().numpy(),
                                         binary_pred.cpu().detach().numpy())))

                    # 1. Log scalar values (scalar summary)
                    info = {
                        'train_loss': train_running_loss / OUTPUT_FREQ,
                        'train_accuracy': train_running_accuracy / OUTPUT_FREQ
                    }

                    for tag, value in info.items():
                        experiment.log_metric(tag,
                                              value,
                                              step=train_batch_num *
                                              (epoch + 1))
                        logger.scalar_summary(tag, value, train_batch_num + 1)

                    ## 2. Log values and gradients of the parameters (histogram summary)
                    for tag, value in model.named_parameters():
                        tag = tag.replace('.', '/')
                        logger.histo_summary(tag,
                                             value.detach().cpu().numpy(),
                                             train_batch_num + 1)
                        logger.histo_summary(tag + '/grad',
                                             value.grad.detach().cpu().numpy(),
                                             train_batch_num + 1)

                    train_running_loss = 0.0
                    beginning_time = time.time()
                    train_running_accuracy = 0.0
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # del loss
        # del accuracy
        # del claims_tensors
        # del claims_text
        # del evidences_tensors
        # del evidences_text
        # del labels
        # del y
        # del y_pred
        # torch.cuda.empty_cache()

        print("Running validation...")
        model.eval()
        pred = []
        true = []
        avg_loss = 0.0
        val_running_accuracy = 0.0
        val_running_loss = 0.0
        beginning_time = time.time()
        with experiment.validate():
            for val_batch_num, val_inputs in enumerate(val_dataloader):
                claims_tensors, claims_text, evidences_tensors, evidences_text, labels = val_inputs

                claims_tensors = claims_tensors.cuda()
                evidences_tensors = evidences_tensors.cuda()
                labels = labels.cuda()

                y_pred = model(claims_tensors, evidences_tensors)

                y = (labels)
                # y_pred = parallel.gather(y_pred, 0)

                y_pred = y_pred.squeeze()

                loss = criterion(y_pred, torch.max(y, 1)[1])

                y = y.float()

                binary_y = torch.max(y, 1)[1]
                binary_pred = torch.max(y_pred, 1)[1]
                true.extend(binary_y.tolist())
                pred.extend(binary_pred.tolist())

                accuracy = (binary_y == binary_pred).to("cuda")

                accuracy = accuracy.float().mean()
                val_running_accuracy += accuracy.item()
                val_running_loss += loss.item()
                avg_loss += loss.item()

                if (val_batch_num % OUTPUT_FREQ) == 0 and val_batch_num > 0:
                    elapsed_time = time.time() - beginning_time
                    print(
                        "[{}:{}:{:3f}s] validation loss: {}, accuracy: {}, recall: {}"
                        .format(
                            epoch,
                            val_batch_num / (len(val_dataset) / BATCH_SIZE),
                            elapsed_time, val_running_loss / OUTPUT_FREQ,
                            val_running_accuracy / OUTPUT_FREQ,
                            recall_score(binary_y.cpu().detach().numpy(),
                                         binary_pred.cpu().detach().numpy())))

                    # 1. Log scalar values (scalar summary)
                    info = {'val_accuracy': val_running_accuracy / OUTPUT_FREQ}

                    for tag, value in info.items():
                        experiment.log_metric(tag,
                                              value,
                                              step=val_batch_num * (epoch + 1))
                        logger.scalar_summary(tag, value, val_batch_num + 1)

                    ## 2. Log values and gradients of the parameters (histogram summary)
                    for tag, value in model.named_parameters():
                        tag = tag.replace('.', '/')
                        logger.histo_summary(tag,
                                             value.detach().cpu().numpy(),
                                             val_batch_num + 1)
                        logger.histo_summary(tag + '/grad',
                                             value.grad.detach().cpu().numpy(),
                                             val_batch_num + 1)

                    val_running_accuracy = 0.0
                    val_running_loss = 0.0
                    beginning_time = time.time()

        # del loss
        # del accuracy
        # del claims_tensors
        # del claims_text
        # del evidences_tensors
        # del evidences_text
        # del labels
        # del y
        # del y_pred
        # torch.cuda.empty_cache()

        accuracy = accuracy_score(true, pred)
        print("[{}] mean accuracy: {}, mean loss: {}".format(
            epoch, accuracy, avg_loss / len(val_dataloader)))

        true = np.array(true).astype("int")
        pred = np.array(pred).astype("int")
        print(classification_report(true, pred))

        best_loss = torch.tensor(
            min(avg_loss / len(val_dataloader),
                best_loss.cpu().numpy()))
        is_best = bool((avg_loss / len(val_dataloader)) <= best_loss)

        putils.save_checkpoint(
            {
                "epoch": epoch,
                "model": model,
                "best_loss": best_loss
            },
            is_best,
            filename="{}_loss_{}".format(model_checkpoint_dir,
                                         best_loss.cpu().numpy()))
Ejemplo n.º 24
0
def main(args):
    if args.dataset in ('FB15k-237', 'kinship', 'nations', 'umls', 'WN18RR', 'YAGO3-10'):
        S = joblib.load(args.data_path)
        train_set = FBDataset(S['train_data'], args.prefetch_to_gpu)
        valid_set = FBDataset(S['val_data'], attr_data)
        test_set = FBDataset(S['test_data'], attr_data)
    else:
        train_set = FBDataset(args.data_path % 'train', args.prefetch_to_gpu)
        valid_set = FBDataset(args.data_path % 'valid')
        test_set = FBDataset(args.data_path % 'test')
        print('50 Most Commone Attributes')

    if args.prefetch_to_gpu:
        train_hash = set([r.tobytes() for r in train_set.dataset.cpu().numpy()])
    else:
        train_hash = set([r.tobytes() for r in train_set.dataset])

    all_hash = train_hash.copy()
    all_hash.update(set([r.tobytes() for r in valid_set.dataset]))
    all_hash.update(set([r.tobytes() for r in test_set.dataset]))
    logdir = args.outname_base + '_logs' + '/'
    if args.remove_old_run:
        shutil.rmtree(logdir)
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    tflogger = tfLogger(logdir)
    ''' Comet Logging '''
    experiment = Experiment(api_key="Ht9lkWvTm58fRo9ccgpabq5zV", disabled= not args.do_log
                        ,project_name="graph-invariance-icml", workspace="joeybose")
    experiment.set_name(args.namestr)
    modelD = TransD(args.num_ent, args.num_rel, args.embed_dim, args.p)

    fairD_0, fairD_1, fairD_2 = None,None,None
    optimizer_fairD_0, optimizer_fairD_1, optimizer_fairD_2 = None,None,None
    filter_0, filter_1, filter_2 = None, None, None

    if args.debug:
        ipdb.set_trace()

    if args.load_transD:
        modelD.load(args.saved_path)

    if args.use_cuda:
        modelD.cuda()
    if args.use_attr:
        ''' Hard Coded to the most common attribute for now '''
        attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\
                args.reindex_attr_idx,args.attr_count]
        fairD_0 = FBDemParDisc(args.embed_dim,args.fair_att_0,'0',attr_data,args.use_cross_entropy)
        fairD_1 = FBDemParDisc(args.embed_dim,args.fair_att_1,'1',attr_data,args.use_cross_entropy)
        fairD_2 = FBDemParDisc(args.embed_dim,args.fair_att_2,'2',attr_data,args.use_cross_entropy)
        most_common_attr = [print(fairD_0.inv_attr_map[int(k)]) for k in \
                fairD_0.reindex_to_idx.keys()]

        ''' Initialize Optimizers '''
        if args.sample_mask:
            filter_0 = AttributeFilter(args.embed_dim,attribute='0')
            filter_1 = AttributeFilter(args.embed_dim,attribute='1')
            filter_2 = AttributeFilter(args.embed_dim,attribute='2')
            filter_0.cuda()
            filter_1.cuda()
            filter_2.cuda()
            optimizer_fairD_0 = optimizer(fairD_0.parameters(),'adam', args.lr)
            optimizer_fairD_1 = optimizer(fairD_1.parameters(),'adam',args.lr)
            optimizer_fairD_2 = optimizer(fairD_2.parameters(),'adam', args.lr)
        elif args.use_trained_filters and not args.sample_mask:
            filter_0 = AttributeFilter(args.embed_dim,attribute='0')
            filter_1 = AttributeFilter(args.embed_dim,attribute='1')
            filter_2 = AttributeFilter(args.embed_dim,attribute='2')
            filter_0.cuda()
            filter_1.cuda()
            filter_2.cuda()
        else:
            optimizer_fairD_0 = optimizer(fairD_0.parameters(),'adam', args.lr)
            optimizer_fairD_1 = optimizer(fairD_1.parameters(),'adam',args.lr)
            optimizer_fairD_2 = optimizer(fairD_2.parameters(),'adam', args.lr)
            filter_0, filter_1, filter_2 = None, None, None

        if args.use_cuda:
            fairD_0.cuda()
            fairD_1.cuda()
            fairD_2.cuda()

    elif args.use_1_attr:
        attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\
                args.reindex_attr_idx,args.attr_count]
        fairD_1 = FBDemParDisc(args.embed_dim,args.fair_att_1,'1',attr_data,\
                use_cross_entropy=args.use_cross_entropy)
        fairD_1.cuda()
        optimizer_fairD_1 = optimizer(fairD_1.parameters(),'adam',args.lr)
    elif args.use_0_attr:
        attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\
                args.reindex_attr_idx,args.attr_count]
        fairD_0 = FBDemParDisc(args.embed_dim,args.fair_att_0,'0',attr_data,\
               use_cross_entropy=args.use_cross_entropy)
        optimizer_fairD_0 = optimizer(fairD_0.parameters(),'adam', args.lr)
        fairD_0.cuda()
    elif args.use_2_attr:
        attr_data = [args.attr_mat,args.ent_to_idx,args.attr_to_idx,\
                args.reindex_attr_idx,args.attr_count]
        fairD_2 = FBDemParDisc(args.embed_dim,args.fair_att_2,'2',attr_data,\
                use_cross_entropy=args.use_cross_entropy)
        optimizer_fairD_2 = optimizer(fairD_2.parameters(),'adam', args.lr)
        fairD_2.cuda()

    if args.load_filters:
        filter_0.load(args.filter_0_saved_path)
        filter_1.load(args.filter_1_saved_path)
        filter_2.load(args.filter_2_saved_path)

    ''' Create Sets '''
    fairD_set = [fairD_0,fairD_1,fairD_2]
    filter_set = [filter_0,filter_1,filter_2]
    optimizer_fairD_set = [optimizer_fairD_0, optimizer_fairD_1,\
            optimizer_fairD_2]

    D_monitor = OrderedDict()
    test_val_monitor = OrderedDict()

    if args.sample_mask and not args.use_trained_filters:
        optimizerD = optimizer(list(modelD.parameters()) + \
                list(filter_0.parameters()) + \
                list(filter_1.parameters()) + \
                list(filter_2.parameters()), 'adam', args.lr)
    else:
        optimizerD = optimizer(modelD.parameters(), 'adam_hyp3', args.lr)
        # optimizerD = optimizer(modelD.parameters(), 'adam', args.lr)
    schedulerD = lr_scheduler(optimizerD, args.decay_lr, args.num_epochs)

    loss_func = MarginRankingLoss(args.margin,1)

    _cst_inds = torch.LongTensor(np.arange(args.num_ent, \
            dtype=np.int64)[:,None]).cuda().repeat(1, args.batch_size//2)
    _cst_s = torch.LongTensor(np.arange(args.batch_size//2)).cuda()
    _cst_s_nb = torch.LongTensor(np.arange(args.batch_size//2,args.batch_size)).cuda()
    _cst_nb = torch.LongTensor(np.arange(args.batch_size)).cuda()

    if args.prefetch_to_gpu:
        train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, drop_last=True,
                                  num_workers=0, collate_fn=collate_fn)
    else:
        train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, drop_last=True,
                                  num_workers=4, pin_memory=True, collate_fn=collate_fn)
    if args.freeze_transD:
        freeze_model(modelD)

    ''' Joint Training '''
    if not args.dont_train:
        with experiment.train():
            for epoch in tqdm(range(1, args.num_epochs + 1)):
                train(train_loader,epoch,args,train_hash,modelD,optimizerD,\
                        tflogger,fairD_set,optimizer_fairD_set,filter_set,experiment)
                gc.collect()
                if args.decay_lr:
                    if args.decay_lr == 'ReduceLROnPlateau':
                        schedulerD.step(monitor['D_loss_epoch_avg'])
                    else:
                        schedulerD.step()

                if epoch % args.valid_freq == 0:
                    with torch.no_grad():
                        l_ranks, r_ranks = test(test_set,args,all_hash,\
                                modelD,tflogger,filter_set,experiment,subsample=20)
                        l_mean = l_ranks.mean()
                        r_mean = r_ranks.mean()
                        l_mrr = (1. / l_ranks).mean()
                        r_mrr = (1. / r_ranks).mean()
                        l_h10 = (l_ranks <= 10).mean()
                        r_h10 = (r_ranks <= 10).mean()
                        l_h5 = (l_ranks <= 5).mean()
                        r_h5 = (r_ranks <= 5).mean()
                        avg_mr = (l_mean + r_mean)/2
                        avg_mrr = (l_mrr+r_mrr)/2
                        avg_h10 = (l_h10+r_h10)/2
                        avg_h5 = (l_h5+r_h5)/2

                    if args.use_attr:
                        test_fairness(test_set,args, modelD,tflogger,\
                                fairD_0,attribute='0',\
                                epoch=epoch,experiment=experiment,filter_=filter_0)
                        test_fairness(test_set,args,modelD,tflogger,\
                                fairD_1,attribute='1',epoch=epoch,\
                                experiment=experiment,filter_=filter_1)
                        test_fairness(test_set,args, modelD,tflogger,\
                                fairD_2,attribute='2',epoch=epoch,\
                                experiment=experiment,filter_=filter_2)
                    elif args.use_0_attr:
                        test_fairness(test_set,args,modelD,tflogger,\
                                fairD_0,attribute='0',epoch=epoch,\
                                experiment=experiment,filter_=filter_0)
                    elif args.use_1_attr:
                        test_fairness(test_set,args,modelD,tflogger,\
                                fairD_1,attribute='1',epoch=epoch,\
                                experiment=experiment,filter_=filter_1)
                    elif args.use_2_attr:
                        test_fairness(test_set,args,modelD,tflogger,\
                                fairD_2,attribute='2',epoch=epoch,\
                                experiment=experiment,filter_=filter_2)

                    joblib.dump({'l_ranks':l_ranks,'r_ranks':r_ranks},args.outname_base+\
                                'epoch{}_validation_ranks.pkl'.format(epoch), compress=9)

                    print("Mean Rank is %f" %(float(avg_mr)))
                    if args.do_log: # Tensorboard logging
                        tflogger.scalar_summary('Mean Rank',float(avg_mr),epoch)
                        tflogger.scalar_summary('Mean Reciprocal Rank',float(avg_mrr),epoch)
                        tflogger.scalar_summary('Hit @10',float(avg_h10),epoch)
                        tflogger.scalar_summary('Hit @5',float(avg_h5),epoch)
                        experiment.log_metric("Mean Rank",float(avg_mr),step=counter)

                    modelD.save(args.outname_base+'D_epoch{}.pts'.format(epoch))

                if epoch % (args.valid_freq * 5) == 0:
                    l_ranks, r_ranks = test(test_set,args,all_hash,modelD,\
                            tflogger,filter_set,experiment,subsample=20)
                    l_mean = l_ranks.mean()
                    r_mean = r_ranks.mean()
                    l_mrr = (1. / l_ranks).mean()
                    r_mrr = (1. / r_ranks).mean()
                    l_h10 = (l_ranks <= 10).mean()
                    r_h10 = (r_ranks <= 10).mean()
                    l_h5 = (l_ranks <= 5).mean()
                    r_h5 = (r_ranks <= 5).mean()

    if args.sample_mask:
        filter_0.save(args.outname_base+'Filter_0.pts')
        filter_1.save(args.outname_base+'Filter_1.pts')
        filter_2.save(args.outname_base+'Filter_2.pts')

    if args.test_new_disc:
        ''' Testing with fresh discriminators '''
        args.use_attr = True
        args.use_trained_filters = True
        with experiment.test():
            args.force_ce = True
            if args.use_trained_filters:
                logdir_filter = args.outname_base + '_test_2_filter_logs' + '/'
                if args.remove_old_run:
                    shutil.rmtree(logdir_filter)
                if not os.path.exists(logdir_filter):
                    os.makedirs(logdir_filter)
                tflogger_filter = tfLogger(logdir_filter)

                args.use_trained_filters = True

                ''' Test With Filters '''
                if args.use_attr:
                    retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\
                            optimizerD,tflogger_filter,filter_2=filter_2,filter_0=None,\
                            filter_1=None,attribute='2')
                    retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\
                            optimizerD,tflogger_filter,filter_0,filter_1=None,\
                            filter_2=None,attribute='0')
                    retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\
                            optimizerD,tflogger_filter,filter_1=filter_1,\
                            filter_0=None,filter_2=None,attribute='1')
                elif args.use_0_attr:
                    retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\
                            optimizerD,tflogger_filter,filter_0,filter_1=None,\
                            filter_2=None,attribute='0')
                elif args.use_1_attr:
                    retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\
                            optimizerD,experiment,tflogger_filter,filter_1=filter_1,\
                            filter_0=None,filter_2=None,attribute='1')
                elif args.use_2_attr:
                    retrain_disc(args,experiment,train_loader,train_hash,test_set,modelD,\
                            optimizerD,tflogger_filter,filter_2=filter_2,filter_0=None,\
                            filter_1=None,attribute='2')

            args.freeze_transD = True
            args.use_trained_filters = False
            logdir_no_filter = args.outname_base + '_test_no_2_filter_logs' + '/'
            if args.remove_old_run:
                shutil.rmtree(logdir_no_filter)
            if not os.path.exists(logdir_no_filter):
                os.makedirs(logdir_no_filter)
            tflogger_no_filter = tfLogger(logdir_no_filter)
Ejemplo n.º 25
0
class Trainer2D:
    def __init__(self, config):
        self.experiment = Experiment(api_key="CQ4yEzhJorcxul2hHE5gxVNGu",
                                     project_name="HIP")
        self.experiment.log_parameters(vars(config))
        self.config = config
        self.log_step = config.log_step
        self.model = conv2d.Conv2DPatches(image_size=config.image_size)
        print(self.model)
        self.d = get_dataloader2D(config)
        self.train_loader, self.test_loader = self.d
        self.train_loader_jig, self.test_loader_jig = get_dataloader2DJigSaw(
            config)
        self.net_optimizer = optim.Adam(self.model.parameters(), config.lr,
                                        [0.5, 0.9999])
        if torch.cuda.is_available():
            self.model = self.model.cuda()
        self.criterion_c = nn.CrossEntropyLoss()
        self.criterion_d = nn.MSELoss()
        self.epochs = config.epochs
        if torch.cuda.is_available():
            print("Using CUDA")
            self.model = self.model.cuda()
        #     self.model = self.model.cuda()
        self.pre_model_path = "./artifacts/pre_models/" + str(
            config.lr) + ".pth"
        self.model_path = "./artifacts/models/" + str(config.lr) + ".pth"
        self.image_size = config.image_size

    def pre_train(self):

        if os.path.isfile(self.pre_model_path):
            print("Using pre-trained model for solving the jigsaw puzzle")
            self.model = torch.load(self.pre_model_path)
        else:
            print("Starting pre-training and solving the jigsaw puzzle")
            for epoch in range(0):
                print("Starting epoch {}".format(epoch))
                train_loader = iter(self.train_loader_jig)
                with self.experiment.train():
                    for i in range(len(train_loader)):
                        self.net_optimizer.zero_grad()
                        data, indexes, _ = train_loader.next()
                        # print(landmarks)
                        # print(landmarks.shape)
                        data, indexes = self.to_var(data), self.to_var(
                            indexes).float()
                        B, L, H, W = data.size()
                        B, L, S = indexes.size()
                        print(data.size())
                        print(indexes.size())

                        jig_out, _ = self.model(data, True)
                        loss = self.criterion_d(jig_out, indexes.view(-1, S))
                        loss.backward()
                        self.net_optimizer.step()
                        # self.plots(y_slices, landmarks[:, :, [0, 2]], detected_points)
                        self.experiment.log_metric("pre-loss", loss.item())
                        print("loss: {}".format(loss.item()))

            torch.save(self.model, self.pre_model_path)

    def train(self):
        if os.path.isfile(self.model_path):
            print("Using pre-trained model")
            self.model = torch.load(self.model_path)
        if False:
            pass
        else:
            print("Starting training")
            if torch.cuda.is_available():
                self.model = self.model.cuda()
            for epoch in range(self.epochs):
                print("Starting epoch {}".format(epoch))
                train_loader = iter(self.train_loader)
                with self.experiment.train():
                    for i in range(len(train_loader)):
                        self.net_optimizer.zero_grad()
                        data, landmarks, _ = train_loader.next()
                        # print(landmarks)
                        data, landmarks = self.to_var(data), self.to_var(
                            landmarks)
                        B, L, H, W = data.size()
                        B, L, S = landmarks.size()
                        y = landmarks[:, :, 1].view(B, L)
                        y_slices = torch.zeros([B, L, H, W],
                                               dtype=torch.float32)
                        if torch.cuda.is_available():
                            y_slices = y_slices.cuda()
                        for i in range(B):
                            y_slices[i] = data[i, y[i]]

                        jig_out, detected_points = self.model(y_slices)
                        landmarks = landmarks.float() / self.image_size
                        loss = self.criterion_d(detected_points,
                                                landmarks[:, :, [0, 2]])
                        loss.backward()
                        self.net_optimizer.step()
                        # self.plots(y_slices, landmarks[:, :, [0, 2]], detected_points)
                        self.experiment.log_metric("loss", loss.item())
                        print("loss: {}".format(loss.item()))
                if epoch % self.log_step == 0:
                    with self.experiment.test():
                        self.evaluate()
                        evaluator = Evaluator(self, self.test_loader)
                        evaluator.report()
            torch.save(self.model, self.model_path)
        evaluator = Evaluator(self, self.test_loader)
        evaluator.report()

    def evaluate(self):
        test_loader = iter(self.test_loader)
        with self.experiment.test():
            loss = 0
            for i in range(len(test_loader)):
                self.net_optimizer.zero_grad()
                data, landmarks, _ = test_loader.next()
                data, landmarks = self.to_var(data), self.to_var(landmarks)
                B, L, H, W = data.size()
                B, L, S = landmarks.size()
                y = landmarks[:, :, 1].view(B, L)
                y_slices = torch.zeros([B, L, H, W], dtype=torch.float32)
                if torch.cuda.is_available():
                    y_slices = y_slices.cuda()

                for i in range(B):
                    y_slices[i] = data[i, y[i]]

                jig_out, detected_points = self.model(y_slices)
                landmarks = landmarks.float() / self.image_size
                loss += self.criterion_d(detected_points,
                                         landmarks[:, :, [0, 2]]).item()
                self.plots(y_slices.cpu(), landmarks[:, :, [0, 2]],
                           detected_points)
            self.experiment.log_metric("loss", loss / len(test_loader))

    def plots(self, slices, real, predicted):
        figure, axes = plt.subplots(nrows=4, ncols=4, figsize=(15, 15))
        slices = slices[0].cpu().detach().numpy()
        real = real[0].cpu().detach().numpy()
        predicted = predicted[0].cpu().detach().numpy()
        real *= self.image_size
        predicted *= self.image_size
        s = 0
        # print(real.size())
        # print(predicted.size())
        for i in range(4):
            for j in range(4):
                axes[i, j].imshow(slices[s])
                x, z = real[s]
                axes[i, j].scatter(x, z, color="red")
                x, z = predicted[s]
                axes[i, j].scatter(x, z, color="blue")
                s += 1
        self.experiment.log_figure(figure=plt)
        plt.savefig("artifacts/predictions/img.png")
        plt.show()

    def to_var(self, x):
        """Converts numpy to variable."""
        if torch.cuda.is_available():
            x = x.cuda()
        return Variable(x, requires_grad=False)

    def to_data(self, x):
        """Converts variable to numpy."""
        if torch.cuda.is_available():
            x = x.cpu()
        return x.data.numpy()

    def predict(self, x):
        if torch.cuda.is_available():
            self.model = self.model.cuda()
            x = x.cuda()
        _, x = self.model(x)
        return x
Ejemplo n.º 26
0
class Trainer():
    def __init__(self, log_dir, cfg):

        self.path = log_dir
        self.cfg = cfg

        if cfg.TRAIN.FLAG:
            self.model_dir = os.path.join(self.path, 'Model')
            self.log_dir = os.path.join(self.path, 'Log')
            mkdir_p(self.model_dir)
            mkdir_p(self.log_dir)
            self.writer = SummaryWriter(log_dir=self.log_dir)
            self.logfile = os.path.join(self.path, "logfile.log")
            sys.stdout = Logger(logfile=self.logfile)

        self.data_dir = cfg.DATASET.DATA_DIR
        self.max_epochs = cfg.TRAIN.MAX_EPOCHS
        self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL

        s_gpus = cfg.GPU_ID.split(',')
        self.gpus = [int(ix) for ix in s_gpus]
        self.num_gpus = len(self.gpus)

        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.lr = cfg.TRAIN.LEARNING_RATE

        torch.cuda.set_device(self.gpus[0])
        cudnn.benchmark = True

        sample = cfg.SAMPLE
        self.dataset = []
        self.dataloader = []
        self.use_feats = cfg.model.use_feats
        eval_split = cfg.EVAL if cfg.EVAL else 'val'
        train_split = cfg.DATASET.train_split
        if cfg.DATASET.DATASET == 'clevr':
            clevr_collate_fn = collate_fn
            cogent = cfg.DATASET.COGENT
            if cogent:
                print(f'Using CoGenT {cogent.upper()}')

            if cfg.TRAIN.FLAG:
                self.dataset = ClevrDataset(data_dir=self.data_dir,
                                            split=train_split + cogent,
                                            sample=sample,
                                            **cfg.DATASET.params)
                self.dataloader = DataLoader(dataset=self.dataset,
                                             batch_size=cfg.TRAIN.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=cfg.WORKERS,
                                             drop_last=True,
                                             collate_fn=clevr_collate_fn)

            self.dataset_val = ClevrDataset(data_dir=self.data_dir,
                                            split=eval_split + cogent,
                                            sample=sample,
                                            **cfg.DATASET.params)
            self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                             batch_size=cfg.TEST_BATCH_SIZE,
                                             drop_last=False,
                                             shuffle=False,
                                             num_workers=cfg.WORKERS,
                                             collate_fn=clevr_collate_fn)

        elif cfg.DATASET.DATASET == 'gqa':
            if self.use_feats == 'spatial':
                gqa_collate_fn = collate_fn_gqa
            elif self.use_feats == 'objects':
                gqa_collate_fn = collate_fn_gqa_objs
            if cfg.TRAIN.FLAG:
                self.dataset = GQADataset(data_dir=self.data_dir,
                                          split=train_split,
                                          sample=sample,
                                          use_feats=self.use_feats,
                                          **cfg.DATASET.params)
                self.dataloader = DataLoader(dataset=self.dataset,
                                             batch_size=cfg.TRAIN.BATCH_SIZE,
                                             shuffle=True,
                                             num_workers=cfg.WORKERS,
                                             drop_last=True,
                                             collate_fn=gqa_collate_fn)

            self.dataset_val = GQADataset(data_dir=self.data_dir,
                                          split=eval_split,
                                          sample=sample,
                                          use_feats=self.use_feats,
                                          **cfg.DATASET.params)
            self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                             batch_size=cfg.TEST_BATCH_SIZE,
                                             shuffle=False,
                                             num_workers=cfg.WORKERS,
                                             drop_last=False,
                                             collate_fn=gqa_collate_fn)

        # load model
        self.vocab = load_vocab(cfg)
        self.model, self.model_ema = mac.load_MAC(cfg, self.vocab)

        self.weight_moving_average(alpha=0)
        if cfg.TRAIN.RADAM:
            self.optimizer = RAdam(self.model.parameters(), lr=self.lr)
        else:
            self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.start_epoch = 0
        if cfg.resume_model:
            location = 'cuda' if cfg.CUDA else 'cpu'
            state = torch.load(cfg.resume_model, map_location=location)
            self.model.load_state_dict(state['model'])
            self.optimizer.load_state_dict(state['optim'])
            self.start_epoch = state['iter'] + 1
            state = torch.load(cfg.resume_model_ema, map_location=location)
            self.model_ema.load_state_dict(state['model'])

        if cfg.start_epoch is not None:
            self.start_epoch = cfg.start_epoch

        self.previous_best_acc = 0.0
        self.previous_best_epoch = 0
        self.previous_best_loss = 100
        self.previous_best_loss_epoch = 0

        self.total_epoch_loss = 0
        self.prior_epoch_loss = 10

        self.print_info()
        self.loss_fn = torch.nn.CrossEntropyLoss().cuda()

        self.comet_exp = Experiment(
            project_name=cfg.COMET_PROJECT_NAME,
            api_key=os.getenv('COMET_API_KEY'),
            workspace=os.getenv('COMET_WORKSPACE'),
            disabled=cfg.logcomet is False,
        )
        if cfg.logcomet:
            exp_name = cfg_to_exp_name(cfg)
            print(exp_name)
            self.comet_exp.set_name(exp_name)
            self.comet_exp.log_parameters(flatten_json_iterative_solution(cfg))
            self.comet_exp.log_asset(self.logfile)
            self.comet_exp.log_asset_data(json.dumps(cfg, indent=4),
                                          file_name='cfg.json')
            self.comet_exp.set_model_graph(str(self.model))
            if cfg.cfg_file:
                self.comet_exp.log_asset(cfg.cfg_file)

        with open(os.path.join(self.path, 'cfg.json'), 'w') as f:
            json.dump(cfg, f, indent=4)

    def print_info(self):
        print('Using config:')
        pprint.pprint(self.cfg)
        print("\n")

        pprint.pprint("Size of train dataset: {}".format(len(self.dataset)))
        # print("\n")
        pprint.pprint("Size of val dataset: {}".format(len(self.dataset_val)))
        print("\n")

        print("Using MAC-Model:")
        pprint.pprint(self.model)
        print("\n")

    def weight_moving_average(self, alpha=0.999):
        for param1, param2 in zip(self.model_ema.parameters(),
                                  self.model.parameters()):
            param1.data *= alpha
            param1.data += (1.0 - alpha) * param2.data

    def set_mode(self, mode="train"):
        if mode == "train":
            self.model.train()
            self.model_ema.train()
        else:
            self.model.eval()
            self.model_ema.eval()

    def reduce_lr(self):
        epoch_loss = self.total_epoch_loss  # / float(len(self.dataset) // self.batch_size)
        lossDiff = self.prior_epoch_loss - epoch_loss
        if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \
            (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \
            (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)):
            self.lr *= 0.5
            print("Reduced learning rate to {}".format(self.lr))
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.lr
        self.prior_epoch_loss = epoch_loss
        self.total_epoch_loss = 0

    def save_models(self, iteration):
        save_model(self.model,
                   self.optimizer,
                   iteration,
                   self.model_dir,
                   model_name="model")
        save_model(self.model_ema,
                   None,
                   iteration,
                   self.model_dir,
                   model_name="model_ema")

    def train_epoch(self, epoch):
        cfg = self.cfg
        total_loss = 0.
        total_correct = 0
        total_samples = 0

        self.labeled_data = iter(self.dataloader)
        self.set_mode("train")

        dataset = tqdm(self.labeled_data, total=len(self.dataloader), ncols=20)

        for data in dataset:
            ######################################################
            # (1) Prepare training data
            ######################################################
            image, question, question_len, answer = data['image'], data[
                'question'], data['question_length'], data['answer']
            answer = answer.long()
            question = Variable(question)
            answer = Variable(answer)

            if cfg.CUDA:
                if self.use_feats == 'spatial':
                    image = image.cuda()
                elif self.use_feats == 'objects':
                    image = [e.cuda() for e in image]
                question = question.cuda()
                answer = answer.cuda().squeeze()
            else:
                question = question
                image = image
                answer = answer.squeeze()

            ############################
            # (2) Train Model
            ############################
            self.optimizer.zero_grad()

            scores = self.model(image, question, question_len)
            loss = self.loss_fn(scores, answer)
            loss.backward()

            if self.cfg.TRAIN.CLIP_GRADS:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.cfg.TRAIN.CLIP)

            self.optimizer.step()
            self.weight_moving_average()

            ############################
            # (3) Log Progress
            ############################
            correct = scores.detach().argmax(1) == answer
            total_correct += correct.sum().cpu().item()
            total_loss += loss.item() * answer.size(0)
            total_samples += answer.size(0)

            avg_loss = total_loss / total_samples
            train_accuracy = total_correct / total_samples
            # accuracy = correct.sum().cpu().numpy() / answer.shape[0]

            # if avg_loss == 0:
            #     avg_loss = loss.item()
            #     train_accuracy = accuracy
            # else:
            #     avg_loss = 0.99 * avg_loss + 0.01 * loss.item()
            #     train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy
            # self.total_epoch_loss += loss.item() * answer.size(0)

            dataset.set_description(
                'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format(
                    epoch + 1, avg_loss, train_accuracy))

        self.total_epoch_loss = avg_loss

        dict = {
            "loss": avg_loss,
            "accuracy": train_accuracy,
            "avg_loss": avg_loss,  # For commet
            "avg_accuracy": train_accuracy,  # For commet
        }
        return dict

    def train(self):
        cfg = self.cfg
        print("Start Training")
        for epoch in range(self.start_epoch, self.max_epochs):

            with self.comet_exp.train():
                dict = self.train_epoch(epoch)
                self.reduce_lr()
                dict['epoch'] = epoch + 1
                dict['lr'] = self.lr
                self.comet_exp.log_metrics(
                    dict,
                    epoch=epoch + 1,
                )

            with self.comet_exp.validate():
                dict = self.log_results(epoch, dict)
                dict['epoch'] = epoch + 1
                dict['lr'] = self.lr
                self.comet_exp.log_metrics(
                    dict,
                    epoch=epoch + 1,
                )

            if cfg.TRAIN.EALRY_STOPPING:
                if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch:
                    # if epoch - cfg.TRAIN.PATIENCE == self.previous_best_loss_epoch:
                    print('Early stop')
                    break

        self.comet_exp.log_asset(self.logfile)
        self.save_models(self.max_epochs)
        self.writer.close()
        print("Finished Training")
        print(
            f"Highest validation accuracy: {self.previous_best_acc} at epoch {self.previous_best_epoch}"
        )

    def log_results(self, epoch, dict, max_eval_samples=None):
        epoch += 1
        self.writer.add_scalar("avg_loss", dict["loss"], epoch)
        self.writer.add_scalar("train_accuracy", dict["accuracy"], epoch)

        metrics = self.calc_accuracy("validation",
                                     max_samples=max_eval_samples)
        self.writer.add_scalar("val_accuracy_ema", metrics['acc_ema'], epoch)
        self.writer.add_scalar("val_accuracy", metrics['acc'], epoch)
        self.writer.add_scalar("val_loss_ema", metrics['loss_ema'], epoch)
        self.writer.add_scalar("val_loss", metrics['loss'], epoch)

        print(
            "Epoch: {epoch}\tVal Acc: {acc},\tVal Acc EMA: {acc_ema},\tAvg Loss: {loss},\tAvg Loss EMA: {loss_ema},\tLR: {lr}"
            .format(epoch=epoch, lr=self.lr, **metrics))

        if metrics['acc'] > self.previous_best_acc:
            self.previous_best_acc = metrics['acc']
            self.previous_best_epoch = epoch
        if metrics['loss'] < self.previous_best_loss:
            self.previous_best_loss = metrics['loss']
            self.previous_best_loss_epoch = epoch

        if epoch % self.snapshot_interval == 0:
            self.save_models(epoch)

        return metrics

    def calc_accuracy(self, mode="train", max_samples=None):
        self.set_mode("validation")

        if mode == "train":
            loader = self.dataloader
        # elif (mode == "validation") or (mode == 'test'):
        #     loader = self.dataloader_val
        else:
            loader = self.dataloader_val

        total_correct = 0
        total_correct_ema = 0
        total_samples = 0
        total_loss = 0.
        total_loss_ema = 0.
        pbar = tqdm(loader, total=len(loader), desc=mode.upper(), ncols=20)
        for data in pbar:

            image, question, question_len, answer = data['image'], data[
                'question'], data['question_length'], data['answer']
            answer = answer.long()
            question = Variable(question)
            answer = Variable(answer)

            if self.cfg.CUDA:
                if self.use_feats == 'spatial':
                    image = image.cuda()
                elif self.use_feats == 'objects':
                    image = [e.cuda() for e in image]
                question = question.cuda()
                answer = answer.cuda().squeeze()

            with torch.no_grad():
                scores = self.model(image, question, question_len)
                scores_ema = self.model_ema(image, question, question_len)

                loss = self.loss_fn(scores, answer)
                loss_ema = self.loss_fn(scores_ema, answer)

            correct = scores.detach().argmax(1) == answer
            correct_ema = scores_ema.detach().argmax(1) == answer

            total_correct += correct.sum().cpu().item()
            total_correct_ema += correct_ema.sum().cpu().item()

            total_loss += loss.item() * answer.size(0)
            total_loss_ema += loss_ema.item() * answer.size(0)

            total_samples += answer.size(0)

            avg_acc = total_correct / total_samples
            avg_acc_ema = total_correct_ema / total_samples
            avg_loss = total_loss / total_samples
            avg_loss_ema = total_loss_ema / total_samples

            pbar.set_postfix({
                'Acc': f'{avg_acc:.5f}',
                'Acc Ema': f'{avg_acc_ema:.5f}',
                'Loss': f'{avg_loss:.5f}',
                'Loss Ema': f'{avg_loss_ema:.5f}',
            })

        return dict(acc=avg_acc,
                    acc_ema=avg_acc_ema,
                    loss=avg_loss,
                    loss_ema=avg_loss_ema)
Ejemplo n.º 27
0
def main(_):
    experiment = Experiment(api_key="xXtJguCo8yFdU7dpjEpo6YbHw",
                            project_name=args.experiment_name)
    hyper_params = {
        "learning_rate": args.lr,
        "num_epochs": args.max_epoch,
        "batch_size": args.single_batch_size,
        "alpha": args.alpha,
        "beta": args.beta,
        "gamma": args.gamma,
        "loss": args.loss
    }
    experiment.log_multiple_params(hyper_params)

    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)
        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
            log_device_placement=False,
        )
        with tf.Session(config=config) as sess:
            # sess=tf_debug.LocalCLIDebugWrapperSession(sess,ui_type='readline')
            model = RPN3D(cls=cfg.DETECT_OBJ,
                          single_batch_size=args.single_batch_size,
                          learning_rate=args.lr,
                          max_gradient_norm=5.0,
                          alpha=args.alpha,
                          beta=args.beta,
                          gamma=args.gamma,
                          loss_type=args.loss,
                          avail_gpus=cfg.GPU_AVAILABLE.split(','))
            # param init/restore
            if tf.train.get_checkpoint_state(save_model_dir):
                print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 10
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            experiment.set_model_graph(sess.graph)

            # training
            with experiment.train():
                for epoch in range(start_epoch, args.max_epoch):
                    counter = 0
                    batch_time = time.time()
                    experiment.log_current_epoch(epoch)

                    for batch in iterate_data(
                            train_dir,
                            shuffle=True,
                            aug=True,
                            is_testset=False,
                            batch_size=args.single_batch_size *
                            cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT):

                        counter += 1
                        global_counter += 1
                        experiment.set_step(global_counter)
                        if counter % summary_interval == 0:
                            is_summary = True
                        else:
                            is_summary = False
                        epochs = args.max_epoch
                        start_time = time.time()
                        ret = model.train_step(sess,
                                               batch,
                                               train=True,
                                               summary=is_summary)
                        forward_time = time.time() - start_time
                        batch_time = time.time() - batch_time
                        param = ret
                        params = {
                            "loss": param[0],
                            "cls_loss": param[1],
                            "cls_pos_loss": param[2],
                            "cls_neg_loss": param[3]
                        }
                        experiment.log_multiple_metrics(params)
                        # print(ret)
                        print(
                            'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'
                            .format(counter, epoch, epochs, ret[0], ret[1],
                                    ret[2], ret[3], forward_time, batch_time))
                        # with open('log/train.txt', 'a') as f:
                        # f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'.format(counter,epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time))

                        #print(counter, summary_interval, counter % summary_interval)
                        if counter % summary_interval == 0:
                            print("summary_interval now")
                            summary_writer.add_summary(ret[-1], global_counter)

                        #print(counter, summary_val_interval, counter % summary_val_interval)
                        if counter % summary_val_interval == 0:
                            print("summary_val_interval now")
                            batch = sample_test_data(
                                val_dir,
                                args.single_batch_size * cfg.GPU_USE_COUNT,
                                multi_gpu_sum=cfg.GPU_USE_COUNT)

                            ret = model.validate_step(sess,
                                                      batch,
                                                      summary=True)
                            summary_writer.add_summary(ret[-1], global_counter)

                            try:
                                ret = model.predict_step(sess,
                                                         batch,
                                                         summary=True)
                                summary_writer.add_summary(
                                    ret[-1], global_counter)
                            except:
                                print("prediction skipped due to error")

                        if check_if_should_pause(args.tag):
                            model.saver.save(sess,
                                             os.path.join(
                                                 save_model_dir, 'checkpoint'),
                                             global_step=model.global_step)
                            print('pause and save model @ {} steps:{}'.format(
                                save_model_dir, model.global_step.eval()))
                            sys.exit(0)

                        batch_time = time.time()
                    experiment.log_epoch_end(epoch)
                    sess.run(model.epoch_add_op)

                    model.saver.save(sess,
                                     os.path.join(save_model_dir,
                                                  'checkpoint'),
                                     global_step=model.global_step)

                    # dump test data every 10 epochs
                    if (epoch + 1) % 10 == 0:
                        # create output folder
                        os.makedirs(os.path.join(args.output_path, str(epoch)),
                                    exist_ok=True)
                        os.makedirs(os.path.join(args.output_path, str(epoch),
                                                 'data'),
                                    exist_ok=True)
                        if args.vis:
                            os.makedirs(os.path.join(args.output_path,
                                                     str(epoch), 'vis'),
                                        exist_ok=True)

                        for batch in iterate_data(
                                val_dir,
                                shuffle=False,
                                aug=False,
                                is_testset=False,
                                batch_size=args.single_batch_size *
                                cfg.GPU_USE_COUNT,
                                multi_gpu_sum=cfg.GPU_USE_COUNT):

                            if args.vis:
                                tags, results, front_images, bird_views, heatmaps = model.predict_step(
                                    sess, batch, summary=False, vis=True)
                            else:
                                tags, results = model.predict_step(
                                    sess, batch, summary=False, vis=False)

                            for tag, result in zip(tags, results):
                                of_path = os.path.join(args.output_path,
                                                       str(epoch), 'data',
                                                       tag + '.txt')
                                with open(of_path, 'w+') as f:
                                    labels = box3d_to_label(
                                        [result[:, 1:8]], [result[:, 0]],
                                        [result[:, -1]],
                                        coordinate='lidar')[0]
                                    for line in labels:
                                        f.write(line)
                                    print('write out {} objects to {}'.format(
                                        len(labels), tag))
                            # dump visualizations
                            if args.vis:
                                for tag, front_image, bird_view, heatmap in zip(
                                        tags, front_images, bird_views,
                                        heatmaps):
                                    front_img_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_front.jpg')
                                    bird_view_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_bv.jpg')
                                    heatmap_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_heatmap.jpg')
                                    cv2.imwrite(front_img_path, front_image)
                                    cv2.imwrite(bird_view_path, bird_view)
                                    cv2.imwrite(heatmap_path, heatmap)

                        # execute evaluation code
                        cmd_1 = "./kitti_eval/launch_test.sh"
                        cmd_2 = os.path.join(args.output_path, str(epoch))
                        cmd_3 = os.path.join(args.output_path, str(epoch),
                                             'log')
                        os.system(" ".join([cmd_1, cmd_2, cmd_3]))

            print('train done. total epoch:{} iter:{}'.format(
                epoch, model.global_step.eval()))

            # finallly save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Ejemplo n.º 28
0
def train(args, use_comet : bool = True):

    data_cls = funcs[args['dataset']]
    model_cls = funcs[args['model']]
    network = funcs[args['network']]

    print ('[INFO] Getting dataset...')
    data = data_cls()
    data.load_data()
    (x_train, y_train), (x_test, y_test) = (data.x_train, data.y_train), (data.x_test, data.y_test)
    classes = data.mapping
    
    # #Used for testing only
    # x_train = x_train[:100, :, :]
    # y_train = y_train[:100, :]
    # x_test = x_test[:100, :, :]
    # y_test = y_test[:100, :]
    # print ('[INFO] Training shape: ', x_train.shape, y_train.shape)
    # print ('[INFO] Test shape: ', x_test.shape, y_test.shape)
    # #delete these lines

    # distribute 90% test 10% val dataset with equal class distribution 
    (x_test, x_valid, y_test, y_valid) = train_test_split(x_test, y_test, test_size=0.2, random_state=42)

    print ('[INFO] Training shape: ', x_train.shape, y_train.shape)
    print ('[INFO] Validation shape: ', x_valid.shape, y_valid.shape)
    print ('[INFO] Test shape: ', x_test.shape, y_test.shape)

    print ('[INFO] Setting up the model..')
    if args['network'] == 'lstmctc':
        network_args = {'backbone' : args['backbone'],
                        'seq_model' : args['seq'],
                        'bi' : args['bi']
                        }
        model = model_cls(network, data_cls, network_args)
    else:
        model = model_cls(network, data_cls)
    print (model)
    
    dataset = dict({
        'x_train' : x_train,
        'y_train' : y_train,
        'x_valid' : x_valid,
        'y_valid' : y_valid,
        'x_test' : x_test,
        'y_test' : y_test
    })

    if use_comet and args['find_lr'] == False:
        #create an experiment with your api key
        experiment = Experiment(api_key='WVBNRAfMLCBWslJAAsffxM4Gz',
                                project_name='iam_lines',
                                auto_param_logging=False)
        
        print ('[INFO] Starting Training...')
        #will log metrics with the prefix 'train_'   
        with experiment.train():
            _ = train_model(
                    model,
                    dataset,
                    batch_size=args['batch_size'],
                    epochs=args['epochs'],
                    name=args['network']
                    )

        print ('[INFO] Starting Testing...')    
        #will log metrics with the prefix 'test_'
        with experiment.test():  
            score = model.evaluate(dataset, int(args['batch_size']))
            print(f'[INFO] Test evaluation: {score*100}...')
            metrics = {
                'accuracy':score
            }
            experiment.log_metrics(metrics)    

        experiment.log_parameters(args)
        experiment.log_dataset_hash(x_train) #creates and logs a hash of your data 
        experiment.end()

    elif use_comet and args['find_lr'] == True:

        _ = train_model(
                    model,
                    dataset,
                    batch_size=args['batch_size'],
                    epochs=args['epochs'],
                    FIND_LR=args['find_lr'],
                    name=args['network']
                    )

    else :

        print ('[INFO] Starting Training...')
        train_model(
            model,
            dataset,
            batch_size=args['batch_size'],
            epochs=args['epochs'],
            name=args['network']
            )
        print ('[INFO] Starting Testing...')    
        score = model.evaluate(dataset, args['batch_size'])
        print(f'[INFO] Test evaluation: {score*100}...')

    if args['weights']:
        model.save_weights()
    
    if args['save_model']:
        model.save_model()
Ejemplo n.º 29
0
           kernel_size=params['filter_size'],
           padding='same',
           activation=params['activation']))
model.add(Dropout(params['dropout']))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=params['optimizer'],
              metrics=['accuracy'])
#print model.summary() to preserve automatically in `Output` tab
print(model.summary())
params.update({'total_number_of_parameters': model.count_params()})

#will log metrics with the prefix 'train_'
with experiment.train():
    model.fit(X_train,
              y_train,
              epochs=params['epochs'],
              batch_size=params['batch_size'],
              verbose=1,
              validation_data=(X_test, y_test))

#will log metrics with the prefix 'test_'
with experiment.test():
    loss, accuracy = model.evaluate(X_test, y_test)
    metrics = {'loss': loss, 'accuracy': accuracy}
    experiment.log_multiple_metrics(metrics)

experiment.log_multiple_params(params)
experiment.log_dataset_hash(X_train)  #creates and logs a hash of your data
Ejemplo n.º 30
0
def train(args, use_comet: bool = True):

    data_cls = funcs[args['dataset']]
    model_cls = funcs[args['model']]
    network = funcs[args['network']]

    print('[INFO] Getting dataset...')
    data = data_cls()
    (x_train, y_train), (x_test, y_test) = data.load_data()
    classes = data.mapping

    # #Used for testing only
    # x_train = x_train[:100, :, :]
    # y_train = y_train[:100, :]
    # x_test = x_test[:100, :, :]
    # y_test = y_test[:100, :]
    # print ('[INFO] Training shape: ', x_train.shape, y_train.shape)
    # print ('[INFO] Test shape: ', x_test.shape, y_test.shape)
    # #delete these lines

    y_test_labels = [
        np.where(y_test[idx] == 1)[0][0] for idx in range(len(y_test))
    ]
    # distribute 90% test 10% val dataset with equal class distribution
    (x_test, x_valid, y_test,
     y_valid) = train_test_split(x_test,
                                 y_test,
                                 test_size=0.1,
                                 stratify=y_test_labels,
                                 random_state=42)

    print('[INFO] Training shape: ', x_train.shape, y_train.shape)
    print('[INFO] Validation shape: ', x_valid.shape, y_valid.shape)
    print('[INFO] Test shape: ', x_test.shape, y_test.shape)

    print('[INFO] Setting up the model..')
    model = model_cls(network, data_cls)
    print(model)

    dataset = dict({
        'x_train': x_train,
        'y_train': y_train,
        'x_valid': x_valid,
        'y_valid': y_valid,
        'x_test': x_test,
        'y_test': y_test
    })

    if use_comet and args['find_lr'] == False:
        #create an experiment with your api key
        experiment = Experiment(api_key='INSERT API KEY',
                                project_name='emnist',
                                auto_param_logging=False)

        print('[INFO] Starting Training...')
        #will log metrics with the prefix 'train_'
        with experiment.train():
            _ = train_model(model,
                            dataset,
                            batch_size=args['batch_size'],
                            epochs=args['epochs'],
                            name=args['network'])

        print('[INFO] Starting Testing...')
        #will log metrics with the prefix 'test_'
        with experiment.test():
            loss, score = model.evaluate(dataset, args['batch_size'])
            print(f'[INFO] Test evaluation: {score*100}')
            metrics = {'loss': loss, 'accuracy': score}
            experiment.log_metrics(metrics)

        experiment.log_parameters(args)
        experiment.log_dataset_hash(
            x_train)  #creates and logs a hash of your data
        experiment.end()

    elif use_comet and args['find_lr'] == True:

        _ = train_model(model,
                        dataset,
                        batch_size=args['batch_size'],
                        epochs=args['epochs'],
                        FIND_LR=args['find_lr'],
                        name=args['network'])

    else:

        print('[INFO] Starting Training...')
        train_model(model,
                    dataset,
                    batch_size=args['batch_size'],
                    epochs=args['epochs'],
                    name=args['network'])
        print('[INFO] Starting Testing...')
        loss, score = model.evaluate(dataset, args['batch_size'])
        print(f'[INFO] Test evaluation: {score*100}')

    if args['weights']:
        model.save_weights()

    if args['save_model']:
        model.save_model()