コード例 #1
0
ファイル: train.py プロジェクト: hbertrand/cookiecutter-pyml
def train(model,
          optimizer,
          loss_fun,
          train_loader,
          dev_loader,
          patience,
          output,
          max_epoch,
          use_progress_bar=True,
          start_from_scratch=False):

    try:
        best_dev_metric = pytorch_train_impl(dev_loader, loss_fun, max_epoch,
                                             model, optimizer, output,
                                             patience, train_loader,
                                             use_progress_bar,
                                             start_from_scratch)
    except RuntimeError as err:
        if orion.client.IS_ORION_ON and 'CUDA out of memory' in str(err):
            logger.error(err)
            logger.error(
                'model was out of memory - assigning a bad score to tell Orion to avoid'
                'too big model')
            best_dev_metric = -999
        else:
            raise err

    report_results([
        dict(
            name='dev_metric',
            type='objective',
            # note the minus - cause orion is always trying to minimize (cit. from the guide)
            value=-best_dev_metric)
    ])
コード例 #2
0
ファイル: hyper_search.py プロジェクト: afansi/TT-Training
def main(_):
    """ This function aims at calling the main training process from which
        it retrieves the metrics and computes the objective value to be
        optimized by ORION. In this case, we optimize for the F1-score on
        the validation dataset.
    """

    run_metrics = main_process()
    default_value = 1000000.0
    orion_objective = default_value
    if not (run_metrics is None):
        # val_acc = run_metrics.get('val_binary_accuracy', 0.0)
        val_precision = run_metrics.get('val_precision', 0.0)
        val_recall = run_metrics.get('val_recall', 0.0)

        denom = val_precision + val_recall
        if denom <= 0:
            denom = 1e-5

        val_f1 = 2 * (val_precision * val_recall) / denom

        orion_objective = -val_f1  # -val_acc

    tf.logging.info("FOUND OBJECTIVE: {}".format(orion_objective))
    report_results([
        dict(name='orion_objective', type='objective', value=orion_objective)
    ])
コード例 #3
0
ファイル: train.py プロジェクト: jerpint/cookiecutter-pyml-1
def train(model, optimizer, loss_fun, train_loader, dev_loader, patience, output,
          max_epoch, use_progress_bar=True, start_from_scratch=False):  # pragma: no cover
    """Training loop wrapper. Used to catch exception (and to handle them) if Orion is being used.

    Args:
        model (obj): The neural network model object.
        optimizer (obj): Optimizer used during training.
        loss_fun (obj): Loss function that will be optimized.
        train_loader (obj): Dataloader for the training set.
        dev_loader (obj): Dataloader for the validation set.
        patience (int): max number of epochs without improving on `best_eval_score`.
            After this point, the train ends.
        output (str): Output directory.
        max_epoch (int): Max number of epochs to train for.
        use_progress_bar (bool): Use tqdm progress bar (can be disabled when logging).
        start_from_scratch (bool): Start training from scratch (ignore checkpoints)
    """
    try:
        best_dev_metric = train_impl(
            model, optimizer, loss_fun, train_loader, dev_loader, patience, output,
            max_epoch, use_progress_bar, start_from_scratch)
    except RuntimeError as err:
        if orion.client.IS_ORION_ON and 'CUDA out of memory' in str(err):
            logger.error(err)
            logger.error('model was out of memory - assigning a bad score to tell Orion to avoid'
                         'too big model')
            best_dev_metric = -999
        else:
            raise err

    report_results([dict(
        name='dev_metric',
        type='objective',
        # note the minus - cause orion is always trying to minimize (cit. from the guide)
        value=-float(best_dev_metric))])
コード例 #4
0
def execute():
    """Execute a simple pipeline as an example."""
    # 1. Receive inputs as you want
    parser = argparse.ArgumentParser()
    parser.add_argument('--configuration', required=True)
    inputs = parser.parse_args()

    with open(inputs.configuration, 'r') as f:
        config = yaml.safe_load(f)

    # 2. Perform computations

    y, dy = function(config['x'])

    # 3. Gather and report results
    results = list()
    results.append(dict(
        name='example_objective',
        type='objective',
        value=y))
    results.append(dict(
        name='example_gradient',
        type='gradient',
        value=[dy]))

    report_results(results)
コード例 #5
0
def execute():
    """Execute a simple pipeline as an example."""
    # 1. Receive inputs as you want
    parser = argparse.ArgumentParser()
    parser.add_argument("-x", type=float, required=True)
    parser.add_argument("--test-env", action="store_true")
    parser.add_argument("--experiment-id", type=str)
    parser.add_argument("--experiment-name", type=str)
    parser.add_argument("--experiment-version", type=str)
    parser.add_argument("--trial-id", type=str)
    parser.add_argument("--working-dir", type=str)

    inputs = parser.parse_args()

    if inputs.test_env:
        assert inputs.experiment_id == os.environ["ORION_EXPERIMENT_ID"]
        assert inputs.experiment_name == os.environ["ORION_EXPERIMENT_NAME"]
        assert inputs.experiment_version == os.environ["ORION_EXPERIMENT_VERSION"]
        assert inputs.trial_id == os.environ["ORION_TRIAL_ID"]
        assert inputs.working_dir == os.environ["ORION_WORKING_DIR"]

    # 2. Perform computations
    y, dy = function(inputs.x)

    # 3. Gather and report results
    results = list()
    results.append(dict(name="example_objective", type="objective", value=y))
    results.append(dict(name="example_gradient", type="gradient", value=[dy]))

    report_results(results)
コード例 #6
0
def execute():
    """Execute a simple pipeline as an example."""
    # 1. Receive inputs as you want
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-x",
        type=str,
        required=True,
        help="Representation of a list of floating numbers of "
        "length at least 2.",
    )
    parser.add_argument(
        "-y",
        type=float,
        default=0,
        help="An optional float to check multi-dimensional inputs.",
    )
    inputs = parser.parse_args()

    # 2. Perform computations
    x = numpy.fromstring(inputs.x[1:-1], sep=", ")
    f = rosenbrock_function(x, inputs.y)

    # 3. Gather and report results
    results = list()
    results.append(dict(name="rosenbrock", type="objective", value=f))
    report_results(results)
コード例 #7
0
def main(argv=None):
    opt = parse_args(argv)

    tasks = TCGAMeta(download=True, preload=True)
    task = tasks[113]

    # Setup the results dictionary
    filename = "experiments/results/clinical-tasks.pkl"
    try:
        results = pickle.load(open(filename, "rb"), encoding='latin1')
        print("Loaded Checkpointed Results")
    except Exception as e:
        print(e)
        results = pd.DataFrame(columns=[
            'task', 'acc_metric', 'model', 'graph', 'trial', 'train_size',
            'time_elapsed'
        ])
        print("Created a New Results Dictionary")

    train_size = 50
    trials = 3
    cuda = True
    exp = []

    for trial in range(trials):
        model = GCN(cuda=cuda,
                    dropout=opt.dropout,
                    num_layer=opt.num_layer,
                    channels=opt.channels,
                    embedding=opt.embedding,
                    aggregation=opt.aggregation,
                    lr=opt.lr,
                    agg_reduce=opt.agg_reduce,
                    seed=trial)
        task._samples = task._samples - task._samples.mean(axis=0)
        task._samples = task._samples / task._samples.var()
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            task._samples,
            task._labels,
            stratify=task._labels,
            train_size=train_size,
            test_size=len(task._labels) - train_size)
        adj = sparse.csr_matrix(nx.to_numpy_matrix(GeneManiaGraph().nx_graph))
        model.fit(X_train, y_train, adj=adj)

        y_hat = []
        for chunk in get_every_n(X_test, 10):
            y_hat.extend(np.argmax(model.predict(chunk), axis=1).numpy())

        exp.append(model.metric(y_test, y_hat))
        print(exp)
    report_results([{
        "name": "acc_metric",
        "type": "objective",
        "value": np.array(exp).mean()
    }])
コード例 #8
0
def main(argv=None):
    opt = parse_args(argv)
    dataset = datasets.TCGADataset()
    dataset.df = dataset.df - dataset.df.mean(axis=0)

    gene_graph = GeneManiaGraph()
    search_num_genes = [50, 100, 200, 300, 500, 1000, 2000, 4000, 8000, 16300]
    test_size = 300
    cuda = torch.cuda.is_available()
    exp = []
    for num_genes in search_num_genes:
        start_time = time.time()
        gene = "RPL4"
        model = GCN(cuda=cuda,
                    dropout=opt.dropout,
                    num_layer=opt.num_layer,
                    channels=opt.channels,
                    embedding=opt.embedding,
                    aggregation=opt.aggregation,
                    lr=opt.lr,
                    agg_reduce=opt.agg_reduce)
        dataset.labels = dataset.df[gene].where(
            dataset.df[gene] > 0).notnull().astype("int")
        dataset.labels = dataset.labels.values if type(
            dataset.labels) == pd.Series else dataset.labels
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            dataset.df,
            dataset.labels,
            stratify=dataset.labels,
            train_size=opt.train_size,
            test_size=opt.test_size,
            random_state=opt.seed)
        if num_genes == 16300:
            neighbors = gene_graph.nx_graph
        else:
            neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

        X_train = X_train[list(neighbors.nodes)].copy()
        X_test = X_test[list(neighbors.nodes)].copy()
        X_train[gene] = 1
        X_test[gene] = 1
        adj = sparse.csr_matrix(nx.to_numpy_matrix(neighbors))
        model.fit(X_train, y_train, adj=adj)

        y_hat = model.predict(X_test)
        y_hat = np.argmax(y_hat, axis=1)
        auc = sklearn.metrics.roc_auc_score(y_test,
                                            np.asarray(y_hat).flatten())
        del model
        exp.append(auc)
    report_results([{
        "name": "auc",
        "type": "objective",
        "value": np.array(exp).mean()
    }])
コード例 #9
0
def execute():
    """Execute a simple pipeline as an example."""
    # 1. Receive inputs as you want
    parser = argparse.ArgumentParser()
    parser.add_argument("-x", type=float, required=True)
    inputs = parser.parse_args()

    # 2. Perform computations
    y, dy = function(inputs.x)

    # 3. Gather and report results
    results = list()
    results.append(dict(name="example_objective", type="objective", value=y))
    results.append(dict(name="example_gradient", type="gradient", value=[dy]))

    report_results(results)
コード例 #10
0
def run(args):
    with open(args.config, 'r') as stream:
        hyper_params = load(stream)
    for k, v in hyper_params:
        log_param(k, v)
        logger.info('hp "{}" => "{}"'.format(k, v))

    patience = 10
    not_improving_since = 0
    best_dev_metric = None
    for e in range(args.max_epoch):

        # change this part to do the real training / evaluation on dev.
        loss = do_training()
        dev_metric = eval_on_dev()

        log_metric("loss", loss, step=e)
        log_metric("dev_metric", dev_metric, step=e)

        if best_dev_metric is None or dev_metric > best_dev_metric:
            best_dev_metric = dev_metric
            not_improving_since = 0
            save_model()
        else:
            not_improving_since += 1

        logger.info('\ndone epoch {} => loss {} - dev metric {} (not improving'
                    ' since {} epoch)'.format(e, loss, dev_metric,
                                              not_improving_since))

        if not_improving_since >= patience:
            logger.info('done! best dev metric is {}'.format(best_dev_metric))
            break

    # useful so was can easily sort models w.r.t. the best dev evaluation
    log_metric("best_dev_metric", best_dev_metric)

    report_results([
        dict(
            name='dev_metric',
            type='objective',
            # note the minus - cause orion is always trying to minimize (cit. from the guide)
            value=-best_dev_metric)
    ])
コード例 #11
0
    def on_train_end(self, logs=None):
        """operations to perform after training is complete

        Parameters
        ----------
        logs : dict, optional
            dictionary containing the model evaluation metrics, by default None
        """
        self.end_time = datetime.datetime.now()
        logging.info(f"Ending training at {self.end_time}")
        logging.info(f"Training duration: {self.end_time - self.start_time}")
        if self.stopped_epoch > 0:
            logging.info(f"Early stopping at epoch {self.stopped_epoch}")
        log_metric("best_valid_acc", self.best_valid_acc)
        report_results([
            dict(name="valid_acc",
                 type="objective",
                 value=-self.best_valid_acc)
        ])
コード例 #12
0
ファイル: black_box.py プロジェクト: 5l1v3r1/orion-1
def execute():
    """Execute a simple pipeline as an example."""
    # 1. Receive inputs as you want
    parser = argparse.ArgumentParser()
    parser.add_argument('-x', type=float, required=True)
    parser.add_argument('--fidelity', type=int, default=10)
    inputs = parser.parse_args()

    assert 0 <= inputs.fidelity <= 10

    noise = (1 - inputs.fidelity / 10) + 0.0001

    # 2. Perform computations
    y, dy = function(inputs.x, noise)

    # 3. Gather and report results
    results = list()
    results.append(dict(name='example_objective', type='objective', value=y))
    results.append(dict(name='example_gradient', type='gradient', value=[dy]))

    report_results(results)
コード例 #13
0
def execute():
    """Execute a simple pipeline as an example."""
    parser = argparse.ArgumentParser()
    parser.add_argument('-x', type=float, required=True)
    parser.add_argument('--dir', type=str, required=True)
    parser.add_argument('--name', type=str, required=True)
    parser.add_argument('--other-name', type=str, required=True)
    inputs = parser.parse_args()

    # That's what is expected to happen
    os.makedirs(os.path.join(inputs.dir, inputs.other_name,
                             "my-exp-{}".format(inputs.name)),
                exist_ok=False)  # Raise OSError if it exists

    y, dy = function(inputs.x)

    results = list()
    results.append(dict(name='example_objective', type='objective', value=y))
    results.append(dict(name='example_gradient', type='gradient', value=[dy]))

    report_results(results)
コード例 #14
0
ファイル: train.py プロジェクト: lebrice/ift6758.github.io
def main(hparams: HyperParameters, train_config: TrainConfig):
    print("Experiment name:", train_config.experiment_name)
    print("Hyperparameters:", hparams)
    print("Train_config:", train_config)

    # create the results path so its directly possible to call the "tail" progam to stream the results as they come in.
    experiment_results_file = os.path.join(
        "logs", train_config.experiment_name + "-results.txt")
    with open(experiment_results_file, "a") as runs_results_file:
        pass

    train_data_dir = os.path.join(os.path.curdir,
                                  "debug_data") if DEBUG else "~/Train"
    # Create the required directories if not present.
    os.makedirs(train_config.log_dir, exist_ok=True)

    print("Training directory:", train_config.log_dir)

    with utils.log_to_file(os.path.join(train_config.log_dir,
                                        "train_log.txt")):
        results = train(train_data_dir, hparams, train_config)

        print(f"Saved model weights are located at '{train_config.log_dir}'")

    log_results(results)

    using_validation_set = train_config.validation_data_fraction != 0.0
    if using_validation_set:
        from orion.client import report_results
        report_results([
            dict(
                name='validation_loss',
                type='objective',
                value=results.metrics_dict.get("loss", np.Inf),
            )
        ])

    print("TRAINING COMPLETE")
コード例 #15
0
ファイル: main.py プロジェクト: kiminh/bert_reranker
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--config",
        help="config file with generic hyper-parameters,  such as optimizer, "
        "batch_size, ... -  in yaml format",
        required=True,
    )
    parser.add_argument(
        "--gpu",
        help="list of gpu ids to use. default is cpu. example: --gpu 0 1",
        type=int,
        nargs="+"
    )
    parser.add_argument(
        "--validation-interval",
        help="how often to run validation in one epoch - "
        "e.g., 0.5 means halfway - default 0.5",
        type=float,
        default=0.5,
    )
    parser.add_argument("--output", help="where to store models", required=True)
    parser.add_argument(
        "--no-model-restoring",
        help="will not restore any previous model weights (" "even if present)",
        action="store_true",
    )
    parser.add_argument(
        "--train",
        help="will not train - will just evaluate on dev",
        action="store_true",
    )
    parser.add_argument(
        "--validate",
        help="will not train - will just evaluate on dev",
        action="store_true",
    )
    parser.add_argument(
        "--predict", help="will predict on the json file you provide as an arg"
    )
    parser.add_argument(
        "--predict-outliers", help="will use the sklearn model to predict outliers",
        action="store_true"
    )
    parser.add_argument(
        "--file-to-emb", help="will use this file as input to generate embeddings"
    )
    parser.add_argument(
        "--write-emb-to", help="will write question embeddings to this file"
    )
    parser.add_argument(
        "--save-weights-to",
        help="will save ONLY the model weights (not the pytorch lightning object)"
        " to this file",
    )
    parser.add_argument("--predict-to", help="(optional) write predictions here)")
    parser.add_argument(
        "--redirect-log",
        help="will intercept any stdout/err and log it",
        action="store_true",
    )
    parser.add_argument(
        "--num-workers", help="number of workers - default 2", type=int, default=0
    )
    parser.add_argument(
        "--print-sentence-stats",
        help="will print stats on the data",
        action="store_true",
    )
    parser.add_argument(
        "--multiple-thresholds",
        help="will print results for various thresholds",
        action="store_true",
    )
    parser.add_argument('--log', help='log to this file (in addition to stdout/err)')
    parser.add_argument("--debug", help="will log more info", action="store_true")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    # will log to a file if provided (useful for orion on cluster)
    if args.log is not None:
        handler = WatchedFileHandler(args.log)
        formatter = logging.Formatter(logging.BASIC_FORMAT)
        handler.setFormatter(formatter)
        root = logging.getLogger()
        root.setLevel(logging.INFO)
        root.addHandler(handler)
    if args.redirect_log or args.log:
        sys.stdout = LoggerWriter(logger.info)
        sys.stderr = LoggerWriter(logger.warning)

    with open(args.config, "r") as stream:
        hyper_params = load(stream, Loader=yaml.FullLoader)

    if args.gpu is None and 'GPU' in os.environ:
        gpu_string = os.environ['GPU']
        gpu = [int(x) for x in gpu_string.strip().split()]
    else:
        gpu = args.gpu

    ckpt_to_resume, ret_trainee, trainer = init_model(
        hyper_params,
        args.num_workers,
        args.output,
        args.validation_interval,
        gpu,
        args.no_model_restoring,
        args.debug,
        args.print_sentence_stats
    )

    if args.train:
        trainer.fit(ret_trainee)
        best_dev_result = float(trainer.early_stop_callback.best.cpu().numpy())
        report_results([dict(
            name='dev_metric',
            type='objective',
            # note the minus - cause orion is always trying to minimize (cit. from the guide)
            value=-float(best_dev_result))])
    elif args.validate:
        trainer.test(ret_trainee)
    elif args.predict:
        if not args.predict_to:
            raise ValueError('--predict also requires --predict-to')

        model_ckpt = torch.load(ckpt_to_resume, map_location=torch.device("cpu"))
        ret_trainee.load_state_dict(model_ckpt["state_dict"])
        if args.predict_outliers:
            with open(os.path.join(args.output, SKLEARN_MODEL_FILE_NAME), 'rb') as file:
                sklearn_model = pickle.load(file)
            predictor = PredictorWithOutlierDetector(ret_trainee, sklearn_model)
        else:
            predictor = Predictor(ret_trainee)
        predictor.generate_predictions(
            json_file=args.predict,
            predict_to=args.predict_to,
            multiple_thresholds=args.multiple_thresholds
        )
    elif args.file_to_emb:
        if args.write_emb_to is None:
            raise ValueError('please specify also --write-emb-to')
        model_ckpt = torch.load(ckpt_to_resume, map_location=torch.device("cpu"))
        ret_trainee.load_state_dict(model_ckpt["state_dict"])
        generate_embeddings(
            ret_trainee,
            input_file=args.file_to_emb,
            out_file=args.write_emb_to
        )
    elif args.save_weights_to is not None:
        torch.save(ret_trainee.retriever.state_dict(), args.save_weights_to)
    else:
        logger.warning("please select one between --train / --validate / --test")
コード例 #16
0
ファイル: main_triplet.py プロジェクト: turpaultn/walle
    embed_set10 = "final_test10"
    test_embed_dir10 = os.path.join(embed_dir, embed_set10)
    df_test_embed10, _ = calculate_embedding(test_dl10, model_triplet, savedir=test_embed_dir10,
                                             concatenate="append")
    test_embed10 = DataLoadDf(df_test_embed10, encode_function_label, transform=Compose(trans_embedding))
    test_embed_loader10 = DataLoader(test_embed10, batch_size=batch_size_classif, shuffle=False,
                                     num_workers=num_workers, drop_last=False)

    model_triplet = to_cpu(model_triplet)
    classif_model = to_cuda_if_available(classif_model)
    classif_model.eval()
    mean_test_results1 = measure_classif(classif_model, test_embed_loader1,
                                         classes=classes,
                                         suffix_print="test1")

    mean_test_results10 = measure_classif(classif_model, test_embed_loader10,
                                          classes=classes,
                                          suffix_print="test10")

    print(f"Time of the program: {time.time() - t}")
    from orion.client import report_results

    report_results(
        [dict(
            name="mean_test_results",
            type="objective",
            value=float(100 - classif_state["macro_measure_valid"] * 100)
        )
        ]
    )
コード例 #17
0
def main():
    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    hyperparams = load_parameters(args.hyperparameter_path)

    orion_hp_string = ''
    if args.lr or args.log10_lr:
        if args.log10_lr:
            lr = 10**args.log10_lr
        else:
            lr = args.lr
        hyperparams['optimizer']['lr_init'] = lr
        hyperparams['scheduler']['lr_min'] = lr * 1e-3
        orion_hp_string += 'lr= %.4f\n' % lr

    if args.kl_obs_dur:
        hyperparams['objective']['kl_obs'][
            'schedule_dur'] = args.kl_obs_dur * args.kl_obs_dur_scale
        orion_hp_string += 'kl_obs_dur= %i\n' % (args.kl_obs_dur *
                                                 args.kl_obs_dur_scale)

    if args.kl_obs_max:
        hyperparams['objective']['kl_obs']['max'] = args.kl_obs_max
        orion_hp_string += 'kl_obs_max= %.3f\n' % (args.kl_obs_max)

    if args.kl_deep_max:
        hyperparams['objective']['kl_deep']['max'] = args.kl_deep_max
        orion_hp_string += 'kl_deep_max= %.3f\n' % (args.kl_deep_max)

    if args.deep_start_p:
        deep_start = int(args.deep_start_p * args.deep_start_p_scale *
                         hyperparams['objective']['kl_obs']['schedule_dur'])
        hyperparams['objective']['kl_deep']['schedule_start'] = deep_start
        hyperparams['objective']['l2']['schedule_start'] = deep_start
        hyperparams['model']['deep_unfreeze_step'] = deep_start
        orion_hp_string += 'deep_start= %i\n' % deep_start

    if args.l2_gen_scale or args.log10_l2_gen_scale:
        if args.log10_l2_gen_scale:
            l2_gen_scale = 10**args.log10_l2_gen_scale
        else:
            l2_gen_scale = args.l2_gen_scale
        hyperparams['objective']['l2_gen_scale'] = l2_gen_scale
        orion_hp_string += 'l2_gen_scale= %.3f\n' % l2_gen_scale

    if args.l2_con_scale or args.log10_l2_con_scale:
        if args.log10_l2_con_scale:
            l2_con_scale = 10**args.log10_l2_con_scale
        else:
            l2_con_scale = args.l2_con_scale
        hyperparams['objective']['l2_con_scale'] = l2_con_scale
        orion_hp_string += 'l2_con_scale= %.3f\n' % l2_con_scale

    data_name = args.data_path.split('/')[-1]
    model_name = hyperparams['model_name']
    mhp_list = [
        key.replace('size', '').replace('deep', 'd').replace(
            'obs', 'o').replace('_', '')[:4] + str(val)
        for key, val in hyperparams['model'].items() if 'size' in key
    ]
    mhp_list.sort()
    hyperparams['run_name'] = '_'.join(mhp_list)
    orion_hp_string = orion_hp_string.replace('\n', '-').replace(' ',
                                                                 '').replace(
                                                                     '=', '')
    orion_hp_string = '_orion-' + orion_hp_string
    hyperparams['run_name'] += orion_hp_string
    save_loc = '%s/%s/%s/%s/' % (args.output_dir, data_name, model_name,
                                 hyperparams['run_name'])

    if not os.path.exists(save_loc):
        os.makedirs(save_loc)

    data_dict = read_data(args.data_path)
    train_data = torch.Tensor(data_dict['train_fluor']).to(device)
    valid_data = torch.Tensor(data_dict['valid_fluor']).to(device)

    num_trials, num_steps, input_size = train_data.shape

    train_ds = torch.utils.data.TensorDataset(train_data)
    valid_ds = torch.utils.data.TensorDataset(valid_data)
    train_dl = torch.utils.data.DataLoader(train_ds,
                                           batch_size=args.batch_size,
                                           shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_ds,
                                           batch_size=valid_data.shape[0])

    #     transforms = trf.Compose([trf.Normalize(mean=(train_data.mean(),), std=(train_data.std(),))])
    transforms = trf.Compose([])

    loglikelihood_obs = LogLikelihoodGaussian()
    loglikelihood_deep = LogLikelihoodPoissonSimplePlusL1(
        dt=float(data_dict['dt']))

    objective = SVLAE_Loss(
        loglikelihood_obs=loglikelihood_obs,
        loglikelihood_deep=loglikelihood_deep,
        loss_weight_dict={
            'kl_deep': hyperparams['objective']['kl_deep'],
            'kl_obs': hyperparams['objective']['kl_obs'],
            'l2': hyperparams['objective']['l2'],
            'recon_deep': hyperparams['objective']['recon_deep']
        },
        l2_con_scale=hyperparams['objective']['l2_con_scale'],
        l2_gen_scale=hyperparams['objective']['l2_gen_scale']).to(device)

    hyperparams['model']['obs']['tau']['value'] /= data_dict['dt']

    model = SVLAE_Net(
        input_size=input_size,
        factor_size=hyperparams['model']['factor_size'],
        obs_encoder_size=hyperparams['model']['obs_encoder_size'],
        obs_latent_size=hyperparams['model']['obs_latent_size'],
        obs_controller_size=hyperparams['model']['obs_controller_size'],
        deep_g_encoder_size=hyperparams['model']['deep_g_encoder_size'],
        deep_c_encoder_size=hyperparams['model']['deep_c_encoder_size'],
        deep_g_latent_size=hyperparams['model']['deep_g_latent_size'],
        deep_u_latent_size=hyperparams['model']['deep_u_latent_size'],
        deep_controller_size=hyperparams['model']['deep_controller_size'],
        generator_size=hyperparams['model']['generator_size'],
        prior=hyperparams['model']['prior'],
        clip_val=hyperparams['model']['clip_val'],
        generator_burn=hyperparams['model']['generator_burn'],
        dropout=hyperparams['model']['dropout'],
        do_normalize_factors=hyperparams['model']['normalize_factors'],
        factor_bias=hyperparams['model']['factor_bias'],
        max_norm=hyperparams['model']['max_norm'],
        deep_unfreeze_step=hyperparams['model']['deep_unfreeze_step'],
        obs_early_stop_step=hyperparams['model']['obs_early_stop_step'],
        obs_continue_step=hyperparams['model']['obs_continue_step'],
        ar1_start_step=hyperparams['model']['ar1_start_step'],
        obs_params=hyperparams['model']['obs'],
        device=device).to(device)

    total_params = 0
    for ix, (name, param) in enumerate(model.named_parameters()):
        print(ix, name, list(param.shape), param.numel(), param.requires_grad)
        total_params += param.numel()

    print('Total parameters: %i' % total_params)

    optimizer = opt.Adam([p for p in model.parameters() if p.requires_grad],
                         lr=hyperparams['optimizer']['lr_init'],
                         betas=hyperparams['optimizer']['betas'],
                         eps=hyperparams['optimizer']['eps'])

    scheduler = LFADS_Scheduler(
        optimizer=optimizer,
        mode='min',
        factor=hyperparams['scheduler']['scheduler_factor'],
        patience=hyperparams['scheduler']['scheduler_patience'],
        verbose=True,
        threshold=1e-4,
        threshold_mode='abs',
        cooldown=hyperparams['scheduler']['scheduler_cooldown'],
        min_lr=hyperparams['scheduler']['lr_min'])

    TIME = torch._np.arange(0, num_steps * data_dict['dt'], data_dict['dt'])

    plotter = {
        'train':
        Plotter(time=TIME,
                truth={
                    'rates': data_dict['train_rates'],
                    'spikes': data_dict['train_spikes'],
                    'latent': data_dict['train_latent']
                }),
        'valid':
        Plotter(time=TIME,
                truth={
                    'rates': data_dict['valid_rates'],
                    'spikes': data_dict['valid_spikes'],
                    'latent': data_dict['valid_latent']
                })
    }

    if args.use_tensorboard:
        import importlib
        if importlib.util.find_spec('torch.utils.tensorboard'):
            tb_folder = save_loc + 'tensorboard/'
            if not os.path.exists(tb_folder):
                os.mkdir(tb_folder)
            elif os.path.exists(tb_folder) and args.restart:
                os.system('rm -rf %s' % tb_folder)
                os.mkdir(tb_folder)

            from torch.utils.tensorboard import SummaryWriter
            writer = SummaryWriter(tb_folder)
            rm_plotter = plotter
        else:
            writer = None
            rm_plotter = None
    else:
        writer = None
        rm_plotter = None

    run_manager = RunManager(model=model,
                             objective=objective,
                             optimizer=optimizer,
                             scheduler=scheduler,
                             train_dl=train_dl,
                             valid_dl=valid_dl,
                             transforms=transforms,
                             writer=writer,
                             plotter=rm_plotter,
                             max_epochs=args.max_epochs,
                             save_loc=save_loc,
                             do_health_check=args.do_health_check)

    run_manager.run()

    if not torch._np.isfinite(run_manager.best):
        run_manager.best = 1e8

    report_results(
        [dict(name='valid_loss', type='objective', value=run_manager.best)])

    fig_folder = save_loc + 'figs/'

    if os.path.exists(fig_folder):
        os.system('rm -rf %s' % fig_folder)
    os.mkdir(fig_folder)

    from matplotlib.figure import Figure
    import matplotlib
    matplotlib.use('Agg')
    fig_dict = plotter['valid'].plot_summary(model=run_manager.model,
                                             dl=run_manager.valid_dl)
    for k, v in fig_dict.items():
        if type(k) == Figure:
            v.savefig(fig_folder + k + '.svg')
コード例 #18
0
def learner(model,
            rollout_storage,
            train_params,
            ppo_params,
            ready_to_works,
            queue,
            sync_flag,
            rank=0,
            distributed=False,
            b=None):
    '''
    learner use ppo algorithm to train model with experience from storage
    :param model:
    :param storage:
    :param params:
    :param ready_to_works:
    :param queue:
    :param sync_flag:
    :param rank:
    :return:
    '''

    print(f"learner with pid ({os.getpid()})  starts job")
    logger = TB_logger("ppo_ai2thor", rank)
    agent = PPO(actor_critic=model, **ppo_params)
    device = rollout_storage.device
    if distributed:
        world_size = dist.get_world_size()
    else:
        world_size = 1

    epochs = train_params["epochs"]
    min_clip_param = 0.001
    min_kl = 0.001
    # start workers for next epoch
    _ = [e.set() for e in ready_to_works]
    # Training policy
    start_time = time.time()
    for epoch in range(epochs):
        agent.clip_param = (ppo_params['clip_param'] - min_clip_param) * (
            epochs - epoch) / epochs + min_clip_param
        agent.max_kl = (ppo_params['max_kl'] -
                        min_kl) * (epochs - epoch) / epochs + min_kl
        rollout_ret = []
        rollout_steps = []
        # wait until all workers finish a epoch
        for i in range(train_params["num_workers"]):
            rewards, steps, id = queue.get()
            print(
                f'Leaner rank:{rank} recieve worker:{id} done signal and reaches {i}th wokers'
            )
            rollout_ret.extend(rewards)
            rollout_steps.extend(steps)

        if b:
            print(f'Learner rank:{rank} wait')
            b.wait()
        print("Start training")

        # normalize advantage
        # if distributed:
        #     mean = rollout_storage.adv_buf.mean()
        #     var = rollout_storage.adv_buf.var()
        #     mean = dist_mean(mean)
        #     var = dist_mean(var)
        #     rollout_storage.normalize_adv(mean_std=(mean, torch.sqrt(var)))
        # else:
        #     rollout_storage.normalize_adv()

        # train with batch
        model.train()
        print('updating...')
        pi_loss, v_loss, kl, entropy = agent.update(rollout_storage,
                                                    distributed)
        v_mean = rollout_storage.val_buf.mean()
        model.eval()
        print("Finishes training")
        # start workers for next epoch
        if epoch == train_params["epochs"] - 1:
            # set exit flag to 1, and notify workers to exit
            sync_flag.value = 1
        _ = [e.set() for e in ready_to_works]

        # log statistics with TensorBoard
        ret_sum = np.sum(rollout_ret)
        steps_sum = np.sum(rollout_steps)
        episode_count = len(rollout_ret)

        #visdom
        # vis.line(X=[episode_count], Y=[ret_sum], win='training_Rewards'+str(rank), update='append')

        if distributed:
            pi_loss = dist_mean(pi_loss)
            v_loss = dist_mean(v_loss)
            kl = dist_mean(kl)
            entropy = dist_mean(entropy)
            v_mean = dist_mean(v_mean)
            ret_sum = dist_sum(torch.tensor(ret_sum).to(device))
            steps_sum = dist_sum(torch.tensor(steps_sum).to(device))
            episode_count = dist_sum(torch.tensor(episode_count).to(device))

        # Log info about epoch
        global_steps = (epoch +
                        1) * train_params["steps"] * train_params["world_size"]
        fps = global_steps / (time.time() - start_time)
        logger.log_info(f"Epoch [{epoch}] avg. FPS:[{fps:.2f}]")

        logger.add_scalar("KL", kl, global_steps)
        logger.add_scalar("Entropy", entropy, global_steps)
        logger.add_scalar("p_loss", pi_loss, global_steps)
        logger.add_scalar("v_loss", v_loss, global_steps)
        logger.add_scalar("v_mean", v_mean, global_steps)

        # print(agent.clip_param,agent.max_kl)
        logger.add_scalar("clip_ration", agent.clip_param, global_steps)
        logger.add_scalar("max_kl", agent.max_kl, global_steps)

        if episode_count > 0:
            ret_per_1000 = (ret_sum / steps_sum) * 1000
            logger.add_scalar("Return1000", ret_per_1000, global_steps)
            logger.log_info(
                f"Epoch [{epoch}] Steps {global_steps}: "
                f"return:({ret_per_1000:.1f}), sum:{ret_sum}, step_sum:{steps_sum}"
            )
        else:
            logger.log_info(f"Epoch [{epoch}] Steps {global_steps}: "
                            f"Goal is not reached in this epoch")

        if (epoch + 1) % 20 == 0 and rank == 0:
            if distributed:
                torch.save(model.module.state_dict(),
                           f'model/ppo/model{epoch+1}.pt')
            else:
                torch.save(model.state_dict(), f'model/ppo/model{epoch+1}.pt')
        print("finish statistics")

    spl = evaluate_with_spl(model, rollout_storage)

    print('>>>>>>>>>>>>>>>>>>>>> Reporting...')
    report_results(
        [dict(name='validation_return', type='objective', value=-spl)])

    print(f"learner with pid ({os.getpid()})  finished job")
コード例 #19
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    with open(args.eval_env_seeds_file, 'r') as f:
        eval_env_seeds = json.load(f)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    eval_dir = os.path.join(args.log_dir, "eval/")
    if not os.path.exists(eval_dir):
        os.makedirs(eval_dir)
    eval_env = [
        make_env(args.env_name,
                 args.seed,
                 0,
                 eval_dir,
                 args.add_timestep,
                 early_resets=True)
    ]
    eval_env = DummyVecEnv(eval_env)

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=args.gamma)

    if len(envs.observation_space.shape) == 1:
        # Don't touch rewards for evaluation
        eval_env = VecNormalize(eval_env, ret=False)
        # set running filter to be the same
        eval_env.ob_rms = envs.ob_rms

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass

    validation_returns = evaluate_with_seeds(eval_env, actor_critic, args.cuda,
                                             eval_env_seeds)

    report_results([
        dict(name='validation_return',
             type='objective',
             value=np.mean(validation_returns))
    ])
コード例 #20
0
ファイル: train_model.py プロジェクト: yonghangzhou/KBRD
    def train(self):
        if is_distributed():
            warn_once(
                "Distributed training outputs average-per-worker metrics during "
                "training, and may be slightly distorted. Validation/test are "
                "unadulterated."
            )
        opt = self.opt
        world = self.world
        with world:
            while True:
                # do one example / batch of examples
                world.parley()
                self.parleys += 1

                # get the total training examples done, compute epochs
                self._total_epochs = (
                    self._preempted_epochs +
                    num_workers() * self.world.get_total_epochs()
                )
                exs_per_epoch = self.world.num_examples()
                self._total_exs = int(np.round(self._total_epochs * exs_per_epoch))

                # and use the primary worker's timings for everything
                train_time, log_time, validate_time = sync_object((
                    self.train_time.time(),
                    self.log_time.time(),
                    self.validate_time.time()
                ))

                # check counters and timers
                if self._total_epochs >= self.max_num_epochs:
                    self.log()
                    print('[ num_epochs completed:{} time elapsed:{}s ]'.format(
                        self.max_num_epochs, train_time))
                    break
                if train_time > self.max_train_time:
                    print('[ max_train_time elapsed:{}s ]'.format(train_time))
                    break
                if log_time > self.log_every_n_secs:
                    self.log()
                if (
                    validate_time > self.val_every_n_secs or
                    self._total_epochs - self.last_valid_epoch
                        >= self.val_every_n_epochs
                ):
                    stop_training = self.validate()
                    self.last_valid_epoch = self._total_epochs
                    if stop_training:
                        break
                if (
                    self.save_time.time() > self.save_every_n_secs and
                    opt.get('model_file') and
                    is_primary_worker()
                ):
                    print("[ saving model checkpoint: {}.checkpoint".format(
                        opt['model_file']
                    ))
                    self.save_model('.checkpoint')
                    self.save_time.reset()

        if not self.saved and is_primary_worker():
            # save agent
            self.save_model()
        elif opt.get('model_file'):
            # reload best validation model
            self.agent = create_agent(opt)

        valid_world = _maybe_load_eval_world(self.agent, opt, 'valid')
        max_exs = opt['validation_max_exs'] if opt.get('short_final_eval') else -1
        v_report = run_eval(valid_world, opt, 'valid', max_exs, write_log=True)
        test_world = _maybe_load_eval_world(self.agent, opt, 'test')
        t_report = run_eval(test_world, opt, 'test', max_exs, write_log=True)
        from orion.client import report_results
        report_results([dict(
            name='-recall@50',
            type='objective',
            value=-t_report['recall@50']
        )])

        if valid_world:
            valid_world.shutdown()
        if test_world:
            test_world.shutdown()

        return v_report, t_report
コード例 #21
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test_error_rate = test(args, model, device, test_loader)

    report_results([
        dict(name='test_error_rate', type='objective', value=test_error_rate)
    ])

    if (args.save_model):
        torch.save(model.state_dict(), "mnist_cnn.pt")
コード例 #22
0
#!/usr/bin/env python
import argparse
from orion.client import report_results


def sphere_func_2d(x, y):
    return x * x + y * y


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-x', help='the x coordinate', type=float)
    parser.add_argument('-y', help='the y coordinate', type=float)
    args = parser.parse_args()
    loss = sphere_func_2d(args.x, args.y)
    report_results(
        [dict(name='test_error_rate', type='objective', value=loss)])
コード例 #23
0
def main():
    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    hyperparams = load_parameters(args.hyperparameter_path)

    if args.lr:
        hyperparams['optimizer']['lr_init'] = args.lr
        hyperparams['scheduler']['lr_min'] = args.lr * 1e-3

    if args.patience:
        hyperparams['scheduler']['scheduler_patience'] = args.patience

    if args.weight_schedule_dur:
        hyperparams['objective']['kl'][
            'weight_schedule_dur'] = args.weight_schedule_dur
        hyperparams['objective']['l2'][
            'weight_schedule_dur'] = args.weight_schedule_dur

    if args.kl_max:
        hyperparams['objective']['kl']['max'] = args.kl_max

    data_name = args.data_path.split('/')[-1]
    model_name = hyperparams['model_name']
    mhp_list = [
        key.replace('size', '').replace('_', '')[:4] + str(val)
        for key, val in hyperparams['model'].items() if 'size' in key
    ]
    mhp_list.sort()
    hyperparams['run_name'] = '_'.join(mhp_list) + '_retest'
    save_loc = '%s/%s/%s/%s/' % (args.output_dir, data_name, model_name,
                                 hyperparams['run_name'])

    if not os.path.exists(save_loc):
        os.makedirs(save_loc)

    data_dict = read_data(args.data_path)
    train_data = torch.Tensor(data_dict['train_%s' %
                                        args.data_suffix]).to(device)
    valid_data = torch.Tensor(data_dict['valid_%s' %
                                        args.data_suffix]).to(device)

    num_trials, num_steps, input_size = train_data.shape

    train_ds = torch.utils.data.TensorDataset(train_data)
    valid_ds = torch.utils.data.TensorDataset(valid_data)
    train_dl = torch.utils.data.DataLoader(train_ds,
                                           batch_size=args.batch_size,
                                           shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_ds,
                                           batch_size=valid_data.shape[0])

    transforms = trf.Compose([])

    loglikelihood = LogLikelihoodPoisson(dt=float(data_dict['dt']))

    objective = LFADS_Loss(
        loglikelihood=loglikelihood,
        loss_weight_dict={
            'kl': hyperparams['objective']['kl'],
            'l2': hyperparams['objective']['l2']
        },
        l2_con_scale=hyperparams['objective']['l2_con_scale'],
        l2_gen_scale=hyperparams['objective']['l2_gen_scale']).to(device)

    model = LFADS_SingleSession_Net(
        input_size=input_size,
        factor_size=hyperparams['model']['factor_size'],
        g_encoder_size=hyperparams['model']['g_encoder_size'],
        c_encoder_size=hyperparams['model']['c_encoder_size'],
        g_latent_size=hyperparams['model']['g_latent_size'],
        u_latent_size=hyperparams['model']['u_latent_size'],
        controller_size=hyperparams['model']['controller_size'],
        generator_size=hyperparams['model']['generator_size'],
        prior=hyperparams['model']['prior'],
        clip_val=hyperparams['model']['clip_val'],
        dropout=hyperparams['model']['dropout'],
        do_normalize_factors=hyperparams['model']['normalize_factors'],
        max_norm=hyperparams['model']['max_norm'],
        device=device).to(device)

    total_params = 0
    for ix, (name, param) in enumerate(model.named_parameters()):
        print(ix, name, list(param.shape), param.numel(), param.requires_grad)
        total_params += param.numel()

    print('Total parameters: %i' % total_params)

    optimizer = opt.Adam(model.parameters(),
                         lr=hyperparams['optimizer']['lr_init'],
                         betas=hyperparams['optimizer']['betas'],
                         eps=hyperparams['optimizer']['eps'])

    scheduler = LFADS_Scheduler(
        optimizer=optimizer,
        mode='min',
        factor=hyperparams['scheduler']['scheduler_factor'],
        patience=hyperparams['scheduler']['scheduler_patience'],
        verbose=True,
        threshold=1e-4,
        threshold_mode='abs',
        cooldown=hyperparams['scheduler']['scheduler_cooldown'],
        min_lr=hyperparams['scheduler']['lr_min'])

    TIME = torch._np.arange(0, num_steps * data_dict['dt'], data_dict['dt'])

    train_truth = {}
    if 'train_rates' in data_dict.keys():
        train_truth['rates'] = data_dict['train_rates']
    if 'train_latent' in data_dict.keys():
        train_truth['latent'] = data_dict['train_latent']

    valid_truth = {}
    if 'valid_rates' in data_dict.keys():
        valid_truth['rates'] = data_dict['valid_rates']
    if 'valid_latent' in data_dict.keys():
        valid_truth['latent'] = data_dict['valid_latent']

    plotter = {
        'train': Plotter(time=TIME, truth=train_truth),
        'valid': Plotter(time=TIME, truth=valid_truth)
    }

    if args.use_tensorboard:
        import importlib
        if importlib.util.find_spec('torch.utils.tensorboard'):
            tb_folder = save_loc + 'tensorboard/'
            if not os.path.exists(tb_folder):
                os.mkdir(tb_folder)
            elif os.path.exists(tb_folder) and args.restart:
                os.system('rm -rf %s' % tb_folder)
                os.mkdir(tb_folder)

            from torch.utils.tensorboard import SummaryWriter
            writer = SummaryWriter(tb_folder)
            rm_plotter = plotter
        else:
            writer = None
            rm_plotter = None
    else:
        writer = None
        rm_plotter = None

    run_manager = RunManager(model=model,
                             objective=objective,
                             optimizer=optimizer,
                             scheduler=scheduler,
                             train_dl=train_dl,
                             valid_dl=valid_dl,
                             transforms=transforms,
                             writer=writer,
                             plotter=rm_plotter,
                             max_epochs=args.max_epochs,
                             save_loc=save_loc,
                             do_health_check=args.do_health_check)

    run_manager.run()

    report_results(
        [dict(name='valid_loss', type='objective', value=run_manager.best)])

    fig_folder = save_loc + 'figs/'

    if os.path.exists(fig_folder):
        os.system('rm -rf %s' % fig_folder)
    os.mkdir(fig_folder)

    from matplotlib.figure import Figure
    import matplotlib
    matplotlib.use('Agg')
    fig_dict = plotter['valid'].plot_summary(model=run_manager.model,
                                             dl=run_manager.valid_dl)
    for k, v in fig_dict.items():
        if type(v) == Figure:
            v.savefig(fig_folder + k + '.svg')
コード例 #24
0
    # gin.config.register_finalize_hook(
    # lambda config: config[('', 'src.train.train')].update({'device': torch.device(config[('', 'src.train.train')].get('device','cpu'))}))

    gin.parse_config_files_and_bindings(args.config, args.gin_param)
    print(gin.operative_config_str())

    errors = []
    for random_seed in range(5):
        if args.savedir:
            os.makedirs(args.savedir, exist_ok=True)

            seed_savedir = f'{args.savedir}/{random_seed}'
        else:
            seed_savedir = None

        best_error = train(savedir=seed_savedir, random_seed=random_seed)
        errors.append(best_error)

    if args.aggregate_seeds == 'mean':
        objective = sum(errors) / len(errors)
    elif args.aggregate_seeds == 'min':
        objective = min(errors)

    print(f'{args.aggregate_seeds} error over seeds: {objective:2.2f}')

    report_results([
        dict(name=f'{args.aggregate_seeds}_error_over_seeds',
             type='objective',
             value=objective)
    ])
コード例 #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batchSz', type=int, default=256)
    parser.add_argument('--nEpochs', type=int, default=100)
    parser.add_argument('--card', type=int, default=2)

    parser.add_argument('--no-cuda', action='store_true')
    parser.add_argument('--ica', type=float, default=1e-1)
    parser.add_argument('--ica-fc', type=float, default=0)
    parser.add_argument('--wd', type=float, default=1e-4)
    parser.add_argument('--save')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--opt', type=str, default='adam',
                        choices=('sgd', 'adam', 'rmsprop', 'sgdw'))
    args = parser.parse_args()
    os.environ["CUDA_VISIBLE_DEVICES"]=str(args.card)
    
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    args.save = args.save or 'error.csv'


    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    if os.path.exists(args.save):
        shutil.rmtree(args.save)
    os.makedirs(args.save, exist_ok=True)

    normMean = [0.49139968, 0.48215827, 0.44653124]
    normStd = [0.24703233, 0.24348505, 0.26158768]
    normTransform = transforms.Normalize(normMean, normStd)

    trainTransform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normTransform
    ])
    testTransform = transforms.Compose([
        transforms.ToTensor(),
        normTransform
    ])

    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
    trainLoader = DataLoader(
        dset.CIFAR10(root='./data', train=True, download=True,
                     transform=trainTransform),
        batch_size=args.batchSz, shuffle=True, **kwargs)

    testLoader = DataLoader(
        dset.CIFAR10(root='./data', train=False, download=True,
                     transform=testTransform),
        batch_size=args.batchSz, shuffle=False, **kwargs)

    net = Net([args.ica]*4 + [args.ica_fc]*2)

    print('  + Number of params: {}'.format(
        sum([p.data.nelement() for p in net.parameters()])))
    if args.cuda:
        net = net.cuda()
        #net = nn.DataParallel(net, device_ids=[0,1])

    if args.opt == 'sgd':
        optimizer = optim.SGD(net.parameters(), lr=1e-1,
                            momentum=0.9, weight_decay=1e-4)
    elif args.opt == 'adam':
        optimizer = optim.Adam(net.parameters(), weight_decay=args.wd)
    elif args.opt == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(), weight_decay=1e-4)
        

    trainF = open(os.path.join(args.save, 'train.csv'), 'w')
    testF = open(os.path.join(args.save, 'test.csv'), 'w')

    for epoch in range(1, args.nEpochs + 1):
        adjust_opt(args.opt, optimizer, epoch)
        train(args, epoch, net, trainLoader, optimizer, trainF)
        test_error_rate = test(args, epoch, net, testLoader, optimizer, testF)
        torch.save(net, os.path.join(args.save, 'latest.pth'))

    trainF.close()
    testF.close()

    report_results([dict(
        name='test_error_rate',
        type='objective',
        value=test_error_rate)])
コード例 #26
0
assert NDCGs_1model != -1, "Orion's objective not evaluated"






########  ORION  ########


# For Orion, print results (MongoDB,...)

report_results([dict(
    name='NDCG with genres',
    type='objective',
    value=-NDCGs_1model),
#    dict(
#    name='valid_pred_error',
#    type='constraint',
#    value=pred_err),
#    dict(
#    name='valid_reconst_error',
#    type='constraint',
#    value=valid_err),
#    dict(
#    name='g',
#    type='constraint',
#    value=model.g.data.item())
    ])

コード例 #27
0
    # will log to a file if provided
    if args.log is not None:
        handler = logging.handlers.WatchedFileHandler(args.log)
        formatter = logging.Formatter(logging.BASIC_FORMAT)
        handler.setFormatter(formatter)
        root = logging.getLogger()
        root.setLevel(logging.INFO)
        root.addHandler(handler)

    args.learning_rate = [float(lr) for lr in eval(args.learning_rate)]

    logger.info(args)
    val_loss = train(args.data_dir,
                     args.csv_path,
                     args.splits_path,
                     args.output_dir,
                     target=args.target,
                     nb_epoch=args.epochs,
                     learning_rate=args.learning_rate,
                     batch_size=args.batch_size,
                     dropout=args.dropout,
                     optim=args.optim,
                     min_patients_per_label=args.min_patients,
                     seed=args.seed,
                     model_type=args.model_type,
                     architecture=args.arch,
                     data_augmentation=args.data_augmentation,
                     misc=args)

    report_results([dict(name='val_auc', type='objective', value=val_loss)])