Esempio n. 1
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()

    assert (0 <= args.hidden_fraction <= 1)

    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))
    log.info('[Loading target GIs]')
    with open(args.target_gis, 'rb') as f:
        tgt_gis = cpkl.load(f)

    log.info('[Loading source GIs]')
    with open(args.source_gis, 'rb') as f:
        src_gis = cpkl.load(f)

    log.info('[Loading sim scores]')
    with open(args.sim_scores, 'rb') as f:
        sim_scores_data = cpkl.load(f)
    sim_scores = sim_scores_data['values']
    sim_scores = sim_scores / np.max(sim_scores)  # Normalize

    # log.info('\t- %d scores', len(sim_scores))

    hp_param_space = xsmf_param_space(args)

    results, models, training_curves, trials = \
        run_xsmf_experiment(tgt_gis=tgt_gis,
                            src_gis=src_gis,
                            space=hp_param_space,
                            sim_scores=sim_scores,
                            val_hf=args.val_hidden_fraction,
                            test_hf=args.hidden_fraction,
                            n_repeats=args.n_repeats,
                            hp_iters=args.n_hyperopt_iters,
                            hp_seed=args.random_seed)
    # Save results and other information
    log_results(results['summary'])
    with open(args.results_output, 'w') as f:
        json.dump(results, f, indent=2)

    with open(args.training_curve_output, 'wb') as f:
        cpkl.dump(training_curves, f)

    # TODO: save models the models cannot be pickled at the moment
    # We will need to implement a from dict and a to dict method
    with open(args.models_output, 'wb') as f:
        cpkl.dump(trials, f)

    with open(args.trials_output, 'wb') as f:
        cpkl.dump(trials, f)
Esempio n. 2
0
 def log_training_results(engine):
     step = True
     run_type = 'train'
     train_eval.run(data_loader['train'])
     y_pred, y = train_eval.state.output
     loss = criterion(y_pred, y)
     log_results(to_cpu(y_pred, convert_to_np=True),
                 to_cpu(y, convert_to_np=True),
                 to_cpu(loss, convert_to_np=True), run_type, step,
                 engine.state.iteration, total_train_steps, writer)
Esempio n. 3
0
def train_model(model,
                train_dl,
                epochs,
                display_every=200,
                visualize_dir='samples'):
    '''
    Train loop
    
    Args:
      model (nn.Module): main model consisting of generator that predicts ab features from L input of L*a*b* image and a discriminator that predicts whether the reconstructed L*a*b* image is real or fake
      train_dl (Dataloader): train dataloader of sampled COCO images
      epochs (int): number of epochs
      display_every (int): saves reconstructed predicted L*a*b* image every number of iterations
      visualize_dir (str): directory where saved images are written to
    '''

    data = next(
        iter(val_dl)
    )  # getting a batch for visualizing the model output after fixed intervals
    for e in range(epochs):
        loss_meter_dict = create_loss_meters(
        )  # function returing a dictionary of objects to
        i = 0  # log the losses of the complete network
        for data in tqdm(train_dl):
            model.setup_input(data)
            model.optimize()
            update_losses(
                model, loss_meter_dict,
                count=data['L'].size(0))  # function updating the log objects
            i += 1
            if i % display_every == 0:
                print(f"\nEpoch {e+1}/{epochs}")
                print(f"Iteration {i}/{len(train_dl)}")
                log_results(
                    loss_meter_dict)  # function to print out the losses
                visualize(model, data, save=True, outdir=visualize_dir
                          )  # function displaying the model's outputs

    # save model
    torch.save(model.state_dict(), 'colorization_model.pt')

    # serialize model
    pickle.dump(model, open('colorization_model.pkl', 'wb'))
Esempio n. 4
0
def train_model(model, train_dl, epochs, display_every=200):
    data = next(
        iter(valid_dl)
    )  # getting a batch for visualizing the model output after fixed intrvals
    for e in range(epochs):
        loss_meter_dict = (create_loss_meters()
                           )  # function returing a dictionary of objects to
        i = 0  # log the losses of the complete network
        for data in tqdm(train_dl):
            model.setup_input(data)
            model.optimize()
            update_losses(
                model, loss_meter_dict,
                count=data["L"].size(0))  # function updating the log objects
            i += 1
            if i % display_every == 0:
                print(f"\nEpoch {e+1}/{epochs}")
                print(f"Iteration {i}/{len(train_dl)}")
                log_results(
                    loss_meter_dict)  # function to print out the losses
                visualize(
                    model, data,
                    save=False)  # function displaying the model's outputs
Esempio n. 5
0
def main(train_images_dir,
         pipeline_config_path,
         output_directory,
         checkpoint_path,
         num_epochs=1,
         image_dict=None,
         labels_path=None,
         samples=None):
    detection_model, pipeline_proto, ckpt_manager = create_model(
        pipeline_config_path, output_directory, checkpoint_path)

    train_files = os.listdir(train_images_dir)
    random.shuffle(train_files)
    BATCH_SIZE = 32
    num_batches = (len(train_files) // BATCH_SIZE) - 1
    for epoch in range(num_epochs):
        for idx in range(num_batches):
            batch_files = train_files[BATCH_SIZE * idx:BATCH_SIZE * (idx + 1)]
            train_images_np, train_gt_box = load_images(
                train_images_dir, batch_files)
            train_image_tensors, gt_classes_one_hot_tensors, gt_box_tensors = \
                prepare_data(train_images_np, train_gt_box)
            detection_model, losses_dict = train_model(
                detection_model, train_images_np, train_image_tensors,
                gt_classes_one_hot_tensors, gt_box_tensors, ckpt_manager)
            logger.info(
                utils.log_results(epoch, num_epochs, idx, num_batches,
                                  losses_dict))
            if idx % 10 == 0:
                ckpt_manager.save()
                print('Checkpoint saved!')
    exporter_lib_v2.export_inference_graph(input_type='image_tensor',
                                           pipeline_config=pipeline_proto,
                                           trained_checkpoint_dir=os.path.join(
                                               output_directory,
                                               r'checkpoint'),
                                           output_directory=output_directory)
def doc_classification(
    task_config,
    model_name_or_path,
    cache_dir,
    data_dir,
    save_dir,
    model_dir,
    run_name="0",
    lr=1e-05,
    warmup_steps=5000,
    balance_classes=True,
    embeds_dropout=0.1,
    epochs=200,  # large because we use early stopping by default
    batch_size=20,
    grad_acc_steps=1,
    early_stopping_metric="roc_auc",
    early_stopping_mode="max",
    early_stopping_patience=10,
    model_class="Bert",
    tokenizer_class="BertTokenizer",
    do_lower_case=False,
    do_train=True,
    do_eval=True,
    do_hpo=False,
    print_preds=False,
    print_dev_preds=False,
    max_seq_len=512,
    seed=11,
    eval_every=500,
    use_amp=False,
    use_cuda=True,
):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = data_dir
    save_dir = save_dir
    model_dir = model_dir

    # Create label list from args list or (for large label lists) create from file by splitting by space
    if isinstance(task_config["data"]["label_list"], list):
        label_list = task_config["data"]["label_list"]
    else:
        with open(data_dir / 'labels' /
                  task_config["data"]["label_list"]) as code_file:
            label_list = code_file.read().split(" ")

    # Register Outcome Metrics
    register_task_metrics(label_list)

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=model_name_or_path,
        tokenizer_class=tokenizer_class,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=data_dir,
        label_list=label_list,
        metric=task_config["metric"],
        multilabel=task_config["multilabel"],
        train_filename=task_config["data"]["train_filename"],
        dev_filename=task_config["data"]["dev_filename"],
        dev_split=task_config["data"]["dev_split"]
        if "dev_split" in task_config["data"] else None,
        test_filename=task_config["data"]["test_filename"],
        delimiter=task_config["data"]["parsing"]["delimiter"],
        quote_char=task_config["data"]["parsing"]["quote_char"],
        label_column_name=task_config["data"]["parsing"]["label_column"])

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         caching=True,
                         cache_path=Path(cache_dir),
                         batch_size=batch_size)

    if do_train:

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(
            experiment_name=task_config["experiment_name"],
            run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(model_name_or_path,
                                            language_model_class=model_class)

        # b) and a prediction head on top that is suited for our task

        # Define class weights
        if balance_classes:
            class_weights = data_silo.calculate_class_weights(
                task_name=task_config["task_type"])
        else:
            class_weights = None

        # Create Multi- or Single-Label Classification Heads
        if task_config["multilabel"]:

            prediction_head = MultiLabelTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        else:
            prediction_head = ExtendedTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        model = ExtendedAdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=[task_config["output_type"]],
            device=device)

        # 5. Create an optimizer
        schedule_opts = {
            "name": "LinearWarmup",
            "num_warmup_steps": warmup_steps
        }

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(mode=early_stopping_mode,
                                           min_delta=0.0001,
                                           save_dir=model_dir,
                                           metric=early_stopping_metric,
                                           patience=early_stopping_patience)

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(model=model,
                                  optimizer=optimizer,
                                  data_silo=data_silo,
                                  epochs=epochs,
                                  n_gpu=n_gpu,
                                  lr_schedule=lr_schedule,
                                  evaluate_every=eval_every,
                                  early_stopping=early_stopping,
                                  device=device,
                                  grad_acc_steps=grad_acc_steps,
                                  evaluator_test=do_eval)

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(model_dir + "/final_model")
        processor.save(model_dir + "/final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = model_dir
        else:
            model_dir = Path(model_name_or_path)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = ExtendedEvaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)

        # Load trained model for evaluation
        model = ExtendedAdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results,
                          dataset_name="test",
                          steps=len(evaluator_test.data_loader),
                          save_path=model_dir + "/eval_results.txt")

        if print_preds:
            # Print model test predictions
            utils.save_predictions(results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"])

        if print_dev_preds:
            # Evaluate on dev set, e.g. for threshold tuning
            evaluator_dev = Evaluator(
                data_loader=data_silo.get_data_loader("dev"),
                tasks=data_silo.processor.tasks,
                device=device)
            dev_results = evaluator_dev.eval(model,
                                             return_preds_and_labels=True)
            utils.log_results(dev_results,
                              dataset_name="dev",
                              steps=len(evaluator_dev.data_loader),
                              save_path=model_dir + "/eval_dev_results.txt")

            # Print model dev predictions
            utils.save_predictions(dev_results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"],
                                   dataset_name="dev")
Esempio n. 7
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()
    assert( 0 <= args.hidden_fraction <= 1 )
    
    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)

    args = parse_args()
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))

    log.info('[Loading input data]')

    with open(args.target_gis, 'rb') as f:
        gi_data = cpkl.load(f)

    row_genes = gi_data['rows']

    log.info('\t- setting up training and test sets')
    train_test_sets = [gi_train_test_split(gi_data, args.hidden_fraction) for _ in range(args.n_repeats)]
    
    train_Xs, test_Xs, test_masks= zip(*train_test_sets)
    if args.mc_alg == 'NGMC':
        scalers = [MCScaler('0-1') for _ in range(args.n_repeats)]
    else:
        scalers = [MCScaler('std') for _ in range(args.n_repeats)]

    train_Xs = [scaler.fit_transform(X) for scaler, X in zip(scalers, train_Xs)]

    if args.mc_alg == 'PMF':
        imputed_Xs, models_info = train_pmf_models(train_Xs = train_Xs,
                                                   rank = args.rank,
                                                   iters = args.iters,
                                                   lr = args.lr,
                                                   lam = args.lambda_f,
                                                   report_every = args.report_every)
    elif args.mc_alg == 'PMF_b':
        imputed_Xs, models_info = train_pmf_b_models(train_Xs = train_Xs,
                                                   rank = args.rank,
                                                   iters = args.iters,
                                                   lr = args.lr,
                                                   lam = args.lambda_f,
                                                   lam_b = args.lambda_b,
                                                   report_every = args.report_every)
    elif args.mc_alg == 'KPMF':
        L = get_laplacian(list(row_genes), args.target_ppi)
        imputed_Xs, models_info = train_kpmf_models(train_Xs = train_Xs,
                                                    L = L,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    rl_lambda = args.rl_lambda,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KPMF_b':
        L = get_laplacian(list(row_genes), args.target_ppi)
        imputed_Xs, models_info = train_kpmf_b_models(train_Xs = train_Xs,
                                                    L = L,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_b = args.lambda_b,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    rl_lambda = args.rl_lambda,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'NGMC':
        ppi = nx.read_edgelist(args.target_ppi)
        A = get_ppi_data(list(row_genes), ppi, mode='normalized_adjacency')
        imputed_Xs, models_info = train_ngmc_models(train_Xs = train_Xs,
                                                    A = A,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    alpha_p = args.alpha_p,
                                                    lambda_f = args.lambda_f,
                                                    lambda_h = args.lambda_h,
                                                    lambda_p = args.lambda_p)
    elif args.mc_alg == 'XSMF':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        imputed_Xs, models_info = train_xsmf_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KXSMF':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi)
        L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi)
        log.warn('%s, %s' % L_src.shape)
        log.warn('%s, %s' % X_src.shape)

        imputed_Xs, models_info = train_kxsmf_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    L_tgt=L_tgt,
                                                    L_src=L_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    lambda_tgt_rl = args.lambda_tgt_rl,
                                                    lambda_src_rl = args.lambda_src_rl,
                                                    report_every = args.report_every)
    elif args.mc_alg == 'KXSMF_b':
        with open(args.source_gis, 'rb') as f:
            src_gi_data = cpkl.load(f)
        X_src = src_gi_data['values']
        X_src = MCScaler(mode='std').fit_transform(X_src)

        log.info('[Loading sim scores]')
        with open(args.sim_scores, 'rb') as f:
            sim_scores_data = cpkl.load(f)
        sim_scores = sim_scores_data['values']
        sim_scores = sim_scores / np.max(sim_scores) # Normalize

        L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi)
        L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi)
        log.warn('%s, %s' % L_src.shape)
        log.warn('%s, %s' % X_src.shape)

        imputed_Xs, models_info = train_kxsmfb_models(train_Xs = train_Xs,
                                                    X_src = X_src,
                                                    L_tgt=L_tgt,
                                                    L_src=L_src,
                                                    sim_scores=sim_scores,
                                                    rank = args.rank,
                                                    iters = args.iters,
                                                    lr = args.lr,
                                                    lambda_b= args.lambda_b,
                                                    lambda_sim = args.lambda_sim,
                                                    lambda_src = args.lambda_src,
                                                    lambda_u = args.lambda_u,
                                                    lambda_v = args.lambda_v,
                                                    lambda_us = args.lambda_us,
                                                    lambda_vs = args.lambda_vs,
                                                    lambda_tgt_rl = args.lambda_tgt_rl,
                                                    lambda_src_rl = args.lambda_src_rl,
                                                    report_every = args.report_every)
    else:
        raise NotImplementedError
    
    imputed_Xs = [scaler.inverse_transform(X) for scaler, X in zip(scalers, imputed_Xs)] # Take transposes here for XSMF, KXSMF

    results = evaluate_preds(test_Xs, imputed_Xs, test_masks)
    results, fold_results = summarize_results(results)
    log_results(results)

    results_dict = dict(summary=results, collected=fold_results, args=vars(args))

    pvals_data = None
    if args.pval_file:
        # given pval file
        with open(args.pval_file, 'rb') as f:
            pvals_data = cpkl.load(f)
        assert(np.all(pvals_data['cols'] == gi_data['cols']))
        assert(np.all(pvals_data['rows'] == gi_data['rows']))

        pvals = pvals_data['values']
        pvals_filled = np.where(np.isnan(pvals), 1000, pvals)
        sig_mask = pvals_filled < args.pval_thresh

        sig_test_Xs = [np.where(sig_mask, _X, np.nan) for _X in test_Xs]
        sig_imputed_Xs = [np.where(sig_mask, _X, np.nan) for _X in imputed_Xs]

        sig_results = evaluate_preds(sig_test_Xs, sig_imputed_Xs, test_masks)
        sig_results, sig_fold_results = summarize_results(sig_results)
        log_results(sig_results)

        results_dict['sig_summary'] = sig_results
        results_dict['sig_collected'] = sig_fold_results

    with open(args.results_output, 'w') as f:
        json.dump(results_dict, f, indent=2)

    serialized_data = {
        'GIs': gi_data,
        'alg': args.mc_alg,
        'fold_data': dict(train_Xs=train_Xs, test_Xs=test_Xs, masks=test_masks),
        'imputed_Xs': imputed_Xs,
        'models_info': models_info,
        'pvals': pvals_data
    }

    with open(args.models_output, 'wb') as f:
        cpkl.dump(serialized_data, f)
Esempio n. 8
0
def main():
    args = parse_args()
    setup_logging(args.logfile)

    log = get_logger()

    assert (0 <= args.hidden_fraction <= 1)

    np.random.seed(args.random_seed)
    tf.set_random_seed(args.random_seed)
    log.info('*' * 100)
    log.info('[Starting MC experiment]')
    log_dict(log.info, vars(args))
    log.info('[Loading input data]')

    with open(args.input_file, 'rb') as f:
        obj = cpkl.load(f)

    # Set up experiments
    fit_params = None
    if args.mc_alg == 'PMF':
        param_space = pmf_param_space(args)
        run_experiment = run_pmf
    elif args.mc_alg == 'PMF_b':
        param_space = pmfb_param_space(args)
        run_experiment = run_pmfb
    elif args.mc_alg in ['KPMF', 'NGMC', 'KPMF_b']:
        # Experiments that need PPI network
        if args.ppi is not None:
            ppi = nx.read_edgelist(args.ppi)

        if args.mc_alg == 'KPMF':
            L = get_ppi_data(obj['rows'], ppi, mode='laplacian')
            param_space = kpmf_param_space(args)
            run_experiment = run_kpmf
            fit_params = dict(L=L)
        elif args.mc_alg == 'KPMF_b':
            L = get_ppi_data(obj['rows'], ppi, mode='laplacian')
            param_space = kpmfb_param_space(args)
            run_experiment = run_kpmfb
            fit_params = dict(L=L)
        elif args.mc_alg == 'NGMC':
            fit_params = dict(P=None)
            P = get_ppi_data(obj['rows'], ppi, mode='normalized_adjacency')
            fit_params['P'] = P
            param_space = ngmc_param_space(args)
            run_experiment = run_ngmc
        else:
            raise (NotImplementedError(
                '{} option is invalid or not implemented'.format(args.mc_alg)))

    else:
        raise (NotImplementedError(
            '{} option is invalid or not implemented'.format(args.mc_alg)))

    # Run experimental protocol
    results, models, training_curves, trials = \
        run_experiment(obj,
                        param_space = param_space,
                        fit_params = fit_params,
                        val_hidden_fraction=args.val_hidden_fraction,
                        hidden_fraction=args.hidden_fraction,
                        n_repeats=args.n_repeats,
                        hyperopt_iters=args.n_hyperopt_iters,
                        seed=args.random_seed,
                        logistic=args.logistic)

    # Save results and other information
    log_results(results['summary'])
    with open(args.results_output, 'w') as f:
        json.dump(results, f, indent=2)

    with open(args.training_curve_output, 'wb') as f:
        cpkl.dump(training_curves, f)

    # TODO: save models the models cannot be pickled at the moment
    # We will need to implement a from dict and a to dict method
    with open(args.models_output, 'wb') as f:
        cpkl.dump(trials, f)

    with open(args.trials_output, 'wb') as f:
        cpkl.dump(trials, f)
Esempio n. 9
0
def outcome_pretraining(task_config,
                        model_name,
                        cache_dir,
                        run_name="0",
                        lr=1e-05,
                        warmup_steps=5000,
                        embeds_dropout=0.1,
                        epochs=200,  # large because we use early stopping by default
                        batch_size=20,
                        grad_acc_steps=1,
                        early_stopping_metric="loss",
                        early_stopping_mode="min",
                        early_stopping_patience=10,
                        model_class="Bert",
                        tokenizer_class="BertTokenizer",
                        do_lower_case=True,
                        do_train=True,
                        do_eval=True,
                        do_hpo=False,
                        max_seq_len=512,
                        seed=11,
                        eval_every=500,
                        use_amp=False,
                        use_cuda=True,
                        ):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = Path(task_config["data"]["data_dir"])

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, tokenizer_class=tokenizer_class,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = OutcomePretrainingProcessor(tokenizer=tokenizer,
                                            max_seq_len=max_seq_len,
                                            data_dir=data_dir,
                                            train_filename=task_config["data"]["train_filename"],
                                            dev_filename=task_config["data"]["dev_filename"],
                                            seed=seed,
                                            max_size_admission=50,
                                            max_size_discharge=50,
                                            cache_dir=cache_dir)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = OutcomePretrainingDataSilo(
        processor=processor,
        caching=True,
        cache_dir=cache_dir,
        batch_size=batch_size,
        max_multiprocessing_chunksize=200)

    if do_train:

        # Set save dir for experiment output
        save_dir = Path(task_config["output_dir"]) / f'{task_config["experiment_name"]}_{run_name}'

        # Use HPO config args if config is passed
        if do_hpo:
            save_dir = save_dir / tune.session.get_trial_name()
        else:
            exp_name = f"exp_{random.randint(100000, 999999)}"
            save_dir = save_dir / exp_name

        # Create save dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(experiment_name=task_config["experiment_name"],
                                  run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis

        language_model = LanguageModel.load(model_name, language_model_class=model_class)

        # b) and NextSentenceHead prediction head or TextClassificationHead if it's not a Bert Model
        if model_class == "Bert":
            next_sentence_head = NextSentenceHead.load(model_class)
        else:
            next_sentence_head = TextClassificationHead(num_labels=2)

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[next_sentence_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=["per_sequence"],
            device=device,
        )

        # 5. Create an optimizer
        schedule_opts = {"name": "LinearWarmup",
                         "num_warmup_steps": warmup_steps}

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(
                mode=early_stopping_mode,
                min_delta=0.0001,
                save_dir=save_dir,
                metric=early_stopping_metric,
                patience=early_stopping_patience
            )

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=eval_every,
            early_stopping=early_stopping,
            device=device,
            grad_acc_steps=grad_acc_steps,
            evaluator_test=do_eval
        )

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(save_dir / "final_model")
        processor.save(save_dir / "final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = save_dir
        else:
            model_dir = Path(model_name)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device
        )

        # Load trained model for evaluation
        model = AdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader),
                          save_path=model_dir / "eval_results.txt")