Exemple #1
0
def do_train(dataloaders, params: MinkLocParams, debug=False, visualize=False):
    # Create model class
    s = get_datetime()
    model = model_factory(params)
    model_name = 'model_' + params.model_params.model + '_' + s
    print('Model name: {}'.format(model_name))
    weights_path = create_weights_folder()
    model_pathname = os.path.join(weights_path, model_name)
    if hasattr(model, 'print_info'):
        model.print_info()
    else:
        n_params = sum([param.nelement() for param in model.parameters()])
        print('Number of model parameters: {}'.format(n_params))

    # Move the model to the proper device before configuring the optimizer
    if torch.cuda.is_available():
        device = "cuda"
        model.to(device)
    else:
        device = "cpu"

    print('Model device: {}'.format(device))

    loss_fn = make_loss(params)

    # Training elements
    if params.weight_decay is None or params.weight_decay == 0:
        optimizer = torch.optim.Adam(model.parameters(), lr=params.lr)
    else:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=params.lr,
                                     weight_decay=params.weight_decay)

    if params.scheduler is None:
        scheduler = None
    else:
        if params.scheduler == 'CosineAnnealingLR':
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=params.epochs + 1, eta_min=params.min_lr)
        elif params.scheduler == 'MultiStepLR':
            scheduler = torch.optim.lr_scheduler.MultiStepLR(
                optimizer, params.scheduler_milestones, gamma=0.1)
        else:
            raise NotImplementedError('Unsupported LR scheduler: {}'.format(
                params.scheduler))

    ###########################################################################
    # Initialize TensorBoard writer
    ###########################################################################

    now = datetime.now()
    logdir = os.path.join("../tf_logs", now.strftime("%Y%m%d-%H%M%S"))
    writer = SummaryWriter(logdir)

    ###########################################################################
    #
    ###########################################################################

    is_validation_set = 'val' in dataloaders
    if is_validation_set:
        phases = ['train', 'val']
    else:
        phases = ['train']

    # Training statistics
    stats = {'train': [], 'val': [], 'eval': []}

    for epoch in tqdm.tqdm(range(1, params.epochs + 1)):
        for phase in phases:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_stats = []  # running stats for the current epoch

            count_batches = 0
            for batch, positives_mask, negatives_mask in dataloaders[phase]:
                # batch is (batch_size, n_points, 3) tensor
                # labels is list with indexes of elements forming a batch
                count_batches += 1
                batch_stats = {}

                if debug and count_batches > 2:
                    break

                # Move everything to the device except 'coords' which must stay on CPU
                batch = {
                    e: batch[e].to(device) if e != 'coords' else batch[e]
                    for e in batch
                }

                n_positives = torch.sum(positives_mask).item()
                n_negatives = torch.sum(negatives_mask).item()
                if n_positives == 0 or n_negatives == 0:
                    # Skip a batch without positives or negatives
                    print(
                        'WARNING: Skipping batch without positive or negative examples'
                    )
                    continue

                optimizer.zero_grad()
                if visualize:
                    #visualize_batch(batch)
                    pass

                with torch.set_grad_enabled(phase == 'train'):
                    # Compute embeddings of all elements
                    embeddings = model(batch)
                    loss, temp_stats, _ = loss_fn(embeddings, positives_mask,
                                                  negatives_mask)

                    temp_stats = tensors_to_numbers(temp_stats)
                    batch_stats.update(temp_stats)
                    batch_stats['loss'] = loss.item()

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_stats.append(batch_stats)
                torch.cuda.empty_cache(
                )  # Prevent excessive GPU memory consumption by SparseTensors

            # ******* PHASE END *******
            # Compute mean stats for the epoch
            epoch_stats = {}
            for key in running_stats[0].keys():
                temp = [e[key] for e in running_stats]
                epoch_stats[key] = np.mean(temp)

            stats[phase].append(epoch_stats)
            print_stats(epoch_stats, phase)

        # ******* EPOCH END *******

        if scheduler is not None:
            scheduler.step()

        loss_metrics = {'train': stats['train'][-1]['loss']}
        if 'val' in phases:
            loss_metrics['val'] = stats['val'][-1]['loss']
        writer.add_scalars('Loss', loss_metrics, epoch)

        if 'num_triplets' in stats['train'][-1]:
            nz_metrics = {'train': stats['train'][-1]['num_non_zero_triplets']}
            if 'val' in phases:
                nz_metrics['val'] = stats['val'][-1]['num_non_zero_triplets']
            writer.add_scalars('Non-zero triplets', nz_metrics, epoch)

        elif 'num_pairs' in stats['train'][-1]:
            nz_metrics = {
                'train_pos': stats['train'][-1]['pos_pairs_above_threshold'],
                'train_neg': stats['train'][-1]['neg_pairs_above_threshold']
            }
            if 'val' in phases:
                nz_metrics['val_pos'] = stats['val'][-1][
                    'pos_pairs_above_threshold']
                nz_metrics['val_neg'] = stats['val'][-1][
                    'neg_pairs_above_threshold']
            writer.add_scalars('Non-zero pairs', nz_metrics, epoch)

        if params.batch_expansion_th is not None:
            # Dynamic batch expansion
            epoch_train_stats = stats['train'][-1]
            if 'num_non_zero_triplets' not in epoch_train_stats:
                print(
                    'WARNING: Batch size expansion is enabled, but the loss function is not supported'
                )
            else:
                # Ratio of non-zero triplets
                rnz = epoch_train_stats[
                    'num_non_zero_triplets'] / epoch_train_stats['num_triplets']
                if rnz < params.batch_expansion_th:
                    dataloaders['train'].batch_sampler.expand_batch()

    print('')

    # Save final model weights
    final_model_path = model_pathname + '_final.pth'
    torch.save(model.state_dict(), final_model_path)

    stats = {'train_stats': stats, 'params': params}

    # Evaluate the final model
    model.eval()
    final_eval_stats = evaluate(model, device, params)
    print('Final model:')
    print_eval_stats(final_eval_stats)
    stats['eval'] = {'final': final_eval_stats}
    print('')

    # Pickle training stats and parameters
    pickle_path = model_pathname + '_stats.pickle'
    pickle.dump(stats, open(pickle_path, "wb"))

    # Append key experimental metrics to experiment summary file
    model_params_name = os.path.split(params.model_params.model_params_path)[1]
    config_name = os.path.split(params.params_path)[1]
    _, model_name = os.path.split(model_pathname)
    prefix = "{}, {}, {}".format(model_params_name, config_name, model_name)
    export_eval_stats("experiment_results.txt", prefix, final_eval_stats)
def main(args: Namespace):
    if args.unconditional:
        assert args.morgan_similarity_threshold == 0  # shouldn't care about inputs in this case

    i2s = None

    if args.checkpoint_dir is not None:
        assert args.checkpoint_path is None
        for _, _, files in os.walk(args.checkpoint_dir):
            for fname in files:
                if fname.endswith('.pt'):
                    args.checkpoint_path = os.path.join(
                        args.checkpoint_dir, fname)

    if args.checkpoint_path is not None:
        print('loading model from checkpoint')
        model, i2s = load_model(args)

    full_train_dataset = PairDataset(
        path=args.train_path,
        i2s=i2s,
        batch_size=args.batch_size,
        extra_vocab_path=args.extra_precursors_path
        if args.extra_precursors_path is not None else None,
        max_data=args.train_max_data
        if args.train_max_data is not None else None)
    pair_datasets = full_train_dataset.split([0.9, 0.1], seed=0)
    train_dataset, val_dataset = pair_datasets[0], pair_datasets[1]
    predict_dataset = SourceDataset(path=args.val_path,
                                    i2s=train_dataset.i2s,
                                    s2i=train_dataset.s2i,
                                    pad_index=train_dataset.pad_index,
                                    start_index=train_dataset.start_index,
                                    end_index=train_dataset.end_index,
                                    batch_size=args.batch_size)

    if args.checkpoint_path is None:
        print('building model from scratch')
        model = Model(args=args,
                      vocab_size=len(train_dataset.i2s),
                      pad_index=train_dataset.pad_index,
                      start_index=train_dataset.start_index,
                      end_index=train_dataset.end_index)
        for param in model.parameters():
            if param.dim() == 1:
                nn.init.constant_(param, 0)
            else:
                nn.init.xavier_normal_(param)

    print(model)
    print('num params: {:,}'.format(
        sum(p.numel() for p in model.parameters() if p.requires_grad)))
    model = model.cuda()

    chemprop_predictor = ChempropPredictor(args)

    criterion = LossFunction(train_dataset.pad_index, args.kl_weight)
    optimizer = optim.Adam(model.parameters(), lr=args.init_lr)
    scheduler = lr_scheduler.ExponentialLR(optimizer, 0.9)

    for epoch in range(args.epochs):
        print('epoch {}'.format(epoch))
        train_dataset.reshuffle(seed=epoch)
        train(model=model,
              train_dataset=train_dataset,
              criterion=criterion,
              optimizer=optimizer,
              max_grad_norm=args.max_grad_norm)
        val_loss = validate(model=model,
                            val_dataset=val_dataset,
                            criterion=criterion)
        os.makedirs(os.path.join(args.save_dir, 'epoch' + str(epoch)),
                    exist_ok=True)
        train_dataset.save(
            os.path.join(args.save_dir, 'epoch' + str(epoch),
                         'train_pairs.csv'))
        save_model(model=model,
                   i2s=train_dataset.i2s,
                   path=os.path.join(args.save_dir, 'epoch' + str(epoch),
                                     'val_loss_{}.pt'.format(val_loss)))
        predict(model=model,
                predict_dataset=predict_dataset,
                save_dir=os.path.join(args.save_dir, 'epoch' + str(epoch)),
                args=args,
                chemprop_predictor=chemprop_predictor
                if not args.no_predictor_at_val else None,
                sample=not args.greedy_prediction,
                num_predictions=args.val_num_predictions,
                print_filter_frac=args.print_filter_frac)
        if epoch % args.evaluate_every == 0:
            evaluate(pred_smiles_dir=os.path.join(args.save_dir,
                                                  'epoch' + str(epoch)),
                     train_path=args.train_path,
                     val_path=args.val_path,
                     checkpoint_dir=args.chemprop_dir,
                     computed_prop=args.computed_prop,
                     prop_min=args.prop_min,
                     sim_thresholds=[0.2, 0.4, 0.6, 0.8, 0.9, 1.0],
                     chemprop_predictor=chemprop_predictor,
                     prop_max=args.prop_max,
                     unconditional=args.unconditional)
        scheduler.step()

    if args.self_train_epochs > 0:
        # store parameters of current model for a loss to constrain it not to stray too far
        original_parameter_vector = parameters_to_vector(
            model.parameters()).data
        parameter_crit = nn.MSELoss()

        args.epoch_length = len(train_dataset.src) // 2

        # Get properties of target molecules in train set
        train_dataset.tgt_props = np.array(
            chemprop_predictor(train_dataset.tgt_smiles))

        augmented_train_dataset = deepcopy(train_dataset)

        epochs_to_dataset_creation = 0
        for epoch in range(args.epochs, args.epochs + args.self_train_epochs):
            print('self train epoch {}'.format(epoch))

            if epochs_to_dataset_creation == 0:
                train_dataset.reshuffle(seed=epoch)
                if args.self_train_max_data is not None:
                    self_train_dataset = deepcopy(train_dataset)
                    self_train_dataset.src, self_train_dataset.tgt = \
                            self_train_dataset.src[:args.self_train_max_data], self_train_dataset.tgt[:args.self_train_max_data]
                    self_train_dataset.src_smiles, self_train_dataset.tgt_smiles = \
                            self_train_dataset.src_smiles[:args.self_train_max_data], self_train_dataset.tgt_smiles[:args.self_train_max_data]
                    if hasattr(self_train_dataset, 'src_props'):
                        self_train_dataset.src_props = self_train_dataset.src_props[:
                                                                                    args
                                                                                    .
                                                                                    self_train_max_data]
                    if hasattr(self_train_dataset, 'tgt_props'):
                        self_train_dataset.tgt_props = self_train_dataset.tgt_props[:
                                                                                    args
                                                                                    .
                                                                                    self_train_max_data]
                else:
                    self_train_dataset = deepcopy(train_dataset)
                if args.extra_precursors_path is not None:
                    self_train_dataset.add_dummy_pairs(
                        args.extra_precursors_path)
                translations, props = generate_self_train_translations(
                    train_dataset=self_train_dataset,
                    model=model,
                    chemprop_predictor=chemprop_predictor,
                    args=args,
                    k=args.k)

                if not args.keep_translations:  # drop old translations and restart
                    augmented_train_dataset = deepcopy(self_train_dataset)

                if args.unconditional:
                    new_train_dataset = deepcopy(self_train_dataset)
                    new_train_dataset.tgt_smiles = translations
                    new_train_dataset.tgt = [
                        list(self_train_dataset.smiles2indices(smiles))
                        for smiles in new_train_dataset.tgt_smiles
                    ]
                    new_train_dataset.tgt = np.array(new_train_dataset.tgt)
                    new_train_dataset.src_smiles = translations  # any dummy is fine
                    new_train_dataset.src = [
                        list(self_train_dataset.smiles2indices(smiles))
                        for smiles in new_train_dataset.src_smiles
                    ]
                    new_train_dataset.src = np.array(new_train_dataset.src)
                else:
                    new_train_dataset = deepcopy(self_train_dataset)
                    new_train_dataset.src = np.concatenate(
                        [self_train_dataset.src for _ in range(args.k)])
                    new_train_dataset.src_smiles = []
                    for _ in range(args.k):
                        new_train_dataset.src_smiles += self_train_dataset.src_smiles
                    new_train_dataset.tgt = []
                    for i in range(args.k):
                        new_train_dataset.tgt += [
                            translations[j][i]
                            for j in range(len(translations))
                        ]
                    new_train_dataset.tgt_smiles = [
                        self_train_dataset.indices2smiles(indices)
                        for indices in new_train_dataset.tgt
                    ]
                    new_train_dataset.tgt = np.array(new_train_dataset.tgt)
                if args.replace_old_dataset:
                    augmented_train_dataset = new_train_dataset
                else:
                    augmented_train_dataset.add(new_train_dataset)

                if not args.unconditional:
                    augmented_train_dataset.filter_dummy_pairs(
                        need_props=False)  # filters src == tgt pairs
                epochs_to_dataset_creation = args.epochs_per_dataset

            augmented_train_dataset.reshuffle(seed=epoch, need_props=False)
            epochs_to_dataset_creation -= 1
            train(model=model,
                  train_dataset=augmented_train_dataset,
                  criterion=criterion,
                  optimizer=optimizer,
                  max_grad_norm=args.max_grad_norm,
                  original_parameter_vector=original_parameter_vector,
                  parameter_crit=parameter_crit,
                  parameter_crit_weight=args.l2_diff_weight)
            val_loss = validate(model=model,
                                val_dataset=val_dataset,
                                criterion=criterion)
            os.makedirs(os.path.join(args.save_dir, 'epoch' + str(epoch)),
                        exist_ok=True)
            augmented_train_dataset.save(
                os.path.join(args.save_dir, 'epoch' + str(epoch),
                             'train_pairs.csv'))
            save_model(model=model,
                       i2s=train_dataset.i2s,
                       path=os.path.join(args.save_dir, 'epoch' + str(epoch),
                                         'val_loss_{}.pt'.format(val_loss)))
            predict(model=model,
                    predict_dataset=predict_dataset,
                    save_dir=os.path.join(args.save_dir, 'epoch' + str(epoch)),
                    args=args,
                    chemprop_predictor=chemprop_predictor
                    if not args.no_predictor_at_val else None,
                    sample=not args.greedy_prediction,
                    num_predictions=args.val_num_predictions,
                    print_filter_frac=args.print_filter_frac)
            evaluate(pred_smiles_dir=os.path.join(args.save_dir,
                                                  'epoch' + str(epoch)),
                     train_path=args.train_path,
                     val_path=args.val_path,
                     checkpoint_dir=args.chemprop_dir,
                     computed_prop=args.computed_prop,
                     prop_min=args.prop_min,
                     sim_thresholds=[0.2, 0.4, 0.6, 0.8, 0.9, 1.0],
                     chemprop_predictor=chemprop_predictor,
                     prop_max=args.prop_max,
                     unconditional=args.unconditional)
            scheduler.step()

    # for convenient evaluation
    os.makedirs(os.path.join(args.save_dir, 'final_eval'), exist_ok=True)
    test_dataset = SourceDataset(path=args.test_path,
                                 i2s=train_dataset.i2s,
                                 s2i=train_dataset.s2i,
                                 pad_index=train_dataset.pad_index,
                                 start_index=train_dataset.start_index,
                                 end_index=train_dataset.end_index,
                                 batch_size=args.batch_size)
    predict(model=model,
            predict_dataset=test_dataset,
            save_dir=os.path.join(args.save_dir, 'final_eval'),
            args=args,
            chemprop_predictor=chemprop_predictor
            if not args.no_predictor_at_val else None,
            sample=not args.greedy_prediction,
            num_predictions=args.val_num_predictions,
            print_filter_frac=args.print_filter_frac)
    if args.final_eval_chemprop_dir is not None:
        args.computed_prop = None
        args.chemprop_dir = args.final_eval_chemprop_dir
        chemprop_predictor = ChempropPredictor(args)
    if args.final_eval_computed_prop is not None:
        args.chemprop_dir = None
        args.computed_prop = args.final_eval_computed_prop
        chemprop_predictor = ChempropPredictor(args)
    evaluate(pred_smiles_dir=os.path.join(args.save_dir, 'final_eval'),
             train_path=args.train_path,
             val_path=args.test_path,
             checkpoint_dir=args.chemprop_dir,
             computed_prop=args.computed_prop,
             prop_min=args.prop_min,
             sim_thresholds=[0.2, 0.4, 0.6, 0.8, 0.9, 1.0],
             chemprop_predictor=chemprop_predictor,
             prop_max=args.prop_max,
             unconditional=args.unconditional)