Esempio n. 1
0
def generate_fingerprints(args: Namespace, logger: Logger = None) -> List[List[float]]:
    """
    Generate the fingerprints.

    :param logger:
    :param args: Arguments.
    :return: A list of lists of target fingerprints.
    """

    checkpoint_path = args.checkpoint_paths[0]
    if logger is None:
        logger = create_logger('fingerprints', quiet=False)
    print('Loading data')
    test_data = get_data(path=args.data_path,
                         args=args,
                         use_compound_names=False,
                         max_data_size=float("inf"),
                         skip_invalid_smiles=False)
    test_data = MoleculeDataset(test_data)

    logger.info(f'Total size = {len(test_data):,}')
    logger.info(f'Generating...')
    # Load model
    model = load_checkpoint(checkpoint_path, cuda=args.cuda, current_args=args, logger=logger)
    model_preds = do_generate(
        model=model,
        data=test_data,
        args=args
    )

    return model_preds
Esempio n. 2
0
def generate_embeddings(args, logger):
    if not os.path.exists(args.output_path):
        os.mkdir(args.output_path)

    checkpoint_path = args.checkpoint_paths[0]
    if logger is None:
        logger = create_logger('fingerprints', quiet=False)
    print('Loading data...')

    test_data = get_data(path=args.data_path,
                         args=args,
                         use_compound_names=True,
                         skip_invalid_smiles=False)
    molecule_ids = test_data.compound_names()

    logger.info(f'Total size = {len(test_data):,}')
    logger.info(f'Generating...')
    # Load model
    model = load_checkpoint(checkpoint_path,
                            cuda=args.cuda,
                            current_args=args,
                            logger=logger)

    model.eval()
    args.bond_drop_rate = 0

    mol_collator = MolCollator(args=args, shared_dict={})

    mol_loader = DataLoader(test_data,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=4,
                            collate_fn=mol_collator)
    curr_row = 0
    for item in mol_loader:
        _, batch, _, _, _ = item
        _, _, _, _, _, a_scope, b_scope, _ = batch
        batch_preds = model(batch)
        batch_preds = {
            k: v.detach().cpu().numpy()
            for k, v in batch_preds.items()
        }
        atom_embeddings = np.hstack(
            [batch_preds['atom_from_atom'], batch_preds['atom_from_bond']])
        bond_embeddings = np.hstack(
            [batch_preds['bond_from_bond'], batch_preds['bond_from_atom']])

        # save each molecule embedding separately
        for i, ((a_start, a_len), (b_start,
                                   b_len)) in enumerate(zip(a_scope, b_scope)):
            mol_id = molecule_ids[curr_row]
            mol_id = str(curr_row) if mol_id is None else mol_id
            curr_row += 1
            save_path = os.path.join(args.output_path,
                                     f'{mol_id}_embeddings.pkl')
            atom_emb = atom_embeddings[a_start - 1:a_start + a_len - 1]
            bond_emb = bond_embeddings[b_start - 1:b_start + b_len - 1]
            with open(save_path, 'wb') as handle:
                pickle.dump({'atoms': atom_emb, 'bonds': bond_emb}, handle)
Esempio n. 3
0
if __name__ == '__main__':
    # setup random seed
    setup(seed=42)
    # Avoid the pylint warning.
    a = MolVocab
    # supress rdkit logger
    lg = RDLogger.logger()
    lg.setLevel(RDLogger.CRITICAL)

    # Initialize MolVocab
    mol_vocab = MolVocab

    args = parse_args()
    if args.parser_name == 'finetune':
        logger = create_logger(name='train',
                               save_dir=args.save_dir,
                               quiet=False)
        cross_validate(args, logger)
    elif args.parser_name == 'pretrain':
        logger = create_logger(name='pretrain', save_dir=args.save_dir)
        pretrain_model(args, logger)
    elif args.parser_name == "eval":
        logger = create_logger(name='eval',
                               save_dir=args.save_dir,
                               quiet=False)
        cross_validate(args, logger)
    elif args.parser_name == 'fingerprint':
        train_args = get_newest_train_args()
        logger = create_logger(name='fingerprint', save_dir=None, quiet=False)
        feas = generate_fingerprints(args, logger)
        np.savez_compressed(args.output_path, fps=feas)
Esempio n. 4
0
def make_predictions(args: Namespace, newest_train_args=None, smiles: List[str] = None):
    """
    Makes predictions. If smiles is provided, makes predictions on smiles.
    Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')

    path = args.checkpoint_paths[0]
    scaler, features_scaler = load_scalars(path)
    train_args = load_args(path)

    # Update args with training arguments saved in checkpoint
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    # update args with newest training args
    if newest_train_args is not None:
        for key, value in vars(newest_train_args).items():
            if not hasattr(args, key):
                setattr(args, key, value)


    # deal with multiprocess problem
    args.debug = True

    logger = create_logger('predict', quiet=False)
    print('Loading data')
    args.task_names = get_task_names(args.data_path)
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
    else:
        test_data = get_data(path=args.data_path, args=args,
                             use_compound_names=args.use_compound_names, skip_invalid_smiles=False)


    args.num_tasks = test_data.num_tasks()
    args.features_size = test_data.features_size()

    print('Validating SMILES')
    valid_indices = [i for i in range(len(test_data))]
    full_data = test_data
    # test_data = MoleculeDataset([test_data[i] for i in valid_indices])
    test_data_list = []
    for i in valid_indices:
        test_data_list.append(test_data[i])
    test_data = MoleculeDataset(test_data_list)

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if hasattr(train_args, 'features_scaling'):
        if train_args.features_scaling:
            test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if hasattr(args, 'num_tasks'):
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(f'Predicting...')
    shared_dict = {}
    # loss_func = torch.nn.BCEWithLogitsLoss()
    count = 0
    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda, current_args=args, logger=logger)
        model_preds, _ = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler,
            shared_dict=shared_dict,
            args=args,
            logger=logger,
            loss_func=None
        )

        if args.fingerprint:
            return model_preds

        sum_preds += np.array(model_preds, dtype=float)
        count += 1

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)

    # Save predictions
    assert len(test_data) == len(avg_preds)

    # Put Nones for invalid smiles
    args.valid_indices = valid_indices
    avg_preds = np.array(avg_preds)
    test_smiles = full_data.smiles()
    return avg_preds, test_smiles