Ejemplo n.º 1
0
def load_gcnn_model():
    # global rlm_gcnn_scaler, rlm_gcnn_features_scaler
    # gcnn_scaler, gcnn_features_scaler = load_scalers('./models/gcnn_model.pt')
    print(f'Loading RLM graph convolutional neural network model',
          file=sys.stdout)
    rlm_gcnn_scaler_path = f'{rlm_base_models_path}/gcnn_model.pt'
    if path.exists(rlm_gcnn_scaler_path):
        rlm_gcnn_scaler, _ = load_scalers(rlm_gcnn_scaler_path)
    else:
        rlm_gcnn_scaler_url = f'{base_url}/gcnn_model.pt'
        rlm_gcnn_scaler_request = requests.get(f'{base_url}/gcnn_model.pt')
        with tqdm.wrapattr(open(os.devnull, "wb"),
                           "write",
                           miniters=1,
                           desc=rlm_gcnn_scaler_url.split('/')[-1],
                           total=int(
                               rlm_gcnn_scaler_request.headers.get(
                                   'content-length', 0))) as fout:
            for chunk in rlm_gcnn_scaler_request.iter_content(chunk_size=4096):
                fout.write(chunk)
        with open(rlm_gcnn_scaler_path, 'wb') as rlm_gcnn_scaler_file:
            rlm_gcnn_scaler_file.write(rlm_gcnn_scaler_request.content)
        rlm_gcnn_scaler, _ = load_scalers(rlm_gcnn_scaler_path)

    rlm_gcnn_model = load_checkpoint(rlm_gcnn_scaler_path)

    return rlm_gcnn_scaler, rlm_gcnn_model
Ejemplo n.º 2
0
def load_model(args: PredictArgs, generator: bool = False):
    """
    Function to load a model or ensemble of models from file. If generator is True, a generator of the respective model and scaler 
    objects is returned (memory efficient), else the full list (holding all models in memory, necessary for preloading).

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param generator: A boolean to return a generator instead of a list of models and scalers.
    :return: A tuple of updated prediction arguments, training arguments, a list or generator object of models, a list or 
                 generator object of scalers, the number of tasks and their respective names.
    """
    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    update_prediction_args(predict_args=args, train_args=train_args)
    args: Union[PredictArgs, TrainArgs]

    # Load model and scalers
    models = (load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths)
    scalers = (load_scalers(checkpoint_path) for checkpoint_path in args.checkpoint_paths)
    if not generator:
        models = list(models)
        scalers = list(scalers)

    return args, train_args, models, scalers, num_tasks, task_names
Ejemplo n.º 3
0
def predict_smile(checkpoint_path: str, smile: str):
    smiles = [smile]
    """
        Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

        :param args: Arguments.
        :param smiles: Smiles to make predictions on.
        :return: A list of lists of target predictions.
        """
    args = Namespace()
    # print('Loading training args')
    scaler, features_scaler = load_scalers(checkpoint_path)
    train_args = load_args(checkpoint_path)

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    # print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
    else:
        print("Enter Valid Smile String")
        return

    # print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))

    model = load_checkpoint(checkpoint_path, cuda=args.cuda)
    model_preds = predict(model=model,
                          data=test_data,
                          batch_size=1,
                          scaler=scaler)
    sum_preds += np.array(model_preds)

    # Ensemble predictions
    return sum_preds[0][0]
Ejemplo n.º 4
0
 def __init__(self, checkpoint_dir):
     self.checkpoints = []
     for root, _, files in os.walk(checkpoint_dir):
         for fname in files:
             if fname.endswith('.pt'):
                 fname = os.path.join(root, fname)
                 self.scaler, self.features_scaler = load_scalers(fname)
                 self.train_args = load_args(fname)
                 model = load_checkpoint(fname, cuda=True)
                 self.checkpoints.append(model)
Ejemplo n.º 5
0
def load_gcnn_model(model_file_path, model_file_url):
    if path.exists(model_file_path):
        gcnn_scaler, _ = load_scalers(model_file_path)
    else:
        gcnn_scaler_request = requests.get(model_file_url)
        with tqdm.wrapattr(
                open(os.devnull, "wb"),
                "write",
                miniters=1,
                desc=model_file_url.split('/')[-1],
                total=int(gcnn_scaler_request.headers.get('content-length',
                                                          0))) as fout:
            for chunk in gcnn_scaler_request.iter_content(chunk_size=4096):
                fout.write(chunk)
        with open(model_file_path, 'wb') as gcnn_scaler_file:
            gcnn_scaler_file.write(gcnn_scaler_request.content)
        gcnn_scaler, _ = load_scalers(model_file_path)

    gcnn_model = load_checkpoint(model_file_path)

    return gcnn_scaler, gcnn_model
Ejemplo n.º 6
0
 def __init__(self, checkpoint_dir):
     self.features_generator = ['rdkit_2d_normalized']
     self.checkpoints, self.scalers, self.features_scalers = [], [], []
     for root, _, files in os.walk(checkpoint_dir):
         for fname in files:
             if fname.endswith('.pt'):
                 fname = os.path.join(root, fname)
                 scaler, features_scaler = load_scalers(fname)
                 self.scalers.append(scaler)
                 self.features_scalers.append(features_scaler)
                 model = load_checkpoint(fname)
                 self.checkpoints.append(model)
Ejemplo n.º 7
0
    def __init__(self, args: InterpretArgs) -> None:
        self.args = args
        self.train_args = load_args(args.checkpoint_paths[0])

        # If features were used during training, they must be used when predicting
        if ((self.train_args.features_path is not None or self.train_args.features_generator is not None)
                and args.features_generator is None):
            raise ValueError('Features were used during training so they must be specified again during prediction '
                             'using the same type of features as before (with --features_generator <generator> '
                             'and using --no_features_scaling if applicable).')

        self.scaler, self.features_scaler = load_scalers(args.checkpoint_paths[0])
        self.checkpoints = [load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths]
Ejemplo n.º 8
0
    def __init__(self, checkpoint_paths: List[str],
                 device: torch.device) -> None:
        self.train_args = load_args(checkpoint_paths[0])

        if self.train_args.features_path is not None and self.train_args.features_generator is None:
            raise ValueError(
                'Must specify features generator using --features_generator <generator> '
                'when using a model trained with additional features.')

        self.scaler, self.features_scaler = load_scalers(checkpoint_paths[0])
        self.checkpoints = [
            load_checkpoint(checkpoint_path, device=device)
            for checkpoint_path in checkpoint_paths
        ]
Ejemplo n.º 9
0
    def __init__(self, checkpoint_dir, features_generator=None):
        self.features_generator = features_generator
        self.checkpoints, self.scalers, self.features_scalers = [], [], []
        for root, _, files in os.walk(checkpoint_dir):
            for fname in files:
                if fname.endswith('.pt'):
                    fname = os.path.join(root, fname)

                    scaler, features_scaler, _, _ = load_scalers(fname)
                    self.scalers.append(scaler)
                    self.features_scalers.append(features_scaler)

                    model = load_checkpoint(fname, device=torch.device('cpu'))
                    self.checkpoints.append(model)
    def __init__(self, args: Namespace):

        if args.computed_prop is not None:
            self.computed_prop = True

            if args.computed_prop == 'penalized_logp':
                self.scorer = penalized_logp
            elif args.computed_prop == 'logp':
                self.scorer = logp
            elif args.computed_prop == 'qed':
                self.scorer = qed
            elif args.computed_prop == 'sascore':
                self.scorer = sascore
            elif args.computed_prop == 'drd2':
                self.scorer = drd2
            else:
                raise ValueError
            return

        self.computed_prop = False

        chemprop_paths = []
        for root, _, files in os.walk(args.chemprop_dir):
            for fname in files:
                if fname.endswith('.pt'):
                    chemprop_paths.append(os.path.join(root, fname))

        self.scaler, self.features_scaler = load_scalers(chemprop_paths[0])
        self.train_args = load_args(chemprop_paths[0])
        if self.train_args.features_path is not None:
            self.train_args.features_path = None
            self.train_args.features_generator = ['rdkit_2d_normalized'
                                                  ]  # just assume this
        self.num_tasks = self.train_args.num_tasks
        self.batch_size = args.batch_size * 4
        self.features_generator = get_features_generator(
            args.features_generator[0]
        ) if args.features_generator is not None else None
        self.neg_threshold = args.neg_threshold
        self.prop_index = args.prop_index

        self.chemprop_models = []
        for checkpoint_path in chemprop_paths:
            self.chemprop_models.append(
                load_checkpoint(checkpoint_path, cuda=True))
Ejemplo n.º 11
0
    def __init__(self, args: InterpretArgs) -> None:
        """
        :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation.
        """
        self.args = args
        self.train_args = load_args(args.checkpoint_paths[0])

        # If features were used during training, they must be used when predicting
        if ((self.train_args.features_path is not None
             or self.train_args.features_generator is not None)
                and args.features_generator is None):
            raise ValueError(
                'Features were used during training so they must be specified again during prediction '
                'using the same type of features as before (with --features_generator <generator> '
                'and using --no_features_scaling if applicable).')

        if self.train_args.atom_descriptors_size > 0 or self.train_args.atom_features_size > 0 or self.train_args.bond_features_size > 0:
            raise NotImplementedError(
                'The interpret function does not yet work with additional atom or bond features'
            )

        self.scaler, self.features_scaler, self.atom_descriptor_scaler, self.bond_feature_scaler = load_scalers(
            args.checkpoint_paths[0])
        self.checkpoints = [
            load_checkpoint(checkpoint_path, device=args.device)
            for checkpoint_path in args.checkpoint_paths
        ]
Ejemplo n.º 12
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=args)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    #for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
    # Load model
    model = load_checkpoint(args.checkpoint_path, cuda=args.cuda)
    test_preds, test_smiles_batch = predict(model=model,
                                            data=test_data,
                                            batch_size=args.batch_size,
                                            scaler=scaler)

    return test_preds, test_smiles_batch
    '''
Ejemplo n.º 13
0
def visualize_encoding_property_space(args: Namespace):
    # Load data
    data = get_data(args.data_path)

    # Sort according to similarity measure
    if args.similarity_measure == 'property':
        data.sort(key=lambda d: d.targets[args.task_index])
    elif args.similarity_measure == 'random':
        data.shuffle(args.seed)
    else:
        raise ValueError(
            'similarity_measure "{}" not supported or not implemented yet.'.
            format(args.similarity_measure))

    # Load model and scalers
    model = load_checkpoint(args.checkpoint_path)
    scaler, features_scaler = load_scalers(args.checkpoint_path)
    data.normalize_features(features_scaler)

    # Random seed
    if args.seed is not None:
        random.seed(args.seed)

    # Generate visualizations
    for i in trange(args.num_examples):
        # Get random three molecules with similar properties
        index = random.randint(1, len(data) - 2)
        molecules = MoleculeDataset(data[index - 1:index + 2])
        molecule_targets = [t[args.task_index] for t in molecules.targets()]

        # Encode three molecules
        molecule_encodings = model.encoder(molecules.smiles())

        # Define interpolation
        def predict_property(point: List[int]) -> float:
            # Return true value on endpoints of triangle
            argmax = np.argmax(point)
            if point[argmax] == 1:
                return molecule_targets[argmax]

            # Interpolate and predict task value
            encoding = sum(point[j] * molecule_encodings[j]
                           for j in range(len(molecule_encodings)))
            pred = model.ffn(encoding).data.cpu().numpy()
            pred = scaler.inverse_transform(pred)
            pred = pred.item()

            return pred

        # Create visualization
        scale = 20
        fontsize = 6

        figure, tax = ternary.figure(scale=scale)
        tax.heatmapf(predict_property, boundary=True, style="hexagonal")
        tax.set_title("Property Prediction")
        tax.right_axis_label('{} ({:.6f}) -->'.format(
            molecules[0].smiles, molecules[0].targets[args.task_index]),
                             fontsize=fontsize)
        tax.left_axis_label('{} ({:.6f}) -->'.format(
            molecules[1].smiles, molecules[1].targets[args.task_index]),
                            fontsize=fontsize)
        tax.bottom_axis_label('<-- {} ({:.6f})'.format(
            molecules[2].smiles, molecules[2].targets[args.task_index]),
                              fontsize=fontsize)

        tax.savefig(os.path.join(args.save_dir, '{}.png'.format(i)))
Ejemplo n.º 14
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None,
                     invalid_smiles_warning: str = None) -> List[List[float]]:
    """Makes predictions."""
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    if invalid_smiles_warning is not None:
        success_indices = []
        for i, s in enumerate(smiles):
            mol = Chem.MolFromSmiles(s)
            if mol is not None:
                success_indices.append(i)
        full_smiles = smiles
        smiles = [smiles[i] for i in success_indices]

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles)
    else:
        test_data = get_data(args.test_path,
                             args,
                             use_compound_names=args.compound_names)
    test_smiles = test_data.smiles()
    if args.compound_names:
        compound_names = test_data.compound_names()
    print('Test size = {:,}'.format(len(test_data)))

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    sum_preds = np.zeros((len(test_data), args.num_tasks))
    print('Predicting with an ensemble of {} models'.format(
        len(args.checkpoint_paths)))
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(model=model,
                              data=test_data,
                              args=args,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / args.ensemble_size
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print('Saving predictions to {}'.format(args.preds_path))

    with open(args.preds_path, 'w') as f:
        if args.write_smiles:
            f.write('smiles,')
        if args.compound_names:
            f.write('compound_name,')
        f.write(','.join(args.task_names) + '\n')

        for i in range(len(avg_preds)):
            if args.write_smiles:
                f.write(test_smiles[i] + ',')
            if args.compound_names:
                f.write(compound_names[i] + ',')
            f.write(','.join(str(p) for p in avg_preds[i]) + '\n')

    if invalid_smiles_warning is not None:
        full_preds = [[invalid_smiles_warning]
                      for _ in range(len(full_smiles))]
        for i, si in enumerate(success_indices):
            full_preds[si] = avg_preds[i]
        return full_preds

    return avg_preds
Ejemplo n.º 15
0
def make_predictions(args: PredictArgs,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             args=args,
                             target_columns=[],
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if full_data[full_index].mol is not None:
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Initialize uncertainty estimator
    if args.uncertainty:
        uncertainty_estimator = uncertainty_estimator_builder(
            args.uncertainty)(args, test_data, scaler)

    # Predict with each model individually and sum predictions
    if not args.uncertainty:
        if args.dataset_type == 'multiclass':
            sum_preds = np.zeros(
                (len(test_data), num_tasks, args.multiclass_num_classes))
        else:
            sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for N, checkpoint_path in tqdm(enumerate(args.checkpoint_paths),
                                   total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, device=args.device)
        model.training = False
        if not args.uncertainty:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)
        else:
            uncertainty_estimator.process_model(model, N)

    # Ensemble predictions
    if not args.uncertainty:
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()
    else:
        avg_preds, avg_UQ = uncertainty_estimator.calculate_UQ()
        if type(avg_UQ) is tuple:
            aleatoric, epistemic = avg_UQ

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        if args.uncertainty:
            if not args.split_UQ:
                cur_UQ = avg_UQ[valid_index] if valid_index is not None else [
                    'Invalid SMILES'
                ] * len(task_names)
                datapoint.row['Uncertainty'] = cur_UQ
            elif args.split_UQ:
                cur_al = aleatoric[
                    valid_index] if valid_index is not None else [
                        'Invalid SMILES'
                    ] * len(task_names)
                cur_ep = epistemic[
                    valid_index] if valid_index is not None else [
                        'Invalid SMILES'
                    ] * len(task_names)
                datapoint.row['Aleatoric'] = cur_al
                datapoint.row['Epistemic'] = cur_ep

        if type(preds) is list:
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred
        else:
            datapoint.row[task_names[0]] = preds

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
Ejemplo n.º 16
0
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print("Loading training args")
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    update_prediction_args(predict_args=args, train_args=train_args)
    args: Union[PredictArgs, TrainArgs]

    if args.atom_descriptors == "feature":
        set_extra_atom_fdim(train_args.atom_features_size)

    if args.bond_features_path is not None:
        set_extra_bond_fdim(train_args.bond_features_size)

    # set explicit H option and reaction option
    set_explicit_h(train_args.explicit_h)
    set_reaction(train_args.reaction, train_args.reaction_mode)

    print("Loading data")
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator,
        )
    else:
        full_data = get_data(
            path=args.test_path,
            smiles_columns=args.smiles_columns,
            target_columns=[],
            ignore_columns=[],
            skip_invalid_smiles=False,
            args=args,
            store_row=not args.drop_extra_columns,
        )

    print("Validating SMILES")
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f"Test size = {len(test_data):,}")

    # Predict with each model individually and sum predictions
    if args.dataset_type == "multiclass":
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(
        dataset=test_data,
        batch_size=args.batch_size,
        num_workers=0 if sys.platform == "darwin" else args.num_workers,
    )

    # Partial results for variance robust calculation.
    if args.ensemble_variance:
        all_preds = np.zeros(
            (len(test_data), num_tasks, len(args.checkpoint_paths)))

    print(
        f"Predicting with an ensemble of {len(args.checkpoint_paths)} models")
    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        (
            scaler,
            features_scaler,
            atom_descriptor_scaler,
            bond_feature_scaler,
        ) = load_scalers(checkpoint_path)

        # Normalize features
        if (args.features_scaling or train_args.atom_descriptor_scaling
                or train_args.bond_feature_scaling):
            test_data.reset_features_and_targets()
            if args.features_scaling:
                test_data.normalize_features(features_scaler)
            if (train_args.atom_descriptor_scaling
                    and args.atom_descriptors is not None):
                test_data.normalize_features(atom_descriptor_scaler,
                                             scale_atom_descriptors=True)
            if train_args.bond_feature_scaling and args.bond_features_size > 0:
                test_data.normalize_features(bond_feature_scaler,
                                             scale_bond_features=True)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)
        if args.ensemble_variance:
            all_preds[:, :, index] = model_preds

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    if args.ensemble_variance:
        all_epi_uncs = np.var(all_preds, axis=2)
        all_epi_uncs = all_epi_uncs.tolist()

    # Save predictions
    print(f"Saving predictions to {args.preds_path}")
    assert len(test_data) == len(avg_preds)
    if args.ensemble_variance:
        assert len(test_data) == len(all_epi_uncs)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == "multiclass":
        task_names = [
            f"{name}_class_{i}" for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = (avg_preds[valid_index] if valid_index is not None else
                 ["Invalid SMILES"] * len(task_names))
        if args.ensemble_variance:
            epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None
                        else ["Invalid SMILES"] * len(task_names))

        # If extra columns have been dropped, add back in SMILES columns
        if args.drop_extra_columns:
            datapoint.row = OrderedDict()

            smiles_columns = args.smiles_columns

            for column, smiles in zip(smiles_columns, datapoint.smiles):
                datapoint.row[column] = smiles

        # Add predictions columns
        if args.ensemble_variance:
            for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs):
                datapoint.row[pred_name] = pred
                datapoint.row[pred_name + "_epi_unc"] = epi_unc
        else:
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, "w") as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
Ejemplo n.º 17
0
def molecule_fingerprint(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to encode fingerprint vectors for the data.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of fingerprint vectors (list of floats)
    """

    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    update_prediction_args(predict_args=args,
                           train_args=train_args,
                           validate_feature_sources=False)
    args: Union[PredictArgs, TrainArgs]

    #set explicit H option and reaction option
    set_explicit_h(train_args.explicit_h)
    set_reaction(train_args.reaction, train_args.reaction_mode)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             smiles_columns=args.smiles_columns,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    # Load model
    print(f'Encoding smiles into a fingerprint vector from a single model')
    if len(args.checkpoint_paths) != 1:
        raise ValueError(
            "Fingerprint generation only supports one model, cannot use an ensemble"
        )

    model = load_checkpoint(args.checkpoint_paths[0], device=args.device)
    scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers(
        args.checkpoint_paths[0])

    # Normalize features
    if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling:
        test_data.reset_features_and_targets()
        if args.features_scaling:
            test_data.normalize_features(features_scaler)
        if train_args.atom_descriptor_scaling and args.atom_descriptors is not None:
            test_data.normalize_features(atom_descriptor_scaler,
                                         scale_atom_descriptors=True)
        if train_args.bond_feature_scaling and args.bond_features_size > 0:
            test_data.normalize_features(bond_feature_scaler,
                                         scale_bond_features=True)

    # Make fingerprints
    model_preds = model_fingerprint(model=model, data_loader=test_data_loader)

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(model_preds)
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to full_data
    total_hidden_size = args.hidden_size * args.number_of_molecules
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = model_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * total_hidden_size

        fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)]
        for i in range(len(fingerprint_columns)):
            datapoint.row[fingerprint_columns[i]] = preds[i]

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f,
                                fieldnames=args.smiles_columns +
                                fingerprint_columns,
                                extrasaction='ignore')
        writer.writeheader()
        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return model_preds
Ejemplo n.º 18
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=args)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
        sum_ale_uncs = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
        sum_epi_uncs = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
        sum_ale_uncs = np.zeros((len(test_data), args.num_tasks))
        sum_epi_uncs = np.zeros((len(test_data), args.num_tasks))

    # Partial results for variance robust calculation.
    all_preds = np.zeros(
        (len(test_data), args.num_tasks, len(args.checkpoint_paths)))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds, ale_uncs, epi_uncs = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler,
            sampling_size=args.sampling_size)
        sum_preds += np.array(model_preds)
        if ale_uncs is not None:
            sum_ale_uncs += np.array(ale_uncs)
        if epi_uncs is not None:
            sum_epi_uncs += np.array(epi_uncs)
        if args.estimate_variance:
            all_preds[:, :, index] = model_preds
        print('\nmodel_preds\n', model_preds)
        print('ale_uncs\n', ale_uncs)

    # Ensemble predictions

    if args.estimate_variance:
        # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates.
        # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds)
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths)
        avg_ale_uncs = avg_ale_uncs.tolist()

        avg_epi_uncs = np.var(all_preds, axis=2)
        avg_epi_uncs = avg_epi_uncs.tolist()

    else:
        # Use another method to estimate uncertainty.
        # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs)
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths)
        avg_ale_uncs = avg_ale_uncs.tolist()

        avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths)
        avg_epi_uncs = avg_epi_uncs.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    assert len(test_data) == len(avg_ale_uncs)
    assert len(test_data) == len(avg_epi_uncs)

    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    full_ale_uncs = [None] * len(full_data)
    full_epi_uncs = [None] * len(full_data)

    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
        full_ale_uncs[si] = avg_ale_uncs[i]
        full_epi_uncs[si] = avg_epi_uncs[i]

    avg_preds = full_preds
    avg_ale_uncs = full_ale_uncs
    avg_epi_uncs = full_epi_uncs

    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = []

        if args.use_compound_names:
            header.append('compound_names')

        header.append('smiles')

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)

            header.extend([tn + "_ale_unc" for tn in args.task_names])

            header.extend([tn + "_epi_unc" for tn in args.task_names])

        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = []

            if args.use_compound_names:
                row.append(compound_names[i])

            row.append(test_smiles[i])

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
                    row.extend(avg_ale_uncs[i])
                    row.extend(avg_epi_uncs[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks *
                               args.multiclass_num_classes)
                else:
                    # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None
                    row.extend([''] * 3 * args.num_tasks)

            writer.writerow(row)

    return avg_preds
Ejemplo n.º 19
0
def make_predictions(args: PredictArgs,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    if args.parcel_size and args.max_data_size:
        num_iterations = math.ceil(args.max_data_size / args.parcel_size)
        max_data_size = args.parcel_size
        print('Using parcels: ' + str(num_iterations))
    else:
        num_iterations = 1
        max_data_size = args.max_data_size
        print('Not using parcels.')

    if args.parcel_offset:
        offset = args.parcel_offset * parcel_size
    else:
        offset = 0

    for iteration in range(num_iterations):
        print('Loading data')
        if smiles is not None:
            full_data = get_data_from_smiles(
                smiles=smiles,
                skip_invalid_smiles=False,
                features_generator=args.features_generator)
        else:
            print("Getting without SMILES")
            full_data = get_data(path=args.test_path,
                                 args=args,
                                 target_columns=[],
                                 max_data_size=max_data_size,
                                 data_offset=offset,
                                 skip_invalid_smiles=False)

        print('Validating SMILES')
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        test_data = MoleculeDataset(
            [full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # Edge case if empty list of smiles is provided
        if len(test_data) == 0:
            return [None] * len(full_data)

        print(f'Test size = {len(test_data):,}')

        # Normalize features
        if args.features_scaling:
            test_data.normalize_features(features_scaler)

        # Predict with each model individually and sum predictions
        if args.dataset_type == 'multiclass':
            sum_preds = np.zeros(
                (len(test_data), num_tasks, args.multiclass_num_classes))
        else:
            sum_preds = np.zeros((len(test_data), num_tasks))

        # Create data loader
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=args.batch_size,
                                              num_workers=args.num_workers)

        print(
            f'Predicting with an ensemble of {len(args.checkpoint_paths)} models'
        )
        for checkpoint_path in tqdm(args.checkpoint_paths,
                                    total=len(args.checkpoint_paths)):
            # Load model
            model = load_checkpoint(checkpoint_path, device=args.device)
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        # Save predictions
        if iteration != 0:
            name, ext = os.path.splitext(args.preds_path)
            preds_path = "{name}.{it}.csv".format(name=name, it=iteration)
        else:
            preds_path = args.preds_path

        print(f'Saving predictions to {preds_path}')
        assert len(test_data) == len(avg_preds)
        makedirs(preds_path, isfile=True)

        # Get prediction column names
        if args.dataset_type == 'multiclass':
            task_names = [
                f'{name}_class_{i}' for name in task_names
                for i in range(args.multiclass_num_classes)
            ]
        else:
            task_names = task_names

        # Copy predictions over to full_data
        for full_index, datapoint in enumerate(full_data):
            valid_index = full_to_valid_indices.get(full_index, None)
            preds = avg_preds[valid_index] if valid_index is not None else [
                'Invalid SMILES'
            ] * len(task_names)

            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

        with open(preds_path, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
            writer.writeheader()

            for datapoint in full_data:
                writer.writerow(datapoint.row)

        offset = offset + parcel_size

    return avg_preds
Ejemplo n.º 20
0
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler = load_scalers(checkpoint_path)

        # Normalize features
        if args.features_scaling:
            test_data.reset_features_and_targets()
            test_data.normalize_features(features_scaler)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        for pred_name, pred in zip(task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
Ejemplo n.º 21
0
def molecule_fingerprint(
        args: FingerprintArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to encode fingerprint vectors for the data.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of fingerprint vectors (list of floats)
    """

    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    if args.fingerprint_type == 'MPN':  # only need to supply input features if using FFN latent representation and if model calls for them.
        validate_feature_sources = False
    else:
        validate_feature_sources = True
    update_prediction_args(predict_args=args,
                           train_args=train_args,
                           validate_feature_sources=validate_feature_sources)
    args: Union[FingerprintArgs, TrainArgs]

    #set explicit H option and reaction option
    reset_featurization_parameters()
    if args.atom_descriptors == 'feature':
        set_extra_atom_fdim(train_args.atom_features_size)

    if args.bond_features_path is not None:
        set_extra_bond_fdim(train_args.bond_features_size)

    set_explicit_h(train_args.explicit_h)
    set_adding_hs(args.adding_h)
    if train_args.reaction:
        set_reaction(train_args.reaction, train_args.reaction_mode)
    elif train_args.reaction_solvent:
        set_reaction(True, train_args.reaction_mode)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             smiles_columns=args.smiles_columns,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    # Set fingerprint size
    if args.fingerprint_type == 'MPN':
        if args.atom_descriptors == "descriptor":  # special case when we have 'descriptor' extra dimensions need to be added
            total_fp_size = (
                args.hidden_size +
                test_data.atom_descriptors_size()) * args.number_of_molecules
        else:
            if args.reaction_solvent:
                total_fp_size = args.hidden_size + args.hidden_size_solvent
            else:
                total_fp_size = args.hidden_size * args.number_of_molecules
        if args.features_only:
            raise ValueError(
                'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.'
            )
    elif args.fingerprint_type == 'last_FFN':
        if args.ffn_num_layers != 1:
            total_fp_size = args.ffn_hidden_size
        else:
            raise ValueError(
                'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.'
            )
    else:
        raise ValueError(
            f'Fingerprint type {args.fingerprint_type} not supported')
    all_fingerprints = np.zeros(
        (len(test_data), total_fp_size, len(args.checkpoint_paths)))

    # Load model
    print(
        f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.'
    )

    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers(
            args.checkpoint_paths[index])

        # Normalize features
        if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling:
            test_data.reset_features_and_targets()
            if args.features_scaling:
                test_data.normalize_features(features_scaler)
            if train_args.atom_descriptor_scaling and args.atom_descriptors is not None:
                test_data.normalize_features(atom_descriptor_scaler,
                                             scale_atom_descriptors=True)
            if train_args.bond_feature_scaling and args.bond_features_size > 0:
                test_data.normalize_features(bond_feature_scaler,
                                             scale_bond_features=True)

        # Make fingerprints
        model_fp = model_fingerprint(model=model,
                                     data_loader=test_data_loader,
                                     fingerprint_type=args.fingerprint_type)
        if args.fingerprint_type == 'MPN' and (
                args.features_path is not None or args.features_generator
        ):  # truncate any features from MPN fingerprint
            model_fp = np.array(model_fp)[:, :total_fp_size]
        all_fingerprints[:, :, index] = model_fp

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this
    makedirs(args.preds_path, isfile=True)

    # Set column names
    fingerprint_columns = []
    if args.fingerprint_type == 'MPN':
        if len(args.checkpoint_paths) == 1:
            for j in range(total_fp_size // args.number_of_molecules):
                for k in range(args.number_of_molecules):
                    fingerprint_columns.append(f'fp_{j}_mol_{k}')
        else:
            for j in range(total_fp_size // args.number_of_molecules):
                for i in range(len(args.checkpoint_paths)):
                    for k in range(args.number_of_molecules):
                        fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}')

    else:  # args == 'last_FNN'
        if len(args.checkpoint_paths) == 1:
            for j in range(total_fp_size):
                fingerprint_columns.append(f'fp_{j}')
        else:
            for j in range(total_fp_size):
                for i in range(len(args.checkpoint_paths)):
                    fingerprint_columns.append(f'fp_{j}_model_{i}')

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = all_fingerprints[valid_index].reshape(
            (len(args.checkpoint_paths) * total_fp_size
             )) if valid_index is not None else ['Invalid SMILES'] * len(
                 args.checkpoint_paths) * total_fp_size

        for i in range(len(fingerprint_columns)):
            datapoint.row[fingerprint_columns[i]] = preds[i]

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f,
                                fieldnames=args.smiles_columns +
                                fingerprint_columns,
                                extrasaction='ignore')
        writer.writeheader()
        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return all_fingerprints
Ejemplo n.º 22
0
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, drug_scaler, cmpd_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
    else:
        test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [i for i in range(len(test_data)) if test_data[i].drug_mol is not None]
    full_data = test_data
    test_data = MolPairDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(drug_scaler, cmpd_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler  # TODO: Shouldn't this be the custom scalers if avail?
        )
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
    avg_preds = full_preds
    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = ['drugSMILE', 'cmpdSMILE']

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)
        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = [test_smiles[i][0], test_smiles[i][1]]

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks * args.multiclass_num_classes)
                else:
                    row.extend([''] * args.num_tasks)

            writer.writerow(row)

    return avg_preds
Ejemplo n.º 23
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    data = smiles
    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    # if smiles is not None:
    #     test_data = get_data_from_smiles_fast(smiles=smiles, skip_invalid_smiles=False)
    # else:
    #     test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False)
    with open(args.test_path, 'r') as f:
        smiles = list(map(lambda x: x.split(',')[0].strip(),
                          f.readlines()[1:]))
    assert (smiles is not None)

    print('Validating SMILES')
    #
    # valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None]
    # full_data = test_data
    # test_data = MoleculeDataset([test_data[i] for i in valid_indices])
    #
    # # Edge case if empty list of smiles is provided
    # if len(test_data) == 0:
    #     return [None] * len(full_data)
    #
    # if args.use_compound_names:
    #     compound_names = test_data.compound_names()
    # print(f'Test size = {len(test_data):,}')
    #
    # # Normalize features
    # if train_args.features_scaling:
    #     test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    # if args.dataset_type == 'multiclass':
    #     sum_preds = np.zeros((len(smiles), args.num_tasks, args.multiclass_num_classes))
    # else:
    # sum_preds = np.zeros((len(smiles), args.num_tasks))
    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        avg_preds = predict(model=model,
                            data=smiles,
                            batch_size=args.batch_size,
                            scaler=scaler,
                            args=args)
        # avg_preds += np.array(model_preds)

    # Ensemble predictions
    # avg_preds = sum_preds / len(args.checkpoint_paths)
    # avg_preds = avg_preds.tolist()

    # Save predictions
    print(len(smiles), len(avg_preds))
    assert len(smiles) == len(avg_preds)
    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = avg_preds
    # for i, si in enumerate(valid_indices):
    #     full_preds[si] = avg_preds[i]
    avg_preds = full_preds
    test_smiles = smiles

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = []

        if args.use_compound_names:
            header.append('compound_names')

        header.append('smiles')

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)
        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = []

            # if args.use_compound_names:
            # row.append(compound_names[i])

            row.append(test_smiles[i])

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks *
                               args.multiclass_num_classes)
                else:
                    row.extend([''] * args.num_tasks)

            writer.writerow(row)

    return avg_preds