コード例 #1
0
ファイル: interpret.py プロジェクト: tsenapathi/chemprop
    def __init__(self, args: InterpretArgs) -> None:
        """
        :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation.
        """
        self.args = args
        self.train_args = load_args(args.checkpoint_paths[0])

        # If features were used during training, they must be used when predicting
        if ((self.train_args.features_path is not None
             or self.train_args.features_generator is not None)
                and args.features_generator is None):
            raise ValueError(
                'Features were used during training so they must be specified again during prediction '
                'using the same type of features as before (with --features_generator <generator> '
                'and using --no_features_scaling if applicable).')

        if self.train_args.atom_descriptors_size > 0 or self.train_args.atom_features_size > 0 or self.train_args.bond_features_size > 0:
            raise NotImplementedError(
                'The interpret function does not yet work with additional atom or bond features'
            )

        self.scaler, self.features_scaler, self.atom_descriptor_scaler, self.bond_feature_scaler = load_scalers(
            args.checkpoint_paths[0])
        self.checkpoints = [
            load_checkpoint(checkpoint_path, device=args.device)
            for checkpoint_path in args.checkpoint_paths
        ]
コード例 #2
0
ファイル: make_predictions.py プロジェクト: chemprop/chemprop
def load_model(args: PredictArgs, generator: bool = False):
    """
    Function to load a model or ensemble of models from file. If generator is True, a generator of the respective model and scaler 
    objects is returned (memory efficient), else the full list (holding all models in memory, necessary for preloading).

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param generator: A boolean to return a generator instead of a list of models and scalers.
    :return: A tuple of updated prediction arguments, training arguments, a list or generator object of models, a list or 
                 generator object of scalers, the number of tasks and their respective names.
    """
    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    update_prediction_args(predict_args=args, train_args=train_args)
    args: Union[PredictArgs, TrainArgs]

    # Load model and scalers
    models = (load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths)
    scalers = (load_scalers(checkpoint_path) for checkpoint_path in args.checkpoint_paths)
    if not generator:
        models = list(models)
        scalers = list(scalers)

    return args, train_args, models, scalers, num_tasks, task_names
コード例 #3
0
def predict_smile(checkpoint_path: str, smile: str):
    smiles = [smile]
    """
        Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

        :param args: Arguments.
        :param smiles: Smiles to make predictions on.
        :return: A list of lists of target predictions.
        """
    args = Namespace()
    # print('Loading training args')
    scaler, features_scaler = load_scalers(checkpoint_path)
    train_args = load_args(checkpoint_path)

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    # print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
    else:
        print("Enter Valid Smile String")
        return

    # print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))

    model = load_checkpoint(checkpoint_path, cuda=args.cuda)
    model_preds = predict(model=model,
                          data=test_data,
                          batch_size=1,
                          scaler=scaler)
    sum_preds += np.array(model_preds)

    # Ensemble predictions
    return sum_preds[0][0]
コード例 #4
0
ファイル: interpret.py プロジェクト: zhenghl2/chemprop
 def __init__(self, checkpoint_dir):
     self.checkpoints = []
     for root, _, files in os.walk(checkpoint_dir):
         for fname in files:
             if fname.endswith('.pt'):
                 fname = os.path.join(root, fname)
                 self.scaler, self.features_scaler = load_scalers(fname)
                 self.train_args = load_args(fname)
                 model = load_checkpoint(fname, cuda=True)
                 self.checkpoints.append(model)
コード例 #5
0
    def __init__(self, args: InterpretArgs) -> None:
        self.args = args
        self.train_args = load_args(args.checkpoint_paths[0])

        # If features were used during training, they must be used when predicting
        if ((self.train_args.features_path is not None or self.train_args.features_generator is not None)
                and args.features_generator is None):
            raise ValueError('Features were used during training so they must be specified again during prediction '
                             'using the same type of features as before (with --features_generator <generator> '
                             'and using --no_features_scaling if applicable).')

        self.scaler, self.features_scaler = load_scalers(args.checkpoint_paths[0])
        self.checkpoints = [load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths]
コード例 #6
0
    def __init__(self, checkpoint_paths: List[str],
                 device: torch.device) -> None:
        self.train_args = load_args(checkpoint_paths[0])

        if self.train_args.features_path is not None and self.train_args.features_generator is None:
            raise ValueError(
                'Must specify features generator using --features_generator <generator> '
                'when using a model trained with additional features.')

        self.scaler, self.features_scaler = load_scalers(checkpoint_paths[0])
        self.checkpoints = [
            load_checkpoint(checkpoint_path, device=device)
            for checkpoint_path in checkpoint_paths
        ]
コード例 #7
0
ファイル: views.py プロジェクト: rmovva/chemprop
def upload_checkpoint(return_page: str):
    """
    Uploads a checkpoint .pt file.

    :param return_page: The name of the page to render after uploading the checkpoint file.
    """
    warnings, errors = [], []

    current_user = request.cookies.get('currentUser')

    if not current_user:
        # Use DEFAULT as current user if the client's cookie is not set.
        current_user = app.config['DEFAULT_USER_ID']

    ckpt = request.files['checkpoint']

    ckpt_name = request.form['checkpointName']

    # Create temporary file to get ckpt_args without losing data.
    with NamedTemporaryFile() as temp_file:
        ckpt.save(temp_file.name)

        ckpt_args = load_args(temp_file)

        ckpt_id, new_ckpt_name = db.insert_ckpt(ckpt_name, current_user,
                                                ckpt_args.dataset_type,
                                                ckpt_args.epochs, 1,
                                                ckpt_args.train_data_size)

        model_id = db.insert_model(ckpt_id)

        model_path = os.path.join(app.config['CHECKPOINT_FOLDER'],
                                  f'{model_id}.pt')

        if ckpt_name != new_ckpt_name:
            warnings.append(
                name_already_exists_message('Checkpoint', ckpt_name,
                                            new_ckpt_name))

        shutil.copy(temp_file.name, model_path)

    warnings, errors = json.dumps(warnings), json.dumps(errors)

    return redirect(
        url_for(return_page,
                checkpoint_upload_warnings=warnings,
                checkpoint_upload_errors=errors))
    def __init__(self, args: Namespace):

        if args.computed_prop is not None:
            self.computed_prop = True

            if args.computed_prop == 'penalized_logp':
                self.scorer = penalized_logp
            elif args.computed_prop == 'logp':
                self.scorer = logp
            elif args.computed_prop == 'qed':
                self.scorer = qed
            elif args.computed_prop == 'sascore':
                self.scorer = sascore
            elif args.computed_prop == 'drd2':
                self.scorer = drd2
            else:
                raise ValueError
            return

        self.computed_prop = False

        chemprop_paths = []
        for root, _, files in os.walk(args.chemprop_dir):
            for fname in files:
                if fname.endswith('.pt'):
                    chemprop_paths.append(os.path.join(root, fname))

        self.scaler, self.features_scaler = load_scalers(chemprop_paths[0])
        self.train_args = load_args(chemprop_paths[0])
        if self.train_args.features_path is not None:
            self.train_args.features_path = None
            self.train_args.features_generator = ['rdkit_2d_normalized'
                                                  ]  # just assume this
        self.num_tasks = self.train_args.num_tasks
        self.batch_size = args.batch_size * 4
        self.features_generator = get_features_generator(
            args.features_generator[0]
        ) if args.features_generator is not None else None
        self.neg_threshold = args.neg_threshold
        self.prop_index = args.prop_index

        self.chemprop_models = []
        for checkpoint_path in chemprop_paths:
            self.chemprop_models.append(
                load_checkpoint(checkpoint_path, cuda=True))
コード例 #9
0
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles
                  ] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [
        os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt')
        for model in models
    ]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')
    train_args = load_args(model_paths[0])

    # Build arguments
    arguments = [
        '--test_path', 'None', '--preds_path',
        os.path.join(app.config['TEMP_FOLDER'],
                     app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths',
        *model_paths
    ]

    if gpu is not None:
        if gpu == 'None':
            arguments.append('--no_cuda')
        else:
            arguments += ['--gpu', gpu]

    # Handle additional features
    if train_args.features_path is not None:
        # TODO: make it possible to specify the features generator if trained using features_path
        arguments += [
            '--features_generator', 'rdkit_2d_normalized',
            '--no_features_scaling'
        ]
    elif train_args.features_generator is not None:
        arguments += ['--features_generator', *train_args.features_generator]

        if not train_args.features_scaling:
            arguments.append('--no_features_scaling')

    # Parse arguments
    args = PredictArgs().parse_args(arguments)

    # Run predictions
    preds = make_predictions(args=args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = 'Invalid SMILES String'
    preds = [
        pred if pred is not None else [invalid_smiles_warning] * num_tasks
        for pred in preds
    ]

    return render_predict(
        predicted=True,
        smiles=smiles,
        num_smiles=min(10, len(smiles)),
        show_more=max(0,
                      len(smiles) - 10),
        task_names=task_names,
        num_tasks=len(task_names),
        preds=preds,
        warnings=["List contains invalid SMILES strings"]
        if None in preds else None,
        errors=["No SMILES strings given"] if len(preds) == 0 else None)
コード例 #10
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=args)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
        sum_ale_uncs = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
        sum_epi_uncs = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
        sum_ale_uncs = np.zeros((len(test_data), args.num_tasks))
        sum_epi_uncs = np.zeros((len(test_data), args.num_tasks))

    # Partial results for variance robust calculation.
    all_preds = np.zeros(
        (len(test_data), args.num_tasks, len(args.checkpoint_paths)))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds, ale_uncs, epi_uncs = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler,
            sampling_size=args.sampling_size)
        sum_preds += np.array(model_preds)
        if ale_uncs is not None:
            sum_ale_uncs += np.array(ale_uncs)
        if epi_uncs is not None:
            sum_epi_uncs += np.array(epi_uncs)
        if args.estimate_variance:
            all_preds[:, :, index] = model_preds
        print('\nmodel_preds\n', model_preds)
        print('ale_uncs\n', ale_uncs)

    # Ensemble predictions

    if args.estimate_variance:
        # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates.
        # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds)
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths)
        avg_ale_uncs = avg_ale_uncs.tolist()

        avg_epi_uncs = np.var(all_preds, axis=2)
        avg_epi_uncs = avg_epi_uncs.tolist()

    else:
        # Use another method to estimate uncertainty.
        # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs)
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths)
        avg_ale_uncs = avg_ale_uncs.tolist()

        avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths)
        avg_epi_uncs = avg_epi_uncs.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    assert len(test_data) == len(avg_ale_uncs)
    assert len(test_data) == len(avg_epi_uncs)

    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    full_ale_uncs = [None] * len(full_data)
    full_epi_uncs = [None] * len(full_data)

    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
        full_ale_uncs[si] = avg_ale_uncs[i]
        full_epi_uncs[si] = avg_epi_uncs[i]

    avg_preds = full_preds
    avg_ale_uncs = full_ale_uncs
    avg_epi_uncs = full_epi_uncs

    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = []

        if args.use_compound_names:
            header.append('compound_names')

        header.append('smiles')

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)

            header.extend([tn + "_ale_unc" for tn in args.task_names])

            header.extend([tn + "_epi_unc" for tn in args.task_names])

        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = []

            if args.use_compound_names:
                row.append(compound_names[i])

            row.append(test_smiles[i])

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
                    row.extend(avg_ale_uncs[i])
                    row.extend(avg_epi_uncs[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks *
                               args.multiclass_num_classes)
                else:
                    # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None
                    row.extend([''] * 3 * args.num_tasks)

            writer.writerow(row)

    return avg_preds
コード例 #11
0
ファイル: make_predictions.py プロジェクト: aivant/chemprop
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print("Loading training args")
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    update_prediction_args(predict_args=args, train_args=train_args)
    args: Union[PredictArgs, TrainArgs]

    if args.atom_descriptors == "feature":
        set_extra_atom_fdim(train_args.atom_features_size)

    if args.bond_features_path is not None:
        set_extra_bond_fdim(train_args.bond_features_size)

    # set explicit H option and reaction option
    set_explicit_h(train_args.explicit_h)
    set_reaction(train_args.reaction, train_args.reaction_mode)

    print("Loading data")
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator,
        )
    else:
        full_data = get_data(
            path=args.test_path,
            smiles_columns=args.smiles_columns,
            target_columns=[],
            ignore_columns=[],
            skip_invalid_smiles=False,
            args=args,
            store_row=not args.drop_extra_columns,
        )

    print("Validating SMILES")
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f"Test size = {len(test_data):,}")

    # Predict with each model individually and sum predictions
    if args.dataset_type == "multiclass":
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(
        dataset=test_data,
        batch_size=args.batch_size,
        num_workers=0 if sys.platform == "darwin" else args.num_workers,
    )

    # Partial results for variance robust calculation.
    if args.ensemble_variance:
        all_preds = np.zeros(
            (len(test_data), num_tasks, len(args.checkpoint_paths)))

    print(
        f"Predicting with an ensemble of {len(args.checkpoint_paths)} models")
    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        (
            scaler,
            features_scaler,
            atom_descriptor_scaler,
            bond_feature_scaler,
        ) = load_scalers(checkpoint_path)

        # Normalize features
        if (args.features_scaling or train_args.atom_descriptor_scaling
                or train_args.bond_feature_scaling):
            test_data.reset_features_and_targets()
            if args.features_scaling:
                test_data.normalize_features(features_scaler)
            if (train_args.atom_descriptor_scaling
                    and args.atom_descriptors is not None):
                test_data.normalize_features(atom_descriptor_scaler,
                                             scale_atom_descriptors=True)
            if train_args.bond_feature_scaling and args.bond_features_size > 0:
                test_data.normalize_features(bond_feature_scaler,
                                             scale_bond_features=True)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)
        if args.ensemble_variance:
            all_preds[:, :, index] = model_preds

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    if args.ensemble_variance:
        all_epi_uncs = np.var(all_preds, axis=2)
        all_epi_uncs = all_epi_uncs.tolist()

    # Save predictions
    print(f"Saving predictions to {args.preds_path}")
    assert len(test_data) == len(avg_preds)
    if args.ensemble_variance:
        assert len(test_data) == len(all_epi_uncs)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == "multiclass":
        task_names = [
            f"{name}_class_{i}" for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = (avg_preds[valid_index] if valid_index is not None else
                 ["Invalid SMILES"] * len(task_names))
        if args.ensemble_variance:
            epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None
                        else ["Invalid SMILES"] * len(task_names))

        # If extra columns have been dropped, add back in SMILES columns
        if args.drop_extra_columns:
            datapoint.row = OrderedDict()

            smiles_columns = args.smiles_columns

            for column, smiles in zip(smiles_columns, datapoint.smiles):
                datapoint.row[column] = smiles

        # Add predictions columns
        if args.ensemble_variance:
            for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs):
                datapoint.row[pred_name] = pred
                datapoint.row[pred_name + "_epi_unc"] = epi_unc
        else:
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, "w") as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
コード例 #12
0
ファイル: views.py プロジェクト: Divide-By-0/chemprop
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        print(" GOT HERE")
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')

    # Create and modify args
    args = load_args(model_paths[0])

    if args.features_path != None:
        args.features_generator = ["rdkit_2d_normalized"]
        args.features_path = None

    preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME'])
    args.test_path = 'None'  # TODO: Remove this hack to avoid assert crashing in modify_predict_args
    args.preds_path = preds_path
    args.checkpoint_paths = model_paths
    if gpu is not None:
        if gpu == 'None':
            args.no_cuda = True
        else:
            args.gpu = int(gpu)

    modify_predict_args(args)

    # Run predictions
    preds = make_predictions(args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = "Invalid SMILES String"
    preds = [pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds]

    return render_predict(predicted=True,
                          smiles=smiles,
                          num_smiles=min(10, len(smiles)),
                          show_more=max(0, len(smiles)-10),
                          task_names=task_names,
                          num_tasks=len(task_names),
                          preds=preds,
                          warnings=["List contains invalid SMILES strings"] if None in preds else None,
                          errors=["No SMILES strings given"] if len(preds) == 0 else None)
コード例 #13
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None,
                     invalid_smiles_warning: str = None) -> List[List[float]]:
    """Makes predictions."""
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    if invalid_smiles_warning is not None:
        success_indices = []
        for i, s in enumerate(smiles):
            mol = Chem.MolFromSmiles(s)
            if mol is not None:
                success_indices.append(i)
        full_smiles = smiles
        smiles = [smiles[i] for i in success_indices]

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles)
    else:
        test_data = get_data(args.test_path,
                             args,
                             use_compound_names=args.compound_names)
    test_smiles = test_data.smiles()
    if args.compound_names:
        compound_names = test_data.compound_names()
    print('Test size = {:,}'.format(len(test_data)))

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    sum_preds = np.zeros((len(test_data), args.num_tasks))
    print('Predicting with an ensemble of {} models'.format(
        len(args.checkpoint_paths)))
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(model=model,
                              data=test_data,
                              args=args,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / args.ensemble_size
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print('Saving predictions to {}'.format(args.preds_path))

    with open(args.preds_path, 'w') as f:
        if args.write_smiles:
            f.write('smiles,')
        if args.compound_names:
            f.write('compound_name,')
        f.write(','.join(args.task_names) + '\n')

        for i in range(len(avg_preds)):
            if args.write_smiles:
                f.write(test_smiles[i] + ',')
            if args.compound_names:
                f.write(compound_names[i] + ',')
            f.write(','.join(str(p) for p in avg_preds[i]) + '\n')

    if invalid_smiles_warning is not None:
        full_preds = [[invalid_smiles_warning]
                      for _ in range(len(full_smiles))]
        for i, si in enumerate(success_indices):
            full_preds[si] = avg_preds[i]
        return full_preds

    return avg_preds
コード例 #14
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=args)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    #for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
    # Load model
    model = load_checkpoint(args.checkpoint_path, cuda=args.cuda)
    test_preds, test_smiles_batch = predict(model=model,
                                            data=test_data,
                                            batch_size=args.batch_size,
                                            scaler=scaler)

    return test_preds, test_smiles_batch
    '''
コード例 #15
0
def molecule_fingerprint(
        args: FingerprintArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to encode fingerprint vectors for the data.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of fingerprint vectors (list of floats)
    """

    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    if args.fingerprint_type == 'MPN':  # only need to supply input features if using FFN latent representation and if model calls for them.
        validate_feature_sources = False
    else:
        validate_feature_sources = True
    update_prediction_args(predict_args=args,
                           train_args=train_args,
                           validate_feature_sources=validate_feature_sources)
    args: Union[FingerprintArgs, TrainArgs]

    #set explicit H option and reaction option
    reset_featurization_parameters()
    if args.atom_descriptors == 'feature':
        set_extra_atom_fdim(train_args.atom_features_size)

    if args.bond_features_path is not None:
        set_extra_bond_fdim(train_args.bond_features_size)

    set_explicit_h(train_args.explicit_h)
    set_adding_hs(args.adding_h)
    if train_args.reaction:
        set_reaction(train_args.reaction, train_args.reaction_mode)
    elif train_args.reaction_solvent:
        set_reaction(True, train_args.reaction_mode)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             smiles_columns=args.smiles_columns,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    # Set fingerprint size
    if args.fingerprint_type == 'MPN':
        if args.atom_descriptors == "descriptor":  # special case when we have 'descriptor' extra dimensions need to be added
            total_fp_size = (
                args.hidden_size +
                test_data.atom_descriptors_size()) * args.number_of_molecules
        else:
            if args.reaction_solvent:
                total_fp_size = args.hidden_size + args.hidden_size_solvent
            else:
                total_fp_size = args.hidden_size * args.number_of_molecules
        if args.features_only:
            raise ValueError(
                'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.'
            )
    elif args.fingerprint_type == 'last_FFN':
        if args.ffn_num_layers != 1:
            total_fp_size = args.ffn_hidden_size
        else:
            raise ValueError(
                'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.'
            )
    else:
        raise ValueError(
            f'Fingerprint type {args.fingerprint_type} not supported')
    all_fingerprints = np.zeros(
        (len(test_data), total_fp_size, len(args.checkpoint_paths)))

    # Load model
    print(
        f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.'
    )

    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers(
            args.checkpoint_paths[index])

        # Normalize features
        if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling:
            test_data.reset_features_and_targets()
            if args.features_scaling:
                test_data.normalize_features(features_scaler)
            if train_args.atom_descriptor_scaling and args.atom_descriptors is not None:
                test_data.normalize_features(atom_descriptor_scaler,
                                             scale_atom_descriptors=True)
            if train_args.bond_feature_scaling and args.bond_features_size > 0:
                test_data.normalize_features(bond_feature_scaler,
                                             scale_bond_features=True)

        # Make fingerprints
        model_fp = model_fingerprint(model=model,
                                     data_loader=test_data_loader,
                                     fingerprint_type=args.fingerprint_type)
        if args.fingerprint_type == 'MPN' and (
                args.features_path is not None or args.features_generator
        ):  # truncate any features from MPN fingerprint
            model_fp = np.array(model_fp)[:, :total_fp_size]
        all_fingerprints[:, :, index] = model_fp

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this
    makedirs(args.preds_path, isfile=True)

    # Set column names
    fingerprint_columns = []
    if args.fingerprint_type == 'MPN':
        if len(args.checkpoint_paths) == 1:
            for j in range(total_fp_size // args.number_of_molecules):
                for k in range(args.number_of_molecules):
                    fingerprint_columns.append(f'fp_{j}_mol_{k}')
        else:
            for j in range(total_fp_size // args.number_of_molecules):
                for i in range(len(args.checkpoint_paths)):
                    for k in range(args.number_of_molecules):
                        fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}')

    else:  # args == 'last_FNN'
        if len(args.checkpoint_paths) == 1:
            for j in range(total_fp_size):
                fingerprint_columns.append(f'fp_{j}')
        else:
            for j in range(total_fp_size):
                for i in range(len(args.checkpoint_paths)):
                    fingerprint_columns.append(f'fp_{j}_model_{i}')

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = all_fingerprints[valid_index].reshape(
            (len(args.checkpoint_paths) * total_fp_size
             )) if valid_index is not None else ['Invalid SMILES'] * len(
                 args.checkpoint_paths) * total_fp_size

        for i in range(len(fingerprint_columns)):
            datapoint.row[fingerprint_columns[i]] = preds[i]

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f,
                                fieldnames=args.smiles_columns +
                                fingerprint_columns,
                                extrasaction='ignore')
        writer.writeheader()
        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return all_fingerprints
コード例 #16
0
ファイル: make_predictions.py プロジェクト: Miserlou/chemprop
def make_predictions(args: PredictArgs,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    if args.parcel_size and args.max_data_size:
        num_iterations = math.ceil(args.max_data_size / args.parcel_size)
        max_data_size = args.parcel_size
        print('Using parcels: ' + str(num_iterations))
    else:
        num_iterations = 1
        max_data_size = args.max_data_size
        print('Not using parcels.')

    if args.parcel_offset:
        offset = args.parcel_offset * parcel_size
    else:
        offset = 0

    for iteration in range(num_iterations):
        print('Loading data')
        if smiles is not None:
            full_data = get_data_from_smiles(
                smiles=smiles,
                skip_invalid_smiles=False,
                features_generator=args.features_generator)
        else:
            print("Getting without SMILES")
            full_data = get_data(path=args.test_path,
                                 args=args,
                                 target_columns=[],
                                 max_data_size=max_data_size,
                                 data_offset=offset,
                                 skip_invalid_smiles=False)

        print('Validating SMILES')
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        test_data = MoleculeDataset(
            [full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # Edge case if empty list of smiles is provided
        if len(test_data) == 0:
            return [None] * len(full_data)

        print(f'Test size = {len(test_data):,}')

        # Normalize features
        if args.features_scaling:
            test_data.normalize_features(features_scaler)

        # Predict with each model individually and sum predictions
        if args.dataset_type == 'multiclass':
            sum_preds = np.zeros(
                (len(test_data), num_tasks, args.multiclass_num_classes))
        else:
            sum_preds = np.zeros((len(test_data), num_tasks))

        # Create data loader
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=args.batch_size,
                                              num_workers=args.num_workers)

        print(
            f'Predicting with an ensemble of {len(args.checkpoint_paths)} models'
        )
        for checkpoint_path in tqdm(args.checkpoint_paths,
                                    total=len(args.checkpoint_paths)):
            # Load model
            model = load_checkpoint(checkpoint_path, device=args.device)
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        # Save predictions
        if iteration != 0:
            name, ext = os.path.splitext(args.preds_path)
            preds_path = "{name}.{it}.csv".format(name=name, it=iteration)
        else:
            preds_path = args.preds_path

        print(f'Saving predictions to {preds_path}')
        assert len(test_data) == len(avg_preds)
        makedirs(preds_path, isfile=True)

        # Get prediction column names
        if args.dataset_type == 'multiclass':
            task_names = [
                f'{name}_class_{i}' for name in task_names
                for i in range(args.multiclass_num_classes)
            ]
        else:
            task_names = task_names

        # Copy predictions over to full_data
        for full_index, datapoint in enumerate(full_data):
            valid_index = full_to_valid_indices.get(full_index, None)
            preds = avg_preds[valid_index] if valid_index is not None else [
                'Invalid SMILES'
            ] * len(task_names)

            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

        with open(preds_path, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
            writer.writeheader()

            for datapoint in full_data:
                writer.writerow(datapoint.row)

        offset = offset + parcel_size

    return avg_preds
コード例 #17
0
def molecule_fingerprint(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to encode fingerprint vectors for the data.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of fingerprint vectors (list of floats)
    """

    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    update_prediction_args(predict_args=args,
                           train_args=train_args,
                           validate_feature_sources=False)
    args: Union[PredictArgs, TrainArgs]

    #set explicit H option and reaction option
    set_explicit_h(train_args.explicit_h)
    set_reaction(train_args.reaction, train_args.reaction_mode)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             smiles_columns=args.smiles_columns,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    # Load model
    print(f'Encoding smiles into a fingerprint vector from a single model')
    if len(args.checkpoint_paths) != 1:
        raise ValueError(
            "Fingerprint generation only supports one model, cannot use an ensemble"
        )

    model = load_checkpoint(args.checkpoint_paths[0], device=args.device)
    scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers(
        args.checkpoint_paths[0])

    # Normalize features
    if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling:
        test_data.reset_features_and_targets()
        if args.features_scaling:
            test_data.normalize_features(features_scaler)
        if train_args.atom_descriptor_scaling and args.atom_descriptors is not None:
            test_data.normalize_features(atom_descriptor_scaler,
                                         scale_atom_descriptors=True)
        if train_args.bond_feature_scaling and args.bond_features_size > 0:
            test_data.normalize_features(bond_feature_scaler,
                                         scale_bond_features=True)

    # Make fingerprints
    model_preds = model_fingerprint(model=model, data_loader=test_data_loader)

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(model_preds)
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to full_data
    total_hidden_size = args.hidden_size * args.number_of_molecules
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = model_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * total_hidden_size

        fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)]
        for i in range(len(fingerprint_columns)):
            datapoint.row[fingerprint_columns[i]] = preds[i]

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f,
                                fieldnames=args.smiles_columns +
                                fingerprint_columns,
                                extrasaction='ignore')
        writer.writeheader()
        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return model_preds
コード例 #18
0
def upload_checkpoint(return_page: str):
    """
    Uploads a checkpoint .pt file.

    :param return_page: The name of the page to render after uploading the checkpoint file.
    """
    warnings, errors = [], []

    current_user = request.cookies.get('currentUser')

    if not current_user:
        # Use DEFAULT as current user if the client's cookie is not set.
        current_user = app.config['DEFAULT_USER_ID']

    ckpt = request.files['checkpoint']

    ckpt_name = request.form['checkpointName']
    ckpt_ext = os.path.splitext(ckpt.filename)[1]

    # Collect paths to all uploaded checkpoints (and unzip if necessary)
    temp_dir = TemporaryDirectory()
    ckpt_paths = []

    if ckpt_ext.endswith('.pt'):
        ckpt_path = os.path.join(temp_dir.name, 'model.pt')
        ckpt.save(ckpt_path)
        ckpt_paths = [ckpt_path]

    elif ckpt_ext.endswith('.zip'):
        ckpt_dir = os.path.join(temp_dir.name, 'models')
        zip_path = os.path.join(temp_dir.name, 'models.zip')
        ckpt.save(zip_path)

        with zipfile.ZipFile(zip_path, mode='r') as z:
            z.extractall(ckpt_dir)

        for root, _, fnames in os.walk(ckpt_dir):
            ckpt_paths += [
                os.path.join(root, fname) for fname in fnames
                if fname.endswith('.pt')
            ]

    else:
        errors.append(
            f'Uploaded checkpoint(s) file must be either .pt or .zip but got {ckpt_ext}'
        )

    # Insert checkpoints into database
    if len(ckpt_paths) > 0:
        ckpt_args = load_args(ckpt_paths[0])
        ckpt_id, new_ckpt_name = db.insert_ckpt(ckpt_name, current_user,
                                                ckpt_args.dataset_type,
                                                ckpt_args.epochs,
                                                len(ckpt_paths),
                                                ckpt_args.train_data_size)

        for ckpt_path in ckpt_paths:
            model_id = db.insert_model(ckpt_id)
            model_path = os.path.join(app.config['CHECKPOINT_FOLDER'],
                                      f'{model_id}.pt')

            if ckpt_name != new_ckpt_name:
                warnings.append(
                    name_already_exists_message('Checkpoint', ckpt_name,
                                                new_ckpt_name))

            shutil.copy(ckpt_path, model_path)

    temp_dir.cleanup()

    warnings, errors = json.dumps(warnings), json.dumps(errors)

    return redirect(
        url_for(return_page,
                checkpoint_upload_warnings=warnings,
                checkpoint_upload_errors=errors))
コード例 #19
0
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler = load_scalers(checkpoint_path)

        # Normalize features
        if args.features_scaling:
            test_data.reset_features_and_targets()
            test_data.normalize_features(features_scaler)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        for pred_name, pred in zip(task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
コード例 #20
0
ファイル: make_predictions.py プロジェクト: opnbnch/chemprop
def make_predictions(args: PredictArgs,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             args=args,
                             target_columns=[],
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if full_data[full_index].mol is not None:
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Initialize uncertainty estimator
    if args.uncertainty:
        uncertainty_estimator = uncertainty_estimator_builder(
            args.uncertainty)(args, test_data, scaler)

    # Predict with each model individually and sum predictions
    if not args.uncertainty:
        if args.dataset_type == 'multiclass':
            sum_preds = np.zeros(
                (len(test_data), num_tasks, args.multiclass_num_classes))
        else:
            sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for N, checkpoint_path in tqdm(enumerate(args.checkpoint_paths),
                                   total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, device=args.device)
        model.training = False
        if not args.uncertainty:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)
        else:
            uncertainty_estimator.process_model(model, N)

    # Ensemble predictions
    if not args.uncertainty:
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()
    else:
        avg_preds, avg_UQ = uncertainty_estimator.calculate_UQ()
        if type(avg_UQ) is tuple:
            aleatoric, epistemic = avg_UQ

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        if args.uncertainty:
            if not args.split_UQ:
                cur_UQ = avg_UQ[valid_index] if valid_index is not None else [
                    'Invalid SMILES'
                ] * len(task_names)
                datapoint.row['Uncertainty'] = cur_UQ
            elif args.split_UQ:
                cur_al = aleatoric[
                    valid_index] if valid_index is not None else [
                        'Invalid SMILES'
                    ] * len(task_names)
                cur_ep = epistemic[
                    valid_index] if valid_index is not None else [
                        'Invalid SMILES'
                    ] * len(task_names)
                datapoint.row['Aleatoric'] = cur_al
                datapoint.row['Epistemic'] = cur_ep

        if type(preds) is list:
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred
        else:
            datapoint.row[task_names[0]] = preds

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
コード例 #21
0
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, drug_scaler, cmpd_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
    else:
        test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [i for i in range(len(test_data)) if test_data[i].drug_mol is not None]
    full_data = test_data
    test_data = MolPairDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(drug_scaler, cmpd_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler  # TODO: Shouldn't this be the custom scalers if avail?
        )
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
    avg_preds = full_preds
    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = ['drugSMILE', 'cmpdSMILE']

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)
        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = [test_smiles[i][0], test_smiles[i][1]]

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks * args.multiclass_num_classes)
                else:
                    row.extend([''] * args.num_tasks)

            writer.writerow(row)

    return avg_preds
コード例 #22
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    data = smiles
    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    # if smiles is not None:
    #     test_data = get_data_from_smiles_fast(smiles=smiles, skip_invalid_smiles=False)
    # else:
    #     test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False)
    with open(args.test_path, 'r') as f:
        smiles = list(map(lambda x: x.split(',')[0].strip(),
                          f.readlines()[1:]))
    assert (smiles is not None)

    print('Validating SMILES')
    #
    # valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None]
    # full_data = test_data
    # test_data = MoleculeDataset([test_data[i] for i in valid_indices])
    #
    # # Edge case if empty list of smiles is provided
    # if len(test_data) == 0:
    #     return [None] * len(full_data)
    #
    # if args.use_compound_names:
    #     compound_names = test_data.compound_names()
    # print(f'Test size = {len(test_data):,}')
    #
    # # Normalize features
    # if train_args.features_scaling:
    #     test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    # if args.dataset_type == 'multiclass':
    #     sum_preds = np.zeros((len(smiles), args.num_tasks, args.multiclass_num_classes))
    # else:
    # sum_preds = np.zeros((len(smiles), args.num_tasks))
    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        avg_preds = predict(model=model,
                            data=smiles,
                            batch_size=args.batch_size,
                            scaler=scaler,
                            args=args)
        # avg_preds += np.array(model_preds)

    # Ensemble predictions
    # avg_preds = sum_preds / len(args.checkpoint_paths)
    # avg_preds = avg_preds.tolist()

    # Save predictions
    print(len(smiles), len(avg_preds))
    assert len(smiles) == len(avg_preds)
    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = avg_preds
    # for i, si in enumerate(valid_indices):
    #     full_preds[si] = avg_preds[i]
    avg_preds = full_preds
    test_smiles = smiles

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = []

        if args.use_compound_names:
            header.append('compound_names')

        header.append('smiles')

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)
        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = []

            # if args.use_compound_names:
            # row.append(compound_names[i])

            row.append(test_smiles[i])

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks *
                               args.multiclass_num_classes)
                else:
                    row.extend([''] * args.num_tasks)

            writer.writerow(row)

    return avg_preds