Example #1
0
def average_duplicates(args: Args):
    """Averages duplicate data points in a dataset."""
    print('Loading data')
    header = get_header(args.data_path)
    data = get_data(path=args.data_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=args.target_columns)
    print(f'Data size = {len(data):,}')

    # Map SMILES string to lists of targets
    smiles_in_order = []
    smiles_to_targets = defaultdict(list)
    for smiles, targets in zip(data.smiles(flatten=True), data.targets()):
        smiles_to_targets[smiles].append(targets)
        if len(smiles_to_targets[smiles]) == 1:
            smiles_in_order.append(smiles)

    # Find duplicates
    duplicate_count = 0
    stds = []
    new_data = []
    for smiles in smiles_in_order:
        all_targets = smiles_to_targets[smiles]
        duplicate_count += len(all_targets) - 1
        num_tasks = len(all_targets[0])

        targets_by_task = [[] for _ in range(num_tasks)]
        for task in range(num_tasks):
            for targets in all_targets:
                if targets[task] is not None:
                    targets_by_task[task].append(targets[task])

        stds.append([
            np.std(task_targets) if len(task_targets) > 0 else 0.0
            for task_targets in targets_by_task
        ])
        means = [
            np.mean(task_targets) if len(task_targets) > 0 else None
            for task_targets in targets_by_task
        ]
        new_data.append((smiles, means))

    print(f'Number of duplicates = {duplicate_count:,}')
    print(
        f'Duplicate standard deviation per task = {", ".join(f":{std:.4e}" for std in np.mean(stds, axis=0))}'
    )
    print(f'New data size = {len(new_data):,}')

    # Save new data
    with open(args.save_path, 'w') as f:
        f.write(','.join(header) + '\n')

        for smiles, avg_targets in new_data:
            f.write(smiles + ',' + ','.join(
                str(value) if value is not None else ''
                for value in avg_targets) + '\n')
Example #2
0
def interpret(args: InterpretArgs) -> None:
    """
    Runs interpretation of a Chemprop model using the Monte Carlo Tree Search algorithm.

    :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation.
    """

    if args.number_of_molecules != 1:
        raise ValueError(
            "Interpreting is currently only available for single-molecule models."
        )

    global C_PUCT, MIN_ATOMS

    chemprop_model = ChempropModel(args)

    def scoring_function(smiles: List[str]) -> List[float]:
        return chemprop_model(smiles)[:, args.property_id - 1]

    C_PUCT = args.c_puct
    MIN_ATOMS = args.min_atoms

    all_smiles = get_smiles(path=args.data_path,
                            smiles_columns=args.smiles_columns)
    header = get_header(path=args.data_path)

    property_name = header[
        args.property_id] if len(header) > args.property_id else 'score'
    print(f'smiles,{property_name},rationale,rationale_score')

    for smiles in all_smiles:
        score = scoring_function([smiles])[0]
        if score > args.prop_delta:
            rationales = mcts(smiles=smiles[0],
                              scoring_function=scoring_function,
                              n_rollout=args.rollout,
                              max_atoms=args.max_atoms,
                              prop_delta=args.prop_delta)
        else:
            rationales = []

        if len(rationales) == 0:
            print(f'{smiles},{score:.3f},,')
        else:
            min_size = min(len(x.atoms) for x in rationales)
            min_rationales = [
                x for x in rationales if len(x.atoms) == min_size
            ]
            rats = sorted(min_rationales, key=lambda x: x.P, reverse=True)
            print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}')
Example #3
0
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # If atom-descriptors were used during training, they must be used when predicting and vice-versa
    if train_args.atom_descriptors != args.atom_descriptors:
        raise ValueError(
            'The use of atom descriptors is inconsistent between training and prediction. If atom descriptors '
            ' were used during training, they must be specified again during prediction using the same type of '
            ' descriptors as before. If they were not used during training, they cannot be specified during prediction.'
        )

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    if args.atom_descriptors == 'feature':
        set_extra_atom_fdim(train_args.atom_features_size)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=not args.drop_extra_columns)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler = load_scalers(checkpoint_path)

        # Normalize features
        if args.features_scaling:
            test_data.reset_features_and_targets()
            test_data.normalize_features(features_scaler)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    #print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        # If extra columns have been dropped, add back in SMILES columns
        if args.drop_extra_columns:
            datapoint.row = OrderedDict()

            smiles_columns = args.smiles_columns

            if None in smiles_columns:
                smiles_columns = get_header(
                    args.test_path)[:len(smiles_columns)]

            for column, smiles in zip(smiles_columns, datapoint.smiles):
                datapoint.row[column] = smiles

        # Add predictions columns
        for pred_name, pred in zip(task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    """
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)
    """

    return avg_preds, dict(full_data[0].row)
Example #4
0
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles
                  ] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [
        os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt')
        for model in models
    ]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')
    train_args = load_args(model_paths[0])

    # Build arguments
    arguments = [
        '--test_path', 'None', '--preds_path',
        os.path.join(app.config['TEMP_FOLDER'],
                     app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths',
        *model_paths
    ]

    if gpu is not None:
        if gpu == 'None':
            arguments.append('--no_cuda')
        else:
            arguments += ['--gpu', gpu]

    # Handle additional features
    if train_args.features_path is not None:
        # TODO: make it possible to specify the features generator if trained using features_path
        arguments += [
            '--features_generator', 'rdkit_2d_normalized',
            '--no_features_scaling'
        ]
    elif train_args.features_generator is not None:
        arguments += ['--features_generator', *train_args.features_generator]

        if not train_args.features_scaling:
            arguments.append('--no_features_scaling')

    # Parse arguments
    args = PredictArgs().parse_args(arguments)

    # Run predictions
    preds = make_predictions(args=args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = 'Invalid SMILES String'
    preds = [
        pred if pred is not None else [invalid_smiles_warning] * num_tasks
        for pred in preds
    ]

    return render_predict(
        predicted=True,
        smiles=smiles,
        num_smiles=min(10, len(smiles)),
        show_more=max(0,
                      len(smiles) - 10),
        task_names=task_names,
        num_tasks=len(task_names),
        preds=preds,
        warnings=["List contains invalid SMILES strings"]
        if None in preds else None,
        errors=["No SMILES strings given"] if len(preds) == 0 else None)
Example #5
0
 def test_bad_path(self):
     """Test bad provided path """
     bad_path = os.path.join(self.temp_dir.name, 'bad_path.csv')
     with self.assertRaises(FileNotFoundError):
         get_header(bad_path)
Example #6
0
 def test_correct_file(self):
     """ Test correct input """
     header = get_header(os.path.join(self.temp_dir.name, 'dummy_data.csv'))
     self.assertEqual(header, ['column0', 'column1'])