Ejemplo n.º 1
0
def average_duplicates(args: Args):
    """Averages duplicate data points in a dataset."""
    print('Loading data')
    header = get_header(args.data_path)
    data = get_data(path=args.data_path,
                    smiles_column=args.smiles_column,
                    target_columns=args.target_columns)
    print(f'Data size = {len(data):,}')

    # Map SMILES string to lists of targets
    smiles_in_order = []
    smiles_to_targets = defaultdict(list)
    for smiles, targets in zip(data.smiles(), data.targets()):
        smiles_to_targets[smiles].append(targets)
        if len(smiles_to_targets[smiles]) == 1:
            smiles_in_order.append(smiles)

    # Find duplicates
    duplicate_count = 0
    stds = []
    new_data = []
    for smiles in smiles_in_order:
        all_targets = smiles_to_targets[smiles]
        duplicate_count += len(all_targets) - 1
        num_tasks = len(all_targets[0])

        targets_by_task = [[] for _ in range(num_tasks)]
        for task in range(num_tasks):
            for targets in all_targets:
                if targets[task] is not None:
                    targets_by_task[task].append(targets[task])

        stds.append([
            np.std(task_targets) if len(task_targets) > 0 else 0.0
            for task_targets in targets_by_task
        ])
        means = [
            np.mean(task_targets) if len(task_targets) > 0 else None
            for task_targets in targets_by_task
        ]
        new_data.append((smiles, means))

    print(f'Number of duplicates = {duplicate_count:,}')
    print(
        f'Duplicate standard deviation per task = {", ".join(f":{std:.4e}" for std in np.mean(stds, axis=0))}'
    )
    print(f'New data size = {len(new_data):,}')

    # Save new data
    with open(args.save_path, 'w') as f:
        f.write(','.join(header) + '\n')

        for smiles, avg_targets in new_data:
            f.write(smiles + ',' + ','.join(
                str(value) if value is not None else ''
                for value in avg_targets) + '\n')
Ejemplo n.º 2
0
def interpret(args: InterpretArgs) -> None:
    global C_PUCT, MIN_ATOMS

    chemprop_model = ChempropModel(args)

    def scoring_function(smiles: List[str]) -> List[float]:
        return chemprop_model(smiles)[:, args.property_id - 1]

    C_PUCT = args.c_puct
    MIN_ATOMS = args.min_atoms

    all_smiles = get_smiles(path=args.data_path,
                            smiles_column=args.smiles_column)
    header = get_header(path=args.data_path)

    property_name = header[
        args.property_id] if len(header) > args.property_id else 'score'
    print(f'smiles,{property_name},rationale,rationale_score')

    rat_smiles = []
    rat_scores = []

    for smiles in all_smiles:
        score = scoring_function([smiles])[0]
        if score > args.prop_delta:
            rationales = mcts(smiles=smiles,
                              scoring_function=scoring_function,
                              n_rollout=args.rollout,
                              max_atoms=args.max_atoms,
                              prop_delta=args.prop_delta)
        else:
            rationales = []

        if len(rationales) == 0:
            rat_smiles.append('N/A')
            rat_scores.append(0)
            print(f'{smiles},{score:.3f},,')
        else:
            min_size = min(len(x.atoms) for x in rationales)
            min_rationales = [
                x for x in rationales if len(x.atoms) == min_size
            ]
            rats = sorted(min_rationales, key=lambda x: x.P, reverse=True)
            rat_smiles.append(rats[0].smiles)
            rat_scores.append(rats[0].P)
            print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}')

    return pd.DataFrame(
        list(zip(all_smiles, rat_smiles, rat_scores)),
        columns=['smiles', 'rationale_smiles', 'rationale_score'])
Ejemplo n.º 3
0
def average_duplicates(args):
    print('Loading data')
    header = get_header(args.data_path)
    data = get_data(args.data_path)
    print('Data size = {:,}'.format(len(data)))

    # Map SMILES string to lists of targets
    smiles_to_targets = defaultdict(list)
    for smiles, targets in zip(data.smiles(), data.targets()):
        smiles_to_targets[smiles].append(targets)

    # Find duplicates
    duplicate_count = 0
    stds = []
    new_data = []
    for smiles, all_targets in smiles_to_targets.items():
        duplicate_count += len(all_targets) - 1
        num_tasks = len(all_targets[0])

        targets_by_task = [[] for _ in range(num_tasks)]
        for task in range(num_tasks):
            for targets in all_targets:
                if targets[task] is not None:
                    targets_by_task[task].append(targets[task])

        stds.append([
            np.std(task_targets) if len(task_targets) > 0 else 0.0
            for task_targets in targets_by_task
        ])
        means = [
            np.mean(task_targets) if len(task_targets) > 0 else None
            for task_targets in targets_by_task
        ]
        new_data.append((smiles, means))

    print('Number of duplicates = {:,}'.format(duplicate_count))
    print('Duplicate standard deviation per task = {}'.format(', '.join(
        '{:.4e}'.format(std) for std in np.mean(stds, axis=0))))
    print('New data size = {:,}'.format(len(new_data)))

    # Save new data
    with open(args.save_path, 'w') as f:
        f.write(','.join(header) + '\n')

        for smiles, avg_targets in new_data:
            f.write(smiles + ',' + ','.join(
                str(value) if value is not None else ''
                for value in avg_targets) + '\n')
Ejemplo n.º 4
0
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles
                  ] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [
        os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt')
        for model in models
    ]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')
    train_args = load_args(model_paths[0])

    # Build arguments
    arguments = [
        '--test_path', 'None', '--preds_path',
        os.path.join(app.config['TEMP_FOLDER'],
                     app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths',
        *model_paths
    ]

    if gpu is not None:
        if gpu == 'None':
            arguments.append('--no_cuda')
        else:
            arguments += ['--gpu', gpu]

    # Handle additional features
    if train_args.features_path is not None:
        # TODO: make it possible to specify the features generator if trained using features_path
        arguments += [
            '--features_generator', 'rdkit_2d_normalized',
            '--no_features_scaling'
        ]
    elif train_args.features_generator is not None:
        arguments += ['--features_generator', *train_args.features_generator]

        if not train_args.features_scaling:
            arguments.append('--no_features_scaling')

    # Parse arguments
    args = PredictArgs().parse_args(arguments)

    # Run predictions
    preds = make_predictions(args=args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = 'Invalid SMILES String'
    preds = [
        pred if pred is not None else [invalid_smiles_warning] * num_tasks
        for pred in preds
    ]

    return render_predict(
        predicted=True,
        smiles=smiles,
        num_smiles=min(10, len(smiles)),
        show_more=max(0,
                      len(smiles) - 10),
        task_names=task_names,
        num_tasks=len(task_names),
        preds=preds,
        warnings=["List contains invalid SMILES strings"]
        if None in preds else None,
        errors=["No SMILES strings given"] if len(preds) == 0 else None)
Ejemplo n.º 5
0
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        print(" GOT HERE")
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')

    # Create and modify args
    args = load_args(model_paths[0])

    if args.features_path != None:
        args.features_generator = ["rdkit_2d_normalized"]
        args.features_path = None

    preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME'])
    args.test_path = 'None'  # TODO: Remove this hack to avoid assert crashing in modify_predict_args
    args.preds_path = preds_path
    args.checkpoint_paths = model_paths
    if gpu is not None:
        if gpu == 'None':
            args.no_cuda = True
        else:
            args.gpu = int(gpu)

    modify_predict_args(args)

    # Run predictions
    preds = make_predictions(args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = "Invalid SMILES String"
    preds = [pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds]

    return render_predict(predicted=True,
                          smiles=smiles,
                          num_smiles=min(10, len(smiles)),
                          show_more=max(0, len(smiles)-10),
                          task_names=task_names,
                          num_tasks=len(task_names),
                          preds=preds,
                          warnings=["List contains invalid SMILES strings"] if None in preds else None,
                          errors=["No SMILES strings given"] if len(preds) == 0 else None)
Ejemplo n.º 6
0

if __name__ == "__main__":
    args = Args().parse_args()

    chemprop_model = ChempropModel(checkpoint_dir=args.checkpoint_dir,
                                   device=args.device)

    def scoring_function(smiles: List[str]) -> List[float]:
        return chemprop_model(smiles)[:, args.property_id - 1]

    C_PUCT = args.c_puct
    MIN_ATOMS = args.min_atoms

    all_smiles = get_smiles(path=args.data_path)
    header = get_header(path=args.data_path)

    property_name = header[
        args.property_id] if len(header) > args.property_id else 'score'
    print(f'smiles,{property_name},rationale,rationale_score')

    for smiles in all_smiles:
        score = scoring_function([smiles])[0]
        if score > args.prop_delta:
            rationales = mcts(smiles=smiles,
                              scoring_function=scoring_function,
                              n_rollout=args.rollout,
                              max_atoms=args.max_atoms,
                              prop_delta=args.prop_delta)
        else:
            rationales = []
Ejemplo n.º 7
0
def predict():
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    checkpoint_name = request.form['checkpointName']

    if 'data' in request.files:
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles
                  ] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))
    elif request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    else:
        smiles = [request.form['drawSmiles']]

    checkpoint_path = os.path.join(app.config['CHECKPOINT_FOLDER'],
                                   checkpoint_name)
    task_names = load_task_names(checkpoint_path)
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')

    # Create and modify args
    parser = ArgumentParser()
    add_predict_args(parser)
    args = parser.parse_args()

    preds_path = os.path.join(app.config['TEMP_FOLDER'],
                              app.config['PREDICTIONS_FILENAME'])
    args.test_path = 'None'  # TODO: Remove this hack to avoid assert crashing in modify_predict_args
    args.preds_path = preds_path
    args.checkpoint_path = checkpoint_path
    args.write_smiles = True
    if gpu is not None:
        if gpu == 'None':
            args.no_cuda = True
        else:
            args.gpu = int(gpu)

    modify_predict_args(args)

    # Run predictions
    preds = make_predictions(args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = "Invalid SMILES String"
    preds = [
        pred if pred is not None else [invalid_smiles_warning] * num_tasks
        for pred in preds
    ]

    return render_predict(
        predicted=True,
        smiles=smiles,
        num_smiles=min(10, len(smiles)),
        show_more=max(0,
                      len(smiles) - 10),
        task_names=task_names,
        num_tasks=len(task_names),
        preds=preds,
        warnings=["List contains invalid SMILES strings"]
        if None in preds else None,
        errors=["No SMILES strings given"] if len(preds) == 0 else None)