Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Clustering with SuCOS and RDKit')
    parser.add_argument(
        '-i',
        '--input',
        help='Input file in SDF format. Can be gzipped (*.gz).')
    parser.add_argument(
        '-o',
        '--output',
        default="cluster",
        help="Base name for output files in SDF format. " +
        "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created"
    )
    parser.add_argument(
        '--gzip',
        action='store_true',
        help=
        'Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz'
    )
    parser.add_argument('-t',
                        '--threshold',
                        type=float,
                        default=0.8,
                        help='Clustering threshold')

    args = parser.parse_args()
    utils.log("SuCOS Cluster Args: ", args)

    input_file = utils.open_file_for_reading(args.input)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    mols = list(suppl)
    matrix = calc_distance_matrix(mols)
    clusters = cluster(matrix, threshold=args.threshold)
    write_clusters_to_sdfs(mols, clusters, args.output, gzip=args.gzip)
Esempio n. 2
0
def process(
    refmol_filename,
    inputs_filename,
    outputs_filename,
    refmol_index=None,
    refmol_format=None,
    tani=False,
    score_mode=FeatMaps.FeatMapScoreMode.All,
):

    ref_mol = utils.read_single_molecule(refmol_filename,
                                         index=refmol_index,
                                         format=refmol_format)
    # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms")
    ref_features = getRawFeatures(ref_mol)

    input_file = utils.open_file_for_reading(inputs_filename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputs_filename)
    writer = Chem.SDWriter(output_file)

    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        count += 1
        if mol is None:
            continue
        # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
        try:
            sucos_score, fm_score, val3 = get_SucosScore(
                ref_mol,
                mol,
                tani=tani,
                ref_features=ref_features,
                score_mode=score_mode,
            )
            mol.SetDoubleProp("SuCOS_Score", sucos_score)
            mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score)
            if tani:
                mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3)
            else:
                mol.SetDoubleProp("SuCOS_Protrude_Score", val3)
            utils.log("Scores:", sucos_score, fm_score, val3)
            writer.write(mol)
            total += 1
        except ValueError as e:
            errors += 1
            utils.log("Molecule", count, "failed to score:", e.message)

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors,
              "errors")
Esempio n. 3
0
def process(inputfilename, clusterfilenames, outputfilename, mode):

    all_clusters = {}
    for filename in clusterfilenames:
        cluster = []
        cluster_file = utils.open_file_for_reading(filename)
        suppl = Chem.ForwardSDMolSupplier(cluster_file)
        i = 0
        for mol in suppl:
            i += 1
            if not mol:
                utils.log("WARNING: failed to generate molecule", i,
                          "in cluster", filename)
                continue
            try:
                features = sucos.getRawFeatures(mol)
                cluster.append((mol, features))
            except:
                utils.log("WARNING: failed to generate features for molecule",
                          i, "in cluster", filename)

        cluster_file.close()
        all_clusters[filename] = cluster

    input_file = utils.open_file_for_reading(inputfilename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputfilename)
    writer = Chem.SDWriter(output_file)

    comparisons = 0
    mol_num = 0

    for mol in suppl:
        mol_num += 1
        if not mol:
            utils.log("WARNING: failed to generate molecule", mol_num,
                      "in input")
            continue
        try:
            query_features = sucos.getRawFeatures(mol)
        except:
            utils.log("WARNING: failed to generate features for molecule",
                      mol_num, "in input")
            continue
        scores = [0, 0, 0]
        for clusterfilename in all_clusters:
            cluster = all_clusters[clusterfilename]
            index = 0
            for entry in cluster:
                hit = entry[0]
                ref_features = entry[1]
                index += 1
                comparisons += 1
                sucos_score, fm_score, vol_score = sucos.get_SucosScore(
                    hit,
                    mol,
                    tani=False,
                    ref_features=ref_features,
                    query_features=query_features)
                if mode == 'max':
                    if sucos_score > scores[0]:
                        scores[0] = sucos_score
                        scores[1] = fm_score
                        scores[2] = vol_score
                        cluster_name = clusterfilename
                        cluster_index = index
                elif mode == 'cum':
                    scores[0] += sucos_score
                    scores[1] += fm_score
                    scores[2] += vol_score
                else:
                    raise ValueError("Invalid mode: " + mode)

        if scores[0] > 0:
            if mode == 'max':
                cluster_file_name_only = cluster_name.split(os.sep)[-1]
                #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
                mol.SetDoubleProp("Max_SuCOS_Score", scores[0])
                mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1])
                mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2])
                mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only)
                mol.SetIntProp("Max_SuCOS_Index", cluster_index)

            else:
                #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
                mol.SetDoubleProp("Cum_SuCOS_Score", scores[0])
                mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1])
                mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2])

            writer.write(mol)

        else:
            utils.log("Molecule", mol_num,
                      "did not overlay. Omitting from results")

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed", comparisons, "comparisons")
Esempio n. 4
0
def process(inputfilename, clusterfilenames, outputfilename, filter_value,
            filter_field):
    all_clusters = {}
    for filename in clusterfilenames:
        cluster = []
        cluster_file = utils.open_file_for_reading(filename)
        suppl = Chem.ForwardSDMolSupplier(cluster_file)
        i = 0
        for mol in suppl:
            i += 1
            if not mol:
                utils.log("WARNING: failed to generate molecule", i,
                          "in cluster", filename)
                continue
            try:
                features = sucos.getRawFeatures(mol)
                cluster.append((mol, features))
            except:
                utils.log("WARNING: failed to generate features for molecule",
                          i, "in cluster", filename)

        cluster_file.close()
        all_clusters[filename] = cluster

    input_file = utils.open_file_for_reading(inputfilename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputfilename)
    writer = Chem.SDWriter(output_file)

    comparisons = 0
    mol_num = 0

    for mol in suppl:
        mol_num += 1
        if not mol:
            utils.log("WARNING: failed to generate molecule", mol_num,
                      "in input")
            continue
        try:
            query_features = sucos.getRawFeatures(mol)
        except:
            utils.log("WARNING: failed to generate features for molecule",
                      mol_num, "in input")
            continue
        scores_max = [0, 0, 0]
        scores_cum = [0, 0, 0]
        cluster_name = None
        for clusterfilename in all_clusters:
            cluster = all_clusters[clusterfilename]
            index = 0
            for entry in cluster:
                hit = entry[0]
                ref_features = entry[1]
                index += 1
                comparisons += 1
                sucos_score, fm_score, vol_score = sucos.get_SucosScore(
                    hit,
                    mol,
                    tani=False,
                    ref_features=ref_features,
                    query_features=query_features)

                if sucos_score > scores_max[0]:
                    scores_max[0] = sucos_score
                    scores_max[1] = fm_score
                    scores_max[2] = vol_score
                    cluster_name = clusterfilename
                    cluster_index = index

                scores_cum[0] += sucos_score
                scores_cum[1] += fm_score
                scores_cum[2] += vol_score

        # utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
        mol.SetDoubleProp("Max_SuCOS_Score",
                          scores_max[0] if scores_max[0] > 0 else 0)
        mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score",
                          scores_max[1] if scores_max[1] > 0 else 0)
        mol.SetDoubleProp("Max_SuCOS_Protrude_Score",
                          scores_max[2] if scores_max[2] > 0 else 0)

        if cluster_name:
            cluster_file_name_only = cluster_name.split(os.sep)[-1]
            mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only)
            mol.SetIntProp("Max_SuCOS_Index", cluster_index)

        # utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
        mol.SetDoubleProp("Cum_SuCOS_Score",
                          scores_cum[0] if scores_cum[0] > 0 else 0)
        mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score",
                          scores_cum[1] if scores_cum[1] > 0 else 0)
        mol.SetDoubleProp("Cum_SuCOS_Protrude_Score",
                          scores_cum[2] if scores_cum[2] > 0 else 0)

        if filter_value and filter_field:
            if mol.HasProp(filter_field):
                val = mol.GetDoubleProp(filter_field)
                if val > filter_value:
                    writer.write(mol)
        else:
            writer.write(mol)

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed", comparisons, "comparisons")