Example #1
0
def doO3Dalign(i,
               mol,
               qmol,
               threshold,
               perfect_score,
               writer,
               conformerProps=None,
               minEnergy=None):
    pyO3As = rdMolAlign.GetO3AForProbeConfs(mol, qmol)
    best_score = 0
    j = 0
    conf_id = -1
    for pyO3A in pyO3As:
        align = pyO3A.Align()
        score = pyO3A.Score()
        if score > best_score:
            best_score = score
            conf_id = j
        j += 1

    #utils.log("Best score = ",best_score)
    if not threshold or perfect_score - best_score < threshold:
        utils.log(i, align, score, Chem.MolToSmiles(mol, isomericSmiles=True))
        mol.SetDoubleProp(field_O3DAScore, score)
        if conformerProps and minEnergy:
            eAbs = conformerProps[conf_id][(conformers.field_EnergyAbs)]
            eDelta = eAbs - minEnergy
            if eAbs:
                mol.SetDoubleProp(conformers.field_EnergyAbs, eAbs)
            if eDelta:
                mol.SetDoubleProp(conformers.field_EnergyDelta, eDelta)
        writer.write(mol, confId=conf_id)
        return 1
    return 0
Example #2
0
def fragment(mol, mode, quiet=False):
    frags = Chem.GetMolFrags(mol, asMols=True)

    if len(frags) == 1:
        return mol
    else:
        # TODO - handle ties
        if mode == 'hac':
            biggest_count = 0
            for frag in frags:
                hac = frag.GetNumHeavyAtoms()
                if hac > biggest_count:
                    biggest_count = hac
                    biggest_mol = frag
            if not quiet:
                utils.log("Chose fragment from ", len(frags), "based on HAC")
        elif mode == 'mw':
            biggest_mw = 0
            for frag in frags:
                mw = Descriptors.MolWt(frag)
                if mw > biggest_mw:
                    biggest_mw = mw
                    biggest_mol = frag
            if not quiet:
                utils.log("Chose fragment from ", len(frags), "based on MW")
        else:
            raise ValueError('Invalid fragment mode:', mode)

        # copy the properties across
        for name in mol.GetPropNames():
            biggest_mol.SetProp(name, mol.GetProp(name))
        return biggest_mol
Example #3
0
def main():

    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='RDKit Input Splitter')
    utils.add_default_input_args(parser)
    parser.add_argument('-o',
                        '--output',
                        required=True,
                        help="Directory name for output files (no extension).")
    parser.add_argument(
        '-f',
        '--field',
        required=True,
        help=
        "field to use to split input. Output files will have the name of this field's value"
    )
    parser.add_argument('--meta',
                        action='store_true',
                        help='Write metadata and metrics files')

    args = parser.parse_args()
    utils.log("Splitter Args: ", args)

    filenames = split(args.input, args.informat, args.field, args.output,
                      args.meta)
    utils.log("Files generated:", " ".join(filenames))
Example #4
0
def main():

    ### command line args defintions #########################################
    parser = argparse.ArgumentParser(
        description='Calculate plane of best fit for molecules')
    utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("PBFEV args: ", args)
    input, output, suppl, writer, output_base = utils.default_open_input_output(
        args.input, args.informat, args.output, 'PBFEV', args.outformat)
    i = 0
    count = 0
    errors = 0
    out_results = []
    for mol in suppl:
        i += 1
        AllChem.EmbedMolecule(mol)
        if mol is None: continue
        out_vector = PBFev(mol)
        if out_vector is None: continue
        rd = PBFRD(mol)
        mol.SetDoubleProp("distance", rd)
        for j, angle in enumerate(out_vector):
            mol.SetDoubleProp("angle" + "_" + str(j), angle)
        out_results.append(mol)
    count = write_out(out_results, count, writer, args.outformat)
    utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) +
              " outputs")
    writer.flush()
    writer.close()
    input.close()
    output.close()
Example #5
0
def main():

    ### command line args defintions ##################################

    parser = argparse.ArgumentParser(description='Tmax/Cmax simulation')
    parser.add_argument('--half-life', type=float, required=True, help='half life (hours)')
    parser.add_argument('--absorption', type=float, required=True, help='half life absorption (hours)')
    parser.add_argument('--dose', type=float, required=True, help='initial dose (mg)')
    parser.add_argument('--auc', type=float, required=True, help='AUC (mg/L*hr)')
    parser.add_argument('--time', type=float, required=True, help='time (h)')

    parser.add_argument('--plot-height', type=int, default=4, help='plot height')
    parser.add_argument('--plot-width', type=int, default=10, help='plot width')
    parser.add_argument('--font-size', type=int, default=12, help='font size')

    parser.add_argument('-o', '--output', type=str, default='cmax.png', help='output file name')

    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')

    args = parser.parse_args()
    utils.log("Tmax/Cmax simulation Args: ", args)

    ### execute #######################################################

    generatePlot(args.half_life, args.absorption, args.dose, args.auc, args.time,
                 plot_width=args.plot_width, plot_height=args.plot_height, font_size=args.font_size,
                 filename=args.output)
Example #6
0
def main():
    global WRITER, THRESHOLD
    global PDB_PATH
    parser = argparse.ArgumentParser(
        description='SMoG2016 - Docking calculation.')
    utils.add_default_io_args(parser)
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring")
    parser.add_argument('-t',
                        '--threshold',
                        help="The maximum score to allow",
                        default=None)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()

    utils.log("SMoG2016 Args: ", args)

    smog_path = "/usr/local/SMoG2016_Rev1/"
    if args.threshold:
        THRESHOLD = float(args.threshold)
    else:
        THRESHOLD = None

    PDB_PATH = "/tmp/pdb_file.pdb"
    # Now copy it to prot_pdb.pdb -> silly SMOG bug requires underscore in the filename!
    shutil.copy(args.pdb_file, PDB_PATH)

    # Open up the input file
    input, suppl = utils.default_open_input(args.input, args.informat)
    # Open the ouput file
    output, WRITER, output_base = utils.default_open_output(
        args.output, "SMoG2016", args.outformat, compress=not args.no_gzip)

    # Cd to the route of the action
    # TODO - can this be done without changing dir? It gives problems in finding the input files and in writing the metrics
    cwd = os.getcwd()
    os.chdir(smog_path)

    # Iterate over the molecules
    # TODO - restore parallel processing, but need to ensure the order of molecules is preserved
    pool = ThreadPool(1)
    pool.map(run_dock, suppl)
    # Close the file
    WRITER.close()

    os.chdir(cwd)
    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': COUNTER,
                '__OutputCount__': SUCCESS,
                'SMoG2016': COUNTER
            })

    utils.log("SMoG2016 complete")
Example #7
0
def main():
    global PDB_PATH, WRITER, THRESHOLD
    parser = argparse.ArgumentParser(
        description='SMoG2016 - Docking calculation.')
    utils.add_default_io_args(parser)
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring")
    parser.add_argument('-t',
                        '--threshold',
                        type=float,
                        help="The maximum score to allow",
                        default=None)
    parser.add_argument(
        '--threads',
        type=int,
        help="Number of threads to used. Default is the number of cores",
        default=None)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("PLI Args: ", args)

    # Open up the input file
    input, suppl = utils.default_open_input(args.input, args.informat)
    # Open the ouput file
    output, WRITER, output_base = utils.default_open_output(
        args.output,
        "plip",
        args.outformat,
        compress=not args.no_gzip,
        thinOutput=args.thin)

    PDB_PATH = args.pdb_file
    if args.threshold:
        THRESHOLD = args.threshold

    # Iterate over the molecules
    # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this.
    pool = ThreadPool(args.threads if args.
                      threads is not None else multiprocessing.cpu_count())
    pool.map(run_dock, suppl)
    pool.close()
    pool.join()
    # Close the file
    WRITER.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': COUNTER,
            '__OutputCount__': SUCCESS,
            'PLI': COUNTER
        })
Example #8
0
def filter_by_molwt(mol, minMw, maxMw, quiet=False):
    mw = Descriptors.MolWt(mol)
    if minMw is not None and mw < minMw:
        if not quiet:
            utils.log("MolWt", mw, "<", minMw)
        return False
    if maxMw is not None and mw > maxMw:
        if not quiet:
            utils.log("MolWt", mw, ">", maxMw)
        return False
    return True
Example #9
0
def filter_by_heavy_atom_count(mol, minCount, maxCount, quiet=False):
    hac = mol.GetNumHeavyAtoms()
    if minCount is not None and hac < minCount:
        if not quiet:
            utils.log("HAC", hac, "<", minCount)
        return False
    if maxCount is not None and hac > maxCount:
        if not quiet:
            utils.log("HAC", hac, ">", maxCount)
        return False
    return True
Example #10
0
def SelectDiverseSubset(mols, clusters, distances, count, field, maximise,
                        score, quiet):
    total = len(mols)
    num_clusters = len(clusters)
    pickedList = []
    clustersList = []
    for i in range(0, num_clusters):
        pickedList.append([])
        if field:
            filteredByValue = [
                x for x in clusters[i] if mols[x].HasProp(field)
            ]
            sortedByValue = sorted(
                filteredByValue,
                key=lambda idx: FetchScore(idx, mols, field, maximise))
            clustersList.append(sortedByValue)
        else:
            allRecords = [x for x in clusters[i]]
            clustersList.append(allRecords)

    totalIter = 0
    clusterIter = 0
    pickedCount = 0

    while totalIter < total and pickedCount < count:
        clusterNum = totalIter % num_clusters
        clus = clustersList[clusterNum]
        pick = pickedList[clusterNum]
        #utils.log("iter",totalIter,"cluster",clusterNum,"length",len(clus))
        if len(clus) > 0:
            # remove that item from the cluster so that it's not tried again
            molIndex = clus.pop(0)
            if len(pick) == 0:  # first time for this cluster
                pick.append(molIndex)
                pickedCount += 1
                clusterIter += 1
                if not quiet:
                    utils.log("Cluster", clusterNum, "initialised with",
                              molIndex)
            else:
                closestDist = GetClosestDistance(distances, molIndex, pick)
                #utils.log("Closest score",closestDist)
                if closestDist < score:
                    pick.append(molIndex)
                    pickedCount += 1
                    clusterIter += 1
                    if not quiet:
                        utils.log("Cluster", clusterNum, "added", molIndex,
                                  "with score", closestDist)
                elif not quiet:
                    utils.log("Cluster", clusterNum, "discarded", molIndex,
                              "with score", closestDist)
        else:  # cluster has been exhausted
            #utils.log("Cluster",clusterNum,"exhasted")
            clusterIter += 1

        totalIter += 1

    utils.log("Picked", pickedCount, "using", totalIter, "iterations")
    return pickedList
Example #11
0
def split(input, informat, fieldName, outputBase, writeMetrics):
    """Splits the input into separate files. The name of each file and the file the each record is written to
    is determined by the fieldName parameter
    """

    input, suppl = utils.default_open_input(input, informat)

    i = 0
    written = 0
    writers = {}
    outputs = []
    filenames = []
    for mol in suppl:
        i += 1
        if mol is None: continue
        if not mol.HasProp(fieldName):
            utils.log("Skipping molecule", i, "- did not contain field",
                      fieldName)
            continue
        value = mol.GetProp(fieldName)
        if value:
            s = str(value)
            if writers.has_key(s):
                writer = writers[s]
            else:
                name = outputBase + s
                output, writer = utils.default_open_output_sdf(
                    name, outputBase, False, False)
                filenames.append(name + '.sdf')
                outputs.append(output)
                writers[s] = writer
            writer.write(mol)
            written += 1

    utils.log("Generated", len(writers), "outputs from", i, "records")

    input.close()
    for k in writers:
        writers[k].close()
    for o in outputs:
        o.close()

    if writeMetrics:
        utils.write_metrics(outputBase, {
            '__InputCount__': i,
            '__OutputCount__': written,
            'Splitter': i
        })

    return filenames
Example #12
0
def run_dock(mol):
    global WRITER, COUNTER, SUCCESS, THRESHOLD
    answer_dict = run_and_get_ans(mol, PDB_PATH)
    COUNTER += 1
    if not answer_dict:
        utils.log("FAILED MOL", Chem.MolToSmiles(mol))
        return
    if THRESHOLD is not None:
        if answer_dict["system"]["pliff_score"] > THRESHOLD:
            return
    for ans in answer_dict["system"]:
        if ans.startswith(u"pliff"):
            mol.SetDoubleProp(str(ans), answer_dict["system"][ans])
    utils.log("SCORED MOL:", Chem.MolToSmiles(mol), answer_dict)
    lock.acquire()
    WRITER.write(mol)
    SUCCESS += 1
    WRITER.flush()
    lock.release()
Example #13
0
def process_mol_conformers(mol, i, numConfs, maxAttempts, pruneRmsThresh,
                           clusterMethod, clusterThreshold,
                           minimizeIterations):
    #utils.log("generating conformers for molecule",i)
    # generate the conformers
    conformerIds = gen_conformers(mol, numConfs, maxAttempts, pruneRmsThresh,
                                  True, True, True)
    conformerPropsDict = {}
    minEnergy = 9999999999999
    for conformerId in conformerIds:
        #utils.log("Processing conf",i,conformerId)
        # energy minimise (optional) and energy calculation
        props = collections.OrderedDict()
        energy = calc_energy(mol, conformerId, minimizeIterations, props)
        if energy and energy < minEnergy:
            minEnergy = energy
        conformerPropsDict[conformerId] = props
    # cluster the conformers
    if clusterMethod:
        rmsClusters = cluster_conformers(mol, clusterMethod, clusterThreshold)
        utils.log("Molecule", i, "generated", len(conformerIds),
                  "conformers and", len(rmsClusters), "clusters")
        rmsClustersPerCluster = []
        clusterNumber = 0

        for cluster in rmsClusters:
            clusterNumber = clusterNumber + 1
            rmsWithinCluster = align_conformers(mol, cluster)
            for conformerId in cluster:
                props = conformerPropsDict[conformerId]
                props[field_ClusterNum] = clusterNumber
                props[field_ClusterCentroid] = cluster[0] + 1
                idx = cluster.index(conformerId)
                if idx > 0:
                    props[field_RMSToCentroid] = rmsWithinCluster[idx - 1]
                else:
                    props[field_RMSToCentroid] = 0.0
    else:
        utils.log("Molecule", i + 1, "generated", len(conformerIds),
                  "conformers")

    return conformerPropsDict, minEnergy
Example #14
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Sdf2Json')
    parser.add_argument('-i', '--input', help="Input SD file, if not defined the STDIN is used")
    parser.add_argument('-o', '--output', help="Base name for output json file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.")
    parser.add_argument('--exclude', help="Optional list of fields (comma separated) to exclude from the output.")


    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    if args.input:
        if args.input.lower().endswith(".sdf"):
            base = args.input[:-4]
        elif args.input.lower().endswith(".sdf.gz"):
            base = args.input[:-7]
        else:
            base = "json"
    utils.log("Base:", base)


    input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, "sdf", args.output, base, "json")
    if args.exclude:
        excludes = args.exclude.split(",")
        utils.log("Excluding", excludes)
    else:
        excludes = None

    i=0
    count = 0
    for mol in suppl:
        i +=1
        if mol is None: continue
        if excludes:
            for exclude in excludes:
                if mol.HasProp(exclude): mol.ClearProp(exclude)
        writer.write(mol)
        count += 1

    utils.log("Converted", count, " molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitSdf2Json':count})

    return count
Example #15
0
def enumerateStereoIsomers(mol):
    out = []
    chiralCentres = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
    #return the molecule object when no chiral centres where identified
    if chiralCentres == []:
        return [mol]

    #All bit permutations with number of bits equals number of chiralCentres
    elements = _spam(len(chiralCentres))

    for isoId,element in enumerate(elements):
        for centreId,i in enumerate(element):
            atomId = chiralCentres[centreId][0]
            if i == 0:
                mol.GetAtomWithIdx(atomId).SetChiralTag(Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW)
            elif i == 1:
                mol.GetAtomWithIdx(atomId).SetChiralTag(Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW)
        outmol = copy(mol)
        utils.log("Enumerated ", Chem.MolToSmiles(mol, isomericSmiles=True))
        out.append(outmol)
    return out
Example #16
0
def run_and_get_ans(mol, pdb_path):
    global PDB_PATH
    smogmol = tempfile.NamedTemporaryFile("w", suffix=".sdf",
                                          delete=False).name
    utils.log("PDB: " + PDB_PATH + " ligand: " + smogmol)
    out_f = open(smogmol, "w")
    out_f.write(Chem.MolToMolBlock(mol))
    out_f.close()
    # Run command
    pli_path = "/usr/local/pli/bin/pli"
    cmd = [
        pli_path, "-protein", pdb_path, "-ligand", smogmol, "-mode", "score",
        "-output", "system,scores", "-exact_voronoi_areas", "0", "-selection",
        "ligand", "-oformat", "json", "-minimise", "1", "-warnings", "0",
        "-min_max_iter", "10"
    ]
    utils.log("PLI CMD: " + " ".join(cmd))
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    # Parse the output
    me = proc.stdout.read()
    if not me:
        return None
    return json.loads(me)
Example #17
0
def split(input, informat, fieldName, outputBase):
    """Splits the input into separate files. The name of each files and the file the records is written to
    is determined by the fieldName parameter
    """

    input, suppl = utils.default_open_input(input, informat)

    i = 0
    writers = {}
    outputs = []
    filenames = []
    for mol in suppl:
        i += 1
        if mol is None: continue
        value = mol.GetProp(fieldName)
        if value:
            s = str(value)
            if writers.has_key(s):
                writer = writers[s]
            else:
                name = outputBase + s
                output, writer = utils.default_open_output_sdf(
                    name, outputBase, False, False)
                filenames.append(name + '.sdf')
                outputs.append(output)
                writers[s] = writer
            writer.write(mol)

    utils.log("Generated", len(writers), "outputs from", i, "records")

    input.close()
    for k in writers:
        writers[k].close()
    for o in outputs:
        o.close()

    return filenames
Example #18
0
def main():
    global PDB_PATH, WRITER, THRESHOLD
    parser = argparse.ArgumentParser(description='Open babel PDB prepare')
    parser.add_argument('--no-gzip',
                        action='store_true',
                        help='Do not compress the output')
    parser.add_argument('-i', '--input', help="PDB file for converting")
    parser.add_argument('-o',
                        '--output',
                        help="Base name for output files (no extension).")
    parser.add_argument('-mol2',
                        '--mol2',
                        action='store_true',
                        help='Output as Mol2 format.')
    parser.add_argument('-pdbqt',
                        '--pdbqt',
                        action='store_true',
                        help='Output as pdbqt format.')
    parser.add_argument('--meta',
                        action='store_true',
                        help='Write metrics files')
    parser.add_argument('-prot',
                        '--protonate',
                        type=float,
                        help="protonate at this pH (optional)")

    args = parser.parse_args()

    utils.log("Prepare Args: ", args)

    if not (args.mol2 or args.pdbqt):
        raise ValueError(
            "Must specify at least one output fromat: mol2 and/or pdbqt")

    if args.pdbqt:
        utils.log("Preparing as pdbqt")
        execute(args.input, args.output, "pdbqt", "-opdbqt", args.protonate,
                args.no_gzip)

    if args.mol2:
        utils.log("Preparing as mol2")
        execute(args.input, args.output, "mol2", "-omol2", args.protonate,
                args.no_gzip)

    utils.log("Preparation complete")
Example #19
0
def run_dock(mol):
    global COUNTER
    global SUCCESS
    global THRESHOLD
    answer = run_and_get_ans(mol)
    COUNTER += 1
    if answer is None:
        utils.log("FAILED MOL", Chem.MolToSmiles(mol))
        return
    if THRESHOLD is not None:
        print(answer, THRESHOLD)
        if answer > THRESHOLD:
            utils.log("UNDER THRESHOLD", Chem.MolToSmiles(mol))
            return
    mol.SetDoubleProp("SMoG2016_SCORE", answer)
    utils.log("SCORED MOL:", Chem.MolToSmiles(mol), answer)
    # Write ligand
    lock.acquire()
    SUCCESS += 1
    WRITER.write(mol)
    WRITER.flush()
    lock.release()
    return
Example #20
0
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps)

    inputs = 0
    totalCount = 0
    totalErrors = 0
    for mol in suppl:
        inputs += 1
        if mol:
            count, errors = generate_conformers(inputs, mol, args.num, ref_mol,
                                                WRITER, args.core_smi)
            totalCount += count
            totalErrors += errors

    input.close()
    WRITER.close()

    if totalErrors > 0:
        utils.log("WARNING:", totalErrors, "conformers failed to generate")

    # write metrics
    if args.meta:
        metrics = {
            '__InputCount__': inputs,
            '__OutputCount__': totalCount,
            'RDKitConstrainedConformer': totalCount
        }
        if totalErrors > 0:
            metrics['__ErrorCount__'] = totalErrors
        utils.write_metrics(output_base, metrics)
Example #21
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit screen')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--qsmiles',
        help='query structure as smiles (incompatible with -qmolfile arg)')
    group.add_argument(
        '--qmolfile',
        help=
        'query structure as filename in molfile format (incompatible with -qsmiles arg)'
    )
    parser.add_argument('--simmin',
                        type=float,
                        default=0.7,
                        help='similarity lower cutoff (1.0 means identical)')
    parser.add_argument('--simmax',
                        type=float,
                        default=1.0,
                        help='similarity upper cutoff (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    utils.add_default_io_args(parser)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    descriptor = descriptors[args.descriptor.lower()]
    metric = metrics[args.metric.lower()]

    if args.qsmiles:
        query_rdkitmol = Chem.MolFromSmiles(args.qsmiles)
    elif args.qmolfile:
        query_rdkitmol = Chem.MolFromMolFile(args.qmolfile)
    else:
        raise ValueError('No query structure specified')

    query_fp = descriptor(query_rdkitmol)

    input, output, suppl, writer, output_base = utils.default_open_input_output(
        args.input,
        args.informat,
        args.output,
        'screen',
        args.outformat,
        thinOutput=args.thin)

    i = 0
    count = 0
    for mol in suppl:
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = filter.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter.filter(mol,
                             minHac=args.hacmin,
                             maxHac=args.hacmax,
                             minMw=args.mwmin,
                             maxMw=args.mwmax,
                             quiet=args.quiet):
            continue
        target_fp = descriptor(mol)
        sim = metric(query_fp, target_fp)

        if sim >= args.simmin and sim <= args.simmax:
            count += 1
            if not args.quiet:
                utils.log(i, sim)
            mol.SetDoubleProp(field_Similarity, sim)
            writer.write(mol)

    utils.log("Found", count, "similar molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitScreen': i
        })

    return count
Example #22
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(
        description='RDKit molecule standardiser / enumerator')
    utils.add_default_io_args(parser)
    parser.add_argument('-et',
                        '--enumerate_tauts',
                        action='store_true',
                        help='Enumerate all tautomers')
    parser.add_argument('-es',
                        '--enumerate_stereo',
                        action='store_true',
                        help='Enumerate all stereoisomers')
    parser.add_argument(
        '-st',
        '--standardize',
        action='store_true',
        help='Standardize molecules. Cannot  be true if enumerate is on.')
    parser.add_argument('-stm',
                        '--standardize_method',
                        default="molvs",
                        choices=STANDARD_MOL_METHODS.keys(),
                        help="Chose the method to standardize.")

    args = parser.parse_args()

    if args.standardize and args.enumerate_tauts:
        raise ValueError("Cannot Enumerate Tautomers and Standardise")

    if args.standardize and args.enumerate_stereo:
        raise ValueError("Cannot Enumerate Stereo and Standardise")

    if args.standardize:
        getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method]

    input, output, suppl, writer, output_base = utils.default_open_input_output(
        args.input, args.informat, args.output, 'sanify', args.outformat)
    i = 0
    count = 0
    errors = 0
    for mol in suppl:
        i += 1
        if mol is None: continue

        if args.standardize:
            # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs
            oldUUID = mol.GetProp("uuid")
            inputCanSmiles = Chem.MolToSmiles(mol,
                                              isomericSmiles=True,
                                              canonical=True)
            try:
                std = getStandardMolecule(mol)
                outputCanSmiles = Chem.MolToSmiles(std,
                                                   isomericSmiles=True,
                                                   canonical=True)
                if oldUUID:
                    std.SetProp("uuid", oldUUID)
                #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles)
                if inputCanSmiles == outputCanSmiles:
                    std.SetProp("Standardised", "False")
                else:
                    std.SetProp("Standardised", "True")
            except:
                errors += 1
                utils.log("Error standardizing", sys.exc_info()[0])
                std = mol
                std.SetProp("Standardised", "Error")

            count = write_out([std], count, writer)
        else:
            # we want a new UUID generating as we are generating new molecules
            parentUuid = mol.GetProp("uuid")

            results = []
            results.append(mol)

            if args.enumerate_tauts:
                utils.log("Enumerating tautomers")
                results = enumerateTautomers(mol)

            if args.enumerate_stereo:
                utils.log("Enumerating steroisomers")
                mols = results
                results = []
                for m in mols:
                    enumerated = enumerateStereoIsomers(m)
                    results.extend(enumerated)

            for m in results:
                m.ClearProp("uuid")
                m.SetIntProp("SourceMolNum", i)
                if parentUuid:
                    m.SetProp("SourceMolUUID", parentUuid)

            count = write_out(results, count, writer)

    utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) +
              " outputs")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                '__ErrorCount__': errors,
                'RDKitSanify': count
            })

    return count
Example #23
0
def generatePlot(t_hf, t_hf_a, D, AUC, tn, quiet=False,
                 plot_height=4, plot_width=10, font_size=12, filename='cmax.png'):


    kel= math.log(2)/t_hf
    ka= math.log(2)/ t_hf_a
    Tmax=(math.log(ka)-math.log(kel))/(ka-kel)
    Cmax=math.exp(-kel*Tmax)*kel*AUC
    V_F=D/kel/AUC

    if not quiet:
        utils.log('------------------------------------------------------------------------------------------')
        utils.log('kel \t',kel)
        utils.log('ka \t',ka)
        utils.log('Tmax \t',Tmax)
        utils.log('Cmax \t',Cmax)
        utils.log('V_F \t',V_F)
        utils.log('------------------------------------------------------------------------------------------')

    b_time=[]
    c_cp=[]
    d_perc=[]
    for i in range(0,101):
        a_no=i
        if(i==0):
            b_time.append(0)
        else:
            b_time.append(b_time[i-1]+tn/100)

        c_cp.append(ka*D/V_F/(ka-kel)*(math.exp(-kel*b_time[i])-math.exp(-ka*b_time[i])))

        d_perc.append(100-100*math.exp(-ka*b_time[i]))

    #print(b_time[100],c_cp[100],d_perc[100])

    #Creating the visulisation
    plt.figure(figsize=(plot_width,plot_height))
    plt.subplot(1, 2, 1)

    plt.plot(b_time,c_cp,linewidth=2,linestyle='dashed',color='coral')  #Plotting the observed data
    plt.xlabel('Time (h)',fontsize=font_size)
    plt.ylabel('Cp(mg/L',fontsize=font_size)
    plt.title('cp Vs Time',color='coral',fontsize=font_size)
    plt.grid(True)
    #plt.yscale('log')   #Change the Y sclae to logscale

    plt.subplot(1, 2, 2)
    plt.plot(b_time,d_perc,linewidth=2,linestyle='dashed')  #Plotting the observed data
    plt.xlabel('Time (h)',fontsize=font_size)
    plt.ylabel('% Absorbed',fontsize=font_size)
    plt.title('%Absorbed Vs Time',color='dodgerblue',fontsize=font_size)
    plt.grid(True)

    # Fine-tune figure; make subplots farther from each other.
    # refine layout to better support different sizes

    plt.savefig(filename)
Example #24
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit screen')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--qsmiles',
        help=
        'filename of query structures as smiles (incompatible with --sdf and --qjson args)'
    )
    group.add_argument(
        '--qsdf',
        help=
        'filename of query structures as sdfile (incompatible with --smiles and --qjson args)'
    )
    group.add_argument(
        '--qjson',
        help=
        'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)'
    )
    parser.add_argument('--qsmilesTitleLine',
                        action='store_true',
                        help='the smiles file has a title line')
    parser.add_argument('--qsmilesDelimiter',
                        default='\t',
                        help='delimiter for smiles file (default is tab)')
    parser.add_argument(
        '--qsmilesColumn',
        type=int,
        default=0,
        help='column in smiles file with the smiles (default is first column)')
    parser.add_argument(
        '--qsmilesNameColumn',
        type=int,
        default=1,
        help='column in smiles file with ID (default is second column)')
    parser.add_argument(
        '--qprop',
        help=
        'property name in query molecules to report. If not defined (or property is not present) '
        +
        'then name property is not written. JSON format uses the UUID as default'
    )

    parser.add_argument('--simmin',
                        type=float,
                        default=0.7,
                        help='similarity lower cutoff (1.0 means identical)')
    parser.add_argument('--simmax',
                        type=float,
                        default=1.0,
                        help='similarity upper cutoff (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    utils.add_default_io_args(parser)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    descriptor = descriptors[args.descriptor.lower()]
    metric = metrics[args.metric.lower()]

    propName = args.qprop
    if args.qsmiles:
        queryMolsupplier = utils.default_open_input_smiles(
            args.qsmiles,
            delimiter=args.qsmilesDelimiter,
            smilesColumn=args.qsmilesColumn,
            nameColumn=args.qsmilesNameColumn,
            titleLine=args.qsmilesTitleLine)
        queryInput = None
    elif args.qsdf:
        queryInput, queryMolsupplier = utils.default_open_input_sdf(args.qsdf)
    elif args.qjson:
        queryInput, queryMolsupplier = utils.default_open_input_json(
            args.qjson, lazy=False)
        if not propName:
            propName = "uuid"
    else:
        raise ValueError('No query structure specified')

    queryFps = {}
    utils.log("Preparing query fingerprints")
    count = 0
    for q in queryMolsupplier:
        count += 1
        if q:
            queryFps[q] = descriptor(q)
        else:
            utils.log("WARNING: Failed to parse Molecule", count)
    if queryInput:
        queryInput.close()

    input, output, suppl, writer, output_base = utils.default_open_input_output(
        args.input, args.informat, args.output, 'screen_multi', args.outformat)

    # OK, all looks good so we can hope that things will run OK.
    # But before we start lets write the metadata so that the results can be handled.
    #if args.meta:
    #    t = open(output_base + '_types.txt', 'w')
    #    t.write(field_Similarity + '=integer\n')
    #    t.flush()
    #    t.close()

    i = 0
    count = 0
    for mol in suppl:
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = filter.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter.filter(mol,
                             minHac=args.hacmin,
                             maxHac=args.hacmax,
                             minMw=args.mwmin,
                             maxMw=args.mwmax,
                             quiet=args.quiet):
            continue
        targetFp = descriptor(mol)
        idx = 0
        hits = 0
        bestScore = 0
        bestName = None
        for queryMol in queryFps:
            idx += 1
            sim = metric(queryFps[queryMol], targetFp)
            if propName:
                name = str(queryMol.GetProp(propName))
            else:
                name = None
            if sim >= args.simmin and sim <= args.simmax:
                hits += 1
                if not args.quiet:
                    utils.log(i, idx, sim)
                if sim > bestScore:
                    bestScore = sim
                    bestIdx = idx
                    if name:
                        bestName = name
                if name:
                    mol.SetDoubleProp(field_Similarity + "_" + name, sim)
                else:
                    mol.SetDoubleProp(
                        field_Similarity + "_" + str(idx) + "_Score", sim)

        if hits > 0:
            count += 1
            mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore)
            if bestName:
                mol.SetProp(field_Similarity + "_BestName", bestName)
            else:
                mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx)
            mol.SetIntProp(field_Similarity + "_Count", hits)
            writer.write(mol)

    utils.log("Found", count, "similar molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitScreen': count
        })

    return count
Example #25
0
def main():

    parser = argparse.ArgumentParser(description='Open3DAlign with RDKit')
    parser.add_argument('query', help='query molfile')
    parser.add_argument(
        '--qmolidx',
        help="Query molecule index in SD file if not the first",
        type=int,
        default=1)
    parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        help='score cuttoff relative to alignment of query to itself')
    parser.add_argument(
        '-n',
        '--num',
        default=0,
        type=int,
        help=
        'number of conformers to generate, if None then input structures are assumed to already be 3D'
    )
    parser.add_argument('-a',
                        '--attempts',
                        default=0,
                        type=int,
                        help='number of attempts to generate conformers')
    parser.add_argument('-r',
                        '--rmsd',
                        type=float,
                        default=1.0,
                        help='prune RMSD threshold for excluding conformers')
    parser.add_argument(
        '-e',
        '--emin',
        type=int,
        default=0,
        help=
        'energy minimisation iterations for generated confomers (default of 0 means none)'
    )
    utils.add_default_io_args(parser)

    args = parser.parse_args()
    utils.log("o3dAlign Args: ", args)

    qmol = utils.read_single_molecule(args.query, index=args.qmolidx)
    qmol = Chem.RemoveHs(qmol)
    qmol2 = Chem.Mol(qmol)

    source = "conformers.py"
    datasetMetaProps = {
        "source": source,
        "description": "Open3DAlign using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {"O3DAScore": "java.lang.Float"}
    fieldMetaProps = [{
        "fieldName": "O3DAScore",
        "values": {
            "source": source,
            "description": "Open3DAlign alignment score"
        }
    }]
    if args.num > 0:
        # we generate the conformers so will add energy info
        clsMappings["EnergyDelta"] = "java.lang.Float"
        clsMappings["EnergyAbs"] = "java.lang.Float"
        fieldMetaProps.append({
            "fieldName": "EnergyDelta",
            "values": {
                "source": source,
                "description": "Energy difference to lowest energy conformer"
            }
        })
        fieldMetaProps.append({
            "fieldName": "EnergyAbs",
            "values": {
                "source": source,
                "description": "Absolute energy"
            }
        })

    input, output, suppl, writer, output_base = utils.default_open_input_output(
        args.input,
        args.informat,
        args.output,
        'o3dAlign',
        args.outformat,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps)

    pyO3A = rdMolAlign.GetO3A(qmol2, qmol)
    perfect_align = pyO3A.Align()
    perfect_score = pyO3A.Score()
    utils.log('Perfect score:', perfect_align, perfect_score,
              Chem.MolToSmiles(qmol, isomericSmiles=True), qmol.GetNumAtoms())

    i = 0
    count = 0
    total = 0
    for mol in suppl:
        if mol is None: continue
        if args.num > 0:
            mol.RemoveAllConformers()
            conformerProps, minEnergy = conformers.process_mol_conformers(
                mol, i, args.num, args.attempts, args.rmsd, None, None, 0)
            mol = Chem.RemoveHs(mol)
            count += doO3Dalign(i,
                                mol,
                                qmol,
                                args.threshold,
                                perfect_score,
                                writer,
                                conformerProps=conformerProps,
                                minEnergy=minEnergy)
        else:
            mol = Chem.RemoveHs(mol)
            count += doO3Dalign(i, mol, qmol, args.threshold, perfect_score,
                                writer)
        i += 1
        total += mol.GetNumConformers()

    input.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitO3DAlign': total
        })
Example #26
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit filter')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        help='Limit output to this many records')
    parser.add_argument(
        '-c',
        '--chunksize',
        type=int,
        help=
        'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...'
    )
    parser.add_argument(
        '-d',
        '--digits',
        type=int,
        default=0,
        help=
        'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...'
    )
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    # WARNING: thin output is not appropriate when using --fragment
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        help='Quiet mode - suppress reporting reason for filtering')
    utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("Filter Args: ", args)

    input, suppl = utils.default_open_input(args.input, args.informat)

    if args.chunksize:
        chunkNum = 1
        if args.output:
            output_base = args.output
        else:
            output_base = 'filter'
        output_base_chunk = output_base + str(chunkNum).zfill(args.digits)
        output, writer, output_base_chunk = utils.default_open_output(
            output_base_chunk,
            output_base_chunk,
            args.outformat,
            compress=not args.no_gzip)
    else:
        output, writer, output_base_chunk = utils.default_open_output(
            args.output, "filter", args.outformat, compress=not args.no_gzip)
        output_base = output_base_chunk

    utils.log("Writing to " + output_base_chunk)

    i = 0
    count = 0
    chunkNum = 1
    for mol in suppl:
        if args.limit and count >= args.limit:
            break
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = fragment(mol, args.fragment, quiet=args.quiet)
        if not filter(mol,
                      minHac=args.hacmin,
                      maxHac=args.hacmax,
                      minMw=args.mwmin,
                      maxMw=args.mwmax,
                      quiet=args.quiet):
            continue
        if args.chunksize:
            if count > 0 and count % args.chunksize == 0:
                writer.close()
                output.close()
                chunkNum += 1
                output_chunk_base = output_base + str(chunkNum).zfill(
                    args.digits)
                utils.log("Writing to " + output_chunk_base)
                output, writer, output_chunk_base = utils.default_open_output(
                    output_chunk_base,
                    output_chunk_base,
                    args.outformat,
                    compress=not args.no_gzip)

        count += 1
        writer.write(mol)

    utils.log("Filtered", i, "down to", count, "molecules")
    if args.chunksize:
        utils.log("Wrote", chunkNum, "chunks")
        if (args.digits > 0 and len(str(chunkNum)) > args.digits):
            utils.log(
                "WARNING: not enough digits specified for the number of chunks"
            )

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitFilter': i
        })
Example #27
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit conformers')
    parser.add_argument('-n',
                        '--num',
                        type=int,
                        default=1,
                        help='number of conformers to generate')
    parser.add_argument('-a',
                        '--attempts',
                        type=int,
                        default=0,
                        help='number of attempts')
    parser.add_argument('-r',
                        '--rmsd',
                        type=float,
                        default=1.0,
                        help='prune RMSD threshold')
    parser.add_argument(
        '-c',
        '--cluster',
        type=str.lower,
        choices=['rmsd', 'tdf'],
        help='Cluster method (RMSD or TFD). If None then no clustering')
    parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        help='cluster threshold (default of 2.0 for RMSD and 0.3 for TFD)')
    parser.add_argument(
        '-e',
        '--emin',
        type=int,
        default=0,
        help='energy minimisation iterations (default of 0 means none)')
    utils.add_default_io_args(parser)
    parser.add_argument(
        '--smiles',
        help=
        'input structure as smiles (incompatible with using files or stdin for input)'
    )

    args = parser.parse_args()

    if not args.threshold:
        if args.cluster == 'tfd':
            args.threshold = 0.3
        else:
            args.threshold = 2.0

    utils.log("Conformers Args: ", args)

    source = "conformers.py"
    datasetMetaProps = {
        "source": source,
        "description":
        "Conformer generation using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {
        "RMSToCentroid": "java.lang.Float",
        "EnergyDelta": "java.lang.Float",
        "EnergyAbs": "java.lang.Float",
        "ConformerNum": "java.lang.Integer",
        "ClusterCentroid": "java.lang.Integer",
        "ClusterNum": "java.lang.Integer",
        "StructureNum": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "RMSToCentroid",
        "values": {
            "source": source,
            "description": "RMS distance to the cluster centroid"
        }
    }, {
        "fieldName": "EnergyDelta",
        "values": {
            "source": source,
            "description": "Energy difference to lowest energy structure"
        }
    }, {
        "fieldName": "EnergyAbs",
        "values": {
            "source": source,
            "description": "Absolute energy"
        }
    }, {
        "fieldName": "ConformerNum",
        "values": {
            "source": source,
            "description": "Conformer number"
        }
    }, {
        "fieldName": "ClusterCentroid",
        "values": {
            "source": source,
            "description": "Conformer number of the cluster centroid"
        }
    }, {
        "fieldName": "ClusterNum",
        "values": {
            "source": source,
            "description": "Cluster number"
        }
    }, {
        "fieldName": "StructureNum",
        "values": {
            "source": source,
            "description": "Structure number this conformer was generated from"
        }
    }]

    if args.smiles:
        mol = Chem.MolFromSmiles(args.smiles)
        suppl = [mol]
        input = None
        output, writer, output_base = utils.default_open_output(
            args.output,
            'conformers',
            args.outformat,
            valueClassMappings=clsMappings,
            datasetMetaProps=datasetMetaProps,
            fieldMetaProps=fieldMetaProps)
    else:
        input, output, suppl, writer, output_base = utils.default_open_input_output(
            args.input,
            args.informat,
            args.output,
            'conformers',
            args.outformat,
            valueClassMappings=clsMappings,
            datasetMetaProps=datasetMetaProps,
            fieldMetaProps=fieldMetaProps)

    # OK, all looks good so we can hope that things will run OK.
    # But before we start lets write the metadata so that the results can be handled.
    #if args.meta:
    #    t = open(output_base + '_types.txt', 'w')
    #    t.write(field_StructureNum + '=integer\n')
    #    t.write(field_StructureNum + '=integer\n')
    #    t.write(field_ConformerNum + '=integer\n')
    #    t.write(field_EnergyAbs + '=double\n')
    #    t.write(field_EnergyDelta + '=double\n')
    #    if args.emin > 0:
    #        t.write(field_MinimizationConverged + '=boolean\n')
    #    if args.cluster:
    #        t.write(field_RMSToCentroid + '=double\n')
    #        t.write(field_ClusterNum + '=integer\n')
    #        t.write(field_ClusterCentroid + '=integer\n')
    #    t.flush()
    #    t.close()

    i = 0
    count = 0
    for mol in suppl:
        if mol is None: continue
        m = Chem.AddHs(mol)
        conformerPropsDict, minEnergy = process_mol_conformers(
            m, i, args.num, args.attempts, args.rmsd, args.cluster,
            args.threshold, args.emin)
        m = Chem.RemoveHs(m)
        write_conformers(m, i, conformerPropsDict, minEnergy, writer)
        count = count + m.GetNumConformers()
        i += 1

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitConformer': count
        })
Example #28
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Butina Cluster')
    parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        default=0.7,
        help='similarity clustering threshold (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-n',
        '--num',
        type=int,
        help='maximum number to pick for diverse subset selection')
    parser.add_argument(
        '-e',
        '--exclude',
        type=float,
        default=0.9,
        help=
        'threshold for excluding structures in diverse subset selection (1.0 means identical)'
    )
    parser.add_argument(
        '--fragment-method',
        choices=['hac', 'mw'],
        default='hac',
        help=
        'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument(
        '--output-fragment',
        action='store_true',
        help='Output the biggest fragment rather than the original molecule')
    parser.add_argument(
        '-f',
        '--field',
        help='field to use to optimise diverse subset selection')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--min',
        action='store_true',
        help='pick lowest value specified by the --field option')
    group.add_argument(
        '--max',
        action='store_true',
        help='pick highest value specified by the --field option')

    utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("Cluster Args: ", args)

    descriptor = descriptors[args.descriptor]
    if descriptor is None:
        raise ValueError('Invalid descriptor name ' + args.descriptor)

    if args.field and not args.num:
        raise ValueError(
            '--num argument must be specified for diverse subset selection')
    if args.field and not (args.min or args.max):
        raise ValueError(
            '--min or --max argument must be specified for diverse subset selection'
        )

    # handle metadata
    source = "cluster_butina.py"
    datasetMetaProps = {
        "source": source,
        "description": "Butina clustering using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {"Cluster": "java.lang.Integer"}
    fieldMetaProps = [{
        "fieldName": "Cluster",
        "values": {
            "source": source,
            "description": "Cluster number"
        }
    }]

    input, output, suppl, writer, output_base = utils.default_open_input_output(
        args.input,
        args.informat,
        args.output,
        'cluster_butina',
        args.outformat,
        thinOutput=args.thin,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps)

    ### generate fingerprints
    #mols = [x for x in suppl if x is not None]
    #fps = [descriptor(x) for x in mols]

    mols = []
    fps = []
    errs = mol_utils.fragmentAndFingerprint(
        suppl,
        mols,
        fps,
        descriptor,
        fragmentMethod=args.fragment_method,
        outputFragment=args.output_fragment,
        quiet=args.quiet)

    input.close()

    ### do clustering
    utils.log("Clustering with descriptor", args.descriptor, "metric",
              args.metric, "and threshold", args.threshold)
    clusters, dists, matrix = ClusterFps(fps, args.metric,
                                         1.0 - args.threshold)

    utils.log("Found", len(clusters), "clusters")

    ### generate diverse subset if specified
    if args.num:
        utils.log("Generating diverse subset")
        # diverse subset selection is specified
        finalClusters = SelectDiverseSubset(mols, clusters, dists, args.num,
                                            args.field, args.max, args.exclude,
                                            args.quiet)
    else:
        finalClusters = clusters

    utils.log("Found", len(finalClusters), "clusters")
    lookup = ClustersToMap(finalClusters)

    if not args.quiet:
        utils.log("Final Clusters:", finalClusters)

    ### write the results
    i = 0
    result_count = 0
    for mol in mols:
        if lookup.has_key(i):
            if args.thin:
                utils.clear_mol_props(mol, ["uuid"])
            cluster = lookup[i]
            mol.SetIntProp(field_Cluster, cluster)
            writer.write(mol)
            result_count += 1
        i += 1

    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        status_str = str(result_count) + ' results from ' + str(
            len(finalClusters)) + ' clusters'
        utils.write_metrics(
            output_base, {
                '__StatusMessage__': status_str,
                '__InputCount__': i,
                '__OutputCount__': result_count,
                'RDKitCluster': i
            })
Example #29
0
def main():
    ### command line args defintions #########################################

    ### Define the reactions available
    poised_filter = True
    if poised_filter == True:
        from poised_filter import Filter
        filter_to_use = Filter()

    parser = argparse.ArgumentParser(description='RDKit rxn process')
    utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('-m',
                        '--multi',
                        action='store_true',
                        help='Output one file for each reaction')
    parser.add_argument('-r',
                        '--reaction',
                        choices=filter_to_use.poised_reactions.keys(),
                        help='Name of reaction to be run')
    parser.add_argument('-rl',
                        '--reagent_lib',
                        help="Input SD file, if not defined the STDIN is used")
    parser.add_argument(
        '-rlf',
        '--reagent_lib_format',
        choices=['sdf', 'json'],
        help="Input format. When using STDIN this must be specified.")

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    if not args.output and args.multi:
        raise ValueError(
            "Must specify output location when writing individual result files"
        )

    input, suppl = utils.default_open_input(args.input, args.informat)
    reagent_input, reagent_suppl = utils.default_open_input(
        args.reagent_lib, args.reagent_lib_format)
    output, writer, output_base = utils.default_open_output(
        args.output, "rxn_maker", args.outformat)

    i = 0
    count = 0

    if args.multi:
        dir_base = os.path.dirname(args.output)
        writer_dict = filter_to_use.get_writers(dir_base)
    else:
        writer_dict = None
        dir_base = None

    for mol in suppl:
        i += 1
        if mol is None: continue
        # Return a dict/class here - indicating which filters passed
        count = filter_to_use.perform_reaction(mol, args.reaction,
                                               reagent_suppl, writer, count)

    utils.log("Created", count, "molecules from a total of ", i,
              "input molecules")

    writer.flush()
    writer.close()
    if input:
        input.close()
    if output:
        output.close()
    # close the individual writers
    if writer_dict:
        for key in writer_dict:
            writer_dict[key].close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                'RxnSmartsFilter': count
            })
Example #30
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Butina Cluster Matrix')
    utils.add_default_input_args(parser)
    parser.add_argument('-o', '--output', help="Base name for output file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.")
    parser.add_argument('-of', '--outformat', choices=['tsv', 'json'], default='tsv', help="Output format. Defaults to 'tsv'.")
    parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files')
    parser.add_argument('-t', '--threshold', type=float, default=0.7, help='Similarity clustering threshold (1.0 means identical)')
    parser.add_argument('-mt', '--matrixThreshold', type=float, default=0.5, help='Threshold for outputting values (1.0 means identical)')
    parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(cluster_butina.descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m', '--metric', type=str.lower, choices=list(cluster_butina.metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)')
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')

    args = parser.parse_args()
    utils.log("Cluster Matrix Args: ", args)

    descriptor = cluster_butina.descriptors[args.descriptor]
    if descriptor is None:
        raise ValueError('Invalid descriptor name ' + args.descriptor)

    input,suppl = utils.default_open_input(args.input, args.informat)

    # handle metadata
    source = "cluster_butina_matrix.py"
    datasetMetaProps = {"source":source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion}
    clsMappings = {
        "Cluster1": "java.lang.Integer",
        "Cluster2": "java.lang.Integer",
        "ID1": "java.lang.String",
        "ID2": "java.lang.String",
        "M1": "java.lang.String",
        "M2": "java.lang.String",
        "Similarity": "java.lang.Float"
    }
    fieldMetaProps = [{"fieldName":"Cluster", "values": {"source":source, "description":"Cluster number"}}]

    fieldNames = collections.OrderedDict()
    fieldNames['ID1'] = 'ID1'
    fieldNames['ID2'] ='ID2'
    fieldNames['Cluster1'] = 'Cluster1'
    fieldNames['Cluster2'] = 'Cluster2'
    fieldNames['Similarity'] = 'Similarity'
    fieldNames['M1'] = 'M1'
    fieldNames['M2'] = 'M2'

    writer,output_base = utils.create_simple_writer(args.output, 'cluster_butina_matrix', args.outformat, fieldNames,
                                                    valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps)


    ### generate fingerprints
    mols = [x for x in suppl if x is not None]
    fps = [descriptor(x) for x in mols]
    input.close()


    ### do clustering
    utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold)
    clusters, dists, matrix, = cluster_butina.ClusterFps(fps, args.metric, 1.0 - args.threshold)
    utils.log("Found", len(clusters), "clusters")

    MapClusterToMols(clusters, mols)

    if not args.quiet:
        utils.log("Clusters:", clusters)

    writer.writeHeader()

    size = len(matrix)
    #utils.log("len(matrix):", size)
    count = 0
    for i in range(size ):
        #utils.log("element",i, "has length", len(matrix[i]))
        writer.write(create_values(mols, i, i, 1.0))
        count += 1
        for j in range(len(matrix[i])):
            #utils.log("writing",i,j)
            dist = matrix[i][j]
            if dist > args.matrixThreshold:
                # the matrix is the lower left segment without the diagonal
                x = j
                y = i + 1
                writer.write(create_values(mols, x, y, dist))
                writer.write(create_values(mols, y, x, dist))
                count += 2
    writer.write(create_values(mols, size, size, 1.0))

    writer.writeFooter()
    writer.close()

    if args.meta:
        utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitCluster':i})