def enumerateMol(mol, fragment):
    """
    Enumerate a single molecule
    :param mol:
    :param fragment The fragmentation method, 'hac' or 'mw'. If not specified the whole molecules is passed to Dimorphite
    :return:
    """

    if fragment:
        mol = mol_utils.fragment(mol, fragment)

    inputmol = []
    inputmol.append(mol)

    protonated_mols = run_with_mol_list(inputmol)
    return protonated_mols
Exemple #2
0
def standardize(mol, neutralize, fragment):
    """

    :param mol: The molecule to standardize
    :param neutralize: Boolean for whether to neutralize the molecule
    :param fragment: The approach for choosing the largest fragment. Either 'hac' or 'mw'. If not specified the whole
    molecule is used.
    :return: The standardized molecule
    """
    mol = rdMolStandardize.Cleanup(mol)
    #mol = lfc.choose(mol)
    # We use our own largest fragment picker as the RDKit one behaves slightly differently
    if fragment:
        mol = mol_utils.fragment(mol, fragment)
    if neutralize:
        mol = uncharger.uncharge(mol)
    return mol
Exemple #3
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit screen')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--qsmiles',
        help=
        'filename of query structures as smiles (incompatible with --sdf and --qjson args)'
    )
    group.add_argument(
        '--qsdf',
        help=
        'filename of query structures as sdfile (incompatible with --smiles and --qjson args)'
    )
    group.add_argument(
        '--qjson',
        help=
        'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)'
    )
    parser.add_argument('--qsmilesTitleLine',
                        action='store_true',
                        help='the smiles file has a title line')
    parser.add_argument('--qsmilesDelimiter',
                        default='\t',
                        help='delimiter for smiles file (default is tab)')
    parser.add_argument(
        '--qsmilesColumn',
        type=int,
        default=0,
        help='column in smiles file with the smiles (default is first column)')
    parser.add_argument(
        '--qsmilesNameColumn',
        type=int,
        default=1,
        help='column in smiles file with ID (default is second column)')
    parser.add_argument(
        '--qprop',
        help=
        'property name in query molecules to report. If not defined (or property is not present) '
        +
        'then name property is not written. JSON format uses the UUID as default'
    )

    parser.add_argument('--simmin',
                        type=float,
                        default=0.7,
                        help='similarity lower cutoff (1.0 means identical)')
    parser.add_argument('--simmax',
                        type=float,
                        default=1.0,
                        help='similarity upper cutoff (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    descriptor = descriptors[args.descriptor.lower()]
    metric = metrics[args.metric.lower()]

    propName = args.qprop
    if args.qsmiles:
        queryMolsupplier = rdkit_utils.default_open_input_smiles(
            args.qsmiles,
            delimiter=args.qsmilesDelimiter,
            smilesColumn=args.qsmilesColumn,
            nameColumn=args.qsmilesNameColumn,
            titleLine=args.qsmilesTitleLine)
        queryInput = None
    elif args.qsdf:
        queryInput, queryMolsupplier = rdkit_utils.default_open_input_sdf(
            args.qsdf)
    elif args.qjson:
        queryInput, queryMolsupplier = rdkit_utils.default_open_input_json(
            args.qjson, lazy=False)
        if not propName:
            propName = "uuid"
    else:
        raise ValueError('No query structure specified')

    queryFps = {}
    utils.log("Preparing query fingerprints")
    count = 0
    for q in queryMolsupplier:
        count += 1
        if q:
            queryFps[q] = descriptor(q)
        else:
            utils.log("WARNING: Failed to parse Molecule", count)
    if queryInput:
        queryInput.close()

    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input, args.informat, args.output, 'screen_multi', args.outformat)

    # OK, all looks good so we can hope that things will run OK.
    # But before we start lets write the metadata so that the results can be handled.
    #if args.meta:
    #    t = open(output_base + '_types.txt', 'w')
    #    t.write(field_Similarity + '=integer\n')
    #    t.flush()
    #    t.close()

    i = 0
    count = 0
    for mol in suppl:
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter.filter(mol,
                             minHac=args.hacmin,
                             maxHac=args.hacmax,
                             minMw=args.mwmin,
                             maxMw=args.mwmax,
                             quiet=args.quiet):
            continue
        targetFp = descriptor(mol)
        idx = 0
        hits = 0
        bestScore = 0
        bestName = None
        for queryMol in queryFps:
            idx += 1
            sim = metric(queryFps[queryMol], targetFp)
            if propName:
                name = str(queryMol.GetProp(propName))
            else:
                name = None
            if sim >= args.simmin and sim <= args.simmax:
                hits += 1
                if not args.quiet:
                    utils.log(i, idx, sim)
                if sim > bestScore:
                    bestScore = sim
                    bestIdx = idx
                    if name:
                        bestName = name
                if name:
                    mol.SetDoubleProp(field_Similarity + "_" + name, sim)
                else:
                    mol.SetDoubleProp(
                        field_Similarity + "_" + str(idx) + "_Score", sim)

        if hits > 0:
            count += 1
            mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore)
            if bestName:
                mol.SetProp(field_Similarity + "_BestName", bestName)
            else:
                mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx)
            mol.SetIntProp(field_Similarity + "_Count", hits)
            writer.write(mol)

    utils.log("Found", count, "similar molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitScreen': count
        })

    return count
Exemple #4
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit screen')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--qsmiles',
        help='query structure as smiles (incompatible with -qmolfile arg)')
    group.add_argument(
        '--qmolfile',
        help=
        'query structure as filename in molfile format (incompatible with -qsmiles arg)'
    )
    parser.add_argument('--simmin',
                        type=float,
                        default=0.7,
                        help='similarity lower cutoff (1.0 means identical)')
    parser.add_argument('--simmax',
                        type=float,
                        default=1.0,
                        help='similarity upper cutoff (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    descriptor = descriptors[args.descriptor.lower()]
    metric = metrics[args.metric.lower()]

    if args.qsmiles:
        query_rdkitmol = Chem.MolFromSmiles(args.qsmiles)
    elif args.qmolfile:
        query_rdkitmol = Chem.MolFromMolFile(args.qmolfile)
    else:
        raise ValueError('No query structure specified')

    query_fp = descriptor(query_rdkitmol)

    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input,
        args.informat,
        args.output,
        'screen',
        args.outformat,
        thinOutput=args.thin)

    i = 0
    count = 0
    for mol in suppl:
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter.filter(mol,
                             minHac=args.hacmin,
                             maxHac=args.hacmax,
                             minMw=args.mwmin,
                             maxMw=args.mwmax,
                             quiet=args.quiet):
            continue
        target_fp = descriptor(mol)
        sim = metric(query_fp, target_fp)

        if sim >= args.simmin and sim <= args.simmax:
            count += 1
            if not args.quiet:
                utils.log(i, sim)
            mol.SetDoubleProp(field_Similarity, sim)
            writer.write(mol)

    utils.log("Found", count, "similar molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitScreen': i
        })
def main():
    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit filter')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parser.add_argument('--rotbmin',
                        type=float,
                        help='Min rotatable bond count')
    parser.add_argument('--rotbmax',
                        type=float,
                        help='Max rotatable bond count')
    parser.add_argument('--logpmin', type=float, help='Min logP')
    parser.add_argument('--logpmax', type=float, help='Max logP')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        help='Limit output to this many records')
    parser.add_argument(
        '-c',
        '--chunksize',
        type=int,
        help=
        'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...'
    )
    parser.add_argument(
        '-d',
        '--digits',
        type=int,
        default=0,
        help=
        'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...'
    )
    parser.add_argument('-r',
                        '--rename',
                        action='append',
                        help='Rename field (fromname:toname)')
    parser.add_argument(
        '-t',
        '--transform',
        action='append',
        help='Transform field value(fieldname:regex:type). ' +
        'Regex is in the form of /regex/substitution/ (the 3 slashes are required). '
        +
        'Type is of int, float, boolean or string. The type is optional and if not specified then string is assumed. '
        +
        'Transformation occurs after field renaming so specify the new name.')
    parser.add_argument('--delete', action='append', help='Delete field')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    # WARNING: thin output is not appropriate when using --fragment
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        help='Quiet mode - suppress reporting reason for filtering')
    parameter_utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("Filter Args: ", args)

    field_renames = {}
    if args.rename:
        for t in args.rename:
            parts = t.split(':')
            if len(parts) != 2:
                raise ValueError('Invalid field rename argument:', t)
            field_renames[parts[0]] = parts[1]
    if args.delete:
        for f in args.delete:
            field_renames[f] = None

    field_regexes = {}
    field_replacements = {}
    field_types = {}
    if args.transform:
        for t in args.transform:
            parts = t.split(':')
            if len(parts) < 2 or len(parts) > 3:
                raise ValueError('Invalid field transform argument:', t)
            terms = parts[1].split('/')
            utils.log("|".join(terms) + str(len(terms)))
            field_regexes[parts[0]] = re.compile(terms[1])
            field_replacements[parts[0]] = terms[2]
            if len(parts) == 3:
                t = parts[2]
            else:
                t = 'string'
            field_types[parts[0]] = t
            utils.log("Created transform of " + terms[1] + " to " + terms[2] +
                      " using type of " + t)

    if args.delete:
        for f in args.delete:
            field_renames[f] = None

    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)

    if args.chunksize:
        chunkNum = 1
        if args.output:
            output_base = args.output
        else:
            output_base = 'filter'
        output_base_chunk = output_base + str(chunkNum).zfill(args.digits)
        output, writer, output_base_chunk = rdkit_utils.default_open_output(
            output_base_chunk,
            output_base_chunk,
            args.outformat,
            thinOutput=args.thin,
            compress=not args.no_gzip)
    else:
        output, writer, output_base_chunk = rdkit_utils.default_open_output(
            args.output,
            "filter",
            args.outformat,
            thinOutput=args.thin,
            compress=not args.no_gzip)
        output_base = output_base_chunk

    utils.log("Writing to " + output_base_chunk)

    i = 0
    count = 0
    chunkNum = 1
    for mol in suppl:
        if args.limit and count >= args.limit:
            break
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter(mol,
                      minHac=args.hacmin,
                      maxHac=args.hacmax,
                      minMw=args.mwmin,
                      maxMw=args.mwmax,
                      minRotb=args.rotbmin,
                      maxRotb=args.rotbmax,
                      minLogp=args.logpmin,
                      maxLogp=args.logpmax,
                      quiet=args.quiet):
            continue
        if args.chunksize:
            if count > 0 and count % args.chunksize == 0:
                # new chunk, so create new writer
                writer.close()
                output.close()
                chunkNum += 1
                output_chunk_base = output_base + str(chunkNum).zfill(
                    args.digits)
                utils.log("Writing to " + output_chunk_base)
                output, writer, output_chunk_base = rdkit_utils.default_open_output(
                    output_chunk_base,
                    output_chunk_base,
                    args.outformat,
                    thinOutput=args.thin,
                    compress=not args.no_gzip)

        for from_name in field_renames:
            to_name = field_renames[from_name]
            if mol.HasProp(from_name):
                val = mol.GetProp(from_name)
                mol.ClearProp(from_name)
                if to_name:
                    mol.SetProp(to_name, val)

        for fieldname in field_regexes:
            p = mol.GetProp(fieldname)
            if p is not None:
                regex = field_regexes[fieldname]
                q = regex.sub(field_replacements[fieldname], p)
                t = field_types[fieldname]
                if t == 'int':
                    mol.SetIntProp(fieldname, int(q))
                elif t == 'float':
                    mol.SetDoubleProp(fieldname, float(q))
                elif t == 'boolean':
                    mol.SetBoolProp(fieldname, bool(q))
                else:
                    mol.SetProp(fieldname, q)

        count += 1
        writer.write(mol)

    utils.log("Filtered", i, "down to", count, "molecules")
    if args.chunksize:
        utils.log("Wrote", chunkNum, "chunks")
        if (args.digits > 0 and len(str(chunkNum)) > args.digits):
            utils.log(
                "WARNING: not enough digits specified for the number of chunks"
            )

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitFilter': i
        })