Python write_squonk_datasetmetadata Examples

Programming Language: Python

Namespace/Package Name: pipelines_utils.utils

Method/Function: write_squonk_datasetmetadata

Examples at hotexamples.com: 5

Python write_squonk_datasetmetadata - 5 examples found. These are the top rated real world Python examples of pipelines_utils.utils.write_squonk_datasetmetadata extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def default_open_output_json(outputDef, outputBase, thinOutput, compress,
                             valueClassMappings, datasetMetaProps,
                             fieldMetaProps):

    # this writes the metadata that Squonk needs
    utils.write_squonk_datasetmetadata(outputBase, thinOutput,
                                       valueClassMappings, datasetMetaProps,
                                       fieldMetaProps)

    output = utils.open_output(outputDef, 'data', compress)

    if thinOutput:
        writer = ThinJsonWriter(output)
    else:
        writer = ThickJsonWriter(output)
    return output, writer

Example #2

Show file

File: filter_interactions.py Project: pk-organics/pipelines

def main():
    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='Filter interactions')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-f',
                        '--group-by-field',
                        required=True,
                        help='Field to group records by (must be sequential)')
    parser.add_argument('-s',
                        '--score-field',
                        required=True,
                        help='Field to use to rank records within a group')
    parser.add_argument('-d',
                        '--score-descending',
                        action='store_true',
                        help='Sort records in descending order')
    parser.add_argument('-x',
                        '--stats-fields',
                        nargs='*',
                        help='Field to use to for summary statistics')

    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')

    args = parser.parse_args()
    utils.log("filter_interactions: ", args)

    # handle metadata
    source = "filter_interactions.py"
    datasetMetaProps = {
        "source": source,
        "description": "Filter by interactions"
    }
    clsMappings = {
        # "EnumChargesSrcMolUUID": "java.lang.String",
        # "EnumChargesSrcMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [
        # {"fieldName": "EnumChargesSrcMolUUID", "values": {"source": source, "description": "UUID of source molecule"}},
        # {"fieldName": "EnumChargesSrcMolIdx", "values": {"source": source, "description": "Index of source molecule"}}
    ]

    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'filter_interactions',
        args.outformat,
        thinOutput=False,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)
    report_file = open(output_base + '.report', 'wt')
    count, total, errors = execute(suppl, writer, report_file,
                                   args.group_by_field, args.score_field,
                                   args.score_descending, args.stats_fields)

    utils.log(count, total, errors)

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()
    report_file.close()

    # re-write the metadata as we now know the size
    if args.outformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=total)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'FilterInteractions': total
            })

Example #3

Show file

File: sanifier.py Project: pk-organics/pipelines

def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(
        description='RDKit molecule standardizer / enumerator')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-et',
                        '--enumerate_tauts',
                        action='store_true',
                        help='Enumerate all tautomers')
    parser.add_argument('-es',
                        '--enumerate_stereo',
                        action='store_true',
                        help='Enumerate all stereoisomers')
    parser.add_argument(
        '-st',
        '--standardize',
        action='store_true',
        help='Standardize molecules. Cannot  be true if enumerate is on.')
    parser.add_argument('-stm',
                        '--standardize_method',
                        default="molvs",
                        choices=STANDARD_MOL_METHODS.keys(),
                        help="Choose the method to standardize.")
    parser.add_argument('-mf',
                        '--mol_format',
                        choices=['smiles', 'mol_2d', 'mol_3d'],
                        help="Format for molecules.")

    args = parser.parse_args()

    utils.log("Sanifier Args: ", args)

    if args.standardize and args.enumerate_tauts:
        raise ValueError("Cannot Enumerate Tautomers and Standardize")

    if args.standardize and args.enumerate_stereo:
        raise ValueError("Cannot Enumerate Stereo and Standardize")

    if args.outformat == 'sdf' and args.mol_format == 'smiles':
        raise ValueError("Smiles cannot be used when outputting as SDF")

    if args.standardize:
        getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method]

    # handle metadata
    source = "sanifier.py"
    datasetMetaProps = {
        "source": source,
        "description": "Enumerate tautomers and stereoisomers"
    }
    clsMappings = {
        "EnumTautIsoSourceMolUUID": "java.lang.String",
        "EnumTautIsoSourceMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "EnumTautIsoSourceMolUUID",
        "values": {
            "source": source,
            "description": "UUID of source molecule"
        }
    }, {
        "fieldName": "EnumTautIsoSourceMolIdx",
        "values": {
            "source": source,
            "description": "Index of source molecule"
        }
    }]

    oformat = utils.determine_output_format(args.outformat)

    input,output,suppl,writer,output_base = rdkit_utils. \
        default_open_input_output(args.input, args.informat, args.output,
                                  'sanifier', args.outformat,
                                  thinOutput=False, valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    i = 0
    count = 0
    errors = 0
    for mol in suppl:
        i += 1
        if mol is None: continue

        if args.standardize:
            # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs
            oldUUID = mol.GetProp("uuid")
            inputCanSmiles = Chem.MolToSmiles(mol,
                                              isomericSmiles=True,
                                              canonical=True)
            try:
                std = getStandardMolecule(mol)
                outputCanSmiles = Chem.MolToSmiles(std,
                                                   isomericSmiles=True,
                                                   canonical=True)
                if oldUUID:
                    std.SetProp("uuid", oldUUID)
                #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles)
                if inputCanSmiles == outputCanSmiles:
                    std.SetProp("Standardized", "False")
                else:
                    std.SetProp("Standardized", "True")
            except:
                errors += 1
                utils.log("Error standardizing", sys.exc_info()[0])
                std = mol
                std.SetProp("Standardized", "Error")

            count = write_out([std], count, writer, args.mol_format,
                              args.outformat)
        else:
            # we want a new UUID generating as we are generating new molecules
            if mol.HasProp('uuid'):
                parentUuid = mol.GetProp("uuid")
            else:
                parentUuid = None

            results = []

            if args.enumerate_tauts:
                utils.log("Enumerating tautomers")
                results = enumerateTautomers(mol)
            else:
                results.append(mol)

            if args.enumerate_stereo:
                utils.log("Enumerating steroisomers")
                mols = results
                results = []
                for m in mols:
                    enumerated = enumerateStereoIsomers(m)
                    results.extend(enumerated)

            for m in results:
                # copy the src mol props
                for name in mol.GetPropNames():
                    m.SetProp(name, mol.GetProp(name))
                # add our new props
                m.ClearProp("uuid")
                m.SetIntProp("EnumTautIsoSourceMolIdx", i)
                if parentUuid:
                    m.SetProp("EnumTautIsoSourceMolUUID", parentUuid)

            count = write_out(results, count, writer, args.mol_format,
                              args.outformat)

    utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) +
              " outputs")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    # re-write the metadata as we now know the size
    if oformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=count)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                '__ErrorCount__': errors,
                'RDKitSanify': count
            })

    return count

Example #4

Show file

File: enumerate_charges.py Project: pk-organics/pipelines

def main():
    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='Enumerate charges')
    parser.add_argument(
        '--fragment-method',
        choices=['hac', 'mw'],
        help=
        'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument('--min-ph',
                        help='The min pH to consider',
                        type=float,
                        default=5.0)
    parser.add_argument('--max-ph',
                        help='The max pH to consider',
                        type=float,
                        default=9.0)

    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("Enumerate charges: ", args)

    # handle metadata
    source = "enumerate_charges.py"
    datasetMetaProps = {
        "source": source,
        "description": "Enumerate charges using Dimorphite-dl"
    }
    clsMappings = {
        "EnumChargesSrcMolUUID": "java.lang.String",
        "EnumChargesSrcMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "EnumChargesSrcMolUUID",
        "values": {
            "source": source,
            "description": "UUID of source molecule"
        }
    }, {
        "fieldName": "EnumChargesSrcMolIdx",
        "values": {
            "source": source,
            "description": "Index of source molecule"
        }
    }]

    oformat = utils.determine_output_format(args.outformat)

    input,output,suppl,writer,output_base = rdkit_utils. \
        default_open_input_output(args.input, args.informat, args.output,
                                  'enumerateCharges', args.outformat,
                                  thinOutput=False, valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    count = 0
    total = 0
    errors = 0
    min_ph = args.min_ph
    max_ph = args.max_ph

    # this hacky bit is needed because the dimporphite entrypoint assumes it's args are passed using argparse
    # but it doesn't understand our args, so we need to switch between the two sets of args.
    dimorphite_sys_argv = sys.argv[:1]
    dimorphite_sys_argv.append('--min_ph')
    dimorphite_sys_argv.append(str(min_ph))
    dimorphite_sys_argv.append('--max_ph')
    dimorphite_sys_argv.append(str(max_ph))
    fragment = args.fragment_method
    for mol in suppl:
        if mol is None:
            continue
        count += 1
        orig_sys_argv = sys.argv[:]
        sys.argv = dimorphite_sys_argv
        enum_mols = enumerateMol(mol, fragment)
        sys.argv = orig_sys_argv
        t, e = writeEnumeratedMols(mol, enum_mols, writer, count)
        total += t
        errors += e

    utils.log(count, total, errors)

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()

    # re-write the metadata as we now know the size
    if oformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=total)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'EnumerateChargesDimporphite': total
            })

Example #5

Show file

def main():
    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='Enumerate charges')
    parser.add_argument('-i',
                        '--input',
                        help="Input file, if not defined the STDIN is used")
    parser.add_argument(
        '-if',
        '--informat',
        choices=['sdf', 'json', 'smi'],
        help="Input format. When using STDIN this must be specified.")
    parameter_utils.add_default_output_args(parser)
    parser.add_argument(
        '--fragment-method',
        choices=['hac', 'mw'],
        help=
        'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument(
        "--minimize",
        type=int,
        default=0,
        help="number of minimisation cycles when generating 3D molecules")
    parser.add_argument('--enumerate-chirals',
                        help='Enumerate undefined chiral centers',
                        type=int,
                        default=0)
    parser.add_argument('--smiles-field',
                        help='Add the SMILES as a field of this name')
    parser.add_argument(
        '--name-field',
        help=
        'Use this field in the input as the name field in the output. If not specified the SMILES is used'
    )
    parser.add_argument('--include-hydrogens',
                        action='store_true',
                        help='Include hydrogens in the output')
    parser.add_argument("--min-charge",
                        type=int,
                        help="Minimum charge of molecule to process")
    parser.add_argument("--max-charge",
                        type=int,
                        help="Maximum charge of molecule to process")
    parser.add_argument("--num-charges",
                        type=int,
                        help="Maximum number of atoms with a charge")

    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')

    args = parser.parse_args()
    utils.log("prepare_3d: ", args)

    # handle metadata
    source = "prepare_3d.py"
    datasetMetaProps = {
        "source": source,
        "description": "Enumerate undefined stereo isomers and generate 3D"
    }
    clsMappings = {
        "EnumChargesSrcMolUUID": "java.lang.String",
        "EnumChargesSrcMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "EnumChargesSrcMolUUID",
        "values": {
            "source": source,
            "description": "UUID of source molecule"
        }
    }, {
        "fieldName": "EnumChargesSrcMolIdx",
        "values": {
            "source": source,
            "description": "Index of source molecule"
        }
    }]

    oformat = utils.determine_output_format(args.outformat)
    if args.informat == 'smi':
        suppl = rdkit_utils.default_open_input_smiles(args.input,
                                                      delimiter='\t',
                                                      titleLine=False)
        input = None
    else:
        input, suppl = rdkit_utils.default_open_input(args.input,
                                                      args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'enumerate_molecules',
        args.outformat,
        thinOutput=False,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)

    count, total, errors = execute(suppl,
                                   writer,
                                   minimize=args.minimize,
                                   enumerate_chirals=args.enumerate_chirals,
                                   smiles_field=args.smiles_field,
                                   include_hs=args.include_hydrogens,
                                   min_charge=args.min_charge,
                                   max_charge=args.max_charge,
                                   num_charges=args.num_charges)

    utils.log(count, total, errors)

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()

    # re-write the metadata as we now know the size
    if oformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=total)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'EnumerateChargesDimporphite': total
            })