parser.add_argument('--min-hac',
                        type=int, default=0,
                        help='Limit processing to molecules with at least this'
                             ' number of heavy atoms')
    parser.add_argument('--max-hac',
                        type=int, default=0,
                        help='Limit processing to molecules with no more than'
                             ' this number of heavy atoms')

    args = parser.parse_args()

    # Create the output directory
    if not os.path.exists(args.output):
        os.mkdir(args.output)
    if not os.path.isdir(args.output):
        error('output ({}) is not a directory'.format(args.output))

    # Sanity-check key arguments
    if args.build_number < 1:
        error('build-number cannot be less then 1 ({})'.format(args.build_number))
    if args.limit < 0:
        error('limit cannot be -ve ({})'.format(args.limit))
    if args.min_hac < 0:
        error('min-hac cannot be -ve ({})'.format(args.min_hac))
    if args.max_hac < 0:
        error('max-hac cannot be -ve ({})'.format(args.max_hac))

    # -------
    # Stage 1 - Process our standardised Vendor File
    # -------
def extract_vendor_compounds(suppliermol_gzip_file,
                             suppliermol_edges_gzip_file,
                             supplier_id,
                             vendor_standard_filename,
                             limit,
                             min_hac,
                             max_hac):
    """Process the given file and extract vendor (and pricing) information.
    Vendor nodes are only created when there is at least one
    column of pricing information.

    This method extracts vendor information and writes the following files: -

    -   "chemspace-bb-suppliermol-nodes.csv.gz"
    -   "chemspace-bb-suppliermol-supplier-edges.csv.gz"

    The following files are expected to be written elsewhere: -

    -   "chemspace-bb-supplier-nodes.csv.gz"

    The "ID" in the SupplierMol nodes file is the Compound ID and the
    "ID" of the (single) Supplier node is the supplier Name.

    As we load the Vendor compounds we 'standardise' the SMILES and
    determine whether they represent an isomer or not.

    :param suppliermol_gzip_file: The SupplierMol node file
    :param suppliermol_edges_gzip_file: The SupplierMol to Supplier edges file
    :param supplier_id: The ID of the supplier node
    :param vendor_standard_filename: The compressed standard file to process
    :param limit: If non-zero, limit precessing to only the first N molecules
    :param min_hac: Minimum HAC (0 for no minimum)
    :param max_hac: Maximum HAC (0 for no maximum)

    :returns: The number of molecules processed
    """

    global compound_isomer_map
    global isomol_smiles
    global non_isomol_smiles
    global non_isomol_isomol_smiles
    global num_vendor_iso_mols
    global num_vendor_mols
    global num_vendor_molecule_failures

    logger.info('Processing %s...', vendor_standard_filename)

    num_lines = 0
    num_processed = 0
    with gzip.open(vendor_standard_filename, 'rt') as gzip_file:

        # Check first line (a space-delimited header).
        # This is a basic sanity-check to make sure the important column
        # names are what we expect.

        hdr = gzip_file.readline()
        field_names = hdr.split('\t')
        # Expected minimum number of columns...
        if len(field_names) < expected_min_num_cols:
            error('expected at least {} columns found {}'.
                  format(expected_input_cols, len(field_names)))
        # Check salient columns...
        for col_num in expected_input_cols:
            if field_names[col_num].strip() != expected_input_cols[col_num]:
                error('expected "{}" in column {} found "{}"'.
                      format(expected_input_cols[col_num],
                             col_num,
                             field_names[col_num]))

        # Columns look right...

        for line in gzip_file:

            num_lines += 1
            fields = line.split('\t')
            if len(fields) <= 1:
                continue

            osmiles = fields[osmiles_col]
            hac = int(fields[hac_col])
            iso = fields[iso_smiles_col]
            noniso = fields[noniso_smiles_col]
            compound_id = fields[compound_col].rstrip()

            # If min/max HAC have been provided
            # use them to eliminate compounds.
            if hac < min_hac:
                continue
            elif max_hac and hac > max_hac:
                continue

            # Add the compound (expected to be unique)
            # to our set of 'all compounds'.
            if compound_id in vendor_compounds:
                error('Duplicate compound ID ({})'.format(compound_id))
            vendor_compounds.add(compound_id)

            # Is it isomeric?
            num_vendor_mols += 1
            if iso != noniso:

                # Yes
                num_vendor_iso_mols += 1
                if iso not in isomol_smiles:
                    # This standardised SMILES is not
                    # in the map of existing isomers
                    # so start a new list of customer compounds...
                    new_set = set()
                    new_set.add(compound_id)
                    isomol_smiles[iso] = new_set
                else:
                    # Standard SMILES already
                    isomol_smiles[iso].add(compound_id)
                compound_isomer_map[compound_id] = iso
                # Put a lookup of iso representation from the non-iso
                if noniso not in non_isomol_isomol_smiles:
                    new_set = set()
                    new_set.add(iso)
                    non_isomol_isomol_smiles[noniso] = new_set
                else:
                    non_isomol_isomol_smiles[noniso].add(iso)

            else:

                # Not an isomeric representation
                if noniso not in non_isomol_smiles:
                    new_set = set()
                    new_set.add(compound_id)
                    non_isomol_smiles[noniso] = new_set
                else:
                    non_isomol_smiles[noniso].add(compound_id)

            # Write the SupplierMol entry
            suppliermol_gzip_file.write('{},"{}",Available\n'.
                                        format(compound_id,
                                               osmiles))

            # And add a suitable 'Availability' relationship with the Supplier
            suppliermol_edges_gzip_file. \
                write('{},{},Availability\n'.
                      format(compound_id,
                             supplier_id))

            # Enough?
            num_processed += 1
            if limit and num_processed >= limit:
                break

    return num_processed
Esempio n. 3
0
def extract_vendor_compounds(suppliermol_gzip_file,
                             suppliermol_edges_gzip_file, supplier_id,
                             vendor_standard_filename, limit, min_hac,
                             max_hac):
    """Process the given file and extract vendor information.

    :param suppliermol_gzip_file: The SupplierMol node file
    :param suppliermol_edges_gzip_file: The SupplierMol to Supplier edges file
    :param supplier_id: The ID of the supplier node
    :param vendor_standard_filename: The compressed standard file to process
    :param limit: If non-zero, limit precessing to only the first N molecules
    :param min_hac: Minimum HAC (0 for no minimum)
    :param max_hac: Maximum HAC (0 for no maximum)

    :returns: The number of molecules processed
    """
    global compound_isomer_map
    global isomol_smiles
    global non_isomol_smiles
    global non_isomol_isomol_smiles
    global assay_compound_values
    global num_vendor_iso_mols
    global num_vendor_mols
    global num_vendor_molecule_failures
    global num_activity_value_failures

    logger.info('Processing {}...'.format(vendor_standard_filename))

    num_lines = 0
    num_processed = 0
    with gzip.open(vendor_standard_filename, 'rt') as gzip_file:

        # Check first line (a tab-delimited header).
        # This is a basic sanity-check to make sure the important column
        # names are what we expect.

        hdr = gzip_file.readline()
        field_names = hdr.split()
        # Expected minimum number of columns...
        if len(field_names) < expected_min_num_cols:
            error('expected at least {} columns found {}'.format(
                expected_input_cols, len(field_names)))
        # Check salient columns...
        for col_num in expected_input_cols:
            if field_names[col_num].strip() != expected_input_cols[col_num]:
                error('expected "{}" in column {} found "{}"'.format(
                    expected_input_cols[col_num], col_num,
                    field_names[col_num]))

        # OK - looks like the column names are right.
        # let's load the data...

        for line in gzip_file:

            num_lines += 1
            fields = line.split()

            osmiles = fields[osmiles_col]
            hac = int(fields[hac_col])
            iso = fields[iso_smiles_col]
            noniso = fields[noniso_smiles_col]
            compound_id = fields[compound_col]
            activity = 0.0
            try:
                activity = float(fields[activity_col].rstrip())
            except ValueError:
                num_activity_value_failures += 1

            # If min/max HAC have been provided
            # use them to eliminate compounds.
            if hac < min_hac:
                continue
            elif max_hac and hac > max_hac:
                continue

            # The compound ID must be unique
            if compound_id in vendor_compounds:
                error('Duplicate compound "{}"'.format(compound_id))
            vendor_compounds.add(compound_id)

            # Add activity to compound-id map...
            assay_compound_values[compound_id] = activity

            # Is it isomeric?
            num_vendor_mols += 1
            if iso != noniso:

                # Yes
                num_vendor_iso_mols += 1
                if iso not in isomol_smiles:
                    # This standardised SMILES is not
                    # in the map of existing isomers
                    # so start a new list of customer compounds...
                    new_set = set()
                    new_set.add(compound_id)
                    isomol_smiles[iso] = new_set
                else:
                    # Standard SMILES already
                    isomol_smiles[iso].add(compound_id)
                compound_isomer_map[compound_id] = iso
                # Put a lookup of iso representation from the non-iso
                if noniso not in non_isomol_isomol_smiles:
                    new_set = set()
                    new_set.add(iso)
                    non_isomol_isomol_smiles[noniso] = new_set
                else:
                    non_isomol_isomol_smiles[noniso].add(iso)

            else:

                # Not an isomeric representation
                if noniso not in non_isomol_smiles:
                    new_set = set()
                    new_set.add(compound_id)
                    non_isomol_smiles[noniso] = new_set
                else:
                    non_isomol_smiles[noniso].add(compound_id)

            # Write the SupplierMol entry
            suppliermol_gzip_file.write('{},"{}",Available\n'.format(
                compound_id, osmiles))

            # And add a suitable 'Availability' relationship with the Supplier
            suppliermol_edges_gzip_file. \
                write('{},{},Availability\n'.
                      format(compound_id, supplier_id))

            # Enough?
            num_processed += 1
            if limit and num_processed >= limit:
                break

    return num_processed