parser.add_argument('--min-hac', type=int, default=0, help='Limit processing to molecules with at least this' ' number of heavy atoms') parser.add_argument('--max-hac', type=int, default=0, help='Limit processing to molecules with no more than' ' this number of heavy atoms') args = parser.parse_args() # Create the output directory if not os.path.exists(args.output): os.mkdir(args.output) if not os.path.isdir(args.output): error('output ({}) is not a directory'.format(args.output)) # Sanity-check key arguments if args.build_number < 1: error('build-number cannot be less then 1 ({})'.format(args.build_number)) if args.limit < 0: error('limit cannot be -ve ({})'.format(args.limit)) if args.min_hac < 0: error('min-hac cannot be -ve ({})'.format(args.min_hac)) if args.max_hac < 0: error('max-hac cannot be -ve ({})'.format(args.max_hac)) # ------- # Stage 1 - Process our standardised Vendor File # -------
def extract_vendor_compounds(suppliermol_gzip_file, suppliermol_edges_gzip_file, supplier_id, vendor_standard_filename, limit, min_hac, max_hac): """Process the given file and extract vendor (and pricing) information. Vendor nodes are only created when there is at least one column of pricing information. This method extracts vendor information and writes the following files: - - "chemspace-bb-suppliermol-nodes.csv.gz" - "chemspace-bb-suppliermol-supplier-edges.csv.gz" The following files are expected to be written elsewhere: - - "chemspace-bb-supplier-nodes.csv.gz" The "ID" in the SupplierMol nodes file is the Compound ID and the "ID" of the (single) Supplier node is the supplier Name. As we load the Vendor compounds we 'standardise' the SMILES and determine whether they represent an isomer or not. :param suppliermol_gzip_file: The SupplierMol node file :param suppliermol_edges_gzip_file: The SupplierMol to Supplier edges file :param supplier_id: The ID of the supplier node :param vendor_standard_filename: The compressed standard file to process :param limit: If non-zero, limit precessing to only the first N molecules :param min_hac: Minimum HAC (0 for no minimum) :param max_hac: Maximum HAC (0 for no maximum) :returns: The number of molecules processed """ global compound_isomer_map global isomol_smiles global non_isomol_smiles global non_isomol_isomol_smiles global num_vendor_iso_mols global num_vendor_mols global num_vendor_molecule_failures logger.info('Processing %s...', vendor_standard_filename) num_lines = 0 num_processed = 0 with gzip.open(vendor_standard_filename, 'rt') as gzip_file: # Check first line (a space-delimited header). # This is a basic sanity-check to make sure the important column # names are what we expect. hdr = gzip_file.readline() field_names = hdr.split('\t') # Expected minimum number of columns... if len(field_names) < expected_min_num_cols: error('expected at least {} columns found {}'. format(expected_input_cols, len(field_names))) # Check salient columns... for col_num in expected_input_cols: if field_names[col_num].strip() != expected_input_cols[col_num]: error('expected "{}" in column {} found "{}"'. format(expected_input_cols[col_num], col_num, field_names[col_num])) # Columns look right... for line in gzip_file: num_lines += 1 fields = line.split('\t') if len(fields) <= 1: continue osmiles = fields[osmiles_col] hac = int(fields[hac_col]) iso = fields[iso_smiles_col] noniso = fields[noniso_smiles_col] compound_id = fields[compound_col].rstrip() # If min/max HAC have been provided # use them to eliminate compounds. if hac < min_hac: continue elif max_hac and hac > max_hac: continue # Add the compound (expected to be unique) # to our set of 'all compounds'. if compound_id in vendor_compounds: error('Duplicate compound ID ({})'.format(compound_id)) vendor_compounds.add(compound_id) # Is it isomeric? num_vendor_mols += 1 if iso != noniso: # Yes num_vendor_iso_mols += 1 if iso not in isomol_smiles: # This standardised SMILES is not # in the map of existing isomers # so start a new list of customer compounds... new_set = set() new_set.add(compound_id) isomol_smiles[iso] = new_set else: # Standard SMILES already isomol_smiles[iso].add(compound_id) compound_isomer_map[compound_id] = iso # Put a lookup of iso representation from the non-iso if noniso not in non_isomol_isomol_smiles: new_set = set() new_set.add(iso) non_isomol_isomol_smiles[noniso] = new_set else: non_isomol_isomol_smiles[noniso].add(iso) else: # Not an isomeric representation if noniso not in non_isomol_smiles: new_set = set() new_set.add(compound_id) non_isomol_smiles[noniso] = new_set else: non_isomol_smiles[noniso].add(compound_id) # Write the SupplierMol entry suppliermol_gzip_file.write('{},"{}",Available\n'. format(compound_id, osmiles)) # And add a suitable 'Availability' relationship with the Supplier suppliermol_edges_gzip_file. \ write('{},{},Availability\n'. format(compound_id, supplier_id)) # Enough? num_processed += 1 if limit and num_processed >= limit: break return num_processed
def extract_vendor_compounds(suppliermol_gzip_file, suppliermol_edges_gzip_file, supplier_id, vendor_standard_filename, limit, min_hac, max_hac): """Process the given file and extract vendor information. :param suppliermol_gzip_file: The SupplierMol node file :param suppliermol_edges_gzip_file: The SupplierMol to Supplier edges file :param supplier_id: The ID of the supplier node :param vendor_standard_filename: The compressed standard file to process :param limit: If non-zero, limit precessing to only the first N molecules :param min_hac: Minimum HAC (0 for no minimum) :param max_hac: Maximum HAC (0 for no maximum) :returns: The number of molecules processed """ global compound_isomer_map global isomol_smiles global non_isomol_smiles global non_isomol_isomol_smiles global assay_compound_values global num_vendor_iso_mols global num_vendor_mols global num_vendor_molecule_failures global num_activity_value_failures logger.info('Processing {}...'.format(vendor_standard_filename)) num_lines = 0 num_processed = 0 with gzip.open(vendor_standard_filename, 'rt') as gzip_file: # Check first line (a tab-delimited header). # This is a basic sanity-check to make sure the important column # names are what we expect. hdr = gzip_file.readline() field_names = hdr.split() # Expected minimum number of columns... if len(field_names) < expected_min_num_cols: error('expected at least {} columns found {}'.format( expected_input_cols, len(field_names))) # Check salient columns... for col_num in expected_input_cols: if field_names[col_num].strip() != expected_input_cols[col_num]: error('expected "{}" in column {} found "{}"'.format( expected_input_cols[col_num], col_num, field_names[col_num])) # OK - looks like the column names are right. # let's load the data... for line in gzip_file: num_lines += 1 fields = line.split() osmiles = fields[osmiles_col] hac = int(fields[hac_col]) iso = fields[iso_smiles_col] noniso = fields[noniso_smiles_col] compound_id = fields[compound_col] activity = 0.0 try: activity = float(fields[activity_col].rstrip()) except ValueError: num_activity_value_failures += 1 # If min/max HAC have been provided # use them to eliminate compounds. if hac < min_hac: continue elif max_hac and hac > max_hac: continue # The compound ID must be unique if compound_id in vendor_compounds: error('Duplicate compound "{}"'.format(compound_id)) vendor_compounds.add(compound_id) # Add activity to compound-id map... assay_compound_values[compound_id] = activity # Is it isomeric? num_vendor_mols += 1 if iso != noniso: # Yes num_vendor_iso_mols += 1 if iso not in isomol_smiles: # This standardised SMILES is not # in the map of existing isomers # so start a new list of customer compounds... new_set = set() new_set.add(compound_id) isomol_smiles[iso] = new_set else: # Standard SMILES already isomol_smiles[iso].add(compound_id) compound_isomer_map[compound_id] = iso # Put a lookup of iso representation from the non-iso if noniso not in non_isomol_isomol_smiles: new_set = set() new_set.add(iso) non_isomol_isomol_smiles[noniso] = new_set else: non_isomol_isomol_smiles[noniso].add(iso) else: # Not an isomeric representation if noniso not in non_isomol_smiles: new_set = set() new_set.add(compound_id) non_isomol_smiles[noniso] = new_set else: non_isomol_smiles[noniso].add(compound_id) # Write the SupplierMol entry suppliermol_gzip_file.write('{},"{}",Available\n'.format( compound_id, osmiles)) # And add a suitable 'Availability' relationship with the Supplier suppliermol_edges_gzip_file. \ write('{},{},Availability\n'. format(compound_id, supplier_id)) # Enough? num_processed += 1 if limit and num_processed >= limit: break return num_processed