def standardise_vendor_compounds(output_file, file_name, limit): """Process the given file and standardise the vendor information, writing it as tab-separated fields to the output. As we load the Vendor compounds we 'standardise' the SMILES and determine whether they represent an isomer or not. :param output_file: The tab-separated standardised output file :param file_name: The (compressed) file to process :param limit: Limit processing to this number of values (or all if 0) :returns: The number of items processed """ global vendor_compounds global num_vendor_mols global num_vendor_molecule_failures logger.info('Standardising %s...', file_name) line_num = 0 num_processed = 0 with open(file_name, 'rt') as input_file: # Check first line (a space-delimited header). # This is a basic sanity-check to make sure the important column # names are what we expect. hdr = input_file.readline() field_names = hdr.split() # Expected minimum number of columns... if len(field_names) < expected_min_num_cols: error('expected at least {} columns found {}'.format( expected_input_cols, len(field_names))) # Check salient columns (ignoring case)... for col_num in expected_input_cols: actual_name = field_names[col_num].strip().lower() if actual_name != expected_input_cols[col_num]: error('expected "{}" in column {} found "{}"'.format( expected_input_cols[col_num], col_num, actual_name)) # Columns look right... for line in input_file: line_num += 1 fields = line.split() if len(fields) <= 1: continue if line_num % report_rate == 0: logger.info(' ...at compound {:,}'.format(line_num)) osmiles = fields[smiles_col] compound_id = real_prefix + fields[compound_col] # Add the compound (expected to be unique) # to our set of 'all compounds'. if compound_id in vendor_compounds: error('Duplicate compound ID ({})'.format(compound_id)) vendor_compounds.add(compound_id) # Standardise and update global maps... # And try and handle and report any catastrophic errors # from dependent modules/functions. std_info = standardise_utils.standardise(osmiles) if not std_info.std: num_vendor_molecule_failures += 1 continue num_vendor_mols += 1 # Write the standardised data output = [ osmiles, std_info.iso, std_info.noniso, std_info.hac, compound_id ] output_file.write('\t'.join(output) + '\n') # Enough? num_processed += 1 if limit and num_processed >= limit: break return num_processed
def standardise_vendor_compounds(output_file, file_name, limit): """Process the given file and standardise the vendor information, writing it as tab-separated fields to the output. As we load the Vendor compounds we 'standardise' the SMILES and determine whether they represent an isomer or not. :param output_file: The tab-separated standardised output file :param file_name: The (compressed) file to process :param limit: Limit processing to this number of values (or all if 0) :returns: The number of items processed """ global vendor_compounds global num_vendor_mols global num_vendor_molecule_failures logger.info('Standardising %s...', file_name) count = 0 num_processed = 0 with gzip.open(file_name, 'rb') as input: suppl = Chem.ForwardSDMolSupplier(input) for mol in suppl: count += 1 if mol is None: logger.warning("Failed to handle record %s", count) continue chembl_id = mol.GetProp('chembl_id') if count % report_rate == 0: logger.info(' ...at compound {:,}'.format(count)) osmiles = Chem.MolToSmiles(mol) # generate the ID. The data in the chembl_id field looks like CHEMBL153534 # so we need to string off the CHEMBL bit as our generated ID looks like CHEMBL:153534 compound_id = vendor_prefix + chembl_id[6:] # Add the compound (expected to be unique) # to our set of 'all compounds'. if compound_id in vendor_compounds: error('Duplicate compound ID ({})'.format(compound_id)) vendor_compounds.add(compound_id) # Standardise and update global maps... # And try and handle and report any catastrophic errors # from dependent modules/functions. std_info = standardise_utils.standardise(osmiles) if not std_info.std: num_vendor_molecule_failures += 1 continue num_vendor_mols += 1 # Write the standardised data output = [ osmiles, std_info.iso, std_info.noniso, std_info.hac, compound_id ] output_file.write('\t'.join(output) + '\n') # Enough? num_processed += 1 if limit and num_processed >= limit: break return num_processed