def import_mol_dir(mine_db, target, name_field="Name", overwrite=False): """Imports a directory of molfiles as a MINE database :param mine_db: a MINE object, the database to insert the compound into :param target: a path, the molfile directory to be loaded :param name_field: a string, the field for the compound name :param overwrite: a bool, if true, new compounds replace the old compounds in the database """ # For each .mol file in the directory of the target folder (path): for file in os.listdir(target): if ".mol" in file: # MolFromMolFile (rdkit) generates Mol objects from .mol files mol = AllChem.MolFromMolFile(target + '/' + file) # Mol object name becomes name of mol file without .mol extension name = file.rstrip('.mol') # Check that Mol object is successfully generated if mol: # Create hashkey for the compound comphash = utils.compound_hash(mol) # If we don't want to overwrite, and the compound (comphash) # already exists, then add an extra comphash for that molecule if not overwrite and mine_db.compounds.count({"_id": comphash }): mine_db.compounds.update({"_id": comphash}, {"$addToSet": { name_field: name }}) # If we don't care about overwriting, just insert the new # compound into the database else: mine_db.insert_compound(mol, compound_dict={ name_field: [name], 'Generation': 0 }, pubchem_db=None, kegg_db=None, modelseed_db=None) # Add to log file (metadata) mine_db.meta_data.insert({ "Timestamp": datetime.datetime.now(), "Action": "MolFiles Imported", "Filepath": target })
def parse_comps(field): atoms = collections.Counter() compounds = collections.Counter(field.split(' // ')) half_rxn = [] for comp, stoich in compounds.items(): if comp in metacyc2hash: mol = metacyc2hash[comp] for pair in re.findall('([A-Z][a-z]*)(\d*)', AllChem.CalcMolFormula(mol)): if pair[1]: atoms[pair[0]] += int(pair[1]) * stoich else: atoms[pair[0]] += 1 * stoich if comp not in inserted: mine_db.insert_compound(mol, {'Generation': 0}) inserted.add(comp) half_rxn.append( utils.stoich_tuple(stoich, utils.compound_hash(mol))) else: raise ValueError('Undefined Compound: %s' % comp) return half_rxn, atoms
def _add_compound(self, id, smi, mol=None, type='Predicted'): """Adds a compound to the internal compound dictionary""" _id = utils.compound_hash(smi, type == 'Coreactant') self._raw_compounds[smi] = _id # We don't want to overwrite the same compound from a prior # generation so we check with hashed id from above if _id not in self.compounds: if not mol: mol = AllChem.MolFromSmiles(smi) i_key = AllChem.InchiToInchiKey(AllChem.MolToInchi(mol)) self.compounds[_id] = { 'ID': id, '_id': _id, "SMILES": smi, 'Inchikey': i_key, 'Type': type, 'Generation': self.generation, 'Formula': AllChem.CalcMolFormula(mol), '_atom_count': self._get_atom_count(mol), 'Charge': AllChem.GetFormalCharge(mol), 'Reactant_in': [], 'Product_of': [], "Sources": [] } # Don't track sources of coreactants if _id[0] == 'X': del self.compounds[_id]['Sources'] # If we are building a mine and generating images, do so here if self.image_dir and self.mine: try: with open(os.path.join(self.image_dir, _id + '.svg'), 'w') as outfile: nmol = rdMolDraw2D.PrepareMolForDrawing(mol) d2d = rdMolDraw2D.MolDraw2DSVG(1000, 1000) d2d.DrawMolecule(nmol) d2d.FinishDrawing() outfile.write(d2d.GetDrawingText()) except OSError: print("Unable to generate image for %s" % smi) return _id
def insert_compound(self, mol_object, compound_dict=None, bulk=None, kegg_db="KEGG", pubchem_db='PubChem-8-28-2015', modelseed_db='ModelSEED'): """This class saves a RDKit Molecule as a compound entry in the MINE. Calculates necessary fields for API and includes additional information passed in the compound dict. Overwrites preexisting compounds in MINE on _id collision. :param mol_object: The compound to be stored :type mol_object: RDKit Mol object :param compound_dict: Additional information about the compound to be stored. Overwritten by calculated values. :type compound_dict: dict :param bulk: A pymongo bulk operation object. If None, reaction is immediately inserted in the database :param kegg_db: The ID of the KEGG Mongo database :type kegg_db: str :param pubchem_db: The ID of the PubChem Mongo database :type pubchem_db: str :param modelseed_db: The ID of the ModelSEED Mongo database :type modelseed_db: str :return: The hashed _id of the compound :rtype: str """ if compound_dict is None: compound_dict = {} # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary compound_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) compound_dict['Inchi'] = AllChem.MolToInchi(mol_object) compound_dict['Inchikey'] = AllChem.InchiToInchiKey( compound_dict['Inchi']) compound_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) compound_dict['Formula'] = AllChem.CalcMolFormula(mol_object) compound_dict['Charge'] = AllChem.GetFormalCharge(mol_object) # Get indices where bits are 1 compound_dict['MACCS'] = list( AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()) compound_dict['len_MACCS'] = len(compound_dict['MACCS']) # Get indices where bits are 1 compound_dict['RDKit'] = list( AllChem.RDKFingerprint(mol_object).GetOnBits()) compound_dict['len_RDKit'] = len(compound_dict['RDKit']) compound_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] compound_dict['_id'] = utils.compound_hash( compound_dict['SMILES'], ('Type' in compound_dict and compound_dict['Type'] == 'Coreactant')) if '_atom_count' in compound_dict: del compound_dict['_atom_count'] # Caching this for rapid reaction mass change calculation self._mass_cache[compound_dict['_id']] = compound_dict['Mass'] # If the compound is a reactant, then make sure the reactant name is # in a correct format. if "Reactant_in" in compound_dict and isinstance( compound_dict['Reactant_in'], str) \ and compound_dict['Reactant_in']: compound_dict['Reactant_in'] = ast.literal_eval( compound_dict['Reactant_in']) # If the compound is a product, then make sure the reactant name is # in a correct format. if "Product_of" in compound_dict \ and isinstance(compound_dict['Product_of'], str) \ and compound_dict['Product_of']: compound_dict['Product_of'] = ast.literal_eval( compound_dict['Product_of']) # Store links to external databases where compound is present if compound_dict['Inchikey']: if kegg_db: compound_dict = self.link_to_external_database( kegg_db, compound=compound_dict, fields_to_copy=[('Pathways', 'Pathways'), ('Names', 'Names'), ('DB_links', 'DB_links'), ('Enzymes', 'Enzymes')]) if pubchem_db: compound_dict = self.link_to_external_database( pubchem_db, compound=compound_dict, fields_to_copy=[('COMPOUND_CID', 'DB_links.PubChem')]) if modelseed_db: compound_dict = self.link_to_external_database( modelseed_db, compound=compound_dict, fields_to_copy=[('DB_links', 'DB_links')]) # Calculate natural product likeness score and store in dict if not self.np_model: self.np_model = np.readNPModel() compound_dict["NP_likeness"] = np.scoreMol(mol_object, self.np_model) compound_dict = utils.convert_sets_to_lists(compound_dict) # Assign an id to the compound if self.id_db: mine_comp = self.id_db.compounds.find_one( {"Inchikey": compound_dict['Inchikey']}, { 'MINE_id': 1, "Pos_CFM_spectra": 1, "Neg_CFM_spectra": 1 }) # If compound already exists in MINE, store its MINE id in the dict if mine_comp: compound_dict['MINE_id'] = mine_comp['MINE_id'] if 'Pos_CFM_spectra' in mine_comp: compound_dict['Pos_CFM_spectra'] = mine_comp[ 'Pos_CFM_spectra'] if 'Neg_CFM_spectra' in mine_comp: compound_dict['Neg_CFM_spectra'] = mine_comp[ 'Neg_CFM_spectra'] # If compound does not exist, create new id based on number of # current ids in the MINE else: compound_dict['MINE_id'] = self.id_db.compounds.count() self.id_db.compounds.save(compound_dict) # If bulk insertion, upsert (insert and update) the database if bulk: bulk.find({'_id': compound_dict['_id']}).upsert().\ replace_one(compound_dict) else: self.compounds.save(compound_dict) return compound_dict['_id']
neutralise=options.bnice, image_dir=options.image_dir, database=options.database) # Create a directory for image output file if it doesn't already exist if options.image_dir and not os.path.exists(options.image_dir): os.mkdir(options.image_dir) # If starting compound specified as SMILES string, then add it if options.smiles: pk._add_compound("Start", options.smiles, type='Starting Compound') else: pk.load_compound_set(compound_file=options.compound_file) # Generate reaction network pk.transform_all(max_generations=options.generations, num_workers=options.max_workers) if options.pruning_whitelist: mols = [ pk._mol_from_dict(line) for line in utils.file_to_dict_list(options.pruning_whitelist) ] pk.prune_network([utils.compound_hash(x) for x in mols if x]) # Save to database (e.g. Mongo) if present, otherwise create output file if options.database: print("Saving results to %s" % options.database) pk.save_to_MINE(options.database) else: pk.assign_ids() pk.write_compound_output_file(options.output_dir + '/compounds.tsv') pk.write_reaction_output_file(options.output_dir + '/reactions.tsv') print("Execution took %s seconds." % (time.time() - t1))