def link_to_external_database(self, external_database, compound=None, match_field="Inchikey", fields_to_copy=None): """This function looks for matching compounds in other databases (i.e. PubChem) and adds links where found. :param external_database: The name of the database to search for matching compounds :type external_database: str :param compound: The compound to search for external links. If none, link all compounds in the database. :type compound: dict :param match_field: The field to search on for matching compounds :type match_field: str :param fields_to_copy: Data to copy into the mine database. The first field is the field name in the external database. The second field is the field name in the MINE database where the data will be copied. :type fields_to_copy: list(tuple) """ if compound: ext = MINE(external_database) projection = dict([( "_id", 0, )] + [( x[0], 1, ) for x in fields_to_copy]) # Find compounds that have same name in another database for ext_comp in ext.compounds.find( {match_field: compound[match_field]}, projection): for field in fields_to_copy: if field[0] in ext_comp: # dict_merge merges two dictionaries using sets to # avoid duplicate values utils.dict_merge( compound, utils.save_dotted_field( field[1], utils.get_dotted_field(ext_comp, field[0]))) return utils.convert_sets_to_lists(compound) # If compound is None, link all compounds in database else: for comp in self.compounds.find(): self.compounds.save( self.link_to_external_database( external_database, compound=comp, match_field=match_field, fields_to_copy=fields_to_copy))
def export_mol(mine_db, target, name_field='_id'): """Exports compounds from the database as MDL molfiles :param mine_db: The database to export :type mine_db: a MINE object :param target: a directory in which to place the files :type target: str :param name_field: the field to provide names for the mol files. Must be unique & universal :type name_field: str :return: :rtype: """ # Create the file if it doesn't yet exist if not os.path.exists(target): os.mkdir(target) # Let user know if an id does not exist for every compound in database if mine_db.compounds.find().count() != mine_db.compounds.find({ name_field: { '$exists': 1 } }).count(): raise ValueError( '%s does not exist for every compound in the database' % name_field) for compound in mine_db.compounds.find({'_id': {'$regex': '^C'}}): # Create Mol object from SMILES code for each compound using # MolFromSmiles (rdkit). Take stereochemistry into account (True), # and replace CoA and R with *. mol = AllChem.MolFromSmiles(compound['SMILES'], True, { 'CoA': '*', 'R': "*" }) if "." in name_field: compound[name_field] = utils.get_dotted_field(compound, name_field) # Make things more compact and look nicer if isinstance(compound[name_field], list): compound[name_field] = ','.join(compound[name_field]) # Use MolToMolFile (rdkit) to create a mol file from the Mol object # with the file path specified. AllChem.MolToMolFile( mol, os.path.join(target, compound[name_field] + '.mol'))
def export_mol(mine_db: MINE, target: str, name_field: str = "_id") -> None: """Exports compounds from the database as a MDL molfiles Parameters ---------- mine_db : MINE MINE object that contains the database. target : str Directory in which to place the files. name_field : str, optional FIeld to provide names for the mol files. Must be unique and universal. By default, "_id". """ # Create the file if it doesn't yet exist if not os.path.exists(target): os.mkdir(target) # Let user know if an id does not exist for every compound in database if (mine_db.compounds.find().count() != mine_db.compounds.find({ name_field: { "$exists": 1 } }).count()): raise ValueError( f"{name_field} does not exist for every compound in the database") for compound in mine_db.compounds.find({"_id": {"$regex": "^C"}}): # Create Mol object from SMILES code for each compound using # MolFromSmiles (rdkit). Take stereochemistry into account (True), # and replace CoA and R with *. mol = AllChem.MolFromSmiles(compound["SMILES"], True, { "CoA": "*", "R": "*" }) if "." in name_field: compound[name_field] = utils.get_dotted_field(compound, name_field) # Make things more compact and look nicer if isinstance(compound[name_field], list): compound[name_field] = ",".join(compound[name_field]) # Use MolToMolFile (rdkit) to create a mol file from the Mol object # with the file path specified. AllChem.MolToMolFile( mol, os.path.join(target, compound[name_field] + ".mol"))
def make_hash_dict(db, key_field): hash_dict = {} for comp in db.compounds.find({key_field: {'$exists': 1}}, {key_field: 1}): for name in utils.get_dotted_field(comp, key_field): hash_dict[name] = comp['_id'] return hash_dict