Exemple #1
0
def import_mol_dir(mine_db, target, name_field="Name", overwrite=False):
    """Imports a directory of molfiles as a MINE database
    
    :param mine_db: a MINE object, the database to insert the compound into
    :param target: a path, the molfile directory to be loaded
    :param name_field: a string, the field for the compound name
    :param overwrite: a bool, if true, new compounds replace the old compounds
        in the database
    """
    # For each .mol file in the directory of the target folder (path):
    for file in os.listdir(target):
        if ".mol" in file:
            # MolFromMolFile (rdkit) generates Mol objects from .mol files
            mol = AllChem.MolFromMolFile(target + '/' + file)
            # Mol object name becomes name of mol file without .mol extension
            name = file.rstrip('.mol')
            # Check that Mol object is successfully generated
            if mol:
                # Create hashkey for the compound
                comphash = utils.compound_hash(mol)
                # If we don't want to overwrite, and the compound (comphash)
                # already exists, then add an extra comphash for that molecule
                if not overwrite and mine_db.compounds.count({"_id": comphash
                                                              }):
                    mine_db.compounds.update({"_id": comphash},
                                             {"$addToSet": {
                                                 name_field: name
                                             }})
                # If we don't care about overwriting, just insert the new
                # compound into the database
                else:
                    mine_db.insert_compound(mol,
                                            compound_dict={
                                                name_field: [name],
                                                'Generation': 0
                                            },
                                            pubchem_db=None,
                                            kegg_db=None,
                                            modelseed_db=None)
    # Add to log file (metadata)
    mine_db.meta_data.insert({
        "Timestamp": datetime.datetime.now(),
        "Action": "MolFiles Imported",
        "Filepath": target
    })
Exemple #2
0
 def parse_comps(field):
     atoms = collections.Counter()
     compounds = collections.Counter(field.split(' // '))
     half_rxn = []
     for comp, stoich in compounds.items():
         if comp in metacyc2hash:
             mol = metacyc2hash[comp]
             for pair in re.findall('([A-Z][a-z]*)(\d*)',
                                    AllChem.CalcMolFormula(mol)):
                 if pair[1]:
                     atoms[pair[0]] += int(pair[1]) * stoich
                 else:
                     atoms[pair[0]] += 1 * stoich
             if comp not in inserted:
                 mine_db.insert_compound(mol, {'Generation': 0})
                 inserted.add(comp)
             half_rxn.append(
                 utils.stoich_tuple(stoich, utils.compound_hash(mol)))
         else:
             raise ValueError('Undefined Compound: %s' % comp)
     return half_rxn, atoms
Exemple #3
0
 def _add_compound(self, id, smi, mol=None, type='Predicted'):
     """Adds a compound to the internal compound dictionary"""
     _id = utils.compound_hash(smi, type == 'Coreactant')
     self._raw_compounds[smi] = _id
     # We don't want to overwrite the same compound from a prior
     # generation so we check with hashed id from above
     if _id not in self.compounds:
         if not mol:
             mol = AllChem.MolFromSmiles(smi)
         i_key = AllChem.InchiToInchiKey(AllChem.MolToInchi(mol))
         self.compounds[_id] = {
             'ID': id,
             '_id': _id,
             "SMILES": smi,
             'Inchikey': i_key,
             'Type': type,
             'Generation': self.generation,
             'Formula': AllChem.CalcMolFormula(mol),
             '_atom_count': self._get_atom_count(mol),
             'Charge': AllChem.GetFormalCharge(mol),
             'Reactant_in': [],
             'Product_of': [],
             "Sources": []
         }
         # Don't track sources of coreactants
         if _id[0] == 'X':
             del self.compounds[_id]['Sources']
         # If we are building a mine and generating images, do so here
         if self.image_dir and self.mine:
             try:
                 with open(os.path.join(self.image_dir, _id + '.svg'),
                           'w') as outfile:
                     nmol = rdMolDraw2D.PrepareMolForDrawing(mol)
                     d2d = rdMolDraw2D.MolDraw2DSVG(1000, 1000)
                     d2d.DrawMolecule(nmol)
                     d2d.FinishDrawing()
                     outfile.write(d2d.GetDrawingText())
             except OSError:
                 print("Unable to generate image for %s" % smi)
     return _id
Exemple #4
0
    def insert_compound(self,
                        mol_object,
                        compound_dict=None,
                        bulk=None,
                        kegg_db="KEGG",
                        pubchem_db='PubChem-8-28-2015',
                        modelseed_db='ModelSEED'):
        """This class saves a RDKit Molecule as a compound entry in the MINE.
        Calculates necessary fields for API and includes additional
        information passed in the compound dict. Overwrites preexisting
        compounds in MINE on _id collision.
        
        :param mol_object: The compound to be stored
        :type mol_object: RDKit Mol object
        :param compound_dict: Additional information about the compound to be
            stored. Overwritten by calculated values.
        :type compound_dict: dict
        :param bulk: A pymongo bulk operation object. If None, reaction is
         immediately inserted in the database
        :param kegg_db: The ID of the KEGG Mongo database
        :type kegg_db: str
        :param pubchem_db: The ID of the PubChem Mongo database
        :type pubchem_db: str
        :param modelseed_db: The ID of the ModelSEED Mongo database
        :type modelseed_db: str
        :return: The hashed _id of the compound
        :rtype: str
        """

        if compound_dict is None:
            compound_dict = {}

        # Store all different representations of the molecule (SMILES, Formula,
        #  InChI key, etc.) as well as its properties in a dictionary
        compound_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True)
        compound_dict['Inchi'] = AllChem.MolToInchi(mol_object)
        compound_dict['Inchikey'] = AllChem.InchiToInchiKey(
            compound_dict['Inchi'])
        compound_dict['Mass'] = AllChem.CalcExactMolWt(mol_object)
        compound_dict['Formula'] = AllChem.CalcMolFormula(mol_object)
        compound_dict['Charge'] = AllChem.GetFormalCharge(mol_object)
        # Get indices where bits are 1
        compound_dict['MACCS'] = list(
            AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits())
        compound_dict['len_MACCS'] = len(compound_dict['MACCS'])
        # Get indices where bits are 1
        compound_dict['RDKit'] = list(
            AllChem.RDKFingerprint(mol_object).GetOnBits())
        compound_dict['len_RDKit'] = len(compound_dict['RDKit'])
        compound_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0]
        compound_dict['_id'] = utils.compound_hash(
            compound_dict['SMILES'],
            ('Type' in compound_dict
             and compound_dict['Type'] == 'Coreactant'))
        if '_atom_count' in compound_dict:
            del compound_dict['_atom_count']
        # Caching this for rapid reaction mass change calculation
        self._mass_cache[compound_dict['_id']] = compound_dict['Mass']

        # If the compound is a reactant, then make sure the reactant name is
        # in a correct format.
        if "Reactant_in" in compound_dict and isinstance(
                compound_dict['Reactant_in'], str) \
                and compound_dict['Reactant_in']:
            compound_dict['Reactant_in'] = ast.literal_eval(
                compound_dict['Reactant_in'])
        # If the compound is a product, then make sure the reactant name is
        # in a correct format.
        if "Product_of" in compound_dict \
                and isinstance(compound_dict['Product_of'], str) \
                and compound_dict['Product_of']:
            compound_dict['Product_of'] = ast.literal_eval(
                compound_dict['Product_of'])

        # Store links to external databases where compound is present
        if compound_dict['Inchikey']:
            if kegg_db:
                compound_dict = self.link_to_external_database(
                    kegg_db,
                    compound=compound_dict,
                    fields_to_copy=[('Pathways', 'Pathways'),
                                    ('Names', 'Names'),
                                    ('DB_links', 'DB_links'),
                                    ('Enzymes', 'Enzymes')])

            if pubchem_db:
                compound_dict = self.link_to_external_database(
                    pubchem_db,
                    compound=compound_dict,
                    fields_to_copy=[('COMPOUND_CID', 'DB_links.PubChem')])

            if modelseed_db:
                compound_dict = self.link_to_external_database(
                    modelseed_db,
                    compound=compound_dict,
                    fields_to_copy=[('DB_links', 'DB_links')])

        # Calculate natural product likeness score and store in dict
        if not self.np_model:
            self.np_model = np.readNPModel()
        compound_dict["NP_likeness"] = np.scoreMol(mol_object, self.np_model)

        compound_dict = utils.convert_sets_to_lists(compound_dict)
        # Assign an id to the compound
        if self.id_db:
            mine_comp = self.id_db.compounds.find_one(
                {"Inchikey": compound_dict['Inchikey']}, {
                    'MINE_id': 1,
                    "Pos_CFM_spectra": 1,
                    "Neg_CFM_spectra": 1
                })
            # If compound already exists in MINE, store its MINE id in the dict
            if mine_comp:
                compound_dict['MINE_id'] = mine_comp['MINE_id']
                if 'Pos_CFM_spectra' in mine_comp:
                    compound_dict['Pos_CFM_spectra'] = mine_comp[
                        'Pos_CFM_spectra']
                if 'Neg_CFM_spectra' in mine_comp:
                    compound_dict['Neg_CFM_spectra'] = mine_comp[
                        'Neg_CFM_spectra']
            # If compound does not exist, create new id based on number of
            # current ids in the MINE
            else:
                compound_dict['MINE_id'] = self.id_db.compounds.count()
                self.id_db.compounds.save(compound_dict)

        # If bulk insertion, upsert (insert and update) the database
        if bulk:
            bulk.find({'_id': compound_dict['_id']}).upsert().\
                replace_one(compound_dict)
        else:
            self.compounds.save(compound_dict)
        return compound_dict['_id']
Exemple #5
0
                 neutralise=options.bnice,
                 image_dir=options.image_dir,
                 database=options.database)
    # Create a directory for image output file if it doesn't already exist
    if options.image_dir and not os.path.exists(options.image_dir):
        os.mkdir(options.image_dir)
    # If starting compound specified as SMILES string, then add it
    if options.smiles:
        pk._add_compound("Start", options.smiles, type='Starting Compound')
    else:
        pk.load_compound_set(compound_file=options.compound_file)
    # Generate reaction network
    pk.transform_all(max_generations=options.generations,
                     num_workers=options.max_workers)
    if options.pruning_whitelist:
        mols = [
            pk._mol_from_dict(line)
            for line in utils.file_to_dict_list(options.pruning_whitelist)
        ]
        pk.prune_network([utils.compound_hash(x) for x in mols if x])
    # Save to database (e.g. Mongo) if present, otherwise create output file
    if options.database:
        print("Saving results to %s" % options.database)
        pk.save_to_MINE(options.database)
    else:
        pk.assign_ids()
        pk.write_compound_output_file(options.output_dir + '/compounds.tsv')
        pk.write_reaction_output_file(options.output_dir + '/reactions.tsv')

    print("Execution took %s seconds." % (time.time() - t1))