Example #1
0
def smiles_extraction(cas_keys, main):

    if len(cas_keys) != 0:
        try:
            top_ck = mode(cas_keys)  # take most frequent cas key
            top_cmpds = pcp.get_compounds(top_ck, 'name')
            if len(top_cmpds) != 0:
                top_smiles = top_cmpds[
                    0].isomeric_smiles  # pick top result (best match) for cas key
                new_smiles[main] = top_smiles

        except statistics.StatisticsError:  # two are equally frequent
            top_ck = []
            smiles1 = []
            lst_count = [x for x in set(cas_keys) if cas_keys.count(x) > 1]

            for i in range(len(lst_count)):
                top_ck.append(lst_count[i])
            for j in top_ck:
                top_cmpds = pcp.get_compounds(j, 'name')
                if len(top_cmpds) != 0:
                    top_smiles = top_cmpds[0].isomeric_smiles
                    smiles1.append(top_smiles)

            new_smiles[main] = [smiles1]
Example #2
0
    def save(self, *args, additional_data=None, cid2=False, **kwargs):
        """
        Sets data for various fields. Assumes that if the object does not have inchikey data that it has a SMILES string
        """
        if not all([self.smiles, self.cid_number, self.chemical_properties]):
            try:
                pcp_data = pcp.get_compounds(self.inchikey, 'inchikey')[0] if hasattr(self, 'inchikey') else \
                    pcp.get_compounds(self.smiles, 'smiles')[0]
            except (IndexError, pcp.BadRequestError):
                raise ValidationError('Something went wrong')

            if not self.iupac_name:
                self.iupac_name = pcp_data.iupac_name or 'n/a'
            self.smiles = pcp_data.isomeric_smiles or pcp_data.canonical_smiles or ''
            self.set_chemical_data(pcp_query=pcp_data)
        if not self.chemical_name:
            self.chemical_name = self.scrape_compound_name(self.cid_number) or \
                                 self.synonyms.split(',')[0] if self.synonyms != 'n/a' else ''
        if cid2 and len(self.smiles.split('.')) > 1:
            try:
                self.cid_number_2 = pcp.get_compounds(
                    self.smiles.split('.')[0], 'smiles')[0].cid
            except (IndexError, pcp.BadRequestError):
                pass
        if len(self.smiles) > 200:
            self.smiles = ''
        if self.iupac_name and len(self.iupac_name) > 250:
            self.iupac_name = ''
        if hasattr(self, 'activity') and not self.activity and hasattr(
                self, 'category') and self.category == 1:
            act_find = FindActivity(self.chemical_name)
            self.activity = act_find.activity
        super(CompoundMixin, self).save(*args, **kwargs)
Example #3
0
def FingerprintCOMP(a1, a2, key):
    p = pcp.get_compounds(a1, "name")

    d = pcp.get_compounds(a2, "name")

    for c in p:
        if key == "isomeric_smiles":
            a = c.isomeric_smiles
        elif key == "molecular_formula":
            a = c.molecular_formula
        elif key == "fingerprint":
            a = c.fingerprint
        else:
            a = c.fingerprint
        print "Input 1 = " + a1
        print a
        print len(a)
        for i in d:
            if key == "isomeric_smiles":
                b = i.isomeric_smiles
            elif key == "molecular_formula":
                b = i.molecular_formula
            elif key == "fingerprint":
                b = i.fingerprint
            else:
                b = i.fingerprint
            print "Input 2 = " + a2
            print b
            print len(b)
            s = difflib.SequenceMatcher(None, a, b)
            print a1 + " and " + a2 + " Have a similarity score of " + str(
                s.ratio() * 100)
            for block in s.get_matching_blocks():
                print "Input 1 1[%d] and Input 2[%d] match for %d elements" % block
Example #4
0
def get_molecule_cid(file_name, test=False):
    """ 
    Take a file_name, remove the ending and prefix and return the pubchem
    molecule object.  If the file_name does not have the cas number, try the 
    name of the system,   else return None so as to disreguard the molecule
    from training.
    """
    my_file = file_name.split("_")
    #try to get the cas # if not then try name.
    cas = my_file[1]
    if test:
        print("Get CID, CAS: ", cas)
    else:
        pass
    try:
        cid = pcp.get_compounds(cas, 'name')[0]
    except:
        #file doesn't have a cas number with it, try name
        cas = my_file[-1]
        cas.replace("_", ' ')
        my_name = cas.split(".")
        try:
            cid = pcp.get_compounds(my_name[0], 'name')[0]
        except:
            cid = None
    finally:
        if test:
            print("Get CID, CID: ", cid)
        return cid
Example #5
0
def mol_prop_gen(dataframe,outputpickle):
#loading initial data
#    dataframe = pd.read_csv(filename)
#    newdf = pd.DataFrame()
    finaldf = pd.DataFrame()

    for index, row in dataframe.iterrows():
        
        if index == 0:
            name = row['Name']
            inchi = row['InChI']
            cmpd = pcp.get_compounds(inchi,'inchi')
            props = cmpd[0].to_dict(properties=['cactvs_fingerprint',
                        'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity',
                        'exact_mass','fingerprint'])
            smiles=props['isomeric_smiles']
            props['mol']=Chem.MolFromSmiles(smiles)
            props['RT'] = row['RT']
            props['Name'] = name
            props['System'] = row['System']
            desc = np.array(fps_plus_mw(props['mol']))
            descdf = pd.DataFrame(desc)
            descdf = descdf.T
            descdf.reindex([index])
            newdf=pd.DataFrame(props,index=[index])
            finaldf=pd.concat([descdf,newdf],axis=1)
            print('test')
        else:
            inchi = row['InChI']
        try:
            cmpd = pcp.get_compounds(inchi,'inchi')
        except:
            print('line bypassed')
            pass
        try:
            props = cmpd[0].to_dict(properties=['cactvs_fingerprint','isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity','exact_mass','fingerprint'])
        except:
            print('line bypassed')
            pass
        name = row['Name']
        smiles=props['isomeric_smiles']
        props['mol']=Chem.MolFromSmiles(smiles)
        props['RT'] = row['RT']
        props['Name'] = name
        props['System'] = row['System']
        newdf=pd.DataFrame(props,index=[index])
        desc = np.array(fps_plus_mw(props['mol']))
        cols=range(len(desc))
        descdf=pd.DataFrame(desc)
        descdf = descdf.T
        descdf.index = [index]
#        descdf = descdf.T
#        descdf = pd.DataFrame(descdf, index=[index])
        interdf = pd.concat([descdf,newdf],axis=1)
        finaldf = finaldf.append(interdf)
        print('on index ' + str(index+1) + ' of ' + str(len(dataframe)))
        finaldf.to_pickle(outputpickle)


#mol_prop_gen('PredRetDB.csv','test.pickle')
Example #6
0
    def set_chem_data(self):
        for d in self.drugs_data:
            try:
                pcp_query = pcp.get_compounds(d['cid_number'], 'cid')[0]
                smiles = pcp_query.canonical_smiles

                d.update({
                    'smiles':
                    smiles,
                    'inchikey':
                    pcp_query.inchikey,
                    'iupac_name':
                    pcp_query.iupac_name
                    or cirpy.resolve(smiles, 'iupac_name', ['smiles']),
                    'chemical_properties':
                    dict_from_query_object(smiles, pcp_query, additional=True),
                })
                if len(smiles.split('.')) > 1:
                    d.update({
                        'cid_number_2':
                        pcp.get_compounds(smiles.split('.')[0],
                                          'smiles')[0].cid
                    })
            except (IndexError, TypeError, pcp.BadRequestError):
                self.drugs_data.remove(d)
        return self.drugs_data
def get_iupac_name_from_smiles(smiles):
    try:
        cpd = pcp.get_compounds(smiles, 'smiles')
    except pcp.PubChemHTTPError:
        cpd = pcp.get_compounds(smiles, 'smiles')

    if len(cpd) != 1: return
    else: return cpd[0].iupac_name
Example #8
0
def convert_feature_to_standard_pubchem(df):
    # create an empty df for later use
    dummyarray = np.empty((len(df), 6))
    dummyarray[:] = np.nan
    df0 = pd.DataFrame(
        dummyarray,
        columns=['iupac', 'synonyms', 'compounds', 'formulae', 'cid', 'flag'])

    # deal with second column with CAS number
    print('2nd')
    df2 = df.iloc[:, 1].copy()
    df2 = pd.Series(df2).replace('nan', np.nan)
    # df2[mask1] = None
    df2 = [
        str(string).lstrip("'") if string is not np.nan else np.nan
        for string in df2
    ]  # remove ' from the beginning of the cas number
    idx2 = [
        pcp.get_compounds(component, 'name')
        if component is not np.nan else [] for component in df2
    ]
    df0 = _fill_df_pubchem(df0, idx2)
    mask2 = [not not elem for elem in idx2]
    df0.loc[mask2, 'flag'] = 'from column B'

    # deal with the third column with mix number
    print('3rd')
    df3 = df.iloc[:, 4].copy()
    df3[mask2] = np.nan
    idx3 = [
        pcp.get_compounds(component, 'name')
        if component is not np.nan else [] for component in df3
    ]
    df0 = _fill_df_pubchem(df0, idx3)
    mask3 = [not not elem for elem in idx3]
    df0.loc[mask3, 'flag'] = 'from column E'

    # deal with first column with given names
    print('1st')
    df1 = df.iloc[:, 0].copy()
    df1[list(pd.Series(mask3) | pd.Series(mask2))] = np.nan
    name2trans = list(df1)
    name_transed_cls = [
        Translator().translate(name, src='nl', dest='en')
        if name is not np.nan else [] for name in name2trans
    ]
    # name_transed_cls = name2trans
    idx1 = [
        pcp.get_compounds(component.text, 'name') if not not component else []
        for component in name_transed_cls
    ]
    df0 = _fill_df_pubchem(df0, idx1)
    mask1 = [not not elem for elem in idx1]
    df0.loc[mask1, 'flag'] = 'from column A'

    return df0
Example #9
0
def geometry_from_pubchem(name, structure=None):
    """Function to extract geometry using the molecule's name from the PubChem
    database. The 'structure' argument can be used to specify which structure
    info to use to extract the geometry. If structure=None, the geometry will
    be constructed based on 3D info, if available, otherwise on 2D (to keep
    backwards compatibility with the times when the argument 'structure'
    was not implemented).

    Args:
        name: a string giving the molecule's name as required by the PubChem
            database.
        structure: a string '2d' or '3d', to specify a specific structure
            information to be retrieved from pubchem. The default is None.
            Recommended value is '3d'.

    Returns:
        geometry: a list of tuples giving the coordinates of each atom with
        distances in Angstrom.
    """
    import pubchempy

    if structure in ['2d', '3d']:
        pubchempy_molecule = pubchempy.get_compounds(name,
                                                     'name',
                                                     record_type=structure)
    elif structure is None:
        # Ideally get the 3-D geometry if available.
        pubchempy_molecule = pubchempy.get_compounds(name,
                                                     'name',
                                                     record_type='3d')

        # If the 3-D geometry isn't available, get the 2-D geometry instead.
        if not pubchempy_molecule:
            pubchempy_molecule = pubchempy.get_compounds(name,
                                                         'name',
                                                         record_type='2d')
    else:
        raise ValueError('Incorrect value for the argument structure=%s' %
                         structure)

    # Check if pubchempy_molecule is an empty list or None
    if not pubchempy_molecule:
        print(
            'Unable to find structure info in the PubChem database for the specified molecule "%s".'
            % name)
        return None

    pubchempy_geometry = \
        pubchempy_molecule[0].to_dict(properties=['atoms'])['atoms']
    geometry = [(atom['element'], (atom['x'], atom['y'], atom.get('z', 0)))
                for atom in pubchempy_geometry]

    return geometry
def get_iupac_name_from_smiles(smiles):
    if not pcp:
        raise RuntimeError('No Pubchempy')

    try:
        cpd = pcp.get_compounds(smiles, 'smiles')
    except pcp.PubChemHTTPError:
        cpd = pcp.get_compounds(smiles, 'smiles')

    if len(cpd) != 1:
        return
    else:
        return cpd[0].iupac_name
Example #11
0
def process_bioactive_identifier(request):
    cas_no = request.GET.get('cas_number')
    inchikey = request.GET.get('inchikey', '').strip()
    obj = None
    if cas_no:
        obj = Bioactive.objects.filter(
            chemical_properties__synonyms__icontains=cas_no).first()
    elif inchikey:
        obj = Bioactive.objects.filter(inchikey__exact=inchikey).first()
    if obj:
        data = {
            'object_exists': obj.get_absolute_url(),
            'object_exists_name': str(obj),
        }
        return JsonResponse(data)
    try:
        iupac_name = None
        if cas_no:
            smiles = cirpy.query(cas_no, 'smiles')[0].value
            if '.' in smiles:
                smiles = [i for i in smiles.split('.') if len(i) > 5][0]
            pcp_query = pcp.get_compounds(smiles, 'smiles')[0]
            if not pcp_query.iupac_name:
                iupac_name = cirpy.resolve(smiles, 'iupac_name', ['smiles'])
        else:
            pcp_query = pcp.get_compounds(inchikey, 'inchikey')[0]
            if not pcp_query.iupac_name:
                iupac_name = cirpy.resolve(inchikey, 'iupac_name',
                                           ['stdinchikey'])
        if not pcp_query.cid:
            raise IndexError
    except (IndexError, pcp.BadRequestError):
        return JsonResponse({'error': 'No compound found for this CAS number'})
    data = {
        'chemical_name':
        Bioactive.scrape_compound_name(pcp_query.cid),
        'iupac_name':
        pcp_query.iupac_name or iupac_name or 'n/a',
        'inchikey':
        pcp_query.inchikey,
        'structure_url':
        'https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid={}&t=l'.
        format(pcp_query.cid),
        'hidden_cid':
        pcp_query.cid,
        'smiles':
        pcp_query.isomeric_smiles or pcp_query.canonical_smiles or '',
    }
    return JsonResponse(data)
Example #12
0
    def _load_from_pubchem(self, name):
        '''
		Method used to load data from pubchem

		name - name of compound to search

		returns nothing
		'''

        import pubchempy as pcp

        try:
            name = int(name)
        except:
            pass

        record_type = '3d'
        mol = pcp.get_compounds(name, ('name', 'cid')[type(name) is int],
                                record_type=record_type)

        if len(mol) == 0:
            utils.message(
                'Error: Could not find 3d structure of {name}... Attempting to find 2d structure...',
                'red')
            record_type = '2d'
            mol = pcp.get_compounds(name, ('name', 'cid')[type(name) is int],
                                    record_type=record_type)

        if len(mol) == 0:
            utils.message('Error: No structural data found for {name}.', 'red')

        else:
            mol = mol[0]

            coords = np.asarray([[a.x, a.y, a.z] for a in mol.atoms])
            coords = np.where(coords == None, 0, coords).astype(float)
            elements = np.asarray([a.element for a in mol.atoms])
            self.name = name.capitalize()

            self.atoms = []
            [
                self.atoms.append(Atom(elements[i], coords[i]))
                for i in range(len(coords))
            ]
            if record_type == '3d':
                self.save_to_xyz(os.getcwd() +
                                 rf'\Molecules\{name.lower()}.xyz')

            self._mol_load_finish()
Example #13
0
def getChemicalName(smiles):
    try:
        name = pcp.get_compounds(smiles, "smiles")[0].iupac_name
        return name
    except:
        print("Pubchempy could not convert SMILES to a IUPAC name")
        return smiles
Example #14
0
def sid_to_smiles(sid):
    """Takes an SID and prints the associated SMILES string."""

    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
    return compound.isomeric_smiles
Example #15
0
def import_from_pubchem():

    compounds = read_csv()
    # cria uma planilha no mesmo local do arquivo .py
    workbook = xlsxwriter.Workbook(filename='to_database.xlsx')
    # cria uma aba
    worksheet = workbook.add_worksheet(name='results')
    row = 1

    print('\nEstabelecendo conexão com o PubChem...')
    # para cada composto na tabela
    for comp in compounds:

        # pega dados no pubChem
        results = pcp.get_compounds(comp[0], 'smiles')
        # baixa a imagem de composto
        pcp.download('PNG',
                     os.path.join(CURR_PATH, 'images', comp[0] + '.png'),
                     comp[0],
                     'smiles',
                     overwrite=True)

        # para cada resultado, escreve na planilha nova o SMILES, o aroma, o nome IUPAC e a fórmula molecular
        for c in results:

            print('\nComposto ' + c.iupac_name)
            worksheet.write(row, 0, comp[0])
            worksheet.write(row, 1, comp[1])
            worksheet.write(row, 2, c.iupac_name)
            worksheet.write(row, 3, c.molecular_formula)

            row += 1

    workbook.close()
    print('Pronto! Compostos Atualizados')
def get_compound_from_smiles(smiles):
    attempts = 5
    time_delay = 1  # in seconds
    while attempts >= 1:
        try:
            compounds = pcp.get_compounds(smiles, namespace='smiles')
            cid = compounds[0].cid
            if cid == None:
                print(
                    'No PubChem record'
                )  # https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html
                return None
            compound = pcp.Compound.from_cid(cid)
        except:
            attempts -= 1
            print('Could not get compound. ' + str(attempts) +
                  ' attempts remaining.')
            time.sleep(time_delay)
        else:
            return compound

    if attempts <= 0:
        print(
            'Failed to get compound from smiles after exhausting all attempts')
        return None
Example #17
0
def get_smiles(idx, names=None, cids=None, sids=None, binding_db=None):
    try:
        cid = int(cids[idx])
        smiles_string = binding_db.loc[binding_db['PubChem CID'] ==
                                       cid]["Ligand SMILES"].values[0]
    except:
        try:
            sid = int(sids[idx])
            smiles_string = binding_db.loc[binding_db['PubChem SID'] ==
                                           sid]["Ligand SMILES"].values[0]
        except:
            try:
                cid = int(cids[idx])
                smiles_string = str(
                    Compound.from_cid(int(cid)).isomeric_smiles)
            except:
                try:
                    chembl_id = Substance.from_sid(sids[idx]).source_id
                    print(chembl_id)
                    try:
                        compounds = CompoundResource()
                        c = compounds.get(chembl_id)
                        smiles_string = c["smiles"]
                    except:
                        smiles_string = binding_db.loc[
                            binding_db["ChEMBL ID of Ligand"] ==
                            chembl_id]["Ligand SMILES"].values[0]
                except:
                    try:
                        name = names[idx]
                        cs = get_compounds(name, 'name')
                        smiles_string = cs[0].isomeric_smiles
                    except:
                        smiles_string = np.nan
    return (smiles_string)
Example #18
0
    def get_name_from_pubchem(self, smiles: str) -> Optional[str]:
        """Tries to get the name of a molecule from the Pubchem website.

        Args:
            smiles: A SMILES string.

        Returns:
            The molecule name if a match is found else ``None``.
        """
        try:
            comp = get_compounds(smiles, namespace="smiles")[0]

        except (BadRequestError, IndexError):
            return None

        traditional = comp.synonyms[0] if comp.synonyms else None
        names = {"traditional": traditional, "iupac": comp.iupac_name}

        match = None
        for source in self.name_preference:
            if source in names and names[source]:
                match = names[source]
                break

        if isinstance(match, str):
            match = match.lower()

        return self._process_match(smiles, match)
Example #19
0
def process_cas(request):
    cas_no = request.GET.get('cas_number')
    try:
        obj = Odorant.objects.get(cas_number__exact=cas_no)
        data = {
            'object_exists': obj.get_absolute_url(),
            'object_exists_name': str(obj),
        }
        return JsonResponse(data)
    except ObjectDoesNotExist:
        pass
    try:
        smiles = cirpy.query(cas_no, 'smiles')[0].value
        pcp_query = pcp.get_compounds(smiles, 'smiles')[0]
        cid_no = pcp_query.cid
    except IndexError:
        return JsonResponse({
            'error': 'No compound found for this CAS number'
        })
    if smiles and cid_no:
        data = {
            'chemical_name': Odorant.scrape_compound_name(cid_no),
            'iupac_name': pcp_query.iupac_name,
            'structure_url': 'https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid={}&amp;t=l'.format(cid_no),
            'hidden_cid': cid_no,
            'smiles': smiles,
        }
        return JsonResponse(data)
Example #20
0
def cid_df_to_smiles(df, cid_colname):
    """
    Args:
        df : pandas dataframe with SID numbers
        column_name (str) : name of column that contains PubChem SID numbers

    Returns:
        df : modified with columns containing CID and SMILES
                  CID becomes first column, SMILES second
        unsuccessful_list : list of SIDs for which no CID or SMILES were found

    """

    res = []
    unsuccessful_list = []
    for index, row in df.iterrows():
        cid = row[cid_colname]
        try:
            compound = pc.get_compounds(cid)[0]
            smiles = compound.canonical_smiles
            res.append(smiles)
        except BaseException:
            res.append('none')
            unsuccessful_list.append(cid)
            pass
        
    df['SMILES'] = res
    #df.to_csv(r'../datasets/df_cleaned_kegg_with_smiles.csv')
    
    return df, unsuccessful_list
Example #21
0
    def _set_inchi_pcc(self, in_str, pcp_type, elem):
        """Check pubchem compounds via API for both an inchikey and any available compound details
        """
        if not in_str:
            return 0

        try:
            pccs = pcp.get_compounds(in_str, pcp_type)
        except pcp.BadRequestError as e:
            print(e)
            return 0
        except pcp.TimeoutError as e:
            print(e)
            return 0
        except pcp.ServerError as e:
            print(e)
            return 0
        except URLError as e:
            print(e)
            return 0
        except BadStatusLine as e:
            print(e)
            return 0

        if pccs:
            pcc = pccs[elem]
            self.compound_info['inchikey_id'] = pcc.inchikey
            self.compound_info['pubchem_id'] = pcc.cid
            self.compound_info['molecular_formula'] = pcc.molecular_formula
            self.compound_info['molecular_weight'] = pcc.molecular_weight
            self.compound_info['exact_mass'] = pcc.exact_mass
            self.compound_info['smiles'] = pcc.canonical_smiles

            if len(pccs) > 1:
                print('WARNING, multiple compounds for ', self.compound_info)
Example #22
0
    def download_from_list(self, name_list, type="name"):
        """Given a list of compounds, download them

        Parameters
        ----------
        name_list
        type

        Returns
        -------

        """

        results = []
        for n in name_list:
            try:
                r = get_compounds(n, namespace=type)
            except:
                print("Ligand %s not found" % n)
                r = []

            if len(r):
                results.append(r[0])
            else:
                results.append(None)

        return results
Example #23
0
def getCompoundsInfo(ChemIDList, MoleculeNameList):
    compoundList = []
    molecular_formulasList = []
    isomeric_smilesList = []
    CidsList = []
    chemIDList = []
    i = 0
    for chemID, molecule in list(zip(ChemIDList, MoleculeNameList)):
        #print('ChemID:', chemID)
        compoundsResults = pubchempy.get_compounds(molecule, namespace='name')
        #print('compoundsResults:\n', compoundsResults)
        #compoundsResults.to_excel(outputPath+"result.xlsx")
        for compound in compoundsResults:
            #print('CID: {}\tMass: {}\tName: {}\tMolfor: {}\tSmi: {}\tSyn: {}'.format(compound.cid,compound.exact_mass,compound.iupac_name,compound.molecular_formula, compound.isomeric_smiles, compound.synonyms))
            molecular_formulas=compound.molecular_formula
            #MWs=compound.molecular_weight
            isomeric_smiles=compound.isomeric_smiles
            #synonyms=compound.synonyms
            Cids=compound.cid
            chemIDList.append(chemID)
            compoundList.append(molecule)
            molecular_formulasList.append(molecular_formulas)
            isomeric_smilesList.append(isomeric_smiles)
            CidsList.append(Cids)
            
            dataframe=pd.DataFrame({'ChemID':chemIDList,
                                    'CompoundName':compoundList, 
                                    'molecular_formula':molecular_formulasList,
                                    'smiles':isomeric_smilesList,
                                    'cid':CidsList})
            dataframe.to_excel(outputPath+"SmilesResult.xlsx",index=False)
            print(f'{i}-{chemID}:{molecule} write to excel file successfully!')
        i += 1
        time.sleep(3)
Example #24
0
def apply_get_compounds(mol_name):
    try:
        iso_smiles = pcp.get_compounds(mol_name, 'name')[0].isomeric_smiles
    except:
        iso_smiles = ''

    return iso_smiles
def get_from_pubchem(name, path=structures_folder):
    '''
	Function that downloads a molecule from pubchem
	Save the molecule to path

	name - name of molecule
	path - path to save molecule to
	'''

    import pubchempy as pcp

    mols = pcp.get_compounds(name, 'name', record_type='3d')
    #check if there was a match
    if len(mols) == 0:
        raise Exception(f'Molecule {mol} not found on PubChem or on disk.')

    #save xyz file to disk
    else:
        coords = np.asarray([[a.x, a.y, a.z] for a in mols[0].atoms])
        coords = np.where(coords == None, 0, coords).astype(float)
        elements = np.asarray([a.element for a in mols[0].atoms])

        mol_path = path + '\\' + name + '.xyz'

        with open(mol_path, 'w+') as f:
            f.write(f'{len(elements)}\n')
            f.write('Downloaded from PubChem\n')
            for i, e in enumerate(elements):
                f.write(
                    f'{e: <2} \t {coords[i][0]: >8.5f} \t {coords[i][1]: >8.5f} \t {coords[i][2]: >8.5f}\n'
                )

        return mol_path
Example #26
0
def get_molecular_weight(name):
    try:
        cpd = pcp.get_compounds(name, "name")
        return cpd[0].to_dict(
            properties=["molecular_weight"])["molecular_weight"]
    except:
        return 300  #not real mw, but it's in the middle - therefore won't be misued in min/max calculations
Example #27
0
def name2inchikey(odor_names):
    """
    Args:
        odor_names (iterable of strs): the odors to be translated to
        inchikeys

    Returns:
        odor_inchikeys (list of strs): the inchikeys for the input odors
        inchikey2name (dict: str -> str): for quickly getting the name for an
        inchikey

    """
    # TODO support single str inputs too
    odor_inchikeys = []
    inchikey2name = dict()
    for o in odor_names:
        matches = pcp.get_compounds(o, 'name')

        if len(matches) > 1:
            print('WARNING: more than one pubchem match for {}. ' + \
                'ambiguous!'.format(o))
            continue

        elif len(matches) == 0:
            print('WARNING: no pubchem matches found for {}!'.format(o))
            continue

        match = matches[0]

        odor_inchikeys.append(match.inchikey)
        inchikey2name[match.inchikey] = o

    return odor_inchikeys, inchikey2name
Example #28
0
    def _default(self):
        if self.app.pargs.by_name:
            compounds = pubchempy.get_compounds(self.app.pargs.name_or_id, 'name')
            results = [[compound.synonyms[0], 'pubchem.compound', compound.cid, compound.inchi] for compound in compounds]
        else:
            unichem = bioservices.UniChem()
            structure = unichem.get_structure(int(float(self.app.pargs.name_or_id)), self.app.pargs.namespace)
            if structure:
                results = [['', self.app.pargs.namespace, self.app.pargs.name_or_id, structure['standardinchi']]]
            else:
                results = []

        if not results:
            print('Unable to find structure', file=sys.stderr)
            return

        lens = [
            max(4, max(len(str(r[0])) for r in results)),
            max(9, max(len(str(r[1])) for r in results)),
            max(2, max(len(str(r[2])) for r in results)),
            max(9, max(len(str(r[3])) for r in results)),
        ]
        format = '{{:<{}}}  {{:<{}}}  {{:<{}}}  {{:<{}}}'.format(*lens)
        print(format.format('Name', 'Namespace', 'ID', 'Structure'))
        print(format.format('=' * lens[0], '=' * lens[1], '=' * lens[2], '=' * lens[3]))
        for result in results:
            print(format.format(*result))
def get_pubchem_by_name(dir):
    files = [
        f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))
    ]
    for in_file in files:
        #print in_file
        file_name, extension = os.path.splitext(str(in_file))
        if "smiles" not in file_name:
            out_file = file_name + "_smiles" + extension
            print "processing:", in_file, "writing to:", out_file
            with open(dir + in_file, 'r') as input, open(dir + out_file,
                                                         'w') as output:
                mol_info = [re.split(',|:', line)[-2:] for line in input]
                #print mol_info
                smiles_string = ""
                for molecule in mol_info:
                    query_results = pubchempy.get_compounds(
                        molecule[0], "name")
                    if len(query_results) != 1:
                        print "Query for:", molecule[0], "yielded", len(
                            query_results), "results"

                        if len(query_results) > 1:
                            print "Using 1st result: ", query_results[
                                0].canonical_smiles
                    if len(query_results) > 0:
                        smiles_string += query_results[
                            0].canonical_smiles + '\n'

                output.write(smiles_string)
Example #30
0
def convertsmiles():
    t_smiles.set('')
    t_sol.set('')
    t_lip.set('')
    t_sasc.set('')

    molecule = pcp.get_compounds(t_name.get(), 'name')
    print('molecule')

    print(molecule[0])
    #print('canocical_smile', molecule[0].canonical_smiles)
    print('isomeric_smile', molecule[0].isomeric_smiles)
    mol_canonical_smiles = molecule[0].canonical_smiles
    mol_isomeric_smiles = molecule[0].isomeric_smiles
    t_smiles.set(mol_isomeric_smiles)

    mol_ = Chem.MolFromSmiles(mol_isomeric_smiles)

    Draw.MolToFile(mol_, 'tmp.png')

    global image_
    image_open = Image.open('tmp.png')
    image_ = ImageTk.PhotoImage(image_open, master=frame1)

    canvas.create_image(150, 75, image=image_)
Example #31
0
def product_prediction_algorithm(target_smiles_list, df, row):

    start = timeit.default_timer()
    targets = []
    substrates = []
    products = []
    results = []
    enzymes = []
    for x in xrange(len(df)):
        try:
            results.append( explore_substrate(target_smiles_list[x], row, df) )
            substrates.append( df['Substrates'].irow(row) )
            products.append( df['Products'].irow(row) )
            targets.append( target_smiles_list[x] )
            enzymes.append( df['Enzymes'].irow(row) )
        except:
            pass
        
    stop = timeit.default_timer()
    #unique_prods, unique_substrates, unique_products, unique_enzymes = filter_results(results, substrates, products, TestUni, enzymes)
    unique_prods, unique_substrates, unique_products, unique_enzymes = results, substrates, products, enzymes
    
    print "Finished reactions... searching hits on PubChem... "
    
    #search pubchem for unique hits
    novel_compounds = []
    for i in unique_prods:
	try:
	    searches = pcp.get_compounds('CanonicalSMILES', str(Chem.MolToSmiles(i)), 'smiles')
	    if str(searches) == '[Compound()]':
	        novel_compounds.append( i )
	except:
	    pass
            
    #put things in a df
    unique_prods_smiles = []
    novel_smiles = []
    for i in unique_prods:
        unique_prods_smiles.append( Chem.MolToSmiles(i) )
    for i in novel_compounds:
        novel_smiles.append( Chem.MolToSmiles(i) )
    
    novels =[ ]
    for i in unique_prods_smiles:
        if i in novel_smiles:
            novels.append( 'Novel' )
        else:
            novels.append( 'Found in Pubchem' )

    Results = pd.DataFrame({'Native Substrate': unique_substrates, 'Native Product': unique_products, 'Products': unique_prods_smiles, 'Novel Compound?': novels, 'Enzymes': unique_enzymes})
    Results = Results.drop_duplicates(cols=['Products'])
            
    print 'Novel Compounds found...'   
    print len(novel_compounds)
    print "Runtime..."
    print stop - start    
    product_pictures = Draw.MolsToGridImage(unique_prods,molsPerRow=8, includeAtomNumbers=False)
    
    return Results, product_pictures
Example #32
0
def chunks_get_compounds(l,n):
    """ Yield successive n-sized chunks from l. (From Stack Overflow)"""
    for i in xrange(0,len(l),n):
        try:
            yield pcb.get_compounds(l[i:i+n])
        except Exception, e:
            print e
            pass
def pubchem_search(comp_name, search_type='name'):
    iupac = ''
    inchi = ''
    inchi_key = ''
    smiles = ''
    cid = ''
    formula = ''
    synonyms = ''
    structure = ''

    try:
        compound = None
        # For this to work on Mac, run: cd "/Applications/Python 3.6/"; sudo "./Install Certificates.command
        # or comment out the line below:
        # ssl._create_default_https_context = ssl._create_unverified_context  # If no root certificates installed
        pubchem_compound = get_compounds(comp_name, namespace=search_type)
        try:
            compound = pubchem_compound[0]  # Only read the first record from PubChem = preferred entry
        except IndexError:
            logger.info('Could not find PubChem compound for ' + comp_name)   # Nothing was found

        if compound:
            inchi = compound.inchi
            inchi_key = compound.inchikey
            smiles = compound.canonical_smiles
            iupac = compound.iupac_name
            iupac = iupac.replace('~', '').replace('{', '').replace('}', '')
            cid = compound.cid
            formula = compound.molecular_formula
            for synonym in compound.synonyms:
                if get_relevant_synonym(synonym):
                    synonyms = synonyms + ';' + synonym

            if synonyms:
                synonyms = synonyms.replace(";", "", 1)  # Remove the leading ";"

            logger.debug('Searching PubChem for "' + comp_name + '", got cid "' + str(cid) +
                         '" and iupac name "' + iupac + '"')
    except Exception as error:
        logger.error("Unable to search PubChem for compound " + comp_name)
        logger.error(error)

    return iupac, inchi, inchi_key, smiles, cid, formula, synonyms, structure
Example #34
0
import pandas as pd
import pubchempy as pcp
df=pd.read_csv('SAMPL4.csv')
for oname in df.NAME:
	cs = pcp.get_compounds(oname, 'name')
	print oname,cs[0].canonical_smiles
df['NSMILES'] = [pcp.get_compounds(oname, 'name')[0].canonical_smiles for oname in df.NAME] 
print df.to_csv(index=False)
Example #35
0
smiles = []


for i in positive_smiles:
    smiles.append( i )
for i in negative_smiles:
    smiles.append( i )
    
for i in range(len(positive_smiles)):
    activity.append( 1 )
for i in range(len(negative_smiles)):
    activity.append( 0 )
    
for i in negative_smiles:
    try:
        results = pcp.get_compounds(str(i), 'smiles')
        cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) )
        compound_names.append( cpn.iupac_name )   
    except:
        compound_names.append( 'error parsing' )

# <codecell>

training_set = pd.DataFrame({'Smiles': smiles, 'Activity': activity, 'Compound Name': compound_names})

# <codecell>

import numpy as np
import pandas as pd
from random import sample
Example #36
0
def get_name(name):
    chemical_name = pcp.get_compounds(name, 'name')
    try:
        return chemical_name[0].isomeric_smiles
    except IndexError:
        return "\n"
Example #37
0
def rf_find_alternative_substrates(enzyme, dataset, threshold):
    
    ###grabs the substrates of the given enzyme and their smiles
    compound_names, positive_smiles = retrieve_enzyme_substrates(enzyme) 
    
    ###generate 3D coordinates for each molecule
    positive_structures = []
    for i in positive_smiles:
        positive_structures.append( Chem.MolFromSmiles(i) )
    endogenous_structures = []
    for i in endogenous_steroids:
        endogenous_structures.append( Chem.MolFromSmiles(i) )
    
    for m in positive_structures: AllChem.Compute2DCoords(m)
    for m in endogenous_structures: AllChem.Compute2DCoords(m)
        
    positive_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in positive_structures]
    endogenous_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in endogenous_structures]
    
    #recombine endogenous structures with their scores
    sims = DataStructs.BulkTanimotoSimilarity(positive_fps[0],endogenous_fps)
    nbrs = sorted(zip(sims,endogenous_structures),reverse=False) 
    
    #grab bottom 10% of matches
    negative_structures = [x[1] for x in nbrs[:12]]
    negative_smiles = []
    for i in negative_structures:
        negative_smiles.append( Chem.MolToSmiles(i) )
        
    #Draw.MolsToGridImage([x[1] for x in nbrs[:12]],legends=['%.4f'%x[0] for x in nbrs])
    
    ###gets the information for each molecule that we've grabbed thus far
    activity = []
    smiles = []
    
    for i in positive_smiles:
        smiles.append( i )
    for i in negative_smiles:
        smiles.append( i )
        
    for i in range(len(positive_smiles)):
        activity.append( 1 )
    for i in range(len(negative_smiles)):
        activity.append( 0 )
        
    for i in negative_smiles:
        try:
            results = pcp.get_compounds(str(i), 'smiles')
            cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) )
            compound_names.append( cpn.iupac_name )   
        except:
            compound_names.append( 'error parsing' )
    
    ###our model
    training_set = pd.DataFrame({'Smiles': smiles, 'Activity': activity, 'Compound Name': compound_names})
    
    ###identity is in the RF_functions.py file 
    values = [] 
    mols = training_set['Smiles'].tolist()
    activities = training_set['Activity'].tolist()
    
    for i in range(len(dataset)):
        if str( rf_classifier(mols, activities, dataset[i], threshold) ) == '[1]':
            if dataset[i] not in mols:
                values.append( dataset[i] )
    values = canonicalize_smiles(values)        
    return values, training_set
Example #38
0
def cross_validate_sliding_threshold(ec_number):
    '''Cross validates the input enzyme across a sliding identity threshold. Output will be the average accuracy over 100 simulations per each threshold.''' 
    #Compute the coordinates of both the positive hits and all the endogenous steroids    
    positive_structures = []
    for i in positive_smiles:
        positive_structures.append( Chem.MolFromSmiles(i) )
    endogenous_structures = []
    for i in endogenous_steroids:
        endogenous_structures.append( Chem.MolFromSmiles(i) )
    
    for m in positive_structures: AllChem.Compute2DCoords(m)
    for m in endogenous_structures: AllChem.Compute2DCoords(m)
        
    positive_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in positive_structures]
    endogenous_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in endogenous_structures]
    
    #recombine endogenous structures with their scores
    sims = DataStructs.BulkTanimotoSimilarity(positive_fps[0],endogenous_fps)
    nbrs = sorted(zip(sims,endogenous_structures),reverse=False)
    
    #grab most disimilar matches, we're arbitrarily picking 12
    negative_structures = [x[1] for x in nbrs[:12]]
    negative_smiles = []
    for i in negative_structures:
        negative_smiles.append( Chem.MolToSmiles(i) )
        
    #Let's code the cross validation section
    activity = []
    smiles = []
    compound_names = []
    
    for i in positive_smiles:
        smiles.append( i )
    for i in negative_smiles:
        smiles.append( i )
        
    for i in range(len(positive_smiles)):
        activity.append( 1 )
    for i in range(len(negative_smiles)):
        activity.append( 0 )
        
    for i in negative_smiles:
        try:
            results = pcp.get_compounds(str(i), 'smiles')
            cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) )
            compound_names.append( cpn.iupac_name )   
        except:
            compound_names.append( 'error parsing' )
    for i in positive_smiles:
        try:
            results = pcp.get_compounds(str(i), 'smiles')
            cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) )
            compound_names.append( cpn.iupac_name )   
        except:
            compound_names.append( 'error parsing' )
            
    training_set = pd.DataFrame({'Smiles': smiles, 'Activity': activity, 'Compound Name': compound_names})
    
    #Scrambles up the training_set dataframe so I can test the rest of it
    percentages = list(np.arange(0, 1.05, 0.10))
    
    validation_per_percent = []
    for iteration in percentages:
        cross_validation_scores = []
        for x in xrange(100):
            # given data frame df
            # create random index
            rindex =  np.array(sample(xrange(len(training_set)), int( len(training_set)*.25 )))
            
            #get some random rows from df
            pulled_data = training_set.ix[rindex]
            pulled_smiles = pulled_data['Smiles'].tolist()
            training_set_culled = training_set.drop(rindex)
            
            predicted_values = [] 
            actual_values = pulled_data['Activity'].tolist()
            mols = training_set_culled['Smiles'].tolist()
            activities = training_set_culled['Activity'].tolist()
            
            try:
                for j in range(len(pulled_smiles)):
                    if rf_validate(mols, activities, pulled_smiles[j]) > iteration:
                        if pulled_smiles[j] not in mols:
                            predicted_values.append( 1 )
                    else:
                        predicted_values.append( 0 ) 
                #cross_validation_scores.append( predicted_values == actual_values )
                sm = difflib.SequenceMatcher(None, predicted_values, actual_values) ###uses the Ratcliff and Obershelp algorithm for matching these lists 
                cross_validation_scores.append( sm.ratio() )
            except:
                pass   
        #validation_per_percent.append( float(cross_validation_scores.count(True))/len(cross_validation_scores) )
        validation_per_percent.append( mean(cross_validation_scores) )
        
    return validation_per_percent
Example #39
0
names=[]
formula=""
inchi=""

for line in open(sys.argv[1]):
 if "CH$NAME" in line:
   names.append(''.join(line.split(" ")[1:]).strip().lower())
 if "CH$FORMULA" in line:
   formula=''.join(line.split(" ")[1:]).strip()
 if "CH$IUPAC" in line:
   inchi=''.join(line.split(" ")[1:]).strip()
 if "CH$SMILES" in line:
   smiles=''.join(line.split(" ")[1:]).strip()

results = pcp.get_compounds(inchi, namespace=u'inchi')

if len(results) != 1:
   print "#results != 1; exiting"
   sys.exit(1)

if results[0].molecular_formula == formula:
   print 'Formula matches ' + '\033[92m[OK]\033[0m' 
else:
   print results[0].molecular_formula
   print formula
   print 'Formulas different ' + '\033[93m[OK]\033[0m'

synonyms=[x.encode('utf-8').lower() for x in results[0].synonyms]

common_names=[]
Example #40
0
def get_pubchem_cids(com, user=None):
    cid_array = []
    
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["cid", "pubchem cid", "pubchem compound", "pubchem compound id", "pubchem compound identifier"]):
        if iden["identifier"] not in cid_array:
            cid_array.append(iden["identifier"])
            
    if cid_array:
        return cid_array
    
    ids_completed = []
    
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["chemspider", "chemspider id", "chemspider identifier", "cs id", "csid"]):
        if iden["identifier"] not in ids_completed:
            ids_completed.append(iden["identifier"])
            results = pubchem.get_compounds(gnomics.objects.compound.Compound.inchi(com, user = user), 'inchi')
            
            for x in results:
                gnomics.objects.compound.Compound.add_identifier(com, identifier=x.cid, identifier_type="PubChem CID", language=None, source="PubChem")
                cid_array.append(x.cid)
                
        elif user is None:
            print("Cannot use ChemSpider conversion when user is None. Please create and pass a valid user with a ChemSpider security token to this method.")
            
    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["kegg compound", "kegg compound id", "kegg compound identifier", "kegg", "kegg compound accession", "kegg id", "kegg identifier", "kegg accession"]):
        if iden["identifier"] not in ids_completed:
            ids_completed.append(iden["identifier"])
            
            for kegg_com in gnomics.objects.compound.Compound.kegg_compound_db_entry(com):
            
                # Returns PubChem SID.
                pubchem_sid = str(kegg_com["DBLINKS"]["PubChem"])

                # Get CIDs from SIDs.
                server = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
                ext = "/substance/sid/" + str(pubchem_sid) + "/JSONP"

                r = requests.get(server+ext, headers={"Content-Type": "application/json"})

                if not r.ok:
                    print("Something went wrong when trying to access the PubChem PUG REST.")
                else:
                    str_r = r.text
                    try:
                        l_index = str_r.index("(") + 1
                        r_index = str_r.rindex(")")
                        res = str_r[l_index:r_index]
                        decoded = json.loads(res)
                        for temp_com in decoded["PC_Substances"][0]["compound"]:
                            if "id" in temp_com:
                                if temp_com["id"]["type"] == 1 and "id" in temp_com["id"]:
                                    if "cid" in temp_com["id"]["id"]:
                                        if temp_com["id"]["id"]["cid"] not in cid_array:
                                            cid_array.append(temp_com["id"]["id"]["cid"])
                                            gnomics.objects.compound.Compound.add_identifier(com, identifier=temp_com["id"]["id"]["cid"], identifier_type="PubChem CID", source="KEGG", language=None)

                    except ValueError:
                        print("Input is not in a JSONP format.")

    for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["wikidata", "wikidata accession", "wikidata id", "wikidata identifier"]):
        if iden["identifier"] not in ids_completed:
            ids_completed.append(iden["identifier"])
            
            for wikidata_object in gnomics.objects.compound.Compound.wikidata(com):

                found_array = gnomics.objects.auxiliary_files.wiki.wikidata_property_check(wikidata_object, "pubchem cid", wikidata_property_language = "en")

                for x in found_array:
                    if x not in cid_array:
                        cid_array.append(x)
                        gnomics.objects.compound.Compound.add_identifier(com, identifier = x, identifier_type = "PubChem CID", language = None, source = "Wikidata")
            
    return cid_array
Example #41
0

import pandas as pd

data = pd.read_csv("compound_names.csv")

#data = data[0:11]

outdf = pd.DataFrame(columns=('Compound Name', 'CID'))


i = 0


for compound in data.values:
	for item in get_compounds(compound, 'name'):
		#print "Compound Name : "+compound+", "+"CID : "+str(item.cid)
		#outdf["Compound Name"] = compound
		#outdf["CID"] = item.cid
		outdf.loc[i] = [compound, item.cid]
		i = i +1


outdf.to_csv('mydb.csv')






Example #42
0
def prediction_algorithm2(target_smiles, df):
    '''iterates through an entire df instead of a target smiles list'''
    start = timeit.default_timer()
    TestUni = df
    chol = str(target_smiles)
    tally = []
    for z in range(len(TestUni)):
        try:
            t = Chem.MolFromSmiles( chol ) 
            s = Chem.MolFromSmiles( TestUni['Substrates'].irow(z) )
            p = Chem.MolFromSmiles( TestUni['Products'].irow(z) )
            tally.append( scan_atoms(t, s, p) )
            if z % 10 == 0 and z != 0:
                    print z  
        except:
            tally.append( 'do not proceed' )
            
    TestUni['Anabolic Compatible'] = tally 
    TestUni2 = TestUni[TestUni['Anabolic Compatible'] == 'proceed']
    print "Potential hits..."
    print len(TestUni2)

    anabolic = chol
    substrates = []
    products = []
    targets = []
    results = []
    enzymes = []
    for x in range(len(TestUni2)):
        try:
            results.append( explore_substrate(anabolic, x, TestUni2) )
            substrates.append( TestUni2['Substrates'].irow(x) )
            products.append( TestUni2['Products'].irow(x) )
            targets.append( anabolic )
            enzymes.append( TestUni2['Enzymes'].irow(x) )
        except:
            pass
        
    stop = timeit.default_timer()
    #unique_prods, unique_substrates, unique_products, unique_enzymes = filter_results(results, substrates, products, TestUni, enzymes)
    unique_prods, unique_substrates, unique_products, unique_enzymes = results, substrates, products, enzymes
    
    print "Finished reactions... searching hits on PubChem... "
    
    #search pubchem for unique hits
    novel_compounds = []
    for i in unique_prods:
        searches = pcp.get_compounds('CanonicalSMILES', str(Chem.MolToSmiles(i)), 'smiles')
        if str(searches) == '[Compound()]':
            novel_compounds.append( i )
            
    #put things in a df
    unique_prods_smiles = []
    novel_smiles = []
    for i in unique_prods:
        unique_prods_smiles.append( Chem.MolToSmiles(i) )
    for i in novel_compounds:
        novel_smiles.append( Chem.MolToSmiles(i) )
    
    novels =[ ]
    for i in unique_prods_smiles:
        if i in novel_smiles:
            novels.append( 'Novel' )
        else:
            novels.append( 'Found in Pubchem' )
    Results = pd.DataFrame({'Native Substrate': unique_substrates, 'Native Product': unique_products, 'Products': unique_prods_smiles, 'Novel Compound?': novels, 'Enzymes': unique_enzymes})
    Results = Results.drop_duplicates(cols=['Products'])
            
    print 'Novel Compounds found...'   
    print len(novel_compounds)
    print "Runtime..."
    print stop - start    
    product_pictures = Draw.MolsToGridImage(unique_prods,molsPerRow=8, includeAtomNumbers=False)
    
    return Results, product_pictures
Example #43
0
def search(query, user=None, search_type=None, source="chemspider", mass_plus_minus=0.001):
    
    result_set = []
    
    if source.lower() in ["chemspider", "all"]:
        if user is not None and user.chemspider_security_token is not None:
            cs = chemspider(user.chemspider_security_token)
            if search_type == None:
                for result in cs.search(query):
                    temp_com = gnomics.objects.compound.Compound(identifier = result.csid, identifier_type = "ChemSpider ID", source = "ChemSpider", name = None)
                    result_set.append(temp_com)
            elif search_type == "formula" or search_type == "molecular formula":
                for result in cs.simple_search_by_formula(query):
                    temp_com = gnomics.objects.compound.Compound(identifier = result.csid, identifier_type = "ChemSpider ID", source = "ChemSpider", name = None)
                    result_set.append(temp_com)
            elif search_type == "mass" and mass_plus_minus is not None:
                for result in cs.simple_search_by_mass(query, mass_plus_minus):
                    temp_com = gnomics.objects.compound.Compound(identifier = result.csid, identifier_type = "ChemSpider ID", source = "ChemSpider", name = None)
                    result_set.append(temp_com)
            else:
                print("No valid search type for ChemSpider was provided.")
                print("Continuing with search type 'None'...")
                return search(query, user = user, search_type = None, source = "chemspider")
        elif source.lower() == "chemspider":
            print("Searching with ChemSpider requires the creation of a User object with a valid ChemSpider security token. Information on obtaining such a token can be found here: 'http://www.chemspider.com/AboutServices.aspx?'.\n")
            
            print("Continuing with PubChem search...\n")
            return search(query, source = "pubchem")
        
    if source.lower() in ["pubchem", "all"]:
        
        if search_type == None:
            try:
                server = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
                ext = "/compound/name/" + str(query) + "/synonyms/JSONP"
                r = requests.get(server+ext, headers={"Content-Type": "application/json"})
                if not r.ok:
                    r.raise_for_status()
                    print("No results found.")
                    return result_set
                str_r = r.text
                try:
                    l_index = str_r.index("(") + 1
                    r_index = str_r.rindex(")")
                except ValueError:
                    print("Input is not in a JSONP format.")
                    exit()
                res = str_r[l_index:r_index]
                decoded = json.loads(res)
                for result in decoded["InformationList"]["Information"]:
                    result_cid = result["CID"]
                    temp_com = gnomics.objects.compound.Compound(identifier = result_cid, identifier_type = "PubChem CID", source = "PubChem", name = result["Synonym"][0])
                    result_set.append(temp_com)
            except requests.exceptions.RequestException as e:
                print(e)
                print("No results found.")
        
        elif search_type == "substructure":
            return pubchem.get_compounds(query, "substructure")
        
        elif search_type == "superstructure":
            return pubchem.get_compounds(query, "superstructure")
        
        elif search_type == "similarity":
            return pubchem.get_compounds(query, "similarity")
        
        elif search_type == "identity":
            return pubchem.get_compounds(query, "identity")
        
        elif search_type.lower() == "smiles":
            for temp_com in pubchem.get_compounds(query, "smiles"):
                if temp_com.synonyms:
                    new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.synonyms[0])
                    result_set.append(new_com)
                else:
                    new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.iupac_name)
                    result_set.append(new_com)
        
        elif search_type.lower() == "inchi":
            for temp_com in pubchem.get_compounds(query, "inchi"):
                if temp_com.synonyms:
                    new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.synonyms[0])
                    result_set.append(new_com)
                else:
                    new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.iupac_name)
                    result_set.append(new_com)
        
        elif search_type == "sdf":
            return pubchem.get_compounds(query, "sdf")
        
        elif search_type == "cid":
            return pubchem.get_compounds(query, "cid")
        
        elif search_type == "cas":
            server = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
            ext = "/compound/name/" + str(query) + "/synonyms/JSONP"
            r = requests.get(server+ext, headers={"Content-Type": "application/json"})
            if not r.ok:
                print("There was a problem attempting to access the PubChem PUG REST service.")
            else:
                str_r = r.text
                try:
                    l_index = str_r.index("(") + 1
                    r_index = str_r.rindex(")")
                except ValueError:
                    print("Input is not in a JSONP format.")
                    exit()
                res = str_r[l_index:r_index]
                decoded = json.loads(res)
                for result in decoded["InformationList"]["Information"]:
                    for syn in result["Synonym"]:
                        if syn == query:
                            result_cid = result["CID"]
                            temp_com = gnomics.objects.compound.Compound(identifier = result_cid, identifier_type = "PubChem CID", source = "PubChem", name = result["Synonym"][0])
                            result_set.append(temp_com)
        else:
            print("No valid search type for PubChem was provided.")
            print("Continuing with search type 'None'...")
            return search(query, user = None, search_type = None, source = "pubchem")
        
    if source.lower() != "chemspider" and source.lower() != "pubchem" and source.lower() != "all":
        print("No valid search source was provided.")
        if user is not None and user.chemspider_security_token is not None:
            print("Because user and ChemSpider security token are provided, continuing with ChemSpider search...")
            return search(query, user = user, search_type = None, source = "chemspider")
        elif user.chemspider_security_token is not None:
            print("Because either user not provided or ChemSpider security token is not valid, continuing with PubChem search...")
            return search(query, user = None, search_type = None, source = "pubchem")
        else:
            return result_set
        
    return result_set