def generate_fingerprint(smiles): if smiles: compound = Compound(compound_string=smiles, identifier_type='smiles', suppress_hydrogens=True) fingerprint = compound.get_bitmap_fingerprint() fp = {x for x in str(fingerprint)[1:-1].split(', ')} return list(fp) else: return []
def get_compound_container(self, chem_id, suppress_hydrogens=True): try: return Compound(compound_string=chem_id, identifier_type='smiles', suppress_hydrogens=suppress_hydrogens) except ValueError as e: try: return Compound(compound_string=chem_id, identifier_type='inchi', suppress_hydrogens=suppress_hydrogens) except ValueError as e: raise e
def default_fingerprint_from_inchi(inchi): c = Compound(compound_string=inchi, identifier_type='inchi') # this should accommodate different types! fp = c.get_fingerprint() fp_size = 1024 fp_array = numpy.zeros(fp_size) for fp_bit in range(fp_size): fp_array[fp_bit] = fp.get(fp_bit) return fp_array
def desalt_compound(smiles): desalted_smiles = [] desalted_ikeys = [] if smiles: for single_compound in smiles.split('.'): desalted_smiles.append(single_compound) try: compound = Compound(compound_string=single_compound, identifier_type='smiles') ikey = compound.get_inchi_key() desalted_ikeys.append(ikey) except Exception as e: desalted_ikeys.append('') return desalted_smiles, desalted_ikeys
def get(self): args = request.args compound_structure = args['structure'] if 'format' in args and args['format'] == 'inchikey': search_results = SearchAPI.exec_freetext_search(compound_structure, indices=('reframe')) for x in search_results: if x['id'] == compound_structure: compound_structure = x['smiles'] break try: compound = Compound(compound_string=compound_structure, identifier_type='smiles') except Exception as e: try: print('executed inchi conv') compound = Compound(compound_string=compound_structure, identifier_type='inchi') compound.get_inchi_key() except Exception as e: print(e) response = { 'status': 'fail', 'message': 'Invalid SMILES or InChI' } return make_response(jsonify(response)), 400 svg_xml = compound.get_svg() response = { 'status': 'success', 'compound_svg': svg_xml } return make_response(jsonify(response)), 200
def get(self): args = request.args compound_structure = args['compound_structure'] try: compound = Compound(compound_string=compound_structure, identifier_type='smiles') except Exception as e: try: compound = Compound(compound_string=compound_structure, identifier_type='inchi') compound.get_inchi_key() except Exception as e: print(e) response = { 'status': 'fail', 'message': 'Invalid SMILES or InChI' } return make_response(jsonify(response)), 400 molfile = compound.get_molfile() response = { 'status': 'success', 'molfile': molfile } return make_response(jsonify(response)), 200
def main(datapath): data = scipy.io.loadmat(datapath + '/data_GNPS.mat') fp_size = 1024 new_fps = [] for i in data['inchi']: print('Processing sample %s' % i) inchi = i[0][0] print(inchi) c = Compound(compound_string=inchi, identifier_type='inchi') fp = c.get_fingerprint() fp_array = numpy.zeros(fp_size) for fp_bit in range(fp_size): fp_array[fp_bit] = fp.get(fp_bit) new_fps.append(fp_array) with open(datapath + '/cdk_fingerprints.bin', 'wb') as f: pickle.dump(new_fps, f)
def generate_fingerprint(smiles, compound_id, main_label, qid): if smiles: compound = Compound(compound_string=smiles, identifier_type='smiles', suppress_hydrogens=True) fingerprint = compound.get_bitmap_fingerprint() fp = {x for x in str(fingerprint)[1:-1].split(', ')} # if only compound id is set as a label, try to set something more useful if compound_id in compound_id_fp_map: sim_item = compound_id_fp_map[compound_id] if sim_item[1] == compound_id: sim_item[1] = main_label else: compound_id_fp_map.update( {compound_id: (compound_id, main_label, qid, fp)}) return list(fp) else: return []
def get(self): args = request.args compound_structure = args['structure'] try: compound = Compound(compound_string=compound_structure, identifier_type='smiles') except ValueError as e: try: compound = Compound(compound_string=compound_structure, identifier_type='inchi') except ValueError as e: response = { 'status': 'fail', 'message': 'Invalid SMILES or InChI' } return make_response(jsonify(response)), 401 svg_xml = compound.get_svg() response = {'status': 'success', 'compound_svg': svg_xml} return make_response(jsonify(response)), 200
def fingerprint_from_inchi(inchi, fingerprint_type=None): c = Compound(compound_string=inchi, identifier_type='inchi') if fingerprint_type == 'cdk_default': fingerprinter = c.cdk.fingerprint.Fingerprinter() elif fingerprint_type == 'substructure': fingerprinter = c.cdk.fingerprint.SubstructureFingerprinter() elif fingerprint_type == 'klekota-roth': fingerprinter = c.cdk.fingerprint.KlekotaRothFingerprinter() else: raise SystemExit('Unknown fingerprint type: {}'.format(fingerprint_type)) fp = fingerprinter.getBitFingerprint(c.mol_container) fp_size = fp.size() fp_array = numpy.zeros(fp_size) for fp_bit in range(fp_size): fp_array[fp_bit] = fp.get(fp_bit) return fp_array
def fingerprint_from_smiles(smiles, fingerprint_type=None): if fingerprint_type is None: fingerprint = numpy.array([]) for fp_type in ('cdk_default', 'substructure', 'klekota-roth'): fingerprint = numpy.hstack((fingerprint, fingerprint_from_smiles(smiles, fp_type))) return fingerprint c = Compound(compound_string=smiles, identifier_type='smiles') if fingerprint_type == 'cdk_default': fingerprinter = c.cdk.fingerprint.Fingerprinter() elif fingerprint_type == 'substructure': fingerprinter = c.cdk.fingerprint.SubstructureFingerprinter() elif fingerprint_type == 'klekota-roth': fingerprinter = c.cdk.fingerprint.KlekotaRothFingerprinter() else: raise SystemExit('Unknown fingerprint type: {}'.format(fingerprint_type)) fp = fingerprinter.getBitFingerprint(c.mol_container) fp_size = fp.size() fp_array = numpy.zeros(fp_size) for fp_bit in range(fp_size): fp_array[fp_bit] = fp.get(fp_bit) return fp_array
def main(): test_inchis = [ 'InChI=1S/C23H18ClF2N3O3S/c1-2-9-33(31,32)29-19-8-7-18(25)20(21(19)26)22(30)17-12-28-23-16(17)10-14(11-27-23)13-3-5-15(24)6-4-13/h3-8,10-12,29H,2,9H2,1H3,(H,27,28)', 'InChI=1S/C33H42N4O6/c1-7-20-19(6)32(42)37-27(20)14-25-18(5)23(10-12-31(40)41)29(35-25)15-28-22(9-11-30(38)39)17(4)24(34-28)13-26-16(3)21(8-2)33(43)36-26/h15,26-27,35H,7-14H2,1-6H3,(H,36,43)(H,37,42)(H,38,39)(H,40,41)/b28-15-/t26-,27-/m0/s1', 'InChI=1S/C21H25ClFN3O3/c1-2-28-20-10-19(24)18(22)9-17(20)21(27)25-11-16-13-26(7-8-29-16)12-14-3-5-15(23)6-4-14/h3-6,9-10,16H,2,7-8,11-13,24H2,1H3,(H,25,27)', 'InChI=1S/C16H12FN3O3/c1-19-14-7-6-10(20(22)23)8-12(14)16(18-9-15(19)21)11-4-2-3-5-13(11)17/h2-8H,9H2,1H3', 'InChI=1S/C10H17N3O6S/c11-5(10(18)19)1-2-7(14)13-6(4-20)9(17)12-3-8(15)16/h5-6,20H,1-4,11H2,(H,12,17)(H,13,14)(H,15,16)(H,18,19)/t5-,6-/m0/s1', 'InChI=1S/C13H16N2O/c1-8-13-11(5-6-14-8)10-4-3-9(16-2)7-12(10)15-13/h3-4,7-8,14-15H,5-6H2,1-2H3', 'InChI=1S/C27H44O2/c1-19-10-13-23(28)18-22(19)12-11-21-9-7-17-27(5)24(14-15-25(21)27)20(2)8-6-16-26(3,4)29/h11-12,20,23-25,28-29H,1,6-10,13-18H2,2-5H3/b21-11+,22-12-/t20-,23+,24-,25+,27-/m1/s1', 'InChI=1S/C40H56/c1-31(19-13-21-33(3)25-27-37-35(5)23-15-29-39(37,7)8)17-11-12-18-32(2)20-14-22-34(4)26-28-38-36(6)24-16-30-40(38,9)10/h11-14,17-23,25-28,37H,15-16,24,29-30H2,1-10H3/b12-11+,19-13+,20-14+,27-25+,28-26+,31-17+,32-18+,33-21+,34-22+/t37-/m0/s1', 'InChI=1S/C11H14N4O5/c1-14-3-13-9-6(10(14)19)12-4-15(9)11-8(18)7(17)5(2-16)20-11/h3-5,7-8,11,16-18H,2H2,1H3', 'InChI=1S/C27H44O2/c1-18(2)8-6-9-19(3)24-13-14-25-21(10-7-15-27(24,25)5)11-12-22-16-23(28)17-26(29)20(22)4/h11-12,18-19,23-26,28-29H,4,6-10,13-17H2,1-3,5H3/b21-11+,22-12-/t19-,23-,24-,25+,26+,27-/m1/s1', 'InChI=1S/C9H14N5O4P/c1-6(18-5-19(15,16)17)2-14-4-13-7-8(10)11-3-12-9(7)14/h3-4,6H,2,5H2,1H3,(H2,10,11,12)(H2,15,16,17)/t6-/m1/s1', 'InChI=1S/C51H79NO13/c1-30-16-12-11-13-17-31(2)42(61-8)28-38-21-19-36(7)51(60,65-38)48(57)49(58)52-23-15-14-18-39(52)50(59)64-43(33(4)26-37-20-22-40(53)44(27-37)62-9)29-41(54)32(3)25-35(6)46(56)47(63-10)45(55)34(5)24-30/h11-13,16-17,25,30,32-34,36-40,42-44,46-47,53,56,60H,14-15,18-24,26-29H2,1-10H3/b13-11+,16-12+,31-17+,35-25+/t30-,32-,33-,34-,36-,37+,38+,39+,40-,42+,43+,44-,46-,47+,51-/m1/s1' ] for inchi in test_inchis: cmpnd = Compound(compound_string=inchi, identifier_type='inchi') print(cmpnd.get_smiles()) print(cmpnd.get_inchi_key()) print(cmpnd.get_inchi()) print(cmpnd.get_mol2()) print(cmpnd.get_fingerprint()) print('----------------------------') # group of compounds with same connectivity but different configuration: # https://pubchem.ncbi.nlm.nih.gov/rest/rdf/inchikey/MNQDKWZEUULFPX-UHFFFAOYSA-M.html smiles = [ '[Ba++].[O-][Fe]([O-])(=O)=O', 'CCN1C2=CC=CC=C2SC1=CC=CC=CC3=[N+](C4=CC=CC=C4S3)CC.[I-]', 'CCN\\1C2=CC=CC=C2S/C1=C\C=C\C=C\C3=[N+](C4=CC=CC=C4S3)CC.[I-]', 'CCN\\1C2=CC=CC=C2S/C1=C/C=C/C=C/C3=[N+](C4=CC=CC=C4S3)CC.[I-]', 'CCN\\1C2=CC=CC=C2S/C1=C\\C=C\\C=C/C3=[N+](C4=CC=CC=C4S3)CC.[I-]', 'CCN\\1C2=CC=CC=C2S/C1=C/C=C/C=CC3=[N+](C4=CC=CC=C4S3)CC.[I-]', 'CC1=CC=CC=C1OCC2=CC=CC=C2/C(=N\OC)/C(=O)OC', 'CCCCCC/C=C\CCCCCCCC(=O)O', 'CC(C)(C)c1nc(c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2F)c(s1)c4ccnc(N)n4', 'CC(C)(C)C1=NC(C2=CC=CC(NS(=O)(=O)N(C)(CC))=C2F)=C(S1)C4=CC=NC(N)=N4', 'C1=CC2=C(C=C1O)C(=CN2)C[C@@H](C(=O)[O-])[NH3+]', 'CN/C(=C\[N+](=O)[O-])/NCCSCC1=CC=C(O1)CN(C)C', 'CN/C(=C/[N+](=O)[O-])/NCCSCC1=CC=C(O1)CN(C)C', 'COCCOC[C@H](CC1(CCCC1)C(=O)N[C@@H]2CC[C@@H](CC2)C(=O)O)C(=O)Oc3ccc4CCCc4c3', 'C1=C(N=C(S1)N=C(N)N)CSCC/C(=N/S(=O)(=O)N)/N', 'C[C@]([C@H]1C[C@@]23CC[C@@]1([C@H]4[C@@]25CCN([C@@H]3CC6=C5C(=C(C=C6)O)O4)CC7CC7)OC)(C(C)(C)CC)O', 'CC(=O)O[Hg]c1cc(ccc1O)C(CC(C)(C)C)(C)C', 'CC(=O)O.CC(C)(C)CC(C)(C)[C]1C=CC(=C=C1)[O-].[Hg+]', 'N/C(N)=C([N+]([O-])=O)\[N+]([O-])=O', 'CC(C)C1=C(C(=C(N1CC[C@H](C[C@H](CC(=O)O)O)O)C2=CC=C(C=C2)F)C3=CC=CC=C3)C(=O)NC4=CC=CC=C4', 'c1cc(ccc1/N=N/c2ccc(c(c2)OS(=O)O)N)OS(=O)O.[Na+].[Na+]', 'Clc1ccc2Nc4ccccc4C(=N\c2c1)/N3CCNCC3', '[Yb][Yb][Yb][Ag][Ag]', 'N[C@@H](CSSC[C@H](N)C(O)=O)C(O)=O' 'CC1(C\\2CCC1(C(=O)/C2=C/c3ccc(cc3)C=O)CS(=O)(=O)[O-])C.[Na+]', 'CNC(=O)C1=CC=CC=C1NC2=NC(=NC=C2Cl)NC3=CC=C(C=C3)N4CCN(CCCN)CC4', 'OC(=O)CN/C(=N\c1ccc(C#N)cc1)NC2CCCCCCCC2', 'N[C@@]12C[C@]3(O[N+]([O-])=O)C[C@@](C2)(CC)C[C@@](C1)(CC)C3', 'C1C2CC3CC1(ON(OO))CC(C2)(C3)N', '[N+](=O)([O-])OC12CC3(CC(CC(C1)(C3)N)(C2)CC)CC', 'COc1cc(c(cc1C(=O)N[C@@H]2CC[N@@]3CCC[C@H]2C3)Cl)N', 'OCN(C(=O)N(CO)C)', '[O-][n+]1cc[n+](c2c1cccc2)[O-]', '[2H]C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C([2H])([2H])C(O)=O', 'CS(O)(=O)=O.[H][C@@]12CC(C)C(C(=O)CN3CCN(CC3)c3cc(nc(n3)N3CCCC3)N3CCCC3)[C@@]1(C)CC=C1[C@@]2([H])CCC2=CC(=O)C=C[C@]12C', 'OCCCC(O)=O', 'Cc1nnc(s1)SCC2=C(N3[C@@H]([C@@H](C3=O)NC(=O)Cn4cnnn4)SC2)C(=O)[O-]', 'CC(=O)Oc1ccc(cc1)C(c1ccc(OC(C)=O)cc1)c1ccccn1' ] for smile in smiles: try: cmpnd = Compound(compound_string=smile, identifier_type='smiles') print(cmpnd.get_smiles(smiles_type='isomeric')) print(cmpnd.get_smiles(smiles_type='unique')) print(cmpnd.get_smiles(smiles_type='absolute')) print(cmpnd.get_smiles(smiles_type='generic')) print(cmpnd.get_inchi_key()) print(cmpnd.get_inchi()) print(cmpnd.get_mol2()) print(cmpnd.get_fingerprint()) print( cmpnd.get_tanimoto( Compound(compound_string='C1C2CC3CC1(ON(OO))CC(C2)(C3)N', identifier_type='smiles'))) print( cmpnd.get_tanimoto_from_bitset( Compound(compound_string='C1C2CC3CC1(ON(OO))CC(C2)(C3)N', identifier_type='smiles'))) print(cmpnd.get_molfile()) print('----------------------------') except ValueError as e: print(e) cmpnd = Compound( compound_string= 'InChI=1S/C5H10N2O3/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H2,7,8)(H,9,10)/p-1', identifier_type='inchi') print(cmpnd.get_smiles(smiles_type='generic')) print(cmpnd.get_inchi_key()) print(cmpnd.get_inchi())
# Convert MIBiG SMILES to InChI keys mibig_inchi_keys = {} failing_smiles = [] cnt = 0 with open('compunds_structures_2.0.csv', 'r') as f: for l in csv.reader(f): if cnt is 0: cnt += 1 continue if len(l) is 0: continue mibig_id, compound_name, smiles, pubchem_id = l if smiles == '': continue # print(smiles) mibig_compound = Compound(compound_string=smiles, identifier_type='smiles') try: mibig_inchi_key = mibig_compound.get_inchi_key() except: failing_smiles.append(l) continue mibig_inchi_first_block = mibig_inchi_key.split('-')[0] if mibig_inchi_first_block in mibig_inchi_keys: mibig_inchi_keys[mibig_inchi_first_block].append( ((mibig_id, compound_name, pubchem_id), mibig_inchi_key, smiles)) #.split('_')[0]) else: mibig_inchi_keys[mibig_inchi_first_block] = [ ((mibig_id, compound_name, pubchem_id), mibig_inchi_key, smiles) ] #.split('_'[0])]