def test_targets_to_mols(self): ret = targets_to_mols( [resolve_target(x)[0]['target_chembl_id'] for x in ['POLK']]) serialiser = get_serializer('chembl_id') self.assertTrue({ 'CHEMBL6', 'CHEMBL25', 'CHEMBL10', 'CHEMBL30', 'CHEMBL50', 'CHEMBL71', 'CHEMBL28', 'CHEMBL66' }.issubset(set(serialiser.serialize_line(ret).strip().split(',')))) ret = targets_to_mols( [resolve_target(x)[0]['target_chembl_id'] for x in ['HERG']]) serialiser = get_serializer('smiles') self.assertTrue({ 'CN1C(=O)N(CC(=O)c2ccccc2)C(=O)c3c1nc(N4CCC[C@H](N)C4)n3CC=C(C)C', 'C[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2CC[C@@]4(C)[C@H]3CC[C@@]4(O)C#C', 'CS(=O)(=O)N1CCC(CN([C@@H]2CC[C@@]3(CC3C2)c4cccc(c4)C#N)C(=O)Nc5ccc(F)c(F)c5)CC1', 'O=S(=O)(NCCCN1CCN(CC1)c2nsc3ccccc23)c4cccc5scnc45', 'Cl.CC(C)n1nc(C(=O)NCC2CCN(CCNS(=O)(=O)C)CC2)c3ccccc13', 'CC1CN(CC(=O)N[C@@H]2C3CC4CC2C[C@@](C4)(C3)C(=O)N)S(=O)(=O)N(C1)c5c(Cl)cc(Cl)cc5Cl', 'Oc1cccc2CC(CN3CCC4(CC3)CCc5ccccc45)NCc12', '[O-][N+](=O)c1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4' }.issubset(set(serialiser.serialize_line(ret).strip().split()))) ret = targets_to_mols( [resolve_target(x)[0]['target_chembl_id'] for x in ['CMKAR1']]) serialiser = get_serializer('inchi') self.assertTrue({ 'InChI=1S/C7H6O2/c8-7(9)6-4-2-1-3-5-6/h1-5H,(H,8,9)', 'InChI=1S/C11H7NS/c13-8-12-11-7-3-5-9-4-1-2-6-10(9)11/h1-7H', 'InChI=1S/CHCl3/c2-1(3)4/h1H', 'InChI=1S/H4N2/c1-2/h1-2H2', 'InChI=1S/C8H8N4/c9-11-8-7-4-2-1-3-6(7)5-10-12-8/h1-5H,9H2,(H,11,12)', 'InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)', 'InChI=1S/CH3O5P/c2-1(3)7(4,5)6/h(H,2,3)(H2,4,5,6)', 'InChI=1S/C4H11NO3/c5-4(1-6,2-7)3-8/h6-8H,1-3,5H2', 'InChI=1S/C9H9Cl2N3O/c10-6-2-1-3-7(11)5(6)4-8(15)14-9(12)13/h1-3H,4H2,(H4,12,13,14,15)', }.issubset(set(serialiser.serialize_line(ret).strip().split('\t')))) ret = targets_to_mols([ resolve_target(x)[0]['target_chembl_id'] for x in ['ADRA2L1', 'VIPR1'] ]) serialiser = get_serializer('inchi_key') self.assertTrue({ 'OELFLUMRDSZNSF-BRWVUGGUSA-N', 'KJPRLNWUNMBNBZ-QPJJXVBHSA-N', 'VEPKQEUBKLEPRA-UHFFFAOYSA-N', 'ACGUYXCXAPNIKK-UHFFFAOYSA-N', 'MVGSNCBCUWPVDA-MFOYZWKCSA-N', 'JZHXLYHEWUWHQL-LVYIWIAJSA-N', 'CUVBGWMAORETGV-UHFFFAOYSA-N', 'DVSMVUMYJDOPJQ-UHFFFAOYSA-N', }.issubset(set(serialiser.serialize_line(ret).strip().split(','))))
def main(): options = get_options() with open(options.input) if options.input else sys.stdin as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.format) if not serializer_cls: sys.stderr.write('Unsupported format', options.format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: name = line.strip() if not name: continue resolved = None try: resolved = resolve(name, options.single) except Exception as e: pass if options.parent: resolved = get_parents(resolved) out_f.write( serializer_cls.serialize_line(resolved, human=options.human, name=name))
def test_inchi_serialiser(self): serialiser = get_serializer('inchi') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'Name:\tInChI:\n') self.assertEqual( serialiser.serialize_line(mols), 'InChI=1S/C21H24FN3O4/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11' '-3-2-6-23-16(11)10-24/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28)/t11-,16+/m0/s1\tInChI=1S/C21' 'H24FN3O4.ClH/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11-3-2-6-2' '3-16(11)10-24;/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28);1H/t11-,16+;/m0./s1\n' ) self.assertEqual( serialiser.serialize_line(mols, human=True), 'InChI=1S/C21H24FN3O4/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11' '-3-2-6-23-16(11)10-24/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28)/t11-,16+/m0/s1\tInChI=1S/C21' 'H24FN3O4.ClH/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11-3-2-6-2' '3-16(11)10-24;/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28);1H/t11-,16+;/m0./s1\n' ) self.assertEqual( serialiser.serialize_line(mols, human=True, name='Vigamox'), 'Vigamox\tInChI=1S/C21H24FN3O4/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(2' '0)24-8-11-3-2-6-23-16(11)10-24/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28)/t11-,16+/m0/s1\tInC' 'hI=1S/C21H24FN3O4.ClH/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-1' '1-3-2-6-23-16(11)10-24;/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28);1H/t11-,16+;/m0./s1\n' )
def main(): options = get_options() with open(options.input) if options.input else sys.stdin as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.dest_format) if not serializer_cls: sys.stderr.write('Unsupported format', options.dest_format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: chunk = line.strip().split()[0] identifiers = chunk.strip().split(',') valid_identifiers = list() for identifier in identifiers: target = resolve_target(identifier) if not target: continue valid_identifiers.append(target) mols = targets_to_mols(valid_identifiers, only_ids=(options.dest_format == 'chembl_id'), include_parents=options.parent, chunk_size=int(options.chunk)) out_f.write(serializer_cls.serialize_line(mols, human=options.human, name=','.join(valid_identifiers)))
def main(): options = get_options() with open(options.input) if options.input else sys.stdin as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.format) if not serializer_cls: sys.stderr.write('Unsupported format', options.format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: name = line.strip() if not name: continue resolved = None try: resolved = resolve(name, options.single) except Exception as e: pass if options.parent: resolved = get_parents(resolved) out_f.write(serializer_cls.serialize_line(resolved, human=options.human, name=name))
def test_targets_to_mols(self): ret = targets_to_mols([resolve_target(x)[0]['target_chembl_id'] for x in ['POLK']]) serialiser = get_serializer('chembl_id') self.assertTrue({'CHEMBL6', 'CHEMBL25', 'CHEMBL10', 'CHEMBL30', 'CHEMBL50', 'CHEMBL71', 'CHEMBL28', 'CHEMBL66' }.issubset(set(serialiser.serialize_line(ret).strip().split(',')))) ret = targets_to_mols([resolve_target(x)[0]['target_chembl_id'] for x in ['HERG']]) serialiser = get_serializer('smiles') self.assertTrue({'CN1C(=O)N(CC(=O)c2ccccc2)C(=O)c3c1nc(N4CCC[C@H](N)C4)n3CC=C(C)C', 'C[C@]12CCC(=O)C=C1CC[C@@H]3[C@@H]2CC[C@@]4(C)[C@H]3CC[C@@]4(O)C#C', 'CS(=O)(=O)N1CCC(CN([C@@H]2CC[C@@]3(CC3C2)c4cccc(c4)C#N)C(=O)Nc5ccc(F)c(F)c5)CC1', 'O=S(=O)(NCCCN1CCN(CC1)c2nsc3ccccc23)c4cccc5scnc45', 'Cl.CC(C)n1nc(C(=O)NCC2CCN(CCNS(=O)(=O)C)CC2)c3ccccc13', 'CC1CN(CC(=O)N[C@@H]2C3CC4CC2C[C@@](C4)(C3)C(=O)N)S(=O)(=O)N(C1)c5c(Cl)cc(Cl)cc5Cl', 'Oc1cccc2CC(CN3CCC4(CC3)CCc5ccccc45)NCc12', '[O-][N+](=O)c1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4' }.issubset(set(serialiser.serialize_line(ret).strip().split()))) ret = targets_to_mols([resolve_target(x)[0]['target_chembl_id'] for x in ['CMKAR1']]) serialiser = get_serializer('inchi') self.assertTrue({ 'InChI=1S/C7H6O2/c8-7(9)6-4-2-1-3-5-6/h1-5H,(H,8,9)', 'InChI=1S/C11H7NS/c13-8-12-11-7-3-5-9-4-1-2-6-10(9)11/h1-7H', 'InChI=1S/CHCl3/c2-1(3)4/h1H', 'InChI=1S/H4N2/c1-2/h1-2H2', 'InChI=1S/C8H8N4/c9-11-8-7-4-2-1-3-6(7)5-10-12-8/h1-5H,9H2,(H,11,12)', 'InChI=1S/CH4N2O2/c2-1(4)3-5/h5H,(H3,2,3,4)', 'InChI=1S/CH3O5P/c2-1(3)7(4,5)6/h(H,2,3)(H2,4,5,6)', 'InChI=1S/C4H11NO3/c5-4(1-6,2-7)3-8/h6-8H,1-3,5H2', 'InChI=1S/C9H9Cl2N3O/c10-6-2-1-3-7(11)5(6)4-8(15)14-9(12)13/h1-3H,4H2,(H4,12,13,14,15)', }.issubset(set(serialiser.serialize_line(ret).strip().split('\t')))) ret = targets_to_mols([resolve_target(x)[0]['target_chembl_id'] for x in ['ADRA2L1', 'VIPR1']]) serialiser = get_serializer('inchi_key') self.assertTrue({ 'OELFLUMRDSZNSF-BRWVUGGUSA-N', 'KJPRLNWUNMBNBZ-QPJJXVBHSA-N', 'VEPKQEUBKLEPRA-UHFFFAOYSA-N', 'ACGUYXCXAPNIKK-UHFFFAOYSA-N', 'MVGSNCBCUWPVDA-MFOYZWKCSA-N', 'JZHXLYHEWUWHQL-LVYIWIAJSA-N', 'CUVBGWMAORETGV-UHFFFAOYSA-N', 'DVSMVUMYJDOPJQ-UHFFFAOYSA-N', }.issubset(set(serialiser.serialize_line(ret).strip().split(','))))
def test_sdf_serialiser(self): serialiser = get_serializer('sdf') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), '') self.assertEqual(serialiser.serialize_line(mols), sdf, serialiser.serialize_line(mols)) self.assertEqual(serialiser.serialize_line(mols, human=True), sdf) self.assertEqual(serialiser.serialize_line(mols, human=True, name='Vigamox'), sdf) serialiser = get_serializer('mol') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), '') self.assertEqual(serialiser.serialize_line(mols), sdf, serialiser.serialize_line(mols)) self.assertEqual(serialiser.serialize_line(mols, human=True), sdf) self.assertEqual(serialiser.serialize_line(mols, human=True, name='Vigamox'), sdf)
def test_chembl_id_serialiser(self): serialiser = get_serializer('chembl_id') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'Name:\tChEMBL ID:\n') self.assertEqual(serialiser.serialize_line(mols), 'CHEMBL32,CHEMBL1200735\n') self.assertEqual(serialiser.serialize_line(mols, human=True), 'CHEMBL32,CHEMBL1200735\n') self.assertEqual(serialiser.serialize_line(mols, human=True, name='Vigamox'), 'Vigamox\tCHEMBL32,CHEMBL1200735\n')
def test_chembl_key_serialiser(self): serialiser = get_serializer('inchi_key') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'Name:\tInChI Key:\n') self.assertEqual(serialiser.serialize_line(mols), 'FABPRXSRWADJSP-MEDUHNTESA-N,IDIIJJHBXUESQI-DFIJPDEKSA-N\n') self.assertEqual(serialiser.serialize_line(mols, human=True), 'FABPRXSRWADJSP-MEDUHNTESA-N,IDIIJJHBXUESQI-DFIJPDEKSA-N\n') self.assertEqual(serialiser.serialize_line(mols, human=True, name='Vigamox'), 'Vigamox\tFABPRXSRWADJSP-MEDUHNTESA-N,IDIIJJHBXUESQI-DFIJPDEKSA-N\n')
def test_mols_to_targets(self): ret = mols_to_targets(['CHEMBL819'], organism='Escherichia coli') serialiser = get_serializer('uniprot') self.assertEqual(set(serialiser.serialize_line(ret).strip().split(',')), {'A1E3K9', 'P62593', 'P35695'}) ret = mols_to_targets(['CHEMBL819'], organism='Escherichia coli', include_parents=True) serialiser = get_serializer('uniprot') self.assertEqual(set(serialiser.serialize_line(ret).strip().split(',')), {'A1E3K9', 'P62593', 'P35695', 'P00811'}) ret = mols_to_targets(['CHEMBL25', 'CHEMBL1']) serialiser = get_serializer('gene_name') self.assertTrue({'POLK', 'ADRA2RL2', 'CCR2', 'CHRM5', 'ADRA2L1', 'CMKAR1', 'PTPRC', 'HADH2', 'ITGAB', 'VIPR1' }.issubset(set(serialiser.serialize_line(ret).strip().split(',')))) ret = mols_to_targets(['CHEMBL2', 'CHEMBL1737']) serialiser = get_serializer('chembl_id') targets_a = set(serialiser.serialize_line(ret).strip().split(',')) self.assertTrue({'CHEMBL213', 'CHEMBL1916', 'CHEMBL205', 'CHEMBL4071', 'CHEMBL1909043', 'CHEMBL2622', }.issubset(targets_a)) ret = mols_to_targets(['CHEMBL2', 'CHEMBL1737'], only_ids=True) serialiser = get_serializer('chembl_id') targets_b = set(serialiser.serialize_line(ret).strip().split(',')) self.assertTrue({'CHEMBL213', 'CHEMBL1916', 'CHEMBL205', 'CHEMBL4071', 'CHEMBL1909043', 'CHEMBL2622', }.issubset(targets_b)) self.assertEqual(targets_a, targets_b) drugs = ['Viagra', 'Gleevec'] ret = mols_to_targets([resolve(x)[0]['molecule_chembl_id'] for x in drugs]) serialiser = get_serializer('uniprot') self.assertTrue({'Q9BQI3', 'P00523', 'O15111', 'P22612', 'P25021', 'P16591', 'Q13153', 'Q8WXR4', 'P28564', }.issubset(set(serialiser.serialize_line(ret).strip().split(','))))
def main(): similarity = new_client.similarity options = get_options() try: threshold = int(options.threshold) if threshold not in range(70, 101): sys.stderr.write('Threshold should be an integer in range [70-100]') return threshold = str(threshold) except: sys.stderr.write('Threshold should be an integer in range [70-100]') return source_format = options.source_format.lower() if source_format not in AVAILABLE_SOURCE_FORMATS: sys.stderr.write('Unsupported source format', options.source_format) return inp = sys.stdin if source_format == 'sdf': with open(options.input) if options.input else sys.stdin as in_f: options.input = None inp = convert_to_smiles(in_f) with open(options.input) if options.input else inp as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.dest_format) if not serializer_cls: sys.stderr.write('Unsupported format', options.dest_format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: if not line or line.lower().startswith('smiles'): continue chunk = line.strip().split()[0] identifiers = chunk.strip().split(',') valid_identifiers = list() sim = list() for identifier in identifiers: if chembl_id_regex.match(identifier): valid_identifiers.append(identifier) sim.extend(list(similarity.filter(chembl_id=identifier, similarity=threshold))) elif smiles_regex.match(identifier): valid_identifiers.append(identifier) sim.extend(list(similarity.filter(smiles=identifier, similarity=threshold))) sim = list({v['molecule_chembl_id']: v for v in sim}.values()) out_f.write(serializer_cls.serialize_line(sim, human=options.human, name=','.join(valid_identifiers)))
def test_chembl_id_serialiser(self): serialiser = get_serializer('chembl_id') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'Name:\tChEMBL ID:\n') self.assertEqual(serialiser.serialize_line(mols), 'CHEMBL32,CHEMBL1200735\n') self.assertEqual(serialiser.serialize_line(mols, human=True), 'CHEMBL32,CHEMBL1200735\n') self.assertEqual( serialiser.serialize_line(mols, human=True, name='Vigamox'), 'Vigamox\tCHEMBL32,CHEMBL1200735\n')
def test_sdf_serialiser(self): serialiser = get_serializer('sdf') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), '') self.assertEqual(serialiser.serialize_line(mols), sdf, serialiser.serialize_line(mols)) self.assertEqual(serialiser.serialize_line(mols, human=True), sdf) self.assertEqual( serialiser.serialize_line(mols, human=True, name='Vigamox'), sdf) serialiser = get_serializer('mol') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), '') self.assertEqual(serialiser.serialize_line(mols), sdf, serialiser.serialize_line(mols)) self.assertEqual(serialiser.serialize_line(mols, human=True), sdf) self.assertEqual( serialiser.serialize_line(mols, human=True, name='Vigamox'), sdf)
def test_smiles_serialiser(self): serialiser = get_serializer('smi') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'SMILES Name\n') self.assertEqual(serialiser.serialize_line(mols), 'COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n' 'Cl.COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n') self.assertEqual(serialiser.serialize_line(mols, human=True), 'COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n' 'Cl.COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n') self.assertEqual(serialiser.serialize_line(mols, human=True, name='Vigamox'), 'COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O Vigamox\n' 'Cl.COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O Vigamox\n')
def test_chembl_key_serialiser(self): serialiser = get_serializer('inchi_key') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'Name:\tInChI Key:\n') self.assertEqual( serialiser.serialize_line(mols), 'FABPRXSRWADJSP-MEDUHNTESA-N,IDIIJJHBXUESQI-DFIJPDEKSA-N\n') self.assertEqual( serialiser.serialize_line(mols, human=True), 'FABPRXSRWADJSP-MEDUHNTESA-N,IDIIJJHBXUESQI-DFIJPDEKSA-N\n') self.assertEqual( serialiser.serialize_line(mols, human=True, name='Vigamox'), 'Vigamox\tFABPRXSRWADJSP-MEDUHNTESA-N,IDIIJJHBXUESQI-DFIJPDEKSA-N\n' )
def main(): substructure = new_client.substructure options = get_options() source_format = options.source_format.lower() if source_format not in AVAILABLE_SOURCE_FORMATS: sys.stderr.write('Unsupported source format', options.source_format) return inp = sys.stdin if source_format == 'sdf': with open(options.input) if options.input else sys.stdin as in_f: options.input = None inp = convert_to_smiles(in_f) with open(options.input) if options.input else inp as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.dest_format) if not serializer_cls: sys.stderr.write('Unsupported format', options.dest_format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: if not line or line.lower().startswith('smiles'): continue chunk = line.strip().split()[0] identifiers = chunk.strip().split(',') valid_identifiers = list() sub = list() for identifier in identifiers: if chembl_id_regex.match(identifier): valid_identifiers.append(identifier) sub.extend(list(substructure.filter(chembl_id=identifier))) elif smiles_regex.match(identifier): valid_identifiers.append(identifier) sub.extend(list(substructure.filter(smiles=identifier))) sub = list({v['molecule_chembl_id']: v for v in sub}.values()) out_f.write( serializer_cls.serialize_line( sub, human=options.human, name=','.join(valid_identifiers)))
def main(): substructure = new_client.substructure options = get_options() source_format = options.source_format.lower() if source_format not in AVAILABLE_SOURCE_FORMATS: sys.stderr.write('Unsupported source format', options.source_format) return inp = sys.stdin if source_format == 'sdf': with open(options.input) if options.input else sys.stdin as in_f: options.input = None inp = convert_to_smiles(in_f) with open(options.input) if options.input else inp as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.dest_format) if not serializer_cls: sys.stderr.write('Unsupported format', options.dest_format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: if not line or line.lower().startswith('smiles'): continue chunk = line.strip().split()[0] identifiers = chunk.strip().split(',') valid_identifiers = list() sub = list() for identifier in identifiers: if chembl_id_regex.match(identifier): valid_identifiers.append(identifier) sub.extend(list(substructure.filter(chembl_id=identifier))) elif smiles_regex.match(identifier): valid_identifiers.append(identifier) sub.extend(list(substructure.filter(smiles=identifier))) sub = list({v['molecule_chembl_id']: v for v in sub}.values()) out_f.write(serializer_cls.serialize_line(sub, human=options.human, name=','.join(valid_identifiers)))
def main(): options = get_options() source_format = options.source_format.lower() if source_format not in AVAILABLE_SOURCE_FORMATS: sys.stderr.write('Unsupported source format', options.source_format) return inp = sys.stdin if source_format == 'sdf': with open(options.input) if options.input else sys.stdin as in_f: options.input = None inp = convert_to_smiles(in_f) with open(options.input) if options.input else inp as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.dest_format) if not serializer_cls: sys.stderr.write('Unsupported format', options.dest_format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: if not line or line.lower().startswith('smiles'): continue chunk = line.strip().split()[0] identifiers = chunk.strip().split(',') valid_identifiers = list() for identifier in identifiers: if chembl_id_regex.match(identifier): valid_identifiers.append(identifier) elif smiles_regex.match(identifier): valid_identifiers.extend([x['molecule_chembl_id'] for x in resolve(identifier)]) targets = mols_to_targets(valid_identifiers, organism=options.organism, only_ids=(options.dest_format == 'chembl_id'), include_parents=options.parent, chunk_size=int(options.chunk)) out_f.write(serializer_cls.serialize_line(targets, human=options.human, name=','.join(valid_identifiers)))
def test_inchi_serialiser(self): serialiser = get_serializer('inchi') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'Name:\tInChI:\n') self.assertEqual(serialiser.serialize_line(mols), 'InChI=1S/C21H24FN3O4/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11' '-3-2-6-23-16(11)10-24/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28)/t11-,16+/m0/s1\tInChI=1S/C21' 'H24FN3O4.ClH/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11-3-2-6-2' '3-16(11)10-24;/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28);1H/t11-,16+;/m0./s1\n') self.assertEqual(serialiser.serialize_line(mols, human=True), 'InChI=1S/C21H24FN3O4/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11' '-3-2-6-23-16(11)10-24/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28)/t11-,16+/m0/s1\tInChI=1S/C21' 'H24FN3O4.ClH/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-11-3-2-6-2' '3-16(11)10-24;/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28);1H/t11-,16+;/m0./s1\n') self.assertEqual(serialiser.serialize_line(mols, human=True, name='Vigamox'), 'Vigamox\tInChI=1S/C21H24FN3O4/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(2' '0)24-8-11-3-2-6-23-16(11)10-24/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28)/t11-,16+/m0/s1\tInC' 'hI=1S/C21H24FN3O4.ClH/c1-29-20-17-13(19(26)14(21(27)28)9-25(17)12-4-5-12)7-15(22)18(20)24-8-1' '1-3-2-6-23-16(11)10-24;/h7,9,11-12,16,23H,2-6,8,10H2,1H3,(H,27,28);1H/t11-,16+;/m0./s1\n')
def test_smiles_serialiser(self): serialiser = get_serializer('smi') mols = resolve('Vigamox') buf = StringIO() serialiser.write_header(buf) buf.seek(0) self.assertEqual(buf.read(), 'SMILES Name\n') self.assertEqual( serialiser.serialize_line(mols), 'COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n' 'Cl.COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n' ) self.assertEqual( serialiser.serialize_line(mols, human=True), 'COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n' 'Cl.COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O\n' ) self.assertEqual( serialiser.serialize_line(mols, human=True, name='Vigamox'), 'COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O Vigamox\n' 'Cl.COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc4C(=O)C(=CN(C5CC5)c14)C(=O)O Vigamox\n' )
def test_mols_to_targets(self): ret = mols_to_targets(['CHEMBL819'], organism='Escherichia coli') serialiser = get_serializer('uniprot') self.assertEqual( set(serialiser.serialize_line(ret).strip().split(',')), {'A1E3K9', 'P62593', 'P35695'}) ret = mols_to_targets(['CHEMBL819'], organism='Escherichia coli', include_parents=True) serialiser = get_serializer('uniprot') self.assertEqual( set(serialiser.serialize_line(ret).strip().split(',')), {'A1E3K9', 'P62593', 'P35695', 'P00811'}) ret = mols_to_targets(['CHEMBL25', 'CHEMBL1']) serialiser = get_serializer('gene_name') self.assertTrue({ 'POLK', 'ADRA2RL2', 'CCR2', 'CHRM5', 'ADRA2L1', 'CMKAR1', 'PTPRC', 'HADH2', 'ITGAB', 'VIPR1' }.issubset(set(serialiser.serialize_line(ret).strip().split(',')))) ret = mols_to_targets(['CHEMBL2', 'CHEMBL1737']) serialiser = get_serializer('chembl_id') targets_a = set(serialiser.serialize_line(ret).strip().split(',')) self.assertTrue({ 'CHEMBL213', 'CHEMBL1916', 'CHEMBL205', 'CHEMBL4071', 'CHEMBL1909043', 'CHEMBL2622', }.issubset(targets_a)) ret = mols_to_targets(['CHEMBL2', 'CHEMBL1737'], only_ids=True) serialiser = get_serializer('chembl_id') targets_b = set(serialiser.serialize_line(ret).strip().split(',')) self.assertTrue({ 'CHEMBL213', 'CHEMBL1916', 'CHEMBL205', 'CHEMBL4071', 'CHEMBL1909043', 'CHEMBL2622', }.issubset(targets_b)) self.assertEqual(targets_a, targets_b) drugs = ['Viagra', 'Gleevec'] ret = mols_to_targets( [resolve(x)[0]['molecule_chembl_id'] for x in drugs]) serialiser = get_serializer('uniprot') self.assertTrue({ 'Q9BQI3', 'P00523', 'O15111', 'P22612', 'P25021', 'P16591', 'Q13153', 'Q8WXR4', 'P28564', }.issubset(set(serialiser.serialize_line(ret).strip().split(','))))
def main(): similarity = new_client.similarity options = get_options() try: threshold = int(options.threshold) if threshold not in range(70, 101): sys.stderr.write( 'Threshold should be an integer in range [70-100]') return threshold = str(threshold) except: sys.stderr.write('Threshold should be an integer in range [70-100]') return source_format = options.source_format.lower() if source_format not in AVAILABLE_SOURCE_FORMATS: sys.stderr.write('Unsupported source format', options.source_format) return inp = sys.stdin if source_format == 'sdf': with open(options.input) if options.input else sys.stdin as in_f: options.input = None inp = convert_to_smiles(in_f) with open(options.input) if options.input else inp as in_f, \ open(options.output, 'w') if options.output else sys.stdout as out_f: serializer_cls = get_serializer(options.dest_format) if not serializer_cls: sys.stderr.write('Unsupported format', options.dest_format) return if options.human: serializer_cls.write_header(out_f) for line in in_f: if not line or line.lower().startswith('smiles'): continue chunk = line.strip().split()[0] identifiers = chunk.strip().split(',') valid_identifiers = list() sim = list() for identifier in identifiers: if chembl_id_regex.match(identifier): valid_identifiers.append(identifier) sim.extend( list( similarity.filter(chembl_id=identifier, similarity=threshold))) elif smiles_regex.match(identifier): valid_identifiers.append(identifier) sim.extend( list( similarity.filter(smiles=identifier, similarity=threshold))) sim = list({v['molecule_chembl_id']: v for v in sim}.values()) out_f.write( serializer_cls.serialize_line( sim, human=options.human, name=','.join(valid_identifiers)))