def main(): # Read in a SD or SMILES file - then write out into a specified directory parser = argparse.ArgumentParser( description= "Convert a SMILES or SDFile to input for Astex Fragment network.") parser.add_argument("--input") parser.add_argument("--input_format", default="smi") parser.add_argument("--base_dir") parser.add_argument("--iso_flag", default=True) args = parser.parse_args() attrs = [] id = 0 mols = parse_mols(args.input, args.input_format) for x in tqdm(mols): if x is None: continue attr = Attr( Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)), ["EM", x.GetProp("_Name")], ) attrs.append(attr) id += 1 if not os.path.isdir(args.base_dir): os.mkdir(args.base_dir) # Build the network node_holder = NodeHolder(iso_flag=args.iso_flag) node_holder = build_network(attrs, node_holder) # Write the data out write_data(args.base_dir, node_holder, attrs)
def process_smiles(smiles, id='1', recurse=True, no_output=False, verbosity=0): attrs = [] # print("Original SMILES: " + args.smiles) # mol = Chem.MolFromSmiles(args.smiles) # if args.standardize: # mol = standardize(mol) # print("Standardized SMILES: " + Chem.MolToSmiles(mol)) # smiles = Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True)) # print("Canonical SMILES: " + smiles) attr = Attr(smiles, ["EM", id]) attrs.append(attr) # Build the network node_holder = NodeHolder(iso_flag=False) max_frags = 0 # print("Recurse:",recurse) node_holder = build_network(attrs, node_holder, max_frags, smiles, verbosity, recurse=recurse) # Write the data out if not no_output: for node in node_holder.get_nodes(): print(str(node)) for edge in node_holder.get_edges(): print(str(edge)) for attr in attrs: print(str(attr)) print("Number of nodes: " + str(len(node_holder.get_nodes())) + " edges: " + str(len(node_holder.get_edges())))
def main(): # Read in a SD or SMILES file - then write out into a specified directory parser = argparse.ArgumentParser( description= "Convert a SMILES or SDFile to input for Astex Fragment network.") parser.add_argument("--input") parser.add_argument("--input_format", default="smi") parser.add_argument("--base_dir") parser.add_argument("--isomeric", dest="iso_flag", action="store_true") parser.add_argument("--non_isomeric", dest="iso_flag", action="store_false") group = parser.add_mutually_exclusive_group() group.add_argument("-v", dest="verbosity", action="store_const", const=1) group.add_argument("-vv", dest="verbosity", action="store_const", const=2) parser.set_defaults(verbosity=0) parser.set_defaults(iso_flag=True) args = parser.parse_args() # Do we have an input and base directory? if not args.input: print('ERROR: Must specify an input') sys.exit(1) if not os.path.isfile(args.input): print('ERROR: input (%s) does not exist' % args.input) sys.exit(1) if not args.base_dir: print('ERROR: Must specify a base directory') sys.exit(1) if not os.path.isdir(args.base_dir): print('ERROR:input base directory (%s) does not exist' % args.base_dir) sys.exit(1) tqdm_disable = True if args.verbosity else False attrs = [] id = 0 mols = parse_mols(args.input, args.input_format) for x in tqdm(mols, disable=tqdm_disable): print("Processing " + Chem.MolToSmiles(x, isomericSmiles=True)) if x is None: continue attr = Attr( Chem.CanonSmiles(Chem.MolToSmiles(x, isomericSmiles=True)), ["EM", x.GetProp("_Name")], ) attrs.append(attr) id += 1 if not os.path.isdir(args.base_dir): os.mkdir(args.base_dir) # Build the network node_holder = NodeHolder(iso_flag=args.iso_flag) max_frags = 0 node_holder = build_network(attrs, node_holder, max_frags, args.base_dir, args.verbosity) # Write the data out write_data(args.base_dir, node_holder, attrs)
def main(): # Read in a SD or SMILES file - then write out into a specified directory parser = argparse.ArgumentParser( description="Convert a SMILES to nodes, edges and attributes" ) parser.add_argument("--smiles") parser.add_argument("--id") parser.add_argument("--standardize", action="store_true") parser.add_argument("--isomeric", dest="iso_flag", action="store_true") parser.add_argument("--non_isomeric", dest="iso_flag", action="store_false") group = parser.add_mutually_exclusive_group() group.add_argument("-v", dest="verbosity", action="store_const", const=1) group.add_argument("-vv", dest="verbosity", action="store_const", const=2) parser.set_defaults(verbosity=0) parser.set_defaults(iso_flag=True) args = parser.parse_args() # Do we have an input and base directory? if not args.smiles: print('ERROR: Must specify a SMILES') sys.exit(1) attrs = [] print("Original SMILES: " + args.smiles) mol = Chem.MolFromSmiles(args.smiles) if args.standardize: mol = standardize(mol) print("Standardized SMILES: " + Chem.MolToSmiles(mol)) smiles = Chem.CanonSmiles(Chem.MolToSmiles(mol, isomericSmiles=True)) print("Canonical SMILES: " + smiles) id = args.id if id is None: id = "smiles1" attr = Attr(smiles, ["EM", id]) attrs.append(attr) # Build the network node_holder = NodeHolder(iso_flag=args.iso_flag) max_frags = 0 node_holder = build_network(attrs, node_holder, max_frags, smiles, args.verbosity) # Write the data out for node in node_holder.node_list: print(str(node)) for edge in node_holder.get_edges(): print(str(edge)) for attr in attrs: print(str(attr)) print("Number of nodes: " + str(len(node_holder.node_list))) print("Number of edges: " + str(len(node_holder.get_edges())))
def test_generate_nodes(self): """ Test we can generate nodes for the basic data. :return: """ try: nodes = [x for x in open("frag/tests/data/nodes.txt").readlines()] edges = [x.split() for x in open("frag/tests/data/edges.txt").readlines()] attrs = [ Attr(input_str=x) for x in open("frag/tests/data/attributes.txt").readlines() ] except IOError: nodes = [x for x in open("data/nodes.txt").readlines()] edges = [x.split() for x in open("data/edges.txt").readlines()] attrs = [Attr(input_str=x) for x in open("data/attributes.txt").readlines()] node_holder = NodeHolder(iso_flag=True) node_holder = build_network(attrs, node_holder) # Create the nodes and test with output self.assertEqual(len(node_holder.node_list), len(nodes)) # This doesn't work yet(we get 3687 edges - should be 3691 # Close enough - and the output looks right... self.assertEqual(len(node_holder.get_edges()), 3687)
def fragment_mol(smiles, base_dir, verbosity): attrs = [] attr = Attr(smiles, ["EM"]) attrs.append(attr) # Build the network node_holder = NodeHolder(iso_flag=False) max_frags = 0 node_holder = build_network(attrs, node_holder, max_frags, base_dir, verbosity=verbosity, recurse=False) return node_holder
def fragment_mol(self, smiles, verbosity=0) -> object: """Performs the fragmentation process for a SMILES. Returns: Fragdata object with Node/Edge data to write to files. """ # Note that in this version, only one SMILES is sent in here. # There seemed to be some strange issues with edges if a combined node holder is used TBI attrs = [] attr = Attr(smiles, ["EM"]) attrs.append(attr) node_holder = NodeHolder(iso_flag=False) node_holder = build_network(attrs, node_holder, base_dir=None, verbosity=verbosity, recurse=False) return node_holder
def fragment_mol(self, smiles, verbosity=0) -> object: """Performs the fragmentation process for a SMILES. Returns: NodeHolder object. """ attrs = [] attr = Attr(smiles, ["EM"]) attrs.append(attr) #print('fragment smiles: {}'.format(smiles)) # Build the network node_holder = NodeHolder(iso_flag=False) node_holder = build_network(attrs, node_holder, base_dir=None, verbosity=verbosity, recurse=False) return node_holder
def fragment_mols(input_smiles, verbosity=0, recurse=False): #print("Fragmenting", smiles) attrs = [] for smiles in input_smiles: attr = Attr(smiles, ["EM"]) attrs.append(attr) # Build the network # print("Processing", len(input_smiles), "mols") # print('INPUT ', ','.join(sorted(input_smiles))) node_holder = NodeHolder(iso_flag=False) node_holder = build_network(attrs, node_holder, base_dir=None, verbosity=verbosity, recurse=recurse) # output_smiles = [n.SMILES for n in node_holder.get_nodes()] # print('OUTPUT', ','.join(sorted(output_smiles))) frag_data = group_data(node_holder, input_smiles) # print("Groups:", len(frag_data.parent_data)) return frag_data
def fragment_mol(smiles, verbosity=0): attrs = [] attr = Attr(smiles, ["EM"]) attrs.append(attr) # Build the network node_holder = NodeHolder(iso_flag=False) node_holder = build_network(attrs, node_holder, base_dir=None, verbosity=verbosity, recurse=False) # Write the data out # print(str(node_holder.size())) # for node in node_holder.node_list: # print(str(node)) # for edge in node_holder.get_edges(): # print(str(edge)) return node_holder
from tqdm import tqdm from frag.network.models import Attr if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Decorate a library of molecules for insertion to the database.') parser.add_argument('--input_smi') parser.add_argument('--output_attr') args = parser.parse_args() out_smi = open(args.output_attr, "w") for mol in tqdm( Chem.SmilesMolSupplier(args.input_smi, delimiter=',', smilesColumn=1, nameColumn=0)): this_smi = Chem.MolToSmiles(mol, isomericSmiles=True) new_smis = decorate_smi(this_smi) new_murck = decorate_smi(MurckoScaffold.MurckoScaffoldSmiles(this_smi)) # mol_frags = get_fragments(Chem.MolFromSmiles(this_smi),iso_labels=False) # new_smis.extend([x.replace("Xe","At") for x in mol_frags]) new_smis.extend(new_murck) new_smis = list(set(new_smis)) # Do this on original and on Murcko Scaffold name = mol.GetProp("_Name") new_attr = Attr(this_smi, ["EM", name]) out_smi.write(str(new_attr) + "\n") for i, smi in enumerate(new_smis): new_attr = Attr(smi, ["EM", name + "_" + str(i)]) out_smi.write(str(new_attr) + "\n")
def test_compare_iso_non_iso(self): """ Test that the iso flag makes a difference. :return: """ input_smis = ["C#CC(C)(C)NC[C@]1(O)CCCN2CCCC[C@@H]21"] test_iso_node_list = [ "C#CC(C)(C)NC", "OC1CCCN2CCCCC12", "O", "C#CC(C)(C)NCC1CCCN2CCCCC12", "C#CC(C)(C)NC[C@]1(O)CCCN2CCCC[C@@H]21", "C1CCN2CCCCC2C1", "C#CC(C)(C)NC.O", ] test_non_iso_node_list = [ "C#CC(C)(C)NC", "OC1CCCN2CCCCC12", "O", "C#CC(C)(C)NCC1CCCN2CCCCC12", "C#CC(C)(C)NCC1(O)CCCN2CCCCC21", "C1CCN2CCCCC2C1", "C#CC(C)(C)NC.O", ] test_iso_edge_list = [ "EDGE C#CC(C)(C)NC[C@]1(O)CCCN2CCCC[C@@H]21 OC1CCCN2CCCCC12 FG|C#CC(C)(C)NC[Xe]|CCC(C)(C)NC[100Xe]|RING|OC1([Xe])CCCN2CCCCC21|O[C@@]1([100Xe])CCCC2CCCC[C@@H]21", "EDGE OC1CCCN2CCCCC12 O RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12|FG|O[Xe]|O[100Xe]", "EDGE C#CC(C)(C)NC[C@]1(O)CCCN2CCCC[C@@H]21 C#CC(C)(C)NC.O RING|[Xe]C1([Xe])CCCN2CCCCC21|[100Xe][C@]1([101Xe])CCCC2CCCC[C@@H]21|FG|C#CC(C)(C)NC[Xe].O[Xe]|CCC(C)(C)NC[100Xe].O[101Xe]", "EDGE OC1CCCN2CCCCC12 C1CCN2CCCCC2C1 FG|O[Xe]|O[100Xe]|RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12", "EDGE C#CC(C)(C)NC.O C#CC(C)(C)NC FG|O|O|FG|C#CC(C)(C)NC|CCC(C)(C)NC", "EDGE C#CC(C)(C)NC[C@]1(O)CCCN2CCCC[C@@H]21 C#CC(C)(C)NCC1CCCN2CCCCC12 FG|O[Xe]|O[101Xe]|RING|C#CC(C)(C)NCC1([Xe])CCCN2CCCCC21|CCC(C)(C)NC[C@@]1([101Xe])CCCC2CCCC[C@@H]21", "EDGE C#CC(C)(C)NCC1CCCN2CCCCC12 C1CCN2CCCCC2C1 FG|C#CC(C)(C)NC[Xe]|CCC(C)(C)NC[100Xe]|RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12", "EDGE C#CC(C)(C)NC.O O FG|C#CC(C)(C)NC|CCC(C)(C)NC|FG|O|O", "EDGE C#CC(C)(C)NCC1CCCN2CCCCC12 C#CC(C)(C)NC RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12|FG|C#CC(C)(C)NC[Xe]|CCC(C)(C)NC[100Xe]", ] test_non_iso_edge_list = [ "EDGE C#CC(C)(C)NCC1(O)CCCN2CCCCC21 C#CC(C)(C)NCC1CCCN2CCCCC12 FG|O[Xe]|O[101Xe]|RING|C#CC(C)(C)NCC1([Xe])CCCN2CCCCC21|CCC(C)(C)NCC1([101Xe])CCCC2CCCCC21", "EDGE OC1CCCN2CCCCC12 C1CCN2CCCCC2C1 FG|O[Xe]|O[100Xe]|RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12", "EDGE C#CC(C)(C)NCC1(O)CCCN2CCCCC21 C#CC(C)(C)NC.O RING|[Xe]C1([Xe])CCCN2CCCCC21|[100Xe]C1([101Xe])CCCC2CCCCC21|FG|C#CC(C)(C)NC[Xe].O[Xe]|CCC(C)(C)NC[100Xe].O[101Xe]", "EDGE OC1CCCN2CCCCC12 O RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12|FG|O[Xe]|O[100Xe]", "EDGE C#CC(C)(C)NCC1CCCN2CCCCC12 C1CCN2CCCCC2C1 FG|C#CC(C)(C)NC[Xe]|CCC(C)(C)NC[100Xe]|RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12", "EDGE C#CC(C)(C)NC.O O FG|C#CC(C)(C)NC|CCC(C)(C)NC|FG|O|O", "EDGE C#CC(C)(C)NCC1(O)CCCN2CCCCC21 OC1CCCN2CCCCC12 FG|C#CC(C)(C)NC[Xe]|CCC(C)(C)NC[100Xe]|RING|OC1([Xe])CCCN2CCCCC21|OC1([100Xe])CCCC2CCCCC21", "EDGE C#CC(C)(C)NC.O C#CC(C)(C)NC FG|O|O|FG|C#CC(C)(C)NC|CCC(C)(C)NC", "EDGE C#CC(C)(C)NCC1CCCN2CCCCC12 C#CC(C)(C)NC RING|[Xe]C1CCCN2CCCCC12|[100Xe]C1CCCC2CCCCC12|FG|C#CC(C)(C)NC[Xe]|CCC(C)(C)NC[100Xe]", ] attrs = [Attr(input_smi) for input_smi in input_smis] node_holder = NodeHolder(iso_flag=False) node_holder = build_network(attrs, node_holder) non_iso_node_list = [x.SMILES for x in node_holder.node_list] non_iso_edge_list = [str(x) for x in node_holder.edge_list] self.assertListEqual(sorted(non_iso_node_list), sorted(test_non_iso_node_list)) self.assertListEqual(sorted(non_iso_edge_list), sorted(test_non_iso_edge_list)) node_holder = NodeHolder(iso_flag=True) node_holder = build_network(attrs, node_holder) iso_node_list = [x.SMILES for x in node_holder.node_list] iso_edge_list = [str(x) for x in node_holder.edge_list] self.assertListEqual(sorted(iso_node_list), sorted(test_iso_node_list)) self.assertListEqual(sorted(iso_edge_list), sorted(test_iso_edge_list))