def write_read_cycle(self): smiles = write_smiles(self.mol) note(self.mol.nodes(data=True)) note(self.mol.edges(data=True)) note(smiles) # self.mol can exist in a mixed implicit/explicit H style. The reference # must be one or the other, since we can't read in mixed mode. We want # to be sure we produce the correct answer in both cases though. for expl_H in (False, True): ref_mol = self.mol.copy() defaults = {'charge': 0, 'hcount': 0} for node in ref_mol: for key, val in defaults.items(): if key not in ref_mol.nodes[node]: ref_mol.nodes[node][key] = val if expl_H: add_explicit_hydrogens(ref_mol) else: remove_explicit_hydrogens(ref_mol) found = read_smiles(smiles, explicit_hydrogen=expl_H, reinterpret_aromatic=False) note(found.nodes(data=True)) note(found.edges(data=True)) assertEqualGraphs(ref_mol, found)
def test_write_smiles(node_data, edge_data, expl_h): mol = make_mol(node_data, edge_data) smiles = write_smiles(mol) found = read_smiles(smiles, explicit_hydrogen=expl_h, reinterpret_aromatic=False) assertEqualGraphs(mol, found)
def LAZY_maker(infile,outfile,DOI,SOFTWARE,METHOD,NAME,EMAIL): op = open(infile,'r') text = op.read().splitlines() op.close() DataDictionary = OutputParser(text) DataDictionary['DOI'] = DOI DataDictionary['METHOD'] = METHOD DataDictionary['NAME'] = NAME DataDictionary['EMAIL'] = EMAIL DataDictionary['fAfC'] = (DataDictionary['ARCH'][0][3]/np.sum(DataDictionary['ARCH'][0][3]))[0] if len(DataDictionary['ARCH'])==1: DataDictionary['NSC'] = 0 else: DataDictionary['NSC'] = DataDictionary['ARCH'][1][3][0] DataDictionary['PHASE'] =infile.split('/')[-2][:-5] chainlist = list_convert_object(DataDictionary['ARCH']) G = molecule(chainlist).create_network() smiles = write_smiles(G) DataDictionary['SMILES'] = smiles DataDictionary['CLASSIFICATION'] = ['Polymers','Bottlebrushes','Computational','SCFT'] pif_dict = dict_to_pif(DataDictionary) print(f'Writing...{outfile}') op = open(f'{outfile}.json','w+') op.write(pif_dict) op.close()
def graph_to_canonical_smiles(G, allHsExplicit=True): #yields a canonical string from a graph representing a molecule, with #node attributes 'element' sm = write_smiles(G) indigo = Indigo() mol = indigo.loadMolecule(sm) mol.aromatize() return mol.canonicalSmiles() #Does not include with H...
def find_elements(): # list all the elements appeared special_case = ['b', 'c', 'o', 'p', 's'] element_list = [] longest_len =0 for path_name in paths: path = paths[path_name] df_smiles = pd.read_csv(os.path.join(path, 'names_smiles.txt')) smiles_list = np.array(df_smiles.iloc[:, 1]) for smiles in smiles_list: mol = read_smiles(smiles) for node in mol.nodes: if 'stereo' in mol.nodes[node]: mol.nodes[node].pop('stereo') # discard stereo infomation by hand new_smiles = write_smiles(mol) length = 0 for i, ele in enumerate(new_smiles): ele = str(ele) #assert ele != 'n', 'SIMPLIFICATION FAILS' if ele.islower() and (not ele in special_case) and i > 0 and\ str(new_smiles[i-1]).isupper(): # is the suffix of an element continue if ele.isupper() and i < len(new_smiles) - 1 and str(new_smiles[i+1]).islower() \ and (not str(new_smiles[i+1]) in special_case): # an element with 2 chars ele = ele + str(new_smiles[i+1]) length += 1 if not ele in element_list: element_list.append(ele) if length > longest_len: longest_len = length print(element_list) with open('element_list.txt', 'w') as f: for item in element_list: f.write("%s " % item) f.write(f'{longest_len}')
#for idx, ele in enumerate(string): # mol.nodes[idx]['element'] = ele # mol.add_edges_from([(idx,idx+1)]) # count+=1 print(write_smiles(G)) # [O-]C(=O)C([C])([C])[C] # fill_valence(mol, respect_hcount=True) #print(write_smiles(mol)) # [O-]C(=O)C(C)(C)C #plt.subplot(121) #nx.draw(G, with_labels=True, font_weight='bold') #plt.subplot(122) # nx.draw(G) #plt.show()
def augment_smiles(rdata, data_choice, make_1d_pading=True): if os.path.exists(f'data/{data_choice}/data.json'): with open(f'data/{data_choice}/data.json', 'r') as f: new_data = json.load(f) return new_data, 320 if not os.path.exists('./element_list.txt'): print('no element list found!\n') find_elements() # find all appeared elements if the list has not been abtained new_data = [] with open('element_list.txt') as f: lines = f.readlines() element_list = lines[0].split(sep=' ')[:-1] #longest_len = int(lines[0].split(sep=' ')[-1]) longest_len = 320 for item in rdata: name = item['name'] smiles = item['SMILES'] if data_choice != 'test': label = item['label'] weight = item['weight'] mol = read_smiles(smiles) for node in mol.nodes: if 'stereo' in mol.nodes[node]: mol.nodes[node].pop('stereo') # discard stereo infomation by hand degrees = np.array([mol.degree(idx) for idx in mol.nodes]) # find leaf nodes to generate different smiles for one mol leaf_nodes = np.array(list(mol.nodes), dtype=int)[degrees == 1] if leaf_nodes.shape[0] == 0: leaf_nodes = [0] try: if len(leaf_nodes) > 5 and data_choice != 'test': length = len(leaf_nodes) idx_list = [idx for idx in range(0, length, length//4)] leaf_nodes = leaf_nodes[idx_list] # at most 5 examples new_smiles_list = [write_smiles(mol, start=list(mol.nodes)[leaf_node]) for leaf_node in leaf_nodes] onehots_list = generate_onehots(new_smiles_list, element_list) if data_choice != 'test': tmp = [write_dict(name, onehots, smiles, label, weight, len(leaf_nodes)) \ for onehots, smiles in zip(onehots_list, new_smiles_list)] else: tmp = [write_dict(name, onehots, smiles) for onehots, smiles in zip(onehots_list, new_smiles_list)] except Exception as inst: print(inst) continue new_data += tmp for item in new_data: item['onehots'] = item['onehots'].tolist() with open(f'data/{data_choice}/data.json', 'w') as f: json.dump(new_data, f) return new_data, longest_len