def snomaData(fname, popposi = False): datas = [] with open(fname) as csvf: reader = csv.reader(csvf) a = 0 for row in reader: if a == 0: a = 1 continue gf = pysmiles.read_smiles(row[0]) # 'CCCCCc1c2c(cc(O)c1C(=O)O)OC(=O)c1c(cc(OC)cc1C(=O)CCCC)O2' x = torch.empty((gf.number_of_nodes(), 40)) edge_index = torch.empty((2, 2 * gf.number_of_edges())) edge_attr = torch.empty((2 * gf.number_of_edges(), 1), dtype=torch.long) for idx in range(gf.number_of_nodes()): x[idx] = torch.cat((torch.eye(32)[elemap[gf.nodes(data='element')[idx]]], torch.eye(8)[gf.nodes(data='hcount')[idx]])) for idx, edge in enumerate(gf.edges): edge_index[:, idx * 2] = torch.tensor(edge) edge_index[:, idx * 2 + 1] = torch.tensor(edge[::-1]) for idx, edge in enumerate(gf.edges(data='order')): edgt = edge[2] - 1 if edgt > 0: edgt += 0.5 if edgt > 0.5: edgt += 0.5 edge_attr[idx * 2, 0] = edgt edge_attr[idx * 2 + 1, 0] = edgt datum = torch_geometric.data.Data(x= x, edge_index=edge_index.to(torch.long), edge_attr=edge_attr, y = torch.tensor([[int(row[1])]], dtype = torch.float)) if datum.y == 1 and popposi: datas += 24 * [datum] else: datas.append(datum) return datas
def test_write_smiles(node_data, edge_data, expl_h): mol = make_mol(node_data, edge_data) smiles = write_smiles(mol) found = read_smiles(smiles, explicit_hydrogen=expl_h, reinterpret_aromatic=False) assertEqualGraphs(mol, found)
def __init__(self, smiles_string: str, y_list: list, atom_info_path='../raw_data/atom_info.txt'): """ Args: smiles_string (string): SMILES for the molecule. y_list (list or int): list of multilabels or single label. """ # create graph from smiles # (the sys code is to block a pysmiles warning about iseometric stuff) sys.stdout = open(os.devnull, 'w') self.graph = read_smiles(smiles_string) sys.stdout = sys.__stdout__ self.atom_info_path = atom_info_path if isinstance(y_list, list): y = torch.tensor(y_list, dtype=torch.float32) else: y = torch.tensor(y_list, dtype=torch.float32).view(1, -1) # inherit superclass from torch-geometric super().__init__(x=torch.tensor(self.extract_features(), dtype=torch.float), edge_index=torch.tensor(self.graph_to_edge_index(), dtype=torch.long), y=y) # remove graph attribute, necessary to inherit from superclass del self.graph del self.atom_info_path
def get_reward_fitness(state, X, ts, char_idx, chars, net): X_seed = dimY(X, ts, char_idx, chars) out = net(T.from_numpy(X_seed).float(), None).detach().numpy() out_cat = np.argmax(out, axis=1) # Penalizing fitness for fake data fitness = np.where(np.argmax(out, axis=1) == 0, -1 * out[:, 0], out[:, 1]) # Reward +1 for correct and -10 for wrong classification reward = np.where(out_cat == 0, -10, 1) arr = np.hstack((reward.reshape(len(reward), 1), fitness.reshape(len(fitness), 1))) try: X = X.values.reshape(len(X.values), 1) except: X = X.reshape(len(X), 1) # [smiles,reward,fitness] arr = np.hstack((X, arr)) # Penealizing reward for generating child gene same as parent same_genes = (state == X.reshape(len(X))).values if True in same_genes: for i, g in enumerate(same_genes): if g == True: arr[i][1] = -10 # Penalizing for wrong smiles for i, g in enumerate(arr): try: mol = read_smiles(g[0]) except: arr[i][1] = -10 return arr
def write_read_cycle(self): smiles = write_smiles(self.mol) note(self.mol.nodes(data=True)) note(self.mol.edges(data=True)) note(smiles) # self.mol can exist in a mixed implicit/explicit H style. The reference # must be one or the other, since we can't read in mixed mode. We want # to be sure we produce the correct answer in both cases though. for expl_H in (False, True): ref_mol = self.mol.copy() defaults = {'charge': 0, 'hcount': 0} for node in ref_mol: for key, val in defaults.items(): if key not in ref_mol.nodes[node]: ref_mol.nodes[node][key] = val if expl_H: add_explicit_hydrogens(ref_mol) else: remove_explicit_hydrogens(ref_mol) found = read_smiles(smiles, explicit_hydrogen=expl_H, reinterpret_aromatic=False) note(found.nodes(data=True)) note(found.edges(data=True)) assertEqualGraphs(ref_mol, found)
def sm2graph(smiles,size, weight = None): try: mol = Chem.MolFromSmiles(smiles) mol = mol_to_nx(mol) except: mol = read_smiles(smiles) #normalized Laplacian matrix # nL = nx.normalized_laplacian_matrix(mol,weight = weight).todense().A # nL = np.pad(nL,(0,size-nL.shape[0])) #adjacent matrix adj = nx.to_numpy_matrix(mol, weight=weight).A adj = adj+np.eye(adj.shape[0]) adj = np.pad(adj,(0,size-adj.shape[0])) #degree matrix de = np.zeros((size,size)) for i in mol.degree: de[i[0]-1][i[0]-1] = i[1]+1 #feature mole = mol.nodes(data='element') #random walk normalized Laplacian matrix di = de di[di!=0] = 1/di[di!=0] rwL = di@adj #is aromatic ar = mol.nodes(data = 'aromatic') return rwL, mole, ar
def get_value(self): filename = self._pth_widget.text() if filename: try: pdb_mol = read_pdb(filename) except Exception as err: self._pth_widget.setText('') dialog = QErrorMessage() dialog.showMessage(str(err)) dialog.exec_() return False pdb_mol = pdb_mol[0] pdb_mol.graph['name'] = Path(filename).stem if not pdb_mol.edges: system = System() system.add_molecule(pdb_mol) MakeBonds(allow_name=False).run_system(system) if not self.hydrogen_checkbox.checkState(): remove_explicit_hydrogens(pdb_mol) else: pdb_mol = None smiles = self._smiles_widget.text() if smiles: try: smiles_mol = read_smiles(smiles) except Exception as err: dialog = QErrorMessage() dialog.showMessage(str(err)) dialog.exec_() self._smiles_widget.setText('') return False smiles_mol.graph['smiles'] = smiles smiles_mol.graph['name'] = smiles if self.hydrogen_checkbox.checkState(): add_explicit_hydrogens(smiles_mol) else: smiles_mol = None if pdb_mol and smiles_mol: gm = nx.isomorphism.GraphMatcher( pdb_mol, smiles_mol, nx.isomorphism.categorical_node_match('element', None)) match = next(gm.isomorphisms_iter(), {}) if not match: dialog = QErrorMessage() dialog.showMessage( 'Smiles and PDB molecule are not isomorphic!') dialog.exec_() return False for pdb_idx, smi_idx in match.items(): smiles_mol.nodes[smi_idx].update(pdb_mol.nodes[pdb_idx]) smiles_mol.graph.update(pdb_mol.graph) molecule = smiles_mol or pdb_mol if not molecule: return False return molecule
def res(self, smile, adj_list, feature_list): try: mol_with_H = read_smiles(smile, explicit_hydrogen=True) except: print(smile) A = nx.to_numpy_matrix(mol_with_H) X = oneHot(smile) adj_list.append(np.array(A)) feature_list.append(np.array(X))
def predict(self, input: JsonSerializable): """ This is a dummy test model. It counts atoms in a SMILES string. """ mol = read_smiles(input, explicit_hydrogen=True) counts = collections.defaultdict(int) for _, atom in mol.nodes(data="element"): counts[atom] += 1 return json.dumps(counts)
def evaluate_smiles(smiles_string): classes = ['insoluble', 'slightly soluble', 'soluble'] G = read_smiles(smiles_string, explicit_hydrogen=True) #decode smiles string feature = element_to_onehot(np.asarray(G.nodes(data='element'))[:, 1]) #convert element to one-hot vector edges = np.asarray(G.edges) #get edge array index = np.asarray([edges[:,0], edges[:,1]]) #reformat edge array to torch geometric suitable format d = Data(x=torch.tensor(feature, dtype=torch.float),edge_index=torch.tensor(index, dtype=torch.long)) #create torch gemoetry Data object data = d.to(device) #send data to device memory model.eval() #set model to evaluate mode print(classes[torch.argmax(torch.softmax(model(data), dim=0)).item()]) #evaluate the test data
def oneHot(smiles): mol = read_smiles(smiles) mol_with_H = read_smiles(smiles, explicit_hydrogen=True) one_hot_matrix = [] for atom in mol.nodes(data='element'): row = [0] * 4 if atom[1] == "C": row[0] = 1 elif atom[1] == "N": row[1] = 1 elif atom[1] == "O": row[2] = 1 one_hot_matrix.append(row) for x in range(len(mol), len(mol_with_H)): one_hot_matrix.append([0, 0, 0, 1]) # for i in one_hot_matrix: # print(i) return one_hot_matrix
def build_graph(smiles): """ Constructs a NetworkX graph out of a SMILES representation of a molecule from the train/test data. :param smiles: a string object of SMILES format :return: nx.Graph: A graph describing a molecule. Nodes will have an 'element', 'aromatic' and a 'charge', and if `explicit_hydrogen` is False a 'hcount'. Depending on the input, they will also have 'isotope' and 'class' information. Edges will have an 'order'. """ ''' can access node data and edge data when the graph is in networkx format dgl.from_networkx(g) converts networkx to dgl graph but the node data and edge data doesnt seem to be transferred Goal: save the node feats and edge feats of networkx as tensor and set them to dgl graph ndata and edata Question: Do we save ndata as ('C', 'C', 'C', 'O', 'C') or do we create one hot vectors like in the hw ''' # read the smile graphs in using pysmiles & build network g = pysmiles.read_smiles(smiles) # get the features from the graph and convert to tensor elems = g.nodes(data='element') h_count = g.nodes(data='hcount') aros = g.nodes(data='aromatic') raw_node_feats = [] for elem, data, aro in zip(elems, h_count, aros): node = list(elem) node.append(data[1]) node.append(aro[1] * 1) raw_node_feats.append(node) na = np.array(list(raw_node_feats)) byte_node_feats = tf.convert_to_tensor(na[:, 1]) # turn the byte string node feats into one_hot node feats node_feats = pt_lookup(byte_node_feats).numpy() node_feats[:, -2] = na[:, 2] node_feats[:, -1] = na[:, 3] node_feats = tf.convert_to_tensor(node_feats) # get edge data and extract bonds, double them, then convert to tensor edata = g.edges(data='order') bonds = list(edata) na = np.array(bonds) tup = zip(na[:, 2], na[:, 2]) bond_data = tf.convert_to_tensor(list(itertools.chain(*tup))) bond_data = tf.cast(bond_data, tf.float32) # build dgl graph dgl_graph = dgl.from_networkx(g) dgl_graph.ndata['node_feats'] = node_feats dgl_graph.edata['edge_feats'] = bond_data return dgl_graph
def read_from_pysmiles(num=10): train_path, test_path, dev_path = train_test_path(num) f_csv = OpenCSV(train_path) # id,smiles,activity if num == 10: SMILES_list = [row[1] for row in f_csv] else: SMILES_list = [row[0] for row in f_csv] SMILES = SMILES_list[1] print(SMILES) m = pysmiles.read_smiles(SMILES) print(m.nodes(data='element'))
def __init__(self, file_name): self.data = pd.read_csv(file_name) self.smiles = self.data['smiles'] self.labels = self.data['activity'] self.mols = [read_smiles(smile) for smile in self.smiles] self.periodic_table = Chem.GetPeriodicTable() self.ams = [ nx.to_numpy_matrix(mol, weight='order') for mol in self.mols ] self.graphs = [nx.from_numpy_matrix(am) for am in self.ams] self.element_lists = [mol.nodes(data='element') for mol in self.mols]
def build_graph(smiles): """ Constructs a NetworkX graph out of a SMILES representation of a molecule from the train/test data. :param smiles: a string object of SMILES format :return: nx.Graph A graph describing a molecule. Nodes will have an 'element', 'aromatic' and a 'charge', and if `explicit_hydrogen` is False a 'hcount'. Depending on the input, they will also have 'isotope' and 'class' information. Edges will have an 'order'. """ # TODO: Initialize a DGL Graph g = pysmiles.read_smiles(smiles) return g
def predict(self, input: List[JsonSerializable]): """ This is a dummy test model. It counts atoms in a SMILES string. """ input = input[0] output = [] for inp in input: mol = read_smiles(inp["input"], explicit_hydrogen=True) counts = collections.defaultdict(int) for _, atom in mol.nodes(data="element"): counts[atom] += 1 output += [{"atoms": counts}] return [output]
def process(self): bond_order = set() atom_types = set() processed = [] with open(self.csv_file, 'r') as f: reader = csv.reader(f, delimiter=',') next(reader, None) # Ignore header for item in tqdm(reader): mol_graph = pysmiles.read_smiles(item[0], explicit_hydrogen=False) data = convert_networkx(mol_graph, self.ATOM_TYPES) data['simles'] = item[0] data['label'] = torch.LongTensor([int(item[1])]) processed.append(data) torch.save(processed, self.processed_file) return processed
def load_data_file(path, mode, pos_lim=-1, neg_lim=-1): stdout_backup = sys.stdout sys.stdout = open(os.devnull, "w") print("stdout not shut down correctly") num_pos = 0 num_neg = 0 data = [] with open(path, "r") as fil: fil.readline() for i, line in enumerate(fil): if mode == "test": mol, label = line.strip().split(",")[0], -1 elif mode == "train": _, mol, label = line.strip().split(",") elif mode == "tdt": mol, label = line.strip().split(",") label = int(label) if label >= 0: if label == 0: num_neg += 1 if neg_lim > 0 and num_neg > neg_lim: continue else: num_pos += 1 if pos_lim > 0 and num_pos > pos_lim: continue if (pos_lim > 0 and num_pos > pos_lim) and (neg_lim > 0 and num_neg > neg_lim): break mol_g = read_smiles(mol) #将smiles字符串转成networkx graph #mol_g.smiles = mol data.append([mol_g, int(label), mol]) if i % 1000 == 0: sys.stderr.write("%d\n" % i) sys.stdout = stdout_backup print("stdout recoverd.") return data
def readf(fname): with open(fname) as csvf: reader = csv.reader(csvf) a = 0 for row in reader: if a == 0: a = 1 continue gf = pysmiles.read_smiles(row[0]) for ele in gf.nodes(data='element'): ele = ele[1] # if ele in cnt: # cnt[ele] += 1 # else: # cnt[ele] = 1 if ele not in cnt: cnt[ele] = len(cnt)
def smiles_to_formula(smiles_string): mol = pysmiles.read_smiles(smiles_string, explicit_hydrogen=True) atom_counts = {g: 0 for g in ATOM_MASSES} for node in mol.nodes(data="element"): atom = node[1] if atom not in atom_counts: return None else: atom_counts[atom] += 1 chem_formula = "" for atom, count in atom_counts.items(): if count == 0: continue elif count == 1: chem_formula += atom else: chem_formula += "{}{}".format(atom, count) return chem_formula
def read_raw(filename, dataset, device, no_h): if dataset == 'covid-19': assertion_len = 2 smile_idx = 0 separator = ',' else: assertion_len = 13 smile_idx = 0 separator = '\t' all_smiles = [] mols = [] targets = [] with open(filename) as f: if assertion_len == 2: f.readline() f = tqdm(f) f.set_description('Reading raw data ... ') for line in f: if line != '': l = line.strip().split(separator) assert len(l) == assertion_len m = Chem.MolFromSmiles(l[smile_idx]) if not no_h: m = Chem.AddHs(m) smiles = Chem.MolToSmiles(m) all_smiles.append(smiles) targets.append( torch.tensor(int(l[1]), device=device) if assertion_len == 2 else torch. tensor([float(i) for i in l[2:]], device=device)) mol = read_smiles(smiles.replace('[H]', '[G]'), explicit_hydrogen=False, reinterpret_aromatic=True) mols.append(mol) features = extract_atom_feature(all_smiles, device, no_h) return mols, targets, features
def find_elements(): # list all the elements appeared special_case = ['b', 'c', 'o', 'p', 's'] element_list = [] longest_len =0 for path_name in paths: path = paths[path_name] df_smiles = pd.read_csv(os.path.join(path, 'names_smiles.txt')) smiles_list = np.array(df_smiles.iloc[:, 1]) for smiles in smiles_list: mol = read_smiles(smiles) for node in mol.nodes: if 'stereo' in mol.nodes[node]: mol.nodes[node].pop('stereo') # discard stereo infomation by hand new_smiles = write_smiles(mol) length = 0 for i, ele in enumerate(new_smiles): ele = str(ele) #assert ele != 'n', 'SIMPLIFICATION FAILS' if ele.islower() and (not ele in special_case) and i > 0 and\ str(new_smiles[i-1]).isupper(): # is the suffix of an element continue if ele.isupper() and i < len(new_smiles) - 1 and str(new_smiles[i+1]).islower() \ and (not str(new_smiles[i+1]) in special_case): # an element with 2 chars ele = ele + str(new_smiles[i+1]) length += 1 if not ele in element_list: element_list.append(ele) if length > longest_len: longest_len = length print(element_list) with open('element_list.txt', 'w') as f: for item in element_list: f.write("%s " % item) f.write(f'{longest_len}')
def main(): # Parse command line arguments to get a smiles string args = parse_arguments() if not args.smiles_string: smiles_string = dict_aa[args.mol] else: smiles_string = args.smiles_string print("Drawing molecule:\n{}".format(smiles_string)) # Parse the string into a graph object g = read_smiles(smiles_string) # Mark the cyclic edges as rings try: g = structure.mark_rings(g) except: pass # init turtle window and start drawing, wait for window events draw.init() draw.molecule(g) draw.done()
def save_mol_img(mols, f_name='tmp.png', is_test=False): orig_f_name = f_name for a_mol in mols: try: if Chem.MolToSmiles(a_mol) is not None: print('Generating molecule') if is_test: f_name = orig_f_name f_split = f_name.split('.') f_split[-1] = random_string() + '.' + f_split[-1] f_name = ''.join(f_split) rdkit.Chem.Draw.MolToFile(a_mol, f_name) a_smi = Chem.MolToSmiles(a_mol) mol_graph = read_smiles(a_smi) break # if not is_test: # break except: continue
def augment_dataset(C , dataset , k = 5): for _k in range(C.pos_aug): new_example = [] for g , label , smiles in dataset: if int(label) == 0: continue for _i in range(1 , len(smiles)): i = random.randint(1 , len(smiles) - 1) if (smiles[i-1].isalpha() and smiles[i-1].isupper()) \ and (smiles[i].isalpha() and smiles[i].isupper()): smiles = smiles[:i] + 'C' + smiles[i:] break ng = pysmiles.read_smiles(smiles) new_example.append( [ng , label , smiles] ) dataset = dataset + new_example random.shuffle(dataset) return dataset
def __init__(self, smiles): """ Initalize Molecule Class :param smiles: smiles string """ # Cheminformatics section self.smiles = smiles self.pybel_mol = readstring("smi", smiles) self.pybel_mol.make3D() self.mol_formula = self.pybel_mol.formula # self.rd_mol = Chem.MolFromSmiles(smiles) # self.rd_mol = Chem.AddHs(self.rd_mol) # AllChem.EmbedMolecule(self.rd_mol) # AllChem.MMFFOptimizeMolecule(self.rd_mol) try: self.pysmiles_mol = read_smiles(smiles, explicit_hydrogen=True) except ValueError: self.pysmiles_mol = PySmilesCopy([0, 1, 1]) # compositional section self.natoms = len(self.pybel_mol.atoms) self.position = np.array([0.0, 0.0, 0.0], dtype=float) self.atoms = [] for i in range(self.natoms): atom = self.pybel_mol.atoms[i] self.atoms.append(Atom(atom.atomicnum, atom.coords)) # geometrical section self.bonds = [] self.bond_orders = [] for bond in self.pysmiles_mol.edges(data='order'): self.bonds.append(bond[:-1]) self.bond_orders.append(bond[2]) self.angles = [] for i in range(self.natoms): bonds = self.get_bonds(i) for j in range(len(bonds) - 1): self.angles.append(sort_bend_angle(bonds[j] + bonds[j + 1])) self.torsions = [] for index1 in range(self.natoms): bonds = self.get_bonds(index1) if len(bonds) >= 2: for i, middle_bond in enumerate(bonds): index2 = [atom for atom in middle_bond if atom != index1][0] bonds2 = self.get_bonds(index2) if len(bonds2) >= 2: if i == len(bonds) - 1: bond1 = bonds[0] else: bond1 = bonds[i + 1] for bond in bonds2: if sorted(bond) != sorted(middle_bond): bond2 = bond self.torsions.append( sort_torsion_bonds(bond1, middle_bond, bond2)) else: continue
from property_prediction.data_utils import TaskDataLoader import networkx as nx from pysmiles import read_smiles task = 'FreeSolv' path = '../datasets/{}.csv'.format(task) data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() indices = [] for i in range(len(smiles_list)): graph = read_smiles(smiles_list[i]) number_of_nodes = nx.Graph.number_of_nodes(graph) print('number of nodes for index ', i, ' is: ', number_of_nodes) if number_of_nodes == 1: indices.append(i) print(smiles_list[i]) print(indices)
wf.write(data) file = open('REAL.csv', 'r', encoding='utf-8') f = csv.reader(file) for idx, line in enumerate(f): if idx ==0: continue if len(line) == 0: continue name, smiles, _, group = line[:4] filename = str(group) + 'REAL' + name.split('-')[1] print(smiles, filename, group) mol = read_smiles(str(smiles)) labels = mol.nodes(data='element') node_labels = ['0'] for label in labels: node_labels += [label[1]] matrix = nx.to_numpy_matrix(mol, weight='order').tolist() content = [[0 for i in range(len(matrix) + 1)]] for ma in matrix: content.append([0] + ma) print() with open('group/smiles'+ group + '/' + filename +'.txt', 'w') as f: f.write(smiles) writeFile('group/' + group + '/' + filename + '.txt', content, node_labels)
import pysmiles import dgl import matplotlib.pyplot as plt import networkx as nx import numpy as np import pdb g = pysmiles.read_smiles( "CN1CC[C@@]23C=C[C@@H](C[C@@H]2OC4=C(C=CCC(=C34)C1)OC)O.Br") #g = pysmiles.read_smiles("O=[N+]([O-])C(Br)(CO)CO") for i in range(30): for j in range(30): try: e = g[i][j] except Exception: continue print("%d - %d" % (i, j), e) for i in range(min(30, len(g.nodes))): print(g.nodes[i]) pdb.set_trace() #g = dgl.DGLGraph(g) def draw(g): #g = g.to_networkx().to_undirected() def make_color(x): if x == 'C':
return edges_occ chemicals = pd.read_pickle("smiles.pickle") chemicals_data = [] print(chemicals) headers = [] targets = [] i = 0 for index, row in chemicals.iterrows(): if not is_tree(G): continue i+=1 if i%1 == 0: print(i) G = read_smiles(row["smiles"], explicit_hydrogen=True) try: mol_weight = MW(CAS_from_any(row["chemicals"])) boiling_point = Tb(CAS_from_any(row['chemicals'])) except(ValueError): continue if boiling_point == None or mol_weight == None: continue occ = count_atom_occurencies(G) occ.update({'boiling_point': boiling_point}) """ try: with timeout(2, exception=RuntimeError): occ.update({'GP_index': calculate_indices.calcuate_pisanski(G)}) except RuntimeError: continue