def clustering(self, smiles: list, verbose=0): """ Clustering the smiles with Tanimoto similarity. Args: smiles (list): list of SMILES strings. verbose (bool): whether showing the progress bar. Returns: list: SMILES clusters. """ self.clusters = dict() counter = 0 it = tqdm(smiles) if verbose else smiles for s in it: s = Smiles(s) if s.rdkit_mol is None: continue idx = self._find_similarity(s) if idx is None: self.clusters[counter] = [s] counter += 1 else: self._update_cluster(idx, s) self.clusters = list(self.clusters.values()) return self.clusters
def create_graph(self, smi, idx, q): try: graph = Smiles(smi).to_graph(sparse=True) except AttributeError: return fp = get_filtered_fingerprint(smi) label = torch.tensor(list(fp), dtype=torch.long)[None, :] q.put((graph, label, idx))
def create_graph(self, data, idx, q): try: graph = Smiles(data[0]).to_graph(sparse=True) except AttributeError: return fp = data[1] label = torch.tensor(fp, dtype=torch.long)[None, :] q.put((graph, label, idx))
def smiles2graph(smiles): try: graph = Smiles(smiles).to_graph(sparse=True) except AttributeError: raise x = torch.tensor(graph["atom_features"], dtype=torch.float) edge_idx = graph["adjacency"].tocoo() edge_idx = torch.tensor([edge_idx.row, edge_idx.col], dtype=torch.long) return Data(x=x, edge_index=edge_idx)
def process(self, smiles_col, ecfp_col, label_col=None): data_list = list() df = pd.read_csv(self.raw_paths[0]) if label_col is None: it = zip(df[smiles_col], df[ecfp_col]) else: it = zip(df[smiles_col], df[ecfp_col], df[label_col]) for item in it: smiles = item[0] fp = item[1] if label_col is not None: label = item[2] smi = Smiles(smiles) try: graph = smi.to_graph(sparse=True) except AttributeError: continue x = torch.tensor(graph["atom_features"], dtype=torch.float) edge_idx = graph["adjacency"].tocoo() edge_idx = torch.tensor([edge_idx.row, edge_idx.col], dtype=torch.long) y = torch.tensor(list(map(int, list(fp.strip()))), dtype=torch.long)[None, :] if label_col is None: data_list.append(Data(x=x, edge_index=edge_idx, y=y)) else: data_list.append( Data(x=x, edge_index=edge_idx, y=y, label=label)) if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])
def write_graphs(inpath, outpath, prefix=None): """ Convert JAK dataset to graphs """ smiles = list() labels = list() with open(inpath, "r") as inf: line = inf.readline() while line: _, sm, lb = line.strip().split(",") if MolFromSmiles(sm) is None: line = inf.readline() continue smiles.append(Smiles(sm)) labels.append(lb) line = inf.readline() writer = GraphWriter(smiles) writer.write(outpath, prefix=prefix, graph_labels=labels)
def write_graphs(inpath, outpath, prefix=None): """ Convert JAK dataset to graphs """ smiles = list() fps = list() pb = tqdm() with open(inpath, "r") as inf: line = inf.readline() while line: sm = line.strip() if MolFromSmiles(sm) is None: line = inf.readline() continue smiles.append(Smiles(sm)) fps.append(",".join(map(str, get_filtered_fingerprint(sm)))) pb.update(1) line = inf.readline() writer = GraphWriter(smiles) writer.write(outpath, prefix=prefix, graph_labels=fps)
def _graph_helper(self, smi): graph = Smiles(smi).to_graph(sparse=True) x = torch.tensor(graph["atom_features"], dtype=torch.float) edge_idx = graph["adjacency"].tocoo() edge_idx = torch.tensor([edge_idx.row, edge_idx.col], dtype=torch.long) return x, edge_idx