def test_smiles(self): with open('%s/../data/hwi-compounds.csv' % self.path, 'rb') as fh: reader = csv.DictReader(fh, delimiter="\t") for row in reader: if len(row['smiles']) == 0: continue mol = smilin(row['smiles']) assert mol != None
def _set_summary_stats(self, path): """ Set summary data for each compound (ex. mw,density,smiles). """ cols = ['molecular_weight', 'density'] data = {} with open(path, 'rb') as csvfile: reader = csv.DictReader(csvfile, delimiter="\t") for row in reader: data[row['name'].lower()] = row for ck in self.cocktails: for cp in ck.components: if cp.name not in data: logger.info("Missing summary data for compound: %s" % cp.name) continue row = data[cp.name] for key in cols: if key in row and len(row[key]) > 0: setattr(cp, key, float(row[key])) else: setattr(cp, key, None) if 'smiles' in row and len(row['smiles']) > 0: cp.smiles = row['smiles'] try: mol = smilin(cp.smiles) except: logger.info("Invalid smiles format, failed to parse smiles for compound: %s" % cp.name) else: logger.info("Missing smiles data for compound: %s" % cp.name)
def mol(self): if self.smiles in _mol_cache: return _mol_cache[self.smiles] mol = None if self.smiles is not None: try: mol = smilin(self.smiles) _mol_cache[self.smiles] = mol except: logger.critical("Invalid smiles format, failed to parse smiles for compound: %s" % self.name) return mol
def test_smilein(self): smiles_strings = [] with open("{}/smiles.txt".format(self.path), 'rb') as fh: for line in fh: smiles_strings.append(line.strip()) for smile in smiles_strings: mol = smilin(smile) out = mol.arbsmiles() can = mol.cansmiles() for bond in mol.bonds: print bond.symbol, bond.bondorder, bond.bondtype, bond.fixed for atom in mol.atoms: print atom, atom.sumBondOrders() print smile, out, can
def test_ecfp(self): """Test Butyramide (example used in ECFP paper)""" mol = smilin("CCCC(=O)N") # ECFP_0 fp = ecfp(mol, radius=0) assert len(fp) == 5 # ECFP_2 fp = ecfp(mol, radius=1) assert len(fp) == 11 # ECFP_4 fp = ecfp(mol, radius=2) assert len(fp) == 14 # ECFP_6 fp = ecfp(mol, radius=3) assert len(fp) == 14
def molecule_embedding(filename): dict = {} df = pd.read_excel(filename) for id, smile in zip(df.MOLENAME, df.SMILES): try: mol = smilin(smile) try: fp = ecfp(mol, radius=3) n_bits = 1024 # Number of bits in fixed-length fingerprint fingerprint = [0 for _ in range(n_bits) ] #fingerprints as a python list for nbrhood_hash in fp.keys(): bit = nbrhood_hash % n_bits fingerprint[bit] = 1 #print(fingerprint) dict[id] = fingerprint except: print('Bad ecfp for', id) except: print('Bad mol for', id) return list(dict.values())