def append(self, newms): """ add new molecules for training """ a3 = [] for mi in newms: om = crc.RDMol(mi, forcefield=self.ff) om.optg() om.optg2() a3.append(om.ats) self.ms = self.a1 + self.a2 + a3 + self.ts
def diagnose1(fa, fq): """ do diagnose for one amon and the query """ oa = crk.RDMol(fa) oa.get_angles(wH=F, key='ia') oa.get_dihedral_angles(wH=F, key='ia') oq = crk.RDMol(fq) oq.get_angles(wH=F, key='ia') oq.get_dihedral_angles(wH=F, key='ia') smi = oa.prop['smiles_indigo'] patt = crk.smi2patt(smi) print('patt=', patt) mp = Chem.MolFromSmarts(patt) iass_a = oa.m.GetSubstructMatches(mp) iass_q = oq.m.GetSubstructMatches(mp) types_a, vals_a = get_angles34(oa.dangs, iass_a) #dic_a = dict(zip(types_a[0],vals_a[0])) types_q, _vals_q = get_angles34(oq.dangs, iass_q) n = len(types_q) vals_q = [] for i in range(n): types = types_q[i] vals = _vals_q[i] #print ' -- types = ', types dic_q = dict(list(zip(types, vals))) _vals = [] for key in types_a[0]: _vals.append(dic_q[key]) vals_q.append(_vals) print(' amon:') print(types_a[0]) print(list(vals_a[0])) print(' query:') for i in range(n): print(vals_q[i]) print(' Difference:') for i in range(n): print(list(vals_q[i] - vals_a[0]))
def __init__(self, fs, rsmi, props=['HF'], istart=0): self.nc0 = len(fs) fsc = [] cso = [] # mol objects ms = [] # ase mols ys = [] #assert '_c' in fs[0] #self.filename = '_'.join( fs[0].split('/')[-1].split('_')[:-1] ) self.fs_diss = [] self.fs_redundant = [] cids = [] for i, f in enumerate(fs): fmt = f[-3:] if fmt in ['log', 'out']: #G09 output file dic = GR0(f, istart=istart)[-1] zs = np.array(dic['Atomic_numbers'], np.int) coords = np.array(dic['Positions']) m = atoms(zs, coords) try: co = cmm.Mol(zs, coords, ican=True) can2 = get_alternative(co.can) if rsmi not in [co.can, can2]: print("#ERROR: %s has a SMILES %s, differs from %s" % (f, co.can, rsmi)) self.fs_diss.append(f) continue else: _ys = {} for key in props: _ys[key] = get_val(dic, key) ys.append(_ys) ms.append(m) cso.append(co) fsc.append(f) except: print("#ERROR: this is a radical!") self.fs_diss.append(f) continue elif fmt in ['mol', 'sdf']: oo = crk.RDMol(f) m = atoms(oo.zs, oo.coords) ms.append(m) cso.append(oo.prop['smiles_indigo']) fsc.append(f) ys.append([oo.prop[k] for k in props]) self.cso = cso self.ms = ms self.fsc = fsc self.nc = len(cso) self.ys = ys
def query(self, fq, idQ=None, k=7): if self.cmaps[0]: #assert len(fsq) > 0, '#ERROR: `fsq not specified!' assert idQ is not None self.idQ = idQ #fsq.index(fq) self.fq = fq #zs, coords, ydic = read_xyz_simple(f,opt='z') objq = crk.RDMol(fq) zs, coords, ydic = objq.zs, objq.coords, objq.prop smi = ydic['smiles_%s' % self.ctk] _ys = [] for key in self.props: _yi = ydic[key] if key in ydic.keys() else np.nan _ys.append(_yi) ys_q = np.array([_ys]) #[ [ydic[key] for key in self.props ] ]) nheav = (np.array(zs) > 1).sum() #ao = cma.amon(smi, k) # amon object. Note that can's are of indigo format #assert ao.iok #amons_q, ns_q, ats_q = ao.get_amons(iao=T) # idxs of atoms (in query mol) as output as well reduce_namons = T ao = coa.ParentMols([smi], reduce_namons, wg=F, imap=T, k=7) #amons_q = [] #for cci in ao.cans: # mobj = indigo.Indigo() # m2 = mobj.loadMolecule(cci) # amons_q.append( m2.canonicalSmiles() ) #self.ats_q = ao.atsa self.amons_q = ao.cans #amons_q ## now stick to can of oechem format #self.ns_q = ao.nsa self.ys_q = ys_q self.zs_q = np.array(zs, np.int) self.nsheav_q = [nheav] self.coords_q = np.array(coords) # tor #self.x_q = get_torsions( atoms(zs,coords) ) #; sys.exit() objq.get_atypes() objq.get_angles(wH=F, key='ia') objq.get_dihedral_angles(wH=F, key='ia') self.objq = objq
def __init__(self, obj, addh=True): """ process a list of SMILES objects """ smis = [] typ = type(obj) if typ is str: if os.path.exists(obj): smis += [si.strip() for si in file(obj).readlines()] else: smis += [ obj, ] elif typ is list: for obj_i in obj: if os.path.exists(obj_i): smis += [si.strip() for si in file(obj).readlines()] else: smis += [ obj_i, ] self.nm = len(smis) self.objs = [crk.RDMol(si) for si in smis]
def __init__(self, objs, wd='targets/', reduce_namons=T, wg=F, \ i3d=T, a1=T, level=2, exta=0, k=7, k2=7, \ ff='mmff94', owt=F): """ vars ============== level: extended amons level, 1 or 2 exta: maximal N_I of extended amons owt: overwrite target (when writing sdf file for target) """ if isinstance(objs, str): objs = [objs] if not os.path.exists(wd): os.mkdir(wd) self.ff = ff ts = [] fts = [] # first get 3d geom of targests for obj in objs: om = crc.RDMol(obj, forcefield=ff) om.optg() om.optg2() ats = om.ats can = coo.oem2can(coo.smi2oem(obj)[1]) can_hex = str2hex(coo.oem2can(coo.smi2oem(obj)[1])) f1 = wd + '%s.xyz' % can #_hex f2 = wd + '%s.sdf' % can #_hex if owt or (not (os.path.exists(f2) and os.path.exists(f1))): om.write_sdf(f2) om.write_xyz(f1) else: ats = cc.obj2mol(f1, ['E']) fts.append(f2) ts.append(ats) self.ts = ts a1 = [] # generic amons if a1: imap = F if len(fts) == 1 else T oa = coa.ParentMols(fts, reduce_namons, label=None, \ imap=imap, fixGeom=F, i3d=i3d, wg=wg, \ k=k,k2=k2, iprt=T, forcefield=ff, \ thresh=0.01, debug=F) for mi in oa.ms: om = crc.RDMol(mi, forcefield=ff) #om.optg() om.iFFOpt = T om.optg2() a.append(om.ats) self.a1 = a1 a2 = [] assert exta >= 0, '#ERROR: `exta: N_I of extended amons shoud be >= 0' if exta: # add extended amons oa2 = coae.transform(objs[0]) oa2.get_newolds() newms, newms2 = oa2.T(level=level) oa2.get_amons_extended(k=exta) for mi in oa2.amons_extended: om = crc.RDMol(mi, forcefield=ff) om.optg() om.optg2() a2.append(om.ats) self.a2 = a2 self.ms = a1 + a2 + ts
"OC12C3C4C1N4C32"] # the last one is highly strained, may have problem in acquring g0 elif n == 1: f = args[0] if f[-3:] in ['smi','can']: objs = [ si.strip() for si in file(f).readlines() ] else: # either an xyz file or a SMILES string objs = args else: objs = args isf = False nobj = len(objs) for i,obj in enumerate(objs): if not os.path.isfile(obj): f = tpf.NamedTemporaryFile(dir='/tmp').name + '.xyz' m0 = cir.RDMol(obj, doff=True) m0.write_xyz(f) else: isf = True f = obj o = cc.molecule(f, isimple=T) can = 'None' iok = T if nobj > 1: if trial: try: m = Mol(o.zs, o.coords, ican=True) can = m.can except: iok = F #print(' conversion failed!')#pass else:
def __init__(self, fd, fd2=None, fcanr=None, h5f=None, \ imb=False, props=['E'], ctk='oechem'): """ fcanr: file containing cononical SMILES for reference (from which conformers were generated) """ self.ctk = ctk # aqml.cheminfomatic toolkit to be used, rdkit, indigo, openbabael or oechem if fd[-1] == '/': fd = fd[:-1] self.fda = fd fs = io2.cmdout('ls %s/frag*.sdf' % fd) self.fs = fs nf = len(fs) ns = [len(f) for f in fs] assert len(np.unique(ns)) == 1, '#ERROR: filename length may differ' assert nf > 0, '#ERROR: no *.sdf found' objs = [] s = [] s1 = [] # unique smiles strings _ncs = [0] * nf imc = 0 for f in fs: obj = crk.RDMol(f) objs.append(obj) smi = obj.prop['smiles_%s' % ctk] s.append(smi) if smi not in s1: _ncs[imc] = 1 imc += 1 s1.append(smi) else: im = s1.index(smi) _ncs[im] += 1 if fcanr is not None: _amons = [si.strip() for si in open(fcanr).readlines()] if _namon == len(s1): print( "#ERROR: #amons from sdf's inconsistent with that in %s" % fcanr) print( " This means that some amons (graph) may have been") print(" hiscarded due to dissociation") raise else: _amons = s1 _namon = len(_amons) self.amons = _amons nm = len(s1) ncs = _ncs[:nm] cidxs = np.arange(nf).astype(int) c2amap = [] ics2 = np.array(np.cumsum(ncs), dtype=int) ics1 = np.array(np.concatenate(([0], ics2[:-1])), dtype=int) for im in range(nm): c2amap += [ im, ] * ncs[im] a2cmap = [list(cidxs[ics1[iamon]:ics2[iamon]]) for iamon in range(nm)] cmaps = [F] # if provided h5 file, read conformer IDs from it if h5f is not None: dt = dd.io.load(h5f) _maps = dt['maps'] if np.max(_maps) != nf - 1: print(' * Error: #sdf files .ne. max(maps)+1!!') raise cmaps = [T, _maps] # add new conformers from another folder ic2 = nf if fd2 is None: c2amap = np.array(c2amap, dtype=int) else: fs2 = io2.cmdout('ls %s/frag*sdf' % fd2) fs += fs2 nf2 = len(fs2) s2 = [] ic2s = [] im2s = [] for f2 in fs2: obj2 = crk.RDMol(f2) smi = obj2.prop['smiles_%s' % self.ctk] s2.append(smi) if smi in _amons: im2 = _amons.index(smi) c2amap.append(im2) a2cmap[im2] += [ic2] ic2 += 1 objs.append(obj2) assert len(s2) == nf2 c2amap = np.array(c2amap, dtype=int) if cmaps[0]: # now update maps file _maps = cmaps[1] nt, ncmax = _maps.shape icsmax = np.arange(ncmax).astype(int) _maps2 = [] ncs_added = [] for j in range(nt): fil = (_maps[j] > -1) ims_j = np.unique(c2amap[_maps[j][fil]]) ics_j = [] for k in ims_j: #print('k,len(a2cmap)=',k,len(a2cmap)) t2 = np.array(a2cmap[k], dtype=int) ics_j += list(t2[t2 >= nf]) ncs_added.append(len(ics_j)) _maps2.append(list(_maps[j][fil]) + ics_j) nadd = max(ncs_added) print( ' ## for some query molecule, %d conformers at most are added as new amons' % nadd) maps2 = -1 * np.ones((nt, ncmax + nadd), dtype=int) for j in range(nt): ncj = len(_maps2[j]) maps2[j, :ncj] = _maps2[j] cmaps = [T, maps2.astype(int)] c2amap = np.array(c2amap, dtype=int) self.c2amap = c2amap # (amon/conformer) to (molecule graph) map self.a2cmap = a2cmap self.cmaps = cmaps # if `fcanr is not None, check dissociated molecules #if fcanr is not None: # _remap = np.arange(nf).astype(int) # for i,f in enumerate(fs): # im = int(f.split('frag_')[1][:-4].split('_')[0])-1 # molecule index (graph! not conformer idx) # smi = s[i] # if smi != _amons[im]: # graph may have changed after optg # _remap[i] = -1 nct = ic2 self.nmt = nm self.nct = nct self.fs = fs objsc = [] ys = [] zs = [] nsheav = [] nas = [] coords = [] for i in range(nct): objc = objs[i] zs += list(objc.zs) nas.append(objc.na) coords += list(objc.coords) nsheav.append(objc.nheav) if imb: # calc many-body terms objc.get_angles(wH=F, key='ia') objc.get_dihedral_angles(wH=F, key='ia') objsc.append(objc) #print('f,e_qmc=',fs[i],objc.prop['QMC_B3LYP']) ys.append([objc.prop[key] for key in props]) self.nas = np.array(nas, np.int) self.zs = np.array(zs, np.int) self.coords = np.array(coords) self.nsheav = np.array(nsheav, np.int) self.objsc = objsc self.ics1 = ics1 self.ics2 = ics2 self.props = props self.ys = np.array(ys)