def collect(fdr='.', nrt=None, out=None, csv=None): """ collect simulation report in a folder. """ fns = sorted(f for f in ls(fdr) if f.endswith('pgz')) # configuration, training history, and benchmarks cfg, hst, bmk = [], [], [] for i, f in enumerate(fns): if nrt is not None and not i < nrt: break f = pt.join(fdr, f) print(f) pgz = lpz(f) # 1) collect training history hs = pgz.pop('hst') hst.append(hs) # 2) collect simulation configuration cf = ['fam', 'xtp', 'frq', 'mdl', 'rsq', 'gdy', 'gtp'] cf = dict((k, v) for k, v in pgz.items() if k in cf) cf['nxp'] = '{}x{}'.format(pgz['gmx'].shape[0], pgz['gmx'].shape[2]) cf['nwk'] = pgz['dim'] cf = pd.Series(cf) cfg.append(cf) # 3) collect reference benchmarks, also append the performance of NNT bmk.append(pgz.pop('bmk').reset_index()) # concatenation _df = [] for c, b in zip(cfg, bmk): _df.append(pd.concat([pd.DataFrame([c] * b.shape[0]), b], 1)) bmk = pd.concat(_df) # non-NNT methods do not rely on these parameters bmk.loc[bmk.mtd != 'nnt', ['gtp', 'nwk', 'xtp']] = '-' # configuration keys and report keys cfk = cf.index.tolist() + ['mtd', 'par', 'key'] _gp = bmk.groupby(cfk) # means, stds, and iteration count of 'val' _mu = _gp.val.mean().rename('mu') _sd = _gp.val.std().rename('sd') _it = _gp.val.count().rename('itr') rpt = pd.concat([_mu, _sd, _it], 1).reset_index() rpt = rpt.loc[:, cfk + ['mu', 'sd', 'itr']] # do the same for training history hst = pd.concat(hst) _gp = hst.groupby('ep') _it = _gp.terr.count().rename('itr') hst = pd.concat([_gp.mean(numeric_only=True), _it], 1).reset_index() # save and return ret = Bunch(bmk=bmk, hst=hst, rpt=rpt) if out: spz(out, ret) if csv: rpt.to_csv(csv) return ret
def collect(fdr): """ collect encoder output in a folder. """ eot = [] hof = [] pcs = [] # __n = 0 for fname in sorted(ls(fdr)): if not fname.endswith('pgz'): continue # if not __n < 10: # break # __n = __n + 1 fname = pt.join(fdr, fname) print(fname) output = lpz(fname) eot.append(output['eot']) hof.append(output['hof']) if not isinstance(output['pcs'], Exception): pcs.append(output['pcs'][:, 0:16]) import numpy as np eot = np.array(eot, 'f') hof = np.array(hof, 'f') pcs = np.array(pcs, 'f') hof = np.transpose(hof, [1, 2, 0]) pcs = np.transpose(pcs, [1, 2, 0]) ret = {'eot': eot, 'hof': hof, 'pcs': pcs} return ret
def plt1(rpt, key='REL', log=True): """ plot supervised learning report. """ # load report form file if necessary. sim = ['fam', 'frq', 'mdl', 'nxp'] nnt = ['gtp', 'xtp', 'nwk'] mtd = ['mtd', 'par'] if isinstance(rpt, str) and rpt.endswith('pgz'): rpt = lpz(rpt) # the benchmark records bmk = rpt.bmk # title ttl = bmk.iloc[0][sim] ttl = ', '.join('{}={}'.format(k, v) for k, v in ttl.items()) # method grouping grp = nnt + mtd # plot of relative error err = bmk[bmk.key == key].loc[:, nnt + mtd + ['val']] err = err[err.mtd != 'nul'] # sample some data points to craft boxplot states X, L = [], [] for l, g in err.groupby(grp): if 'nnt' in l: l = "{nwk:>10}.{mtd}".format(**g.iloc[0]) else: l = "{par:>10}.{mtd}".format(**g.iloc[0]) x = np.array(g.val) X.append(x) L.append(l) X = np.array(X).T S = cbook.boxplot_stats(X, labels=L) # plot plt.close('all') plt.title(ttl) ax = plt.axes() if log: ax.set_yscale('log') ax.bxp(S) # draw a line at y=1 x0, x1 = ax.get_xbound() zx, zy = np.linspace(x0, x1, 10), np.ones(10) ax.plot(zx, zy, linestyle='--', color='red', linewidth=.5) for tick in ax.get_xticklabels(): tick.set_rotation(90) return rpt, plt
def main(vcf, nep=20, out=None, rdp=None, **kwd): """ Performance test for sigmoid, relu autoencoders. -- fnm: the genomic file. -- nep: number of epoches to go through. -- out: output location. -- rdp: reduce network depth by this much. ** sav: location to save training progress. ** mdp: maximum network depth. """ # handle filenames stm = pt.join(pt.dirname(vcf), pt.basename(vcf).split('.')[0]) if not pt.exists(vcf): # name correction if pt.exists(stm + '.vcf'): vcf = stm + '.vcf' elif pt.exists(stm + '.vcf.gz'): vcf = stm + '.vcf.gz' else: raise Exception('non-existing: ', vcf) sav = kwd.get('sav', '.') if pt.isdir(sav): sav = pt.join(sav, pt.basename(stm)) if not sav.endswith('.pgz'): sav = sav + '.pgz' # prepare data gmx, sbj = loadVCF(vcf) gmx = gmx.reshape(gmx.shape[0], -1).astype('f') # progress recover: if pt.exists(sav): print( sav, ": exists,", ) # do not continue to training? ovr = kwd.pop('ovr', 0) if ovr == 0: print(" skipped.") return kwd else: ovr = 2 # resume progress, use network stored in {sav}. if ovr is 1: # options in {kwd} take precedence over {sav}. sdt = lpz(sav) sdt.update(kwd) kwd = sdt print("continue training.") else: # restart the training print("restart training.") # perform PCA on genome data if necessary pcs = kwd.pop('pcs', None) if pcs is None: try: pca = PCA(n_components=gmx.shape[0]) pcs = pca.fit_transform(gmx) except numpy.linalg.linalg.LinAlgError as e: pcs = e kwd.update(pcs=pcs) hlt = kwd.get('hlt', 0) # halted? if hlt > 0: print 'NT: Halt.\nNT: Done.' return kwd mdp = kwd.pop('mdp', None) # maximum network depth lrt = kwd.pop('lrt', 1e-4) # learing rates gdy = kwd.pop('gdy', 0) # some greedy pre-training? # train the network, create it if necessary nwk = kwd.pop('nwk', None) if nwk is None: # train autoencoders, each layer roughfly halves the dimensionality dim = [gmx.shape[1] ] + [1024 // 2**_ for _ in range(16) if 2**_ <= 1024] dim = dim[:mdp] nwk = SAE.from_dim(dim, s='sigmoid', **kwd) print('create NT: ', nwk) print('NT: begin') tnr = SAE.Train(nwk, gmx, gmx, lrt=lrt, gdy=gdy, nep=nep, **kwd) lrt = tnr.lrt.get_value() # updated learning rate hof = nwk.ec(gmx).eval() # high order features eot = tnr.terr().item() # error of training hlt = tnr.hlt # halting status if hlt > 0: print('NT: Halt.') print('NT: Done.') # update, save the progress, then return kwd.update(nwk=nwk, lrt=lrt, hof=hof, eot=eot, hlt=hlt, sbj=sbj) spz(sav, kwd) return kwd
def plot_hist(sim, out=None, gui=0): """ rearrange simulation pgz, report. sim: the outputs organized in a list of dictionaries. """ if isinstance(sim, str): sim = lpz(sim) # performance measures import matplotlib as mpl if not gui: mpl.use('Agg') import matplotlib.pyplot as gc # graphics context s0 = sim[0] x = s0['hst']['ep'] # shared x axis r2 = s0['rsq'] # shared r2 frq = s0['frq'] nep = s0['nep'] fam = s0['fam'] N, P = s0['gmx'].shape[0:3:2] # shared genome type gtp = s0['gtp'] mdl = s0['mdl'] ttl = dict(r2=r2, N=N, P=P, gtp=gtp, frq=frq, fam=fam, nep=nep, mdl=mdl) ttl = ','.join(['='.join([str(k), str(v)]) for k, v in ttl.items()]) print(ttl) if out is None: out = '.' if pt.isdir(out): out = pt.join(out, ttl) # benchmarks bmk = np.concatenate([_['bmk'] for _ in sim]) # course types xtp = set(_['xtp'] for _ in sim) cs = 'bgrcmykw' for i, t in enumerate(xtp): sub = [s for s in sim if s['xtp'] == t] nwk = str(sub[0]['dim']) # histories h = np.array([_['hst'] for _ in sub]) # early stop early = np.argmin(h['verr'].mean(0)) # error plot gc.subplot(2, 1, 1) gc.loglog(x, h['verr'].mean(0), c=cs[i], ls='-', lw=2, label=t) gc.loglog(x, h['terr'].mean(0), c=cs[i], ls='--', lw=2, label='_' + t) if i == len(xtp) - 1: gc.loglog([early, early], gc.ylim(), c=cs[i], ls='-', lw=2) if i == 0: gc.ylabel(r'error') gc.title(ttl) gc.legend() # correlation plot gc.subplot(2, 1, 2) acc = 'tauc' if fam == 'bin' else 'vcor' ylab = r'$auc(y, \hat{y})$' if fam == 'bin' else r'$corr(y, \hat{y})$' gc.loglog(x, h[acc].mean(0), c=cs[i], lw=2, label=t) if i == len(xtp) - 1: gc.loglog([early, early], gc.ylim(), c=cs[i], ls='-', lw=2) # record acc = h[acc].max(1).mean() pgz = np.array([('nnt', '{!s:>10}'.format(t), acc)], bmk.dtype) bmk = np.concatenate([bmk, pgz]) print("DNN: {:s} {:s} {:.3f}".format(t, nwk, acc)) # axis, labels should be plot only once if i == 0: # horizontal line to show r2 gc.loglog(x, np.repeat(r2, x.size), 'r', lw=2, label=r'$r^2$') # other decoration elements gc.ylabel(ylab) gc.xlabel(r'epoch') gc.legend(loc=4) # fo = out + '.bmk' # np.savetxt(fo, bmk, '%s', header=' '.join(bmk.dtype.names), comments='') # fo = out + '.png' # gc.savefig(fo) return gc, bmk
def ept(fnm, out=None): """ Export training result in text format. -- fnm: filename of training progress. -- out: where to save the export. """ pwd = os.getcwd() fnm = pt.abspath(fnm) import tempfile tpd = tempfile.mkdtemp() if out is None: out = pwd if pt.isdir(out): out = pt.join(out, pt.basename(fnm).split('.')[0]) if not out.endswith('.tgz') or out.endswith('.tar.gz'): out = out + '.tgz' out = pt.abspath(out) # read the training progress dat = lpz(fnm) [dat.pop(_) for _ in ['nwk', 'cvm', 'cvw', 'nft', 'ovr']] dat['fnm'] = fnm dat['out'] = out # genomic matrix gmx = dat.pop('gmx').astype('i1') np.savetxt(pt.join(tpd, 'gx0.txt'), gmx[:, 0, :], '%d') np.savetxt(pt.join(tpd, 'gx1.txt'), gmx[:, 1, :], '%d') # genomic map np.savetxt(pt.join(tpd, 'gmp.txt'), dat.pop('gmp'), '%d\t%d\t%s') # subjects np.savetxt(pt.join(tpd, 'sbj.txt'), dat.pop('sbj'), '%s') # untyped subjects (indices) # np.savetxt(pt.join(tpd, 'usb.txt'), dat.pop('usb'), '%d') # untyped variants (indices) # np.savetxt(pt.join(tpd, 'ugv.txt'), dat.pop('ugv'), '%d') # CV-errors cve = dat.pop('cve') np.savetxt(pt.join(tpd, 'cve.txt'), cve, '%.8f') # high-order features hof = dat.pop('hof') np.savetxt(pt.join(tpd, 'hof.txt'), hof, '%.8f') # meta information inf = open(pt.join(tpd, 'inf.txt'), 'w') for k, v in dat.iteritems(): inf.write('{}={}\n'.format(k, v)) inf.close() # done # pack the output, delete invididual files import tarfile import shutil # packing os.chdir(tpd) # goto the packing dir try: tar = tarfile.open(out, 'w:gz') [tar.add(_) for _ in os.listdir('.')] shutil.rmtree(tpd, True) tar.close() except Exception as e: print(e) os.chdir(pwd) # back to the working dir
def main(fnm='../../raw/W09/1004', **kwd): """ the fine-tune procedure for Stacked Autoencoder(SAE). -- fnm: pathname to the input, supposingly the saved progress after the pre-training. If {fnm} points to a directory, a file is randomly chosen from it. ** ae1: depth of the sub SA. """ new_lrt = kwd.pop('lrt', None) # new learning rate new_hte = kwd.pop('hte', None) # new halting error # randomly pick data file if {fnm} is a directory and no record # exists in the saved progress: if pt.isdir(fnm): fnm = pt.join(fnm, np.random.choice(os.listdir(fnm))) kwd.update(fnm=fnm) # load data from {fnm}, but parameters in {kwd} takes precedence. kwd.update((k, v) for k, v in lpz(fnm).iteritems() if k not in kwd.keys()) # check saved progress and overwrite options: sav = kwd.get('sav', '.') if pt.isdir(sav): sav = pt.join(sav, pt.basename(fnm).split('.')[0]) if pt.exists(sav + '.pgz'): print(sav, ": exists,") ovr = kwd.pop('ovr', 0) # overwrite? if ovr is 0 or ovr > 2: # do not overwrite the progress print(" skipped.") return kwd else: ovr = 2 # resume progress, use network stored in {sav}. if ovr is 1: kwd.pop('cvw', None) # use saved networks for CV kwd.pop('cvl', None) # use saved CV LRT kwd.pop('cvh', None) # use saved CV halting state kwd.pop('cve', None) # use saved CV halting error kwd.pop('lrt', None) # use saved learning rate for training kwd.pop('nwk', None) # use saved network for training # remaining options in {kwd} take precedence over {sav}. sdt = lpz(sav) sdt.update(kwd) kwd = sdt print("continue training.") else: # restart the training kwd.pop('lrt', None) # do not use archived NT LRT kwd.pop('cvl', None) # do not use archived CV LRT kwd.pop('cve', None) # do not use archived CV errors kwd.pop('cvh', None) # do not use archived CV halting state print("restart training.") # <-- __x, w, npt, ptn, ... do it. gmx = kwd['gmx'] nsb = gmx.shape[0] # sample size xmx = gmx.reshape(nsb, -1).astype('f') # training data ngv = xmx.shape[-1] # feature size mdp = kwd.pop('wdp', 16) # maximum network depth # learing rates lrt = new_lrt if new_lrt else kwd.pop('lrt', 1e-4) dim = [ngv//2**_ for _ in range(mdp) if 2**_ <= ngv] # cross-validation networks cvk = kwd.get('cvk', 2) # K cvm = kwd.get('cvm', cv_msk(xmx, cvk)) # mask cvh = kwd.pop('cvh', [None] * cvk) # halting cvl = kwd.pop('cvl', [lrt] * cvk) # learning rates cvw = kwd.pop('cvw', [None] * cvk) # slots for CV networks cve = kwd.pop('cve', np.ndarray((cvk, 2))) # error # tune the network: (1) CV for i, m in enumerate(cvm): msg = 'CV: {:02d}/{:02d}'.format(i + 1, cvk) if cvh[i]: msg = msg + ' halted.' print(msg) continue print(msg) if cvw[i] is None: cvw[i] = SAE.from_dim(dim, s='relu', **kwd) cvw[i][-1].s = 'sigmoid' # suggest no layer-wise treatment (relu) gdy = kwd.get('gdy', False) else: # suggest no layer-wise treatment gdy = kwd.get('gdy', False) kwd = ftn_sae(cvw[i], xmx[-m], xmx[+m], gdy=gdy, lrt=cvl[i], **kwd) # collect the output ftn = kwd.pop('ftn') cvl[i] = ftn.lrt.get_value() # CV learning rate cve[i, 0] = ftn.terr() # CV training error cve[i, 1] = ftn.verr() # CV validation error cvh[i] = ftn.hlt # CV halting? # update kwd.update(cvk=cvk, cvm=cvm, cvh=cvh, cvl=cvl, cve=cve, cvw=cvw) # (2) normal training # force continue of training till new halting error? if new_hte: [kwd.pop(_, None) for _ in ['hte', 'hof', 'eot', 'eov']] hte = new_hte else: # mean CV training error as halting error hte = kwd.pop('hte', cve[:, 0].mean()) # NT only happens when all CV is halted. if all(cvh) and 'hof' not in kwd: # create normal network if necessary nwk = kwd.pop('nwk', None) if nwk is None: nwk = SAE.from_dim(dim, s='relu', **kwd) nwk[-1].s = 'sigmoid' # suggest no layer-wise treatment (relu) gdy = kwd.get('gdy', False) else: # suggest no layer-wise treatment gdy = kwd.get('gdy', False) print('NT: HTE = {}'.format(hte)) kwd = ftn_sae(nwk, xmx, xmx, gdy=gdy, lrt=lrt, hte=hte, **kwd) ftn = kwd.pop('ftn') lrt = ftn.lrt.get_value() # learning rate # update kwd.update(nwk=nwk, lrt=lrt, hte=hte) # when NT halt, save the high order features if ftn.hlt: kwd['hof'] = nwk.ec(xmx).eval() kwd['eot'] = ftn.terr() kwd['eov'] = ftn.verr() print('NT: halted.') elif all(cvh) and 'hof' in kwd: print('NT: halted.') else: print('NT: Not Ready.') # not ready for NT # save if sav: print("write to: ", sav) spz(sav, kwd) kwd = dict((k, v) for k, v in kwd.iteritems() if v is not None) return kwd
def main(fnm='../../sim/W09/10_PTN', **kwd): """ the fine-tune procedure for Stacked Autoencoder(SAE). -- fnm: pathname to the input, supposingly the saved progress after the pre-training. If {fnm} points to a directory, a file is randomly chosen from it. ** ae1: depth of the sub SA. """ # randomly pick pre-trained progress if {fnm} is a directory and no record # exists in the saved progress: if pt.isdir(fnm): fnm = pt.join(fnm, np.random.choice(os.listdir(fnm))) kwd.update(fnm=fnm) # load data from {fnm}, but let parameters in {kwd} takes precedence over # those in {fnm} _ = kwd.keys() kwd.update((k, v) for k, v in lpz(fnm).iteritems() if k not in _) # check saved progress and overwrite options: sav = kwd.get('sav', pt.basename(fnm).split('.')[0]) if pt.exists(sav + '.pgz'): print( sav, ": exists,", ) ovr = kwd.pop('ovr', 0) # overwrite? if ovr is 0 or ovr > 2: # do not overwrite the progress print(" skipped.") return kwd else: ovr = 2 # resume progress, use network stored in {sav}. if ovr is 1: kwd.pop('cvw', None) # use saved networks for CV kwd.pop('nwk', None) # use saved network for training kwd.pop('cvl', None) # use saved CV LRT kwd.pop('cvh', None) # use saved CV halting state kwd.pop('cve', None) # use saved CV halting error kwd.pop('lrt', None) # use saved learning rate for training # remaining options in {kwd} take precedence over {sav}. sdt = lpz(sav) sdt.update(kwd) kwd = sdt print("continue training.") else: # restart the training kwd.pop('lrt', None) # do not use archived NT LRT kwd.pop('cvl', None) # do not use archived CV LRT kwd.pop('cve', None) # do not use archived CV errors kwd.pop('cvh', None) # do not use archived CV halting state print("restart training.") # <-- __x, w, npt, ptn, ... do it. xmx = kwd['xmx'] # training data nwk = kwd['nwk'] # the whole network, not subset dph = len(nwk.sa) # depth of the network ae1 = kwd.get('ae1', dph) # encoding depth -- autoencoder one # cross-validation cvk = kwd['cvk'] # CV folds cvm = kwd['cvm'] # CV partitaion mask cvw = kwd['cvw'] # CV networks # CV halting cvh = kwd.get('cvh', [False] * cvk) # learing rates for normal training and CV lrt = kwd.pop('lrt', .01) cvl = kwd.pop('cvl', [lrt] * cvk) # NT halting error and CV errors hte = kwd.pop('hte', .005) cve = kwd.pop('cve', np.ndarray((cvk, 2))) # create error tables if necessary: if kwd.get('etb') is None: kwd['etb'] = np.zeros([dph + 1, 2]) - 1.0 kwd['etb'][0] = 0 # raw data has zero error etb = kwd['etb'] # create high order feature (HOF) table: if kwd.get('hof') is None: kwd['hof'] = [None] * (dph + 1) hof = kwd['hof'] # # raw data as trivial HOF _ = kwd['gmx'][:, :, kwd['ugv'] < 1] _ = _.reshape(_.shape[0], -1) hof[0] = _ # fine-tuning # 1) for CV: for i, m in enumerate(cvm): msg = 'CV: {:02d}/{:02d}'.format(i + 1, cvk) if cvh[i]: msg = msg + ' halted.' print(msg) continue print(msg) kwd = ftn_sae(cvw[i], xmx[-m], xmx[+m], lrt=cvl[i], **kwd) # collect the output ftn = kwd.pop('ftn') cvl[i] = ftn.lrt.get_value() # learning rate cve[i, 0] = ftn.terr() # CV training error cve[i, 1] = ftn.verr() # CV validation error cvh[i] = ftn.hlt.get_value() # CV halting # mean CV validation error etb[ae1, 1] = cve[:, 1].mean() # mean CV training error as NT halting error hte = cve[:, 0].mean() # 2) for normal training: # happens when all CV is halted or converged. if np.all(cvh): print('NT: HTE = {}'.format(hte)) kwd = ftn_sae(nwk, xmx, xmx, lrt=lrt, hte=hte, **kwd) ftn = kwd.pop('ftn') lrt = ftn.lrt.get_value() # learning rate etb[ae1, 0] = ftn.terr() # normal error if ftn.hlt.get_value(): print('NT: halted.') # high order features for all individuals, with typed variants _ = kwd['gmx'][:, :, kwd['ugv'] < 1] _ = _.reshape(_.shape[0], -1) hof[ae1] = ftn.nnt.ec(_).eval() else: print('NT: HTE = ??') # not ready for NT # 3) update progress and saving. kwd.update(cvl=cvl, cve=cve, cvh=cvh, lrt=lrt, etb=etb, hof=hof) if sav: print("write to: ", sav) spz(sav, kwd) kwd = dict((k, v) for k, v in kwd.iteritems() if v is not None) return kwd
def ept(fnm, out=None): """ Export training result in text format. -- fnm: filename of training progress. -- sav: where to save the export. """ pwd = os.getcwd() fnm = pt.abspath(fnm) import tempfile tpd = tempfile.mkdtemp() if out is None: out = pwd if pt.isdir(out): out = pt.join(out, pt.basename(fnm).split('.')[0]) if not out.endswith('.tgz') or out.endswith('.tar.gz'): out = out + '.tgz' out = pt.abspath(out) # read the training progress dat = lpz(fnm) [dat.pop(_) for _ in ['gmx', 'cvw', 'sav', 'nep', 'nep', 'ovr']] dat['fnm'] = fnm dat['out'] = out # genomic map np.savetxt(pt.join(tpd, 'gmp.txt'), dat.pop('gmp'), '%d\t%d\t%s') # genomic data in dosage format np.savetxt(pt.join(tpd, 'dsg.txt'), dat.pop('dsg'), '%d') # subjects np.savetxt(pt.join(tpd, 'sbj.txt'), dat.pop('sbj'), '%s') # untyped subjects (indices) np.savetxt(pt.join(tpd, 'usb.txt'), dat.pop('usb'), '%d') # untyped variants (indices) np.savetxt(pt.join(tpd, 'ugv.txt'), dat.pop('ugv'), '%d') # final high-order features # xmx, nwk = dat.pop('xmx'), dat.pop('nwk') # np.savetxt(pt.join(tpd, 'hff.txt'), nwk.ec(xmx).eval(), '%.8f') # sub high-order features hof = dat.pop('hof') for i in range(len(hof)): if hof[i] is None: continue np.savetxt(pt.join(tpd, 'hf{}.txt'.format(i)), hof[i], '%.8f') # error table np.savetxt(pt.join(tpd, 'etb.txt'), dat.pop('etb'), '%.8f') # CV masks np.savetxt(pt.join(tpd, 'cvm.txt'), dat.pop('cvm'), '%d') # meta information inf = open(pt.join(tpd, 'inf.txt'), 'w') for k, v in dat.iteritems(): inf.write('{}={}\n'.format(k, v)) inf.close() # done # pack the output, delete invididual files import tarfile import shutil # packing os.chdir(tpd) # goto the packing dir try: tar = tarfile.open(out, 'w:gz') [tar.add(_) for _ in os.listdir('.')] shutil.rmtree(tpd, True) tar.close() except Exception as e: print(e) os.chdir(pwd) # back to the working dir
def main(fnm='../../sim/W08/10_PTN', **kwd): """ the fine-tune procedure for Stacked Autoencoder(SAE). -- fnm: pathname to the input, supposingly the saved progress after the pre-training. If {fnm} points to a directory, a file is randomly chosen from it. ** ae1: depth of the sub SA. """ # randomly pick pre-trained progress if {fnm} is a directory and no record # exists in the saved progress: if pt.isdir(fnm): fnm = pt.join(fnm, np.random.choice(os.listdir(fnm))) kwd.update(fnm=fnm) # load data from {fnm}, but let parameters in {kwd} takes precedence over # those in {fnm} _ = kwd.keys() kwd.update((k, v) for k, v in lpz(fnm).iteritems() if k not in _) # check saved progress and overwrite options: sav = kwd.get('sav', pt.basename(fnm).split('.')[0]) if pt.exists(sav + '.pgz'): print(sav, ": exists,", ) ovr = kwd.pop('ovr', 0) # overwrite? if ovr is 0 or ovr > 2: # do not overwrite the progress print(" skipped.") return kwd else: sdt = lpz(sav) # resumed fine-tuneing, use network stored in {sav} if possible # continue with the progress if ovr is 1: kwd.pop('cvw', None) # use saved ANNs for CV kwd.pop('nwk', None) # use saved ANNs for training kwd.pop('cvl', None) # use saved learning rate for CV kwd.pop('lrt', None) # use saved learning rate for training # remaining options in {kwd} take precedence over {sav}. sdt = lpz(sav) sdt.update(kwd) kwd = sdt print("continue training.") else: ovr = 2 # no saved progress, restart anyway if ovr is 2: # restart the training kwd.pop('lrt', None) # do not use archived LRT for training kwd.pop('cvl', None) # do not use archived LRT for CV print("restart training.") # <-- __x, w, npt, ptn, ... do it. xmx = kwd['xmx'] # training data nwk = kwd['nwk'] # the whole network, not subset dph = len(nwk.sa) # depth of the network ae1 = kwd.get('ae1', dph) # encoding depth -- autoencoder one # cross-validation cvk = kwd['cvk'] # CV folds cvm = kwd['cvm'] # CV partitaion mask cvw = kwd['cvw'] # CV networks # learing rates for normal training and CV lrt = kwd.pop('lrt', .01) cvl = kwd.pop('cvl', [lrt] * cvk) # create error tables if necessary: if kwd.get('etb') is None: kwd['etb'] = np.zeros([dph + 1, 2]) - 1.0 kwd['etb'][0] = 0 # raw data has zero error etb = kwd['etb'] # create high order feature (HOF) table: if kwd.get('hof') is None: kwd['hof'] = [None] * (dph + 1) kwd['hof'][0] = xmx # raw data as trivial HOF hof = kwd['hof'] # fine-tuning # 1) for CV: cve = np.ndarray(cvk) # CV errors for i, m in enumerate(cvm): print('CV: {:02d}/{:02d}'.format(i+1, cvk)) kwd = ftn_sae(cvw[i], xmx[-m], xmx[+m], lrt=cvl[i], **kwd) # collect the output ftn = kwd.pop('ftn') cvl[i] = ftn.lrt.get_value() # learning rate cve[i] = ftn.verr() # CV errors etb[ae1, 1] = cve.mean() # mean CV error # 2) for normal training: print('NT:') kwd = ftn_sae(nwk, xmx, xmx, lrt=lrt, **kwd) ftn = kwd.pop('ftn') lrt = ftn.lrt.get_value() # learning rate etb[ae1, 0] = ftn.terr() # normal error # high order features hof[ae1] = ftn.nnt.ec(xmx).eval() # 3) update progress kwd.update(cvl=cvl, lrt=lrt, etb=etb, hof=hof) # save the progress if sav: print("write to: ", sav) spz(sav, kwd) kwd = dict((k, v) for k, v in kwd.iteritems() if v is not None) return kwd
def ept(fnm, out=None): """ Export training result in text format. -- fnm: filename of training progress. -- sav: where to save the export. """ pwd = os.getcwd() fnm = pt.abspath(fnm) import tempfile tpd = tempfile.mkdtemp() if out is None: out = pwd if pt.isdir(out): out = pt.join(out, pt.basename(fnm).split('.')[0]) if not out.endswith('.tgz') or out.endswith('.tar.gz'): out = out + '.tgz' out = pt.abspath(out) # read the training progress dat = lpz(fnm) [dat.pop(_) for _ in ['gmx', 'cvw', 'sav', 'nep', 'nft', 'ovr']] dat['fnm'] = fnm dat['out'] = out # genomic map np.savetxt(pt.join(tpd, 'gmp.txt'), dat.pop('gmp'), '%d\t%d\t%s') # genomic data in dosage format np.savetxt(pt.join(tpd, 'dsg.txt'), dat.pop('dsg'), '%d') # subjects np.savetxt(pt.join(tpd, 'sbj.txt'), dat.pop('sbj'), '%s') # final high-order features xmx, nwk = dat.pop('xmx'), dat.pop('nwk') np.savetxt(pt.join(tpd, 'hff.txt'), nwk.ec(xmx).eval(), '%.8f') # sub high-order features hof = dat.pop('hof') for i in range(len(hof)): if hof[i] is None: continue np.savetxt(pt.join(tpd, 'hf{}.txt'.format(i)), hof[i], '%.8f') # error table np.savetxt(pt.join(tpd, 'etb.txt'), dat.pop('etb'), '%.8f') # CV masks np.savetxt(pt.join(tpd, 'cvm.txt'), dat.pop('cvm'), '%d') # meta information inf = open(pt.join(tpd, 'inf.txt'), 'w') for k, v in dat.iteritems(): inf.write('{}={}\n'.format(k, v)) inf.close() # done # pack the output, delete invididual files import tarfile import shutil # packing os.chdir(tpd) # goto the packing dir try: tar = tarfile.open(out, 'w:gz') [tar.add(_) for _ in os.listdir('.')] shutil.rmtree(tpd, True) tar.close() except Exception as e: print(e) os.chdir(pwd) # back to the working dir
def main(fnm, **kwd): """ the fine-tune procedure for Stacked Autoencoder(SAE). -- fnm: pathname to the input, supposingly the saved progress after the pre-training. If {fnm} points to a directory, a file is randomly chosen from it. """ # randomly pick pre-trained progress if {fnm} is a directory and no record # exists in the saved progress: if pt.isdir(fnm): fnm = pt.join(fnm, np.random.choice(os.listdir(fnm))) kwd.update(fnm=fnm) fdr, fbn = pt.split(fnm) fpx = fbn.split('.', 2)[0] # read {fnm}, but parameters in {kwd} takes precedence over those in {fnm} keys = list(kwd.keys()) + ['lrt'] kwd.update((k, v) for k, v in lpz(fnm).items() if k not in keys) # check saved progress and overwrite options: sav = kwd.get('sav', '.') if sav is None: sav = pt.join(fdr, fpx) if pt.isdir(sav): sav = pt.join(sav, fpx) if pt.exists(sav + '.pgz') or pt.exists(sav): print(sav, ": exists,", ) ovr = kwd.pop('ovr', 0) # overwrite? if ovr is 0 or ovr > 2: # do not overwrite the progress print(" skipped.") return kwd else: ovr = 2 # resume progress, use network stored in {sav}. if ovr is 1: # remaining options in {kwd} take precedence over {sav}, but always use # saved network, even if there is one available in {fnm}. sdt = lpz(sav) # load # should we restart the training due to failure? eot = sdt.get('eot', 0.0) rte = kwd.pop('rte', 1e9) if eot > rte: ovr = 2 print('NT: RTE = {}'.format(kwd.bet('rte'))) print('NT: EOT = {}'.format(sdt.get('eot'))) print("NT: eot > rte, re-try.") else: kwd.pop('nwk', None) # use saved network kwd.pop('lrt', None) # use saved learning rate kwd.pop('eot', None) # use saved error sdt.update(kwd) kwd = sdt print("continue.") else: # restart the training print("restart.") # training data, only first 500 if 'gx0' in kwd and 'gx1' in kwd: gmx = np.concatenate([ kwd['gx0'][:, np.newaxis], kwd['gx1'][:, np.newaxis]], 1) else: gmx = kwd['gmx'] # take out part of the samples? nsb = kwd.get('nsb', None) if nsb is not None and nsb < gmx.shape[0]: idx = np.sort(np.random.permutation(gmx.shape[0])[:nsb]) gmx = gmx[idx, :] kwd['sbj'] = kwd['sbj'][idx] else: nsb = gmx.shape[0] # genomic copy 1 & 2 kwd['gx0'] = gmx[:, 0, :] kwd['gx1'] = gmx[:, 1, :] # training format xmx = gmx.reshape(nsb, -1).astype('f') # the dimensions ngv = xmx.shape[-1] dim = [ngv] + [ngv//2**d for d in range(1, 16) if 2**d <= ngv] # learing rates lrt = kwd.pop('lrt', .001) nep = kwd.pop('nep', 20) # Halt already? hte = kwd.pop('hte', .001) eot = kwd.pop('eot', 1e12) ste = kwd.pop('ste', 1e10) print('NT: HTE = {}'.format(hte)) print('NT: EOT = {}'.format(eot)) if eot < hte: # halted? print('NT: eot < hte') print('NT: Halt.\nNT: Done.') return kwd if eot > ste: # to much to even bother? print('NT: eot > ste') print('NT: Skip.\nNT: Done.') return kwd # train the network, create it if necessary nwk = kwd.pop('nwk', None) if nwk is None: nwk = SAE(dim, s='sigmoid', **kwd) print('create NT: ', nwk) # limit the working network wdp = kwd.pop('wdp', None) # wd0 indexing the lowest of new layers revealed on top of an existing # optimal working network wd0 = kwd.pop('wd0', 0) # pre-train the new layers when they are revealed for the first time. if wd0 > 0 and ovr is 2: # output from previously trained network at the bottom. xm1 = nwk.sub(None, wd0).ec(xmx).eval() # the new stacks. nw1 = nwk.sub(wd0, wdp) pep = kwd.pop('pep', nep) ptr = SAE.Train(nw1, xm1, xm1, lrt=lrt, nep=pep) eot = ptr.terr() nep = max(20, nep - pep) # fine-tuning wnk = nwk.sub(None, wdp) ftn = SAE.Train(wnk, xmx, xmx, lrt=lrt, hte=hte, nep=nep, **kwd) lrt = ftn.lrt.get_value() eot = ftn.terr() eov = ftn.verr() eph = kwd.pop('eph', 0) + ftn.ep.get_value() hof = ftn.nwk.ec(xmx).eval() rsd = xmx - ftn.nwk(xmx).eval() # the residual if ftn.hlt: print('NT: Halt.', ftn.hlt) # 3) update progress and save. kwd.update(nwk=nwk, wdp=wdp, lrt=lrt, hof=hof, rsd=rsd, eot=eot, eov=eov, eph=eph) kwd.pop('gh0', None) kwd.pop('gh1', None) kwd.pop('gmx', None) if sav: print("write to: ", sav) spz(sav, kwd) print('NT: Done.') kwd = dict((k, v) for k, v in kwd.items() if v is not None) return kwd