Ejemplo n.º 1
0
def collect(fdr='.', nrt=None, out=None, csv=None):
    """ collect simulation report in a folder. """
    fns = sorted(f for f in ls(fdr) if f.endswith('pgz'))

    # configuration, training history, and benchmarks
    cfg, hst, bmk = [], [], []
    for i, f in enumerate(fns):
        if nrt is not None and not i < nrt:
            break
        f = pt.join(fdr, f)
        print(f)
        pgz = lpz(f)

        # 1) collect training history
        hs = pgz.pop('hst')
        hst.append(hs)

        # 2) collect simulation configuration
        cf = ['fam', 'xtp', 'frq', 'mdl', 'rsq', 'gdy', 'gtp']
        cf = dict((k, v) for k, v in pgz.items() if k in cf)
        cf['nxp'] = '{}x{}'.format(pgz['gmx'].shape[0], pgz['gmx'].shape[2])
        cf['nwk'] = pgz['dim']
        cf = pd.Series(cf)
        cfg.append(cf)

        # 3) collect reference benchmarks, also append the performance of NNT
        bmk.append(pgz.pop('bmk').reset_index())

    # concatenation
    _df = []
    for c, b in zip(cfg, bmk):
        _df.append(pd.concat([pd.DataFrame([c] * b.shape[0]), b], 1))

    bmk = pd.concat(_df)
    # non-NNT methods do not rely on these parameters
    bmk.loc[bmk.mtd != 'nnt', ['gtp', 'nwk', 'xtp']] = '-'

    # configuration keys and report keys
    cfk = cf.index.tolist() + ['mtd', 'par', 'key']
    _gp = bmk.groupby(cfk)
    # means, stds, and iteration count of 'val'
    _mu = _gp.val.mean().rename('mu')
    _sd = _gp.val.std().rename('sd')
    _it = _gp.val.count().rename('itr')
    rpt = pd.concat([_mu, _sd, _it], 1).reset_index()
    rpt = rpt.loc[:, cfk + ['mu', 'sd', 'itr']]

    # do the same for training history
    hst = pd.concat(hst)
    _gp = hst.groupby('ep')
    _it = _gp.terr.count().rename('itr')
    hst = pd.concat([_gp.mean(numeric_only=True), _it], 1).reset_index()

    # save and return
    ret = Bunch(bmk=bmk, hst=hst, rpt=rpt)
    if out:
        spz(out, ret)
    if csv:
        rpt.to_csv(csv)
    return ret
Ejemplo n.º 2
0
def collect(fdr):
    """ collect encoder output in a folder. """
    eot = []
    hof = []
    pcs = []
    # __n = 0
    for fname in sorted(ls(fdr)):
        if not fname.endswith('pgz'):
            continue
        # if not __n < 10:
        #     break
        # __n = __n + 1
        fname = pt.join(fdr, fname)
        print(fname)
        output = lpz(fname)
        eot.append(output['eot'])
        hof.append(output['hof'])
        if not isinstance(output['pcs'], Exception):
            pcs.append(output['pcs'][:, 0:16])

    import numpy as np
    eot = np.array(eot, 'f')
    hof = np.array(hof, 'f')
    pcs = np.array(pcs, 'f')
    hof = np.transpose(hof, [1, 2, 0])
    pcs = np.transpose(pcs, [1, 2, 0])
    ret = {'eot': eot, 'hof': hof, 'pcs': pcs}
    return ret
Ejemplo n.º 3
0
def plt1(rpt, key='REL', log=True):
    """ plot supervised learning report. """
    # load report form file if necessary.
    sim = ['fam', 'frq', 'mdl', 'nxp']
    nnt = ['gtp', 'xtp', 'nwk']
    mtd = ['mtd', 'par']
    if isinstance(rpt, str) and rpt.endswith('pgz'):
        rpt = lpz(rpt)

    # the benchmark records
    bmk = rpt.bmk

    # title
    ttl = bmk.iloc[0][sim]
    ttl = ', '.join('{}={}'.format(k, v) for k, v in ttl.items())

    # method grouping
    grp = nnt + mtd

    # plot of relative error
    err = bmk[bmk.key == key].loc[:, nnt + mtd + ['val']]
    err = err[err.mtd != 'nul']

    # sample some data points to craft boxplot states
    X, L = [], []
    for l, g in err.groupby(grp):
        if 'nnt' in l:
            l = "{nwk:>10}.{mtd}".format(**g.iloc[0])
        else:
            l = "{par:>10}.{mtd}".format(**g.iloc[0])
        x = np.array(g.val)
        X.append(x)
        L.append(l)
    X = np.array(X).T
    S = cbook.boxplot_stats(X, labels=L)

    # plot
    plt.close('all')
    plt.title(ttl)
    ax = plt.axes()
    if log:
        ax.set_yscale('log')
    ax.bxp(S)

    # draw a line at y=1
    x0, x1 = ax.get_xbound()
    zx, zy = np.linspace(x0, x1, 10), np.ones(10)
    ax.plot(zx, zy, linestyle='--', color='red', linewidth=.5)
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)
    return rpt, plt
Ejemplo n.º 4
0
def main(vcf, nep=20, out=None, rdp=None, **kwd):
    """ Performance test for sigmoid, relu autoencoders.
    -- fnm: the genomic file.
    -- nep: number of epoches to go through.
    -- out: output location.
    -- rdp: reduce network depth by this much.

    ** sav: location to save training progress.
    ** mdp: maximum network depth.
    """

    # handle filenames
    stm = pt.join(pt.dirname(vcf), pt.basename(vcf).split('.')[0])
    if not pt.exists(vcf):  # name correction
        if pt.exists(stm + '.vcf'):
            vcf = stm + '.vcf'
        elif pt.exists(stm + '.vcf.gz'):
            vcf = stm + '.vcf.gz'
        else:
            raise Exception('non-existing: ', vcf)

    sav = kwd.get('sav', '.')
    if pt.isdir(sav):
        sav = pt.join(sav, pt.basename(stm))
    if not sav.endswith('.pgz'):
        sav = sav + '.pgz'

    # prepare data
    gmx, sbj = loadVCF(vcf)
    gmx = gmx.reshape(gmx.shape[0], -1).astype('f')

    # progress recover:
    if pt.exists(sav):
        print(
            sav,
            ": exists,",
        )

        # do not continue to training?
        ovr = kwd.pop('ovr', 0)
        if ovr == 0:
            print(" skipped.")
            return kwd
    else:
        ovr = 2

    # resume progress, use network stored in {sav}.
    if ovr is 1:
        # options in {kwd} take precedence over {sav}.
        sdt = lpz(sav)
        sdt.update(kwd)
        kwd = sdt
        print("continue training.")
    else:  # restart the training
        print("restart training.")

    # perform PCA on genome data if necessary
    pcs = kwd.pop('pcs', None)
    if pcs is None:
        try:
            pca = PCA(n_components=gmx.shape[0])
            pcs = pca.fit_transform(gmx)
        except numpy.linalg.linalg.LinAlgError as e:
            pcs = e
    kwd.update(pcs=pcs)

    hlt = kwd.get('hlt', 0)  # halted?
    if hlt > 0:
        print 'NT: Halt.\nNT: Done.'
        return kwd
    mdp = kwd.pop('mdp', None)  # maximum network depth
    lrt = kwd.pop('lrt', 1e-4)  # learing rates
    gdy = kwd.pop('gdy', 0)  # some greedy pre-training?

    # train the network, create it if necessary
    nwk = kwd.pop('nwk', None)
    if nwk is None:
        # train autoencoders, each layer roughfly halves the dimensionality
        dim = [gmx.shape[1]
               ] + [1024 // 2**_ for _ in range(16) if 2**_ <= 1024]
        dim = dim[:mdp]

        nwk = SAE.from_dim(dim, s='sigmoid', **kwd)
        print('create NT: ', nwk)

    print('NT: begin')
    tnr = SAE.Train(nwk, gmx, gmx, lrt=lrt, gdy=gdy, nep=nep, **kwd)
    lrt = tnr.lrt.get_value()  # updated learning rate
    hof = nwk.ec(gmx).eval()  # high order features
    eot = tnr.terr().item()  # error of training
    hlt = tnr.hlt  # halting status
    if hlt > 0:
        print('NT: Halt.')
    print('NT: Done.')

    # update, save the progress, then return
    kwd.update(nwk=nwk, lrt=lrt, hof=hof, eot=eot, hlt=hlt, sbj=sbj)
    spz(sav, kwd)

    return kwd
Ejemplo n.º 5
0
def plot_hist(sim, out=None, gui=0):
    """ rearrange simulation pgz, report.
    sim: the outputs organized in a list of dictionaries.
    """
    if isinstance(sim, str):
        sim = lpz(sim)

    # performance measures
    import matplotlib as mpl
    if not gui:
        mpl.use('Agg')
    import matplotlib.pyplot as gc  # graphics context

    s0 = sim[0]
    x = s0['hst']['ep']  # shared x axis
    r2 = s0['rsq']  # shared r2
    frq = s0['frq']
    nep = s0['nep']
    fam = s0['fam']
    N, P = s0['gmx'].shape[0:3:2]

    # shared genome type
    gtp = s0['gtp']
    mdl = s0['mdl']
    ttl = dict(r2=r2, N=N, P=P, gtp=gtp, frq=frq, fam=fam, nep=nep, mdl=mdl)
    ttl = ','.join(['='.join([str(k), str(v)]) for k, v in ttl.items()])
    print(ttl)

    if out is None:
        out = '.'
    if pt.isdir(out):
        out = pt.join(out, ttl)

    # benchmarks
    bmk = np.concatenate([_['bmk'] for _ in sim])

    # course types
    xtp = set(_['xtp'] for _ in sim)
    cs = 'bgrcmykw'
    for i, t in enumerate(xtp):
        sub = [s for s in sim if s['xtp'] == t]
        nwk = str(sub[0]['dim'])

        # histories
        h = np.array([_['hst'] for _ in sub])

        # early stop
        early = np.argmin(h['verr'].mean(0))

        # error plot
        gc.subplot(2, 1, 1)
        gc.loglog(x, h['verr'].mean(0), c=cs[i], ls='-', lw=2, label=t)
        gc.loglog(x, h['terr'].mean(0), c=cs[i], ls='--', lw=2, label='_' + t)
        if i == len(xtp) - 1:
            gc.loglog([early, early], gc.ylim(), c=cs[i], ls='-', lw=2)

        if i == 0:
            gc.ylabel(r'error')
            gc.title(ttl)
        gc.legend()

        # correlation plot
        gc.subplot(2, 1, 2)
        acc = 'tauc' if fam == 'bin' else 'vcor'
        ylab = r'$auc(y, \hat{y})$' if fam == 'bin' else r'$corr(y, \hat{y})$'
        gc.loglog(x, h[acc].mean(0), c=cs[i], lw=2, label=t)
        if i == len(xtp) - 1:
            gc.loglog([early, early], gc.ylim(), c=cs[i], ls='-', lw=2)

        # record
        acc = h[acc].max(1).mean()
        pgz = np.array([('nnt', '{!s:>10}'.format(t), acc)], bmk.dtype)
        bmk = np.concatenate([bmk, pgz])
        print("DNN: {:s} {:s} {:.3f}".format(t, nwk, acc))

        # axis, labels should be plot only once
        if i == 0:
            # horizontal line to show r2
            gc.loglog(x, np.repeat(r2, x.size), 'r', lw=2, label=r'$r^2$')

            # other decoration elements
            gc.ylabel(ylab)
            gc.xlabel(r'epoch')
        gc.legend(loc=4)

    # fo = out + '.bmk'
    # np.savetxt(fo, bmk, '%s', header=' '.join(bmk.dtype.names), comments='')

    # fo = out + '.png'
    # gc.savefig(fo)

    return gc, bmk
Ejemplo n.º 6
0
def ept(fnm, out=None):
    """ Export training result in text format.
    -- fnm: filename of training progress.
    -- out: where to save the export.
    """
    pwd = os.getcwd()
    fnm = pt.abspath(fnm)

    import tempfile
    tpd = tempfile.mkdtemp()
    if out is None:
        out = pwd
    if pt.isdir(out):
        out = pt.join(out, pt.basename(fnm).split('.')[0])
    if not out.endswith('.tgz') or out.endswith('.tar.gz'):
        out = out + '.tgz'
    out = pt.abspath(out)

    # read the training progress
    dat = lpz(fnm)
    [dat.pop(_) for _ in ['nwk', 'cvm', 'cvw', 'nft', 'ovr']]

    dat['fnm'] = fnm
    dat['out'] = out

    # genomic matrix
    gmx = dat.pop('gmx').astype('i1')
    np.savetxt(pt.join(tpd, 'gx0.txt'), gmx[:, 0, :], '%d')
    np.savetxt(pt.join(tpd, 'gx1.txt'), gmx[:, 1, :], '%d')

    # genomic map
    np.savetxt(pt.join(tpd, 'gmp.txt'), dat.pop('gmp'), '%d\t%d\t%s')

    # subjects
    np.savetxt(pt.join(tpd, 'sbj.txt'), dat.pop('sbj'), '%s')

    # untyped subjects (indices)
    # np.savetxt(pt.join(tpd, 'usb.txt'), dat.pop('usb'), '%d')

    # untyped variants (indices)
    # np.savetxt(pt.join(tpd, 'ugv.txt'), dat.pop('ugv'), '%d')

    # CV-errors
    cve = dat.pop('cve')
    np.savetxt(pt.join(tpd, 'cve.txt'), cve, '%.8f')

    # high-order features
    hof = dat.pop('hof')
    np.savetxt(pt.join(tpd, 'hof.txt'), hof, '%.8f')

    # meta information
    inf = open(pt.join(tpd, 'inf.txt'), 'w')
    for k, v in dat.iteritems():
        inf.write('{}={}\n'.format(k, v))
    inf.close()  # done

    # pack the output, delete invididual files
    import tarfile
    import shutil

    # packing
    os.chdir(tpd)  # goto the packing dir
    try:
        tar = tarfile.open(out, 'w:gz')
        [tar.add(_) for _ in os.listdir('.')]
        shutil.rmtree(tpd, True)
        tar.close()
    except Exception as e:
        print(e)
    os.chdir(pwd)  # back to the working dir
Ejemplo n.º 7
0
def main(fnm='../../raw/W09/1004', **kwd):
    """ the fine-tune procedure for Stacked Autoencoder(SAE).

    -- fnm: pathname to the input, supposingly the saved progress after the
    pre-training. If {fnm} points to a directory, a file is randomly chosen
    from it.

    ** ae1: depth of the sub SA.
    """
    new_lrt = kwd.pop('lrt', None)  # new learning rate
    new_hte = kwd.pop('hte', None)  # new halting error

    # randomly pick data file if {fnm} is a directory and no record
    # exists in the saved progress:
    if pt.isdir(fnm):
        fnm = pt.join(fnm, np.random.choice(os.listdir(fnm)))
    kwd.update(fnm=fnm)

    # load data from {fnm}, but parameters in {kwd} takes precedence.
    kwd.update((k, v) for k, v in lpz(fnm).iteritems() if k not in kwd.keys())

    # check saved progress and overwrite options:
    sav = kwd.get('sav', '.')
    if pt.isdir(sav):
        sav = pt.join(sav, pt.basename(fnm).split('.')[0])
    if pt.exists(sav + '.pgz'):
        print(sav, ": exists,")
        ovr = kwd.pop('ovr', 0)  # overwrite?

        if ovr is 0 or ovr > 2:  # do not overwrite the progress
            print(" skipped.")
            return kwd
    else:
        ovr = 2

    # resume progress, use network stored in {sav}.
    if ovr is 1:
        kwd.pop('cvw', None)  # use saved networks for CV
        kwd.pop('cvl', None)  # use saved CV LRT
        kwd.pop('cvh', None)  # use saved CV halting state
        kwd.pop('cve', None)  # use saved CV halting error
        kwd.pop('lrt', None)  # use saved learning rate for training
        kwd.pop('nwk', None)  # use saved network for training

        # remaining options in {kwd} take precedence over {sav}.
        sdt = lpz(sav)
        sdt.update(kwd)
        kwd = sdt
        print("continue training.")
    else:  # restart the training
        kwd.pop('lrt', None)    # do not use archived NT LRT
        kwd.pop('cvl', None)    # do not use archived CV LRT
        kwd.pop('cve', None)    # do not use archived CV errors
        kwd.pop('cvh', None)    # do not use archived CV halting state
        print("restart training.")

    # <-- __x, w, npt, ptn, ... do it.
    gmx = kwd['gmx']
    nsb = gmx.shape[0]                     # sample size
    xmx = gmx.reshape(nsb, -1).astype('f')  # training data
    ngv = xmx.shape[-1]                     # feature size
    mdp = kwd.pop('wdp', 16)                # maximum network depth
    # learing rates
    lrt = new_lrt if new_lrt else kwd.pop('lrt', 1e-4)
    dim = [ngv//2**_ for _ in range(mdp) if 2**_ <= ngv]

    # cross-validation networks
    cvk = kwd.get('cvk', 2)                    # K
    cvm = kwd.get('cvm', cv_msk(xmx, cvk))     # mask
    cvh = kwd.pop('cvh', [None] * cvk)         # halting
    cvl = kwd.pop('cvl', [lrt] * cvk)          # learning rates
    cvw = kwd.pop('cvw', [None] * cvk)         # slots for CV networks
    cve = kwd.pop('cve', np.ndarray((cvk, 2)))  # error

    # tune the network: (1) CV
    for i, m in enumerate(cvm):
        msg = 'CV: {:02d}/{:02d}'.format(i + 1, cvk)
        if cvh[i]:
            msg = msg + ' halted.'
            print(msg)
            continue

        print(msg)
        if cvw[i] is None:
            cvw[i] = SAE.from_dim(dim, s='relu', **kwd)
            cvw[i][-1].s = 'sigmoid'
            
            # suggest no layer-wise treatment (relu)
            gdy = kwd.get('gdy', False)
        else:
            # suggest no layer-wise treatment
            gdy = kwd.get('gdy', False)
        kwd = ftn_sae(cvw[i], xmx[-m], xmx[+m], gdy=gdy, lrt=cvl[i], **kwd)

        # collect the output
        ftn = kwd.pop('ftn')
        cvl[i] = ftn.lrt.get_value()  # CV learning rate
        cve[i, 0] = ftn.terr()        # CV training error
        cve[i, 1] = ftn.verr()        # CV validation error
        cvh[i] = ftn.hlt              # CV halting?
    # update
    kwd.update(cvk=cvk, cvm=cvm, cvh=cvh, cvl=cvl, cve=cve, cvw=cvw)

    # (2) normal training
    # force continue of training till new halting error?
    if new_hte:
        [kwd.pop(_, None) for _ in ['hte', 'hof', 'eot', 'eov']]
        hte = new_hte
    else:
        # mean CV training error as halting error
        hte = kwd.pop('hte', cve[:, 0].mean())
        
    # NT only happens when all CV is halted.
    if all(cvh) and 'hof' not in kwd:
        # create normal network if necessary
        nwk = kwd.pop('nwk', None)
        if nwk is None:
            nwk = SAE.from_dim(dim, s='relu', **kwd)
            nwk[-1].s = 'sigmoid'

            # suggest no layer-wise treatment (relu)
            gdy = kwd.get('gdy', False)
        else:
            # suggest no layer-wise treatment
            gdy = kwd.get('gdy', False)

        print('NT: HTE = {}'.format(hte))
        kwd = ftn_sae(nwk, xmx, xmx, gdy=gdy, lrt=lrt, hte=hte, **kwd)
        ftn = kwd.pop('ftn')
        lrt = ftn.lrt.get_value()  # learning rate

        # update
        kwd.update(nwk=nwk, lrt=lrt, hte=hte)

        # when NT halt, save the high order features
        if ftn.hlt:
            kwd['hof'] = nwk.ec(xmx).eval()
            kwd['eot'] = ftn.terr()
            kwd['eov'] = ftn.verr()
            print('NT: halted.')
    elif all(cvh) and 'hof' in kwd:
        print('NT: halted.')
    else:
        print('NT: Not Ready.')  # not ready for NT

    # save
    if sav:
        print("write to: ", sav)
        spz(sav, kwd)

    kwd = dict((k, v) for k, v in kwd.iteritems() if v is not None)
    return kwd
Ejemplo n.º 8
0
def main(fnm='../../sim/W09/10_PTN', **kwd):
    """ the fine-tune procedure for Stacked Autoencoder(SAE).

    -- fnm: pathname to the input, supposingly the saved progress after the
    pre-training. If {fnm} points to a directory, a file is randomly chosen
    from it.

    ** ae1: depth of the sub SA.
    """
    # randomly pick pre-trained progress if {fnm} is a directory and no record
    # exists in the saved progress:
    if pt.isdir(fnm):
        fnm = pt.join(fnm, np.random.choice(os.listdir(fnm)))
    kwd.update(fnm=fnm)

    # load data from {fnm}, but let parameters in {kwd} takes precedence over
    # those in {fnm}
    _ = kwd.keys()
    kwd.update((k, v) for k, v in lpz(fnm).iteritems() if k not in _)

    # check saved progress and overwrite options:
    sav = kwd.get('sav', pt.basename(fnm).split('.')[0])
    if pt.exists(sav + '.pgz'):
        print(
            sav,
            ": exists,",
        )
        ovr = kwd.pop('ovr', 0)  # overwrite?

        if ovr is 0 or ovr > 2:  # do not overwrite the progress
            print(" skipped.")
            return kwd
    else:
        ovr = 2

    # resume progress, use network stored in {sav}.
    if ovr is 1:
        kwd.pop('cvw', None)  # use saved networks for CV
        kwd.pop('nwk', None)  # use saved network for training
        kwd.pop('cvl', None)  # use saved CV LRT
        kwd.pop('cvh', None)  # use saved CV halting state
        kwd.pop('cve', None)  # use saved CV halting error
        kwd.pop('lrt', None)  # use saved learning rate for training

        # remaining options in {kwd} take precedence over {sav}.
        sdt = lpz(sav)
        sdt.update(kwd)
        kwd = sdt
        print("continue training.")
    else:  # restart the training
        kwd.pop('lrt', None)  # do not use archived NT LRT
        kwd.pop('cvl', None)  # do not use archived CV LRT
        kwd.pop('cve', None)  # do not use archived CV errors
        kwd.pop('cvh', None)  # do not use archived CV halting state
        print("restart training.")

    # <-- __x, w, npt, ptn, ... do it.
    xmx = kwd['xmx']  # training data
    nwk = kwd['nwk']  # the whole network, not subset
    dph = len(nwk.sa)  # depth of the network
    ae1 = kwd.get('ae1', dph)  # encoding depth -- autoencoder one

    # cross-validation
    cvk = kwd['cvk']  # CV folds
    cvm = kwd['cvm']  # CV partitaion mask
    cvw = kwd['cvw']  # CV networks
    # CV halting
    cvh = kwd.get('cvh', [False] * cvk)

    # learing rates for normal training and CV
    lrt = kwd.pop('lrt', .01)
    cvl = kwd.pop('cvl', [lrt] * cvk)

    # NT halting error and CV errors
    hte = kwd.pop('hte', .005)
    cve = kwd.pop('cve', np.ndarray((cvk, 2)))

    # create error tables if necessary:
    if kwd.get('etb') is None:
        kwd['etb'] = np.zeros([dph + 1, 2]) - 1.0
        kwd['etb'][0] = 0  # raw data has zero error
    etb = kwd['etb']

    # create high order feature (HOF) table:
    if kwd.get('hof') is None:
        kwd['hof'] = [None] * (dph + 1)
    hof = kwd['hof']

    # # raw data as trivial HOF
    _ = kwd['gmx'][:, :, kwd['ugv'] < 1]
    _ = _.reshape(_.shape[0], -1)
    hof[0] = _

    # fine-tuning
    # 1) for CV:
    for i, m in enumerate(cvm):
        msg = 'CV: {:02d}/{:02d}'.format(i + 1, cvk)
        if cvh[i]:
            msg = msg + ' halted.'
            print(msg)
            continue
        print(msg)
        kwd = ftn_sae(cvw[i], xmx[-m], xmx[+m], lrt=cvl[i], **kwd)

        # collect the output
        ftn = kwd.pop('ftn')
        cvl[i] = ftn.lrt.get_value()  # learning rate
        cve[i, 0] = ftn.terr()  # CV training error
        cve[i, 1] = ftn.verr()  # CV validation error
        cvh[i] = ftn.hlt.get_value()  # CV halting

    # mean CV validation error
    etb[ae1, 1] = cve[:, 1].mean()

    # mean CV training error as NT halting error
    hte = cve[:, 0].mean()

    # 2) for normal training:
    # happens when all CV is halted or converged.
    if np.all(cvh):
        print('NT: HTE = {}'.format(hte))
        kwd = ftn_sae(nwk, xmx, xmx, lrt=lrt, hte=hte, **kwd)
        ftn = kwd.pop('ftn')
        lrt = ftn.lrt.get_value()  # learning rate
        etb[ae1, 0] = ftn.terr()  # normal error
        if ftn.hlt.get_value():
            print('NT: halted.')

        # high order features for all individuals, with typed variants
        _ = kwd['gmx'][:, :, kwd['ugv'] < 1]
        _ = _.reshape(_.shape[0], -1)
        hof[ae1] = ftn.nnt.ec(_).eval()
    else:
        print('NT: HTE = ??')  # not ready for NT

    # 3) update progress and saving.
    kwd.update(cvl=cvl, cve=cve, cvh=cvh, lrt=lrt, etb=etb, hof=hof)
    if sav:
        print("write to: ", sav)
        spz(sav, kwd)

    kwd = dict((k, v) for k, v in kwd.iteritems() if v is not None)
    return kwd
Ejemplo n.º 9
0
def ept(fnm, out=None):
    """ Export training result in text format.
    -- fnm: filename of training progress.
    -- sav: where to save the export.
    """
    pwd = os.getcwd()
    fnm = pt.abspath(fnm)

    import tempfile
    tpd = tempfile.mkdtemp()
    if out is None:
        out = pwd
    if pt.isdir(out):
        out = pt.join(out, pt.basename(fnm).split('.')[0])
    if not out.endswith('.tgz') or out.endswith('.tar.gz'):
        out = out + '.tgz'
    out = pt.abspath(out)

    # read the training progress
    dat = lpz(fnm)
    [dat.pop(_) for _ in ['gmx', 'cvw', 'sav', 'nep', 'nep', 'ovr']]
    dat['fnm'] = fnm
    dat['out'] = out

    # genomic map
    np.savetxt(pt.join(tpd, 'gmp.txt'), dat.pop('gmp'), '%d\t%d\t%s')

    # genomic data in dosage format
    np.savetxt(pt.join(tpd, 'dsg.txt'), dat.pop('dsg'), '%d')

    # subjects
    np.savetxt(pt.join(tpd, 'sbj.txt'), dat.pop('sbj'), '%s')

    # untyped subjects (indices)
    np.savetxt(pt.join(tpd, 'usb.txt'), dat.pop('usb'), '%d')

    # untyped variants (indices)
    np.savetxt(pt.join(tpd, 'ugv.txt'), dat.pop('ugv'), '%d')

    # final high-order features
    # xmx, nwk = dat.pop('xmx'), dat.pop('nwk')
    # np.savetxt(pt.join(tpd, 'hff.txt'), nwk.ec(xmx).eval(), '%.8f')

    # sub high-order features
    hof = dat.pop('hof')
    for i in range(len(hof)):
        if hof[i] is None:
            continue
        np.savetxt(pt.join(tpd, 'hf{}.txt'.format(i)), hof[i], '%.8f')

    # error table
    np.savetxt(pt.join(tpd, 'etb.txt'), dat.pop('etb'), '%.8f')

    # CV masks
    np.savetxt(pt.join(tpd, 'cvm.txt'), dat.pop('cvm'), '%d')

    # meta information
    inf = open(pt.join(tpd, 'inf.txt'), 'w')
    for k, v in dat.iteritems():
        inf.write('{}={}\n'.format(k, v))
    inf.close()  # done

    # pack the output, delete invididual files
    import tarfile
    import shutil

    # packing
    os.chdir(tpd)  # goto the packing dir
    try:
        tar = tarfile.open(out, 'w:gz')
        [tar.add(_) for _ in os.listdir('.')]
        shutil.rmtree(tpd, True)
        tar.close()
    except Exception as e:
        print(e)
    os.chdir(pwd)  # back to the working dir
Ejemplo n.º 10
0
def main(fnm='../../sim/W08/10_PTN', **kwd):
    """ the fine-tune procedure for Stacked Autoencoder(SAE).

    -- fnm: pathname to the input, supposingly the saved progress after the
    pre-training. If {fnm} points to a directory, a file is randomly chosen
    from it.

    ** ae1: depth of the sub SA.
    """
    # randomly pick pre-trained progress if {fnm} is a directory and no record
    # exists in the saved progress:
    if pt.isdir(fnm):
        fnm = pt.join(fnm, np.random.choice(os.listdir(fnm)))
    kwd.update(fnm=fnm)

    # load data from {fnm}, but let parameters in {kwd} takes precedence over
    # those in {fnm}
    _ = kwd.keys()
    kwd.update((k, v) for k, v in lpz(fnm).iteritems() if k not in _)

    # check saved progress and overwrite options:
    sav = kwd.get('sav', pt.basename(fnm).split('.')[0])
    if pt.exists(sav + '.pgz'):
        print(sav, ": exists,", )
        ovr = kwd.pop('ovr', 0)  # overwrite?

        if ovr is 0 or ovr > 2:  # do not overwrite the progress
            print(" skipped.")
            return kwd
        else:
            sdt = lpz(sav)

        # resumed fine-tuneing,  use network stored in {sav} if possible
        # continue with the progress
        if ovr is 1:
            kwd.pop('cvw', None)  # use saved ANNs for CV
            kwd.pop('nwk', None)  # use saved ANNs for training
            kwd.pop('cvl', None)  # use saved learning rate for CV
            kwd.pop('lrt', None)  # use saved learning rate for training

            # remaining options in {kwd} take precedence over {sav}.
            sdt = lpz(sav)
            sdt.update(kwd)
            kwd = sdt
            print("continue training.")
    else:
        ovr = 2                 # no saved progress, restart anyway
    if ovr is 2:                # restart the training
        kwd.pop('lrt', None)    # do not use archived LRT for training
        kwd.pop('cvl', None)    # do not use archived LRT for CV
        print("restart training.")
        
    # <-- __x, w, npt, ptn, ... do it.
    xmx = kwd['xmx']            # training data
    nwk = kwd['nwk']            # the whole network, not subset
    dph = len(nwk.sa)           # depth of the network
    ae1 = kwd.get('ae1', dph)   # encoding depth -- autoencoder one

    # cross-validation
    cvk = kwd['cvk']            # CV folds
    cvm = kwd['cvm']            # CV partitaion mask
    cvw = kwd['cvw']            # CV networks

    # learing rates for normal training and CV
    lrt = kwd.pop('lrt', .01)
    cvl = kwd.pop('cvl', [lrt] * cvk)

    # create error tables if necessary:
    if kwd.get('etb') is None:
        kwd['etb'] = np.zeros([dph + 1, 2]) - 1.0
        kwd['etb'][0] = 0       # raw data has zero error
    etb = kwd['etb']

    # create high order feature (HOF) table:
    if kwd.get('hof') is None:
        kwd['hof'] = [None] * (dph + 1)
        kwd['hof'][0] = xmx     # raw data as trivial HOF
    hof = kwd['hof']

    # fine-tuning
    # 1) for CV:
    cve = np.ndarray(cvk)       # CV errors
    for i, m in enumerate(cvm):
        print('CV: {:02d}/{:02d}'.format(i+1, cvk))
        kwd = ftn_sae(cvw[i], xmx[-m], xmx[+m], lrt=cvl[i], **kwd)

        # collect the output
        ftn = kwd.pop('ftn')
        cvl[i] = ftn.lrt.get_value()  # learning rate
        cve[i] = ftn.verr()           # CV errors
    etb[ae1, 1] = cve.mean()          # mean CV error

    # 2) for normal training:
    print('NT:')
    kwd = ftn_sae(nwk, xmx, xmx, lrt=lrt, **kwd)
    ftn = kwd.pop('ftn')
    lrt = ftn.lrt.get_value()    # learning rate
    etb[ae1, 0] = ftn.terr()     # normal error

    # high order features
    hof[ae1] = ftn.nnt.ec(xmx).eval()

    # 3) update progress
    kwd.update(cvl=cvl, lrt=lrt, etb=etb, hof=hof)

    # save the progress
    if sav:
        print("write to: ", sav)
        spz(sav, kwd)

    kwd = dict((k, v) for k, v in kwd.iteritems() if v is not None)
    return kwd
Ejemplo n.º 11
0
def ept(fnm, out=None):
    """ Export training result in text format.
    -- fnm: filename of training progress.
    -- sav: where to save the export.
    """
    pwd = os.getcwd()
    fnm = pt.abspath(fnm)
    
    import tempfile
    tpd = tempfile.mkdtemp()
    if out is None:
        out = pwd
    if pt.isdir(out):
        out = pt.join(out, pt.basename(fnm).split('.')[0])
    if not out.endswith('.tgz') or out.endswith('.tar.gz'):
        out = out + '.tgz'
    out = pt.abspath(out)

    # read the training progress
    dat = lpz(fnm)
    [dat.pop(_) for _ in ['gmx', 'cvw', 'sav', 'nep', 'nft', 'ovr']]
    dat['fnm'] = fnm
    dat['out'] = out

    # genomic map
    np.savetxt(pt.join(tpd, 'gmp.txt'), dat.pop('gmp'), '%d\t%d\t%s')

    # genomic data in dosage format
    np.savetxt(pt.join(tpd, 'dsg.txt'), dat.pop('dsg'), '%d')

    # subjects
    np.savetxt(pt.join(tpd, 'sbj.txt'), dat.pop('sbj'), '%s')

    # final high-order features
    xmx, nwk = dat.pop('xmx'), dat.pop('nwk')
    np.savetxt(pt.join(tpd, 'hff.txt'), nwk.ec(xmx).eval(), '%.8f')

    # sub high-order features
    hof = dat.pop('hof')
    for i in range(len(hof)):
        if hof[i] is None:
            continue
        np.savetxt(pt.join(tpd, 'hf{}.txt'.format(i)), hof[i], '%.8f')

    # error table
    np.savetxt(pt.join(tpd, 'etb.txt'), dat.pop('etb'), '%.8f')

    # CV masks
    np.savetxt(pt.join(tpd, 'cvm.txt'), dat.pop('cvm'), '%d')

    # meta information
    inf = open(pt.join(tpd, 'inf.txt'), 'w')
    for k, v in dat.iteritems():
        inf.write('{}={}\n'.format(k, v))
    inf.close()                 # done

    # pack the output, delete invididual files
    import tarfile
    import shutil

    # packing
    os.chdir(tpd)               # goto the packing dir
    try:
        tar = tarfile.open(out, 'w:gz')
        [tar.add(_) for _ in os.listdir('.')]
        shutil.rmtree(tpd, True)
        tar.close()
    except Exception as e:
        print(e)
    os.chdir(pwd)               # back to the working dir
Ejemplo n.º 12
0
def main(fnm, **kwd):
    """ the fine-tune procedure for Stacked Autoencoder(SAE).

    -- fnm: pathname to the input, supposingly the saved progress after the
    pre-training. If {fnm} points to a directory, a file is randomly chosen
    from it.

    """
    # randomly pick pre-trained progress if {fnm} is a directory and no record
    # exists in the saved progress:
    if pt.isdir(fnm):
        fnm = pt.join(fnm, np.random.choice(os.listdir(fnm)))
    kwd.update(fnm=fnm)
    fdr, fbn = pt.split(fnm)
    fpx = fbn.split('.', 2)[0]

    # read {fnm}, but parameters in {kwd} takes precedence over those in {fnm}
    keys = list(kwd.keys()) + ['lrt']
    kwd.update((k, v) for k, v in lpz(fnm).items() if k not in keys)

    # check saved progress and overwrite options:
    sav = kwd.get('sav', '.')
    if sav is None:
        sav = pt.join(fdr, fpx)
    if pt.isdir(sav):
        sav = pt.join(sav, fpx)
    if pt.exists(sav + '.pgz') or pt.exists(sav):
        print(sav, ": exists,", )
        ovr = kwd.pop('ovr', 0)  # overwrite?
        if ovr is 0 or ovr > 2:  # do not overwrite the progress
            print(" skipped.")
            return kwd
    else:
        ovr = 2

    # resume progress, use network stored in {sav}.
    if ovr is 1:
        # remaining options in {kwd} take precedence over {sav}, but always use
        # saved network, even if there is one available in {fnm}.
        sdt = lpz(sav)          # load

        # should we restart the training due to failure?
        eot = sdt.get('eot', 0.0)
        rte = kwd.pop('rte', 1e9)
        if eot > rte:
            ovr = 2
            print('NT: RTE = {}'.format(kwd.bet('rte')))
            print('NT: EOT = {}'.format(sdt.get('eot')))
            print("NT: eot > rte, re-try.")
        else:
            kwd.pop('nwk', None)    # use saved network
            kwd.pop('lrt', None)    # use saved learning rate
            kwd.pop('eot', None)    # use saved error
            sdt.update(kwd)
            kwd = sdt
            print("continue.")
    else:                       # restart the training
        print("restart.")

    # training data, only first 500
    if 'gx0' in kwd and 'gx1' in kwd:
        gmx = np.concatenate([
            kwd['gx0'][:, np.newaxis],
            kwd['gx1'][:, np.newaxis]], 1)
    else:
        gmx = kwd['gmx']

    # take out part of the samples?
    nsb = kwd.get('nsb', None)
    if nsb is not None and nsb < gmx.shape[0]:
        idx = np.sort(np.random.permutation(gmx.shape[0])[:nsb])
        gmx = gmx[idx, :]
        kwd['sbj'] = kwd['sbj'][idx]
    else:
        nsb = gmx.shape[0]

    # genomic copy 1 & 2
    kwd['gx0'] = gmx[:, 0, :]
    kwd['gx1'] = gmx[:, 1, :]

    # training format
    xmx = gmx.reshape(nsb, -1).astype('f')

    # the dimensions
    ngv = xmx.shape[-1]
    dim = [ngv] + [ngv//2**d for d in range(1, 16) if 2**d <= ngv]

    # learing rates
    lrt = kwd.pop('lrt', .001)
    nep = kwd.pop('nep', 20)

    # Halt already?
    hte = kwd.pop('hte', .001)
    eot = kwd.pop('eot', 1e12)
    ste = kwd.pop('ste', 1e10)
    print('NT: HTE = {}'.format(hte))
    print('NT: EOT = {}'.format(eot))
    if eot < hte:               # halted?
        print('NT: eot < hte')
        print('NT: Halt.\nNT: Done.')
        return kwd
    if eot > ste:               # to much to even bother?
        print('NT: eot > ste')
        print('NT: Skip.\nNT: Done.')
        return kwd

    # train the network, create it if necessary
    nwk = kwd.pop('nwk', None)
    if nwk is None:
        nwk = SAE(dim, s='sigmoid', **kwd)
        print('create NT: ', nwk)

    # limit the working network
    wdp = kwd.pop('wdp', None)

    # wd0 indexing the lowest of new layers revealed on top of an existing
    # optimal working network
    wd0 = kwd.pop('wd0', 0)

    # pre-train the new layers when they are revealed for the first time.
    if wd0 > 0 and ovr is 2:
        # output from previously trained network at the bottom.
        xm1 = nwk.sub(None, wd0).ec(xmx).eval()
        # the new stacks.
        nw1 = nwk.sub(wd0, wdp)
        pep = kwd.pop('pep', nep)
        ptr = SAE.Train(nw1, xm1, xm1, lrt=lrt, nep=pep)
        eot = ptr.terr()
        nep = max(20, nep - pep)

    # fine-tuning
    wnk = nwk.sub(None, wdp)
    ftn = SAE.Train(wnk, xmx, xmx, lrt=lrt, hte=hte, nep=nep, **kwd)
    lrt = ftn.lrt.get_value()
    eot = ftn.terr()
    eov = ftn.verr()
    eph = kwd.pop('eph', 0) + ftn.ep.get_value()
    hof = ftn.nwk.ec(xmx).eval()
    rsd = xmx - ftn.nwk(xmx).eval()  # the residual

    if ftn.hlt:
        print('NT: Halt.', ftn.hlt)

    # 3) update progress and save.
    kwd.update(nwk=nwk, wdp=wdp, lrt=lrt,
               hof=hof, rsd=rsd,
               eot=eot, eov=eov, eph=eph)

    kwd.pop('gh0', None)
    kwd.pop('gh1', None)
    kwd.pop('gmx', None)
    if sav:
        print("write to: ", sav)
        spz(sav, kwd)
    print('NT: Done.')

    kwd = dict((k, v) for k, v in kwd.items() if v is not None)
    return kwd