def LassoRegionOptimizer(str_f_iosg, str_f_st, str_refgem,
                         str_of_seq, str_of_fesrep, str_of_md,
                         int_regn_start=0, int_regn_end=100, int_regn_step=5,
                         cv=5, niter=1000, njob=1):
    int_ups_regn = np.arange(int_regn_start, int_regn_end+1, int_regn_step)
    int_dws_regn = np.where((int_ups_regn-3)>0, int_ups_regn-3, 0)
    arr_r2 = np.zeros_like(int_ups_regn, dtype=np.double)
    dfm_st = pd.read_csv(str_f_st, sep='\t', index_col=None)
    dfm_st = dfm_st[[0,-1]]
    for (i,(int_ups, int_dws)) in enumerate(zip(int_ups_regn, int_dws_regn)):
        dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws)
        dfm_y = pd.merge(dfm_x, dfm_st, on='sgID')
        dfm_x = dfm_y.ix[:, 1:-1]
        dfm_y = dfm_y[[-1]]
        x = np.array(dfm_x, dtype=np.double)
        y = np.array(dfm_y, dtype=np.double).ravel()
        arr_r2[i] = ml.LassoSelector(x, y, cv, niter, njob).r2
    int_ups = int_ups_regn[arr_r2.argmax()]
    int_dws = int_dws_regn[arr_r2.argmax()]
    dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws)
    dfm_x.to_csv(str_of_seq, sep='\t', index=None)
    dfm_y = pd.merge(dfm_x, dfm_st, on='sgID')
    dfm_x = dfm_y.ix[:, 1:-1]
    dfm_y = dfm_y[[-1]]
    x = np.array(dfm_x, dtype=np.double)
    y = np.array(dfm_y, dtype=np.double).ravel()
    md = ml.LassoSelector(x, y, cv, niter, njob)
    lst_fs = [i for i in dfm_x.columns[md.idx]]
    fr = Fr(fs=lst_fs, idx=md.idx, ups=int_ups, dws=int_dws, niter=niter,
            cv=cv, cor=md.cor, r2=md.r2)
    __SaveFeatureReport(fr, str_of_fesrep)
    mdl = dict(med='lasso', model=md.model, idx=md.idx,
               ups=int_ups, dws=int_dws)
    ml.SaveModel(mdl, str_of_md)
def LogitRegionOptimizer(str_f_iosg, str_f_st, str_refgem,
                         str_of_seq, str_of_fesrep, str_of_md,
                         int_regn_start=0, int_regn_end=100, int_regn_step=5,
                         cv=5, njob=1):
    int_ups_regn = np.arange(int_regn_start, int_regn_end+1, int_regn_step)
    int_dws_regn = np.where((int_ups_regn-3)>0, int_ups_regn-3, 0)
    arr_accu = np.zeros_like(int_ups_regn, dtype=np.double)
    dfm_st = pd.read_csv(str_f_st, sep='\t', index_col=None)
    dfm_st = dfm_st[[0,-1]]
    for (i,(int_ups, int_dws)) in enumerate(zip(int_ups_regn, int_dws_regn)):
        dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws)
        dfm_y = pd.merge(dfm_x, dfm_st, on='sgID')
        dfm_x = dfm_y.ix[:, 1:-1]
        dfm_y = dfm_y[[-1]]
        x = np.array(dfm_x, dtype=np.double)
        y = np.array(dfm_y, dtype=np.int)
        arr_accu[i] = ml.LogitSelector(x, y, cv, njob).accu
    int_ups = int_ups_regn[arr_accu.argmax()]
    int_dws = int_dws_regn[arr_accu.argmax()]
    dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws)
    dfm_x.to_csv(str_of_seq, sep='\t', index=None)
    dfm_y = pd.merge(dfm_x, dfm_st, on='sgID')
    dfm_x = dfm_y.ix[:, 1:-1]
    dfm_y = dfm_y[[-1]]
    x = np.array(dfm_x, dtype=np.double)
    y = np.array(dfm_y, dtype=np.int)
    mdc = ml.LogitSelector(x, y, cv, njob)
    lst_fs = [i for i in dfm_x.columns[mdc.idx]]
    frc = Frc(fs=lst_fs, ups=int_ups, dws=int_dws,
              cv=cv, accu=mdc.accu, prec=mdc.prec, rec=mdc.rec,
              f1=mdc.f1, au=mdc.au)
    __SaveFeatureReport(frc, str_of_fesrep)
    mdlc = dict(med='logit', model=mdc.model, idx=mdc.idx,
                ups=int_ups, dws=int_dws)
    ml.SaveModel(mdlc, str_of_md)
def MtLassoFeatureSelection(lst_sg, lst_st, lst_ref, lst_seq,
                            str_of_fesrep, int_ups=30, int_dws=27, flt_lmd=0.5):
    lst_x = []
    lst_y = []
    ind = [0]
    lst_header = []
    for i in xrange(len(lst_sg)):
        dfm = ExtractSeqFeature(lst_sg[i], lst_ref[i], int_ups, int_dws)
        dfm.to_csv(lst_seq[i], sep='\t', index=None)
        dfm_y = pd.read_csv(lst_st[i], sep='\t', index_col=None)[[0,-1]]
        dfm = pd.merge(dfm, dfm_y, on='sgID')
        x = np.array(dfm.ix[:,1:-1], dtype=np.double)
        y = np.array(dfm[[-1]], dtype=np.double)
        lst_x.append(x)
        lst_y.append(y)
        ind.append(dfm.shape[0])
        if i == 0:
            lst_header = dfm.columns[1:-1].tolist()
            
    x = np.double(np.vstack(lst_x))
    y = np.double(np.vstack(lst_y))
    ind = np.cumsum(ind)
    dict_opts = dict(init=2, rFlag=1, rsL2=0, ind=ind, nFlag=2)
    idx = ml.MtLassoSelector(x, y, flt_lmd, dict_opts)
    lst_fs = [lst_header[i] for i in idx]
    fr = Fr(fs=lst_fs, ups=int_ups, dws=int_dws)
    __SaveFeatureReport(fr, str_of_fesrep)
def FeatureEval(str_f_iosg, str_f_md, str_refgem, str_of_seq, str_of_st):
    mdl = ml.LoadModel(str_f_md)
    dfm = ExtractSeqFeature(str_f_iosg, str_refgem, mdl['ups'], mdl['dws'])
    dfm.to_csv(str_of_seq, sep='\t', index=None)

    x = np.array(dfm.ix[:,1:], dtype=np.double)
    y = None
    if mdl['med'] == 'lasso':
        y = ml.LassoEval(mdl['model'], x, mdl['idx'])
    elif mdl['med'] == 'logit':
        y = ml.LogitEval(mdl['model'], x, mdl['idx'])
    dfm['score'] = y

    dfm_y = dfm[[0,-1]]
    dfm_y.to_csv(str_of_st, sep='\t', index=None)
def LogitFeatureSelection(
    str_f_iosg,
    str_f_st,
    str_refgem,
    str_of_seq,
    str_of_fesrep,
    str_of_md,
    int_ups=30,
    int_dws=27,
    cv=5,
    niter=1000,
    njob=1,
):

    dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws)
    dfm_x.to_csv(str_of_seq, sep="\t", index=None)
    dfm_y = pd.read_csv(str_f_st, sep="\t", index_col=None)
    dfm_y = dfm_y[[0, -1]]
    dfm_y = pd.merge(dfm_x, dfm_y, on="sgID")
    dfm_x = dfm_y.ix[:, 1:-1]
    dfm_y = dfm_y[[-1]]

    x = np.array(dfm_x, dtype=np.double)
    y = np.array(dfm_y, dtype=np.int)

    mdc = ml.LogitSelector(x, y, cv, niter, njob)
    lst_fs = [i for i in dfm_x.columns[mdc.idx]]
    frc = Frc(
        fs=lst_fs,
        idx=mdc.idx,
        ups=int_ups,
        dws=int_dws,
        niter=niter,
        cv=cv,
        accu=mdc.accu,
        prec=mdc.prec,
        rec=mdc.rec,
        f1=mdc.f1,
        au=mdc.au,
    )
    __SaveFeatureReport(frc, str_of_fesrep)
    mdlc = dict(med="logit", model=mdc.model, idx=mdc.idx, ups=int_ups, dws=int_dws)
    ml.SaveModel(mdlc, str_of_md)
def LassoFeatureSelection(
    str_f_iosg, str_f_st, str_refgem, str_of_seq, str_of_fesrep, str_of_md, int_ups=30, int_dws=27, cv=5, njob=1
):

    dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws)
    dfm_x.to_csv(str_of_seq, sep="\t", index=None)
    dfm_y = pd.read_csv(str_f_st, sep="\t", index_col=None)
    dfm_y = dfm_y[[0, -1]]
    dfm_y = pd.merge(dfm_x, dfm_y, on="sgID")
    dfm_x = dfm_y.ix[:, 1:-1]
    dfm_y = dfm_y[[-1]]

    x = np.array(dfm_x, dtype=np.double)
    y = np.array(dfm_y, dtype=np.double).ravel()

    md = ml.LassoSelector(x, y, cv, njob)
    lst_fs = [i for i in dfm_x.columns[md.idx]]
    fr = Fr(fs=lst_fs, ups=int_ups, dws=int_dws, cv=cv, cor=md.cor, r2=md.r2)
    __SaveFeatureReport(fr, str_of_fesrep)
    mdl = dict(med="lasso", model=md.model, idx=md.idx, ups=int_ups, dws=int_dws)
    ml.SaveModel(mdl, str_of_md)
Beispiel #7
0
def FeatureEval(str_f_iosg, str_f_md, str_refgem, str_of_seq, str_of_st):
    mdl = ml.LoadModel(str_f_md)
    dfm = ExtractSeqFeature(str_f_iosg, str_refgem, mdl["ups"], mdl["dws"])
    dfm.to_csv(str_of_seq, sep="\t", index=None)

    x = np.array(dfm.ix[:, 1:], dtype=np.double)
    y = None
    if mdl["med"] == "lasso":
        y = ml.LassoEval(mdl["model"], x, mdl["idx"])
    elif mdl["med"] == "logit":
        y = ml.LogitEval(mdl["model"], x, mdl["idx"])
    dfm["score"] = y

    dfm_y = dfm[[0, -1]]
    dfm_iosg = pd.read_csv(str_f_iosg, sep="\t", header=None, index_col=None)
    dfm_iosg.columns = ["sgID", "chrom", "strand", "sbeg", "send", "qseq", "c_site"]
    dfm = pd.merge(dfm_iosg, dfm_y, on="sgID")
    dfm.drop("c_site", axis=1, inplace=True)
    dfm.sort_index(by=["score"], ascending=False, inplace=True)
    dfm.to_csv(str_of_st, sep="\t", index=None)