def LassoRegionOptimizer(str_f_iosg, str_f_st, str_refgem, str_of_seq, str_of_fesrep, str_of_md, int_regn_start=0, int_regn_end=100, int_regn_step=5, cv=5, niter=1000, njob=1): int_ups_regn = np.arange(int_regn_start, int_regn_end+1, int_regn_step) int_dws_regn = np.where((int_ups_regn-3)>0, int_ups_regn-3, 0) arr_r2 = np.zeros_like(int_ups_regn, dtype=np.double) dfm_st = pd.read_csv(str_f_st, sep='\t', index_col=None) dfm_st = dfm_st[[0,-1]] for (i,(int_ups, int_dws)) in enumerate(zip(int_ups_regn, int_dws_regn)): dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws) dfm_y = pd.merge(dfm_x, dfm_st, on='sgID') dfm_x = dfm_y.ix[:, 1:-1] dfm_y = dfm_y[[-1]] x = np.array(dfm_x, dtype=np.double) y = np.array(dfm_y, dtype=np.double).ravel() arr_r2[i] = ml.LassoSelector(x, y, cv, niter, njob).r2 int_ups = int_ups_regn[arr_r2.argmax()] int_dws = int_dws_regn[arr_r2.argmax()] dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws) dfm_x.to_csv(str_of_seq, sep='\t', index=None) dfm_y = pd.merge(dfm_x, dfm_st, on='sgID') dfm_x = dfm_y.ix[:, 1:-1] dfm_y = dfm_y[[-1]] x = np.array(dfm_x, dtype=np.double) y = np.array(dfm_y, dtype=np.double).ravel() md = ml.LassoSelector(x, y, cv, niter, njob) lst_fs = [i for i in dfm_x.columns[md.idx]] fr = Fr(fs=lst_fs, idx=md.idx, ups=int_ups, dws=int_dws, niter=niter, cv=cv, cor=md.cor, r2=md.r2) __SaveFeatureReport(fr, str_of_fesrep) mdl = dict(med='lasso', model=md.model, idx=md.idx, ups=int_ups, dws=int_dws) ml.SaveModel(mdl, str_of_md)
def LogitRegionOptimizer(str_f_iosg, str_f_st, str_refgem, str_of_seq, str_of_fesrep, str_of_md, int_regn_start=0, int_regn_end=100, int_regn_step=5, cv=5, njob=1): int_ups_regn = np.arange(int_regn_start, int_regn_end+1, int_regn_step) int_dws_regn = np.where((int_ups_regn-3)>0, int_ups_regn-3, 0) arr_accu = np.zeros_like(int_ups_regn, dtype=np.double) dfm_st = pd.read_csv(str_f_st, sep='\t', index_col=None) dfm_st = dfm_st[[0,-1]] for (i,(int_ups, int_dws)) in enumerate(zip(int_ups_regn, int_dws_regn)): dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws) dfm_y = pd.merge(dfm_x, dfm_st, on='sgID') dfm_x = dfm_y.ix[:, 1:-1] dfm_y = dfm_y[[-1]] x = np.array(dfm_x, dtype=np.double) y = np.array(dfm_y, dtype=np.int) arr_accu[i] = ml.LogitSelector(x, y, cv, njob).accu int_ups = int_ups_regn[arr_accu.argmax()] int_dws = int_dws_regn[arr_accu.argmax()] dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws) dfm_x.to_csv(str_of_seq, sep='\t', index=None) dfm_y = pd.merge(dfm_x, dfm_st, on='sgID') dfm_x = dfm_y.ix[:, 1:-1] dfm_y = dfm_y[[-1]] x = np.array(dfm_x, dtype=np.double) y = np.array(dfm_y, dtype=np.int) mdc = ml.LogitSelector(x, y, cv, njob) lst_fs = [i for i in dfm_x.columns[mdc.idx]] frc = Frc(fs=lst_fs, ups=int_ups, dws=int_dws, cv=cv, accu=mdc.accu, prec=mdc.prec, rec=mdc.rec, f1=mdc.f1, au=mdc.au) __SaveFeatureReport(frc, str_of_fesrep) mdlc = dict(med='logit', model=mdc.model, idx=mdc.idx, ups=int_ups, dws=int_dws) ml.SaveModel(mdlc, str_of_md)
def MtLassoFeatureSelection(lst_sg, lst_st, lst_ref, lst_seq, str_of_fesrep, int_ups=30, int_dws=27, flt_lmd=0.5): lst_x = [] lst_y = [] ind = [0] lst_header = [] for i in xrange(len(lst_sg)): dfm = ExtractSeqFeature(lst_sg[i], lst_ref[i], int_ups, int_dws) dfm.to_csv(lst_seq[i], sep='\t', index=None) dfm_y = pd.read_csv(lst_st[i], sep='\t', index_col=None)[[0,-1]] dfm = pd.merge(dfm, dfm_y, on='sgID') x = np.array(dfm.ix[:,1:-1], dtype=np.double) y = np.array(dfm[[-1]], dtype=np.double) lst_x.append(x) lst_y.append(y) ind.append(dfm.shape[0]) if i == 0: lst_header = dfm.columns[1:-1].tolist() x = np.double(np.vstack(lst_x)) y = np.double(np.vstack(lst_y)) ind = np.cumsum(ind) dict_opts = dict(init=2, rFlag=1, rsL2=0, ind=ind, nFlag=2) idx = ml.MtLassoSelector(x, y, flt_lmd, dict_opts) lst_fs = [lst_header[i] for i in idx] fr = Fr(fs=lst_fs, ups=int_ups, dws=int_dws) __SaveFeatureReport(fr, str_of_fesrep)
def FeatureEval(str_f_iosg, str_f_md, str_refgem, str_of_seq, str_of_st): mdl = ml.LoadModel(str_f_md) dfm = ExtractSeqFeature(str_f_iosg, str_refgem, mdl['ups'], mdl['dws']) dfm.to_csv(str_of_seq, sep='\t', index=None) x = np.array(dfm.ix[:,1:], dtype=np.double) y = None if mdl['med'] == 'lasso': y = ml.LassoEval(mdl['model'], x, mdl['idx']) elif mdl['med'] == 'logit': y = ml.LogitEval(mdl['model'], x, mdl['idx']) dfm['score'] = y dfm_y = dfm[[0,-1]] dfm_y.to_csv(str_of_st, sep='\t', index=None)
def LogitFeatureSelection( str_f_iosg, str_f_st, str_refgem, str_of_seq, str_of_fesrep, str_of_md, int_ups=30, int_dws=27, cv=5, niter=1000, njob=1, ): dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws) dfm_x.to_csv(str_of_seq, sep="\t", index=None) dfm_y = pd.read_csv(str_f_st, sep="\t", index_col=None) dfm_y = dfm_y[[0, -1]] dfm_y = pd.merge(dfm_x, dfm_y, on="sgID") dfm_x = dfm_y.ix[:, 1:-1] dfm_y = dfm_y[[-1]] x = np.array(dfm_x, dtype=np.double) y = np.array(dfm_y, dtype=np.int) mdc = ml.LogitSelector(x, y, cv, niter, njob) lst_fs = [i for i in dfm_x.columns[mdc.idx]] frc = Frc( fs=lst_fs, idx=mdc.idx, ups=int_ups, dws=int_dws, niter=niter, cv=cv, accu=mdc.accu, prec=mdc.prec, rec=mdc.rec, f1=mdc.f1, au=mdc.au, ) __SaveFeatureReport(frc, str_of_fesrep) mdlc = dict(med="logit", model=mdc.model, idx=mdc.idx, ups=int_ups, dws=int_dws) ml.SaveModel(mdlc, str_of_md)
def LassoFeatureSelection( str_f_iosg, str_f_st, str_refgem, str_of_seq, str_of_fesrep, str_of_md, int_ups=30, int_dws=27, cv=5, njob=1 ): dfm_x = ExtractSeqFeature(str_f_iosg, str_refgem, int_ups, int_dws) dfm_x.to_csv(str_of_seq, sep="\t", index=None) dfm_y = pd.read_csv(str_f_st, sep="\t", index_col=None) dfm_y = dfm_y[[0, -1]] dfm_y = pd.merge(dfm_x, dfm_y, on="sgID") dfm_x = dfm_y.ix[:, 1:-1] dfm_y = dfm_y[[-1]] x = np.array(dfm_x, dtype=np.double) y = np.array(dfm_y, dtype=np.double).ravel() md = ml.LassoSelector(x, y, cv, njob) lst_fs = [i for i in dfm_x.columns[md.idx]] fr = Fr(fs=lst_fs, ups=int_ups, dws=int_dws, cv=cv, cor=md.cor, r2=md.r2) __SaveFeatureReport(fr, str_of_fesrep) mdl = dict(med="lasso", model=md.model, idx=md.idx, ups=int_ups, dws=int_dws) ml.SaveModel(mdl, str_of_md)
def FeatureEval(str_f_iosg, str_f_md, str_refgem, str_of_seq, str_of_st): mdl = ml.LoadModel(str_f_md) dfm = ExtractSeqFeature(str_f_iosg, str_refgem, mdl["ups"], mdl["dws"]) dfm.to_csv(str_of_seq, sep="\t", index=None) x = np.array(dfm.ix[:, 1:], dtype=np.double) y = None if mdl["med"] == "lasso": y = ml.LassoEval(mdl["model"], x, mdl["idx"]) elif mdl["med"] == "logit": y = ml.LogitEval(mdl["model"], x, mdl["idx"]) dfm["score"] = y dfm_y = dfm[[0, -1]] dfm_iosg = pd.read_csv(str_f_iosg, sep="\t", header=None, index_col=None) dfm_iosg.columns = ["sgID", "chrom", "strand", "sbeg", "send", "qseq", "c_site"] dfm = pd.merge(dfm_iosg, dfm_y, on="sgID") dfm.drop("c_site", axis=1, inplace=True) dfm.sort_index(by=["score"], ascending=False, inplace=True) dfm.to_csv(str_of_st, sep="\t", index=None)