def eval(flags): name = flags.pred_path yp = pd.read_csv(name) classes = len([i for i in yp.columns.values if 'class' in i]) yp = yp[['class%d' % i for i in range(1, classes + 1)]].values myDB = personalDB(flags, name="full") if "stage1" in name: y = myDB.data['test_variants_filter']['Class'] - 1 else: myDB.get_split() va = myDB.split[flags.fold][1] y = np.argmax(myDB.y[va], axis=1) if np.max(y) > classes: y = np.argmax(to4c(onehot_encode(y)), axis=1) score = cross_entropy(y, yp) print(name, score, '\n')
def train_rnn(flags): myDB = personalDB(flags) model = BaoRnn(flags, myDB) model.train()
def train_embedding(flags): myDB = personalDB(flags) model = BaoEmbedding(flags, myDB) model.train()
def show_embedding(flags): myDB = personalDB(flags) model = BaoEmbedding(flags, myDB) model.show_embedding("CBOW/embedding/w:0")
def predict_nn(flags): myDB = personalDB(flags) model = BaoNN(flags, myDB) model.predict()
def build_feature(flags,feas=None): if feas is None: feas = ['domain','full_text','gene_text','var_text','share','gene','pattern','onehot'] if 'meta' in flags.task: feas.append('meta') myDB = personalDB(flags,name='full') myDB.get_per_sample_tfidf(['training_text','test_text_filter','stage2_test_text'],"Text") myDB.get_split() geneDB = geneTextDB(flags,W=10,bar=0) geneDB.poke()#_text() geneDB.get_per_sample_tfidf(['training_text','test_text_filter','stage2_test_text'],"Text") varDB = geneTextDB(flags,tag='variation',W=10,bar=0) varDB.poke()#_text() varDB.get_per_sample_tfidf(['training_text','test_text_filter','stage2_test_text'],"Text") fold,folds = flags.fold,flags.folds if fold>=0: tr_rows,te_rows = myDB.split[fold] tr_rows = tr_rows.tolist() te_rows = te_rows.tolist() else: tr_rows = None te_rows = 'stage2' if 'stage2' in flags.task else 'stage1' words = myDB.select_top_k_words(['training_text','test_text_filter'],"Text",mode="tf",k=5,slack=0) myDB.get_words(words) gwords = geneDB.select_top_k_words(['training_text','test_text_filter'],"Text",mode="tf",k=2,slack=0) geneDB.get_words(gwords) vwords = varDB.select_top_k_words(['training_text','test_text_filter'],"Text",mode="tf",k=5,slack=0) varDB.get_words(vwords) X,Xt = [],[] if 'meta' in feas: paths = [#"comps/personal/baobao/backup/91_88_87_86_74", # "comps/personal/baobao/backup/107_110_105_102_102", "comps/personal/baobao/backup/4c_52_51_47_46_39"] for path in paths: fill(X,Xt,*get_meta(myDB, tr_rows, te_rows, path)) if 'text_len' in feas: fill(X,Xt,*get_text_len(myDB, tr_rows, te_rows)) if 'domain' in feas: fill(X,Xt,*domain(myDB, tr_rows, te_rows)) if 'full_text' in feas: for mode in ["tf"]: fill(X,Xt,*get_count(myDB, tr_rows, te_rows, mode)) if 'gene_text' in feas: for mode in ["tf"]: fill(X,Xt,*get_count(geneDB, tr_rows, te_rows, mode)) if 'var_text' in feas: for mode in ["tf"]: fill(X,Xt,*get_count(varDB, tr_rows, te_rows, mode)) if 'share' in feas: fill(X,Xt,*get_share(myDB, tr_rows, te_rows)) if 'pattern' in feas: patterns = [r'[a-zA-Z][0-9]+[a-zA-Z]*','del']#,'ins','fus','trunc','methy','amp','sil','expr','splice','exon'] fill(X,Xt,*get_pattern(myDB, tr_rows, te_rows,patterns)) if 'gene' in feas: fill(X,Xt,*get_gene(myDB, tr_rows, te_rows)) if 'd2v' in feas: fill(X,Xt,*get_d2v(flags.load_path, tr_rows, te_rows)) if 'onehot' in feas: fill(X,Xt,*onehot_gene(myDB, tr_rows, te_rows)) y,yt = myDB.gety(tr_rows,te_rows) return np.hstack(X),y,np.hstack(Xt),yt,te_rows
def preprocess(flags): myDB = personalDB(flags, name="full") myDB.poke()
def train_d2v(flags): myDB = personalDB(flags) model = D2V(flags, myDB) model.train()
def show_d2v(flags): myDB = personalDB(flags) model = D2V(flags, myDB) model.show_embedding("D2V/embedding/w:0")
def test_cnn(flags): myDB = personalDB(flags, name='full') gDB = geneTextDB(flags, W=10, bar=0) model = BaoCnn(flags, myDB, gDB) model.predict()
def train_cnn(flags): myDB = personalDB(flags, name='full') gDB = geneTextDB(flags, W=10, bar=0) model = BaoCnn(flags, myDB, gDB) model.train()