コード例 #1
0
ファイル: eval.py プロジェクト: daxiongshu/kaggle-review
def eval(flags):
    name = flags.pred_path
    yp = pd.read_csv(name)
    classes = len([i for i in yp.columns.values if 'class' in i])
    yp = yp[['class%d' % i for i in range(1, classes + 1)]].values
    myDB = personalDB(flags, name="full")
    if "stage1" in name:
        y = myDB.data['test_variants_filter']['Class'] - 1
    else:
        myDB.get_split()
        va = myDB.split[flags.fold][1]
        y = np.argmax(myDB.y[va], axis=1)
    if np.max(y) > classes:
        y = np.argmax(to4c(onehot_encode(y)), axis=1)
    score = cross_entropy(y, yp)
    print(name, score, '\n')
コード例 #2
0
def train_rnn(flags):
    myDB = personalDB(flags)
    model = BaoRnn(flags, myDB)
    model.train()
コード例 #3
0
def train_embedding(flags):
    myDB = personalDB(flags)
    model = BaoEmbedding(flags, myDB)
    model.train()
コード例 #4
0
def show_embedding(flags):
    myDB = personalDB(flags)
    model = BaoEmbedding(flags, myDB)
    model.show_embedding("CBOW/embedding/w:0")
コード例 #5
0
ファイル: nn.py プロジェクト: daxiongshu/kaggle-review
def predict_nn(flags):
    myDB = personalDB(flags)
    model = BaoNN(flags, myDB)
    model.predict()
コード例 #6
0
ファイル: fe.py プロジェクト: daxiongshu/kaggle-review
def build_feature(flags,feas=None):

    if feas is None:
        feas = ['domain','full_text','gene_text','var_text','share','gene','pattern','onehot']
    if 'meta' in flags.task:
        feas.append('meta')
 
    myDB = personalDB(flags,name='full')
    myDB.get_per_sample_tfidf(['training_text','test_text_filter','stage2_test_text'],"Text")
    myDB.get_split()

    geneDB = geneTextDB(flags,W=10,bar=0)
    geneDB.poke()#_text()
    geneDB.get_per_sample_tfidf(['training_text','test_text_filter','stage2_test_text'],"Text")
  
    varDB = geneTextDB(flags,tag='variation',W=10,bar=0)
    varDB.poke()#_text()
    varDB.get_per_sample_tfidf(['training_text','test_text_filter','stage2_test_text'],"Text")

    fold,folds = flags.fold,flags.folds
    if fold>=0:
        tr_rows,te_rows = myDB.split[fold]
        tr_rows = tr_rows.tolist()
        te_rows = te_rows.tolist()
    else:
        tr_rows = None
        te_rows = 'stage2' if 'stage2' in flags.task else 'stage1'
    
    words = myDB.select_top_k_words(['training_text','test_text_filter'],"Text",mode="tf",k=5,slack=0)
    myDB.get_words(words)

    gwords = geneDB.select_top_k_words(['training_text','test_text_filter'],"Text",mode="tf",k=2,slack=0)
    geneDB.get_words(gwords)

    vwords = varDB.select_top_k_words(['training_text','test_text_filter'],"Text",mode="tf",k=5,slack=0)
    varDB.get_words(vwords)

    X,Xt = [],[]

    if 'meta' in feas:
        paths = [#"comps/personal/baobao/backup/91_88_87_86_74",
           # "comps/personal/baobao/backup/107_110_105_102_102",
           "comps/personal/baobao/backup/4c_52_51_47_46_39"]
        for path in paths:
            fill(X,Xt,*get_meta(myDB, tr_rows, te_rows, path))
    if 'text_len' in feas:
        fill(X,Xt,*get_text_len(myDB, tr_rows, te_rows))
    if 'domain' in feas:
        fill(X,Xt,*domain(myDB, tr_rows, te_rows))

    if 'full_text' in feas:
        for mode in ["tf"]:
            fill(X,Xt,*get_count(myDB, tr_rows, te_rows, mode))

    if 'gene_text' in feas:
        for mode in ["tf"]:
            fill(X,Xt,*get_count(geneDB, tr_rows, te_rows, mode))

    if 'var_text' in feas:
        for mode in ["tf"]:
            fill(X,Xt,*get_count(varDB, tr_rows, te_rows, mode))

    if 'share' in feas:
        fill(X,Xt,*get_share(myDB, tr_rows, te_rows))

    if 'pattern'  in feas:
        patterns = [r'[a-zA-Z][0-9]+[a-zA-Z]*','del']#,'ins','fus','trunc','methy','amp','sil','expr','splice','exon']
        fill(X,Xt,*get_pattern(myDB, tr_rows, te_rows,patterns))

    if 'gene' in feas:
        fill(X,Xt,*get_gene(myDB, tr_rows, te_rows))

    if 'd2v' in feas:
        fill(X,Xt,*get_d2v(flags.load_path, tr_rows, te_rows))

    if 'onehot' in feas:
        fill(X,Xt,*onehot_gene(myDB, tr_rows, te_rows))
        
    y,yt = myDB.gety(tr_rows,te_rows)    
    return np.hstack(X),y,np.hstack(Xt),yt,te_rows
コード例 #7
0
def preprocess(flags):
    myDB = personalDB(flags, name="full")
    myDB.poke()
コード例 #8
0
def train_d2v(flags):
    myDB = personalDB(flags)
    model = D2V(flags, myDB)
    model.train()
コード例 #9
0
def show_d2v(flags):
    myDB = personalDB(flags)
    model = D2V(flags, myDB)
    model.show_embedding("D2V/embedding/w:0")
コード例 #10
0
def test_cnn(flags):
    myDB = personalDB(flags, name='full')
    gDB = geneTextDB(flags, W=10, bar=0)
    model = BaoCnn(flags, myDB, gDB)
    model.predict()
コード例 #11
0
def train_cnn(flags):
    myDB = personalDB(flags, name='full')
    gDB = geneTextDB(flags, W=10, bar=0)
    model = BaoCnn(flags, myDB, gDB)
    model.train()