def self_train():
    dict = utils.unpickle('cifar_10/data_batch_1')
    test_dict = utils.unpickle('cifar_10/data_batch_2')
    # dict content = [b'batch_label', b'labels', b'data', b'filenames']

    label_data, label = utils.data_fromCIFAR10(dict, 500)
    test_data, test_label = utils.data_fromCIFAR10(dict, 5000)
    #label_data, label = utils.data_fromCIFAR10(dict, 5000)
    #print("label data number {}, label number {}".format(np.shape(label_data), np.shape(label)))
    '''
    train_data = []
    train_lable = []

    count = np.zeros(10)
    for i in range(500):
        if(count[0] < 20 and (label[i] == 0) ): count[0] += 1
        elif(count[1] < 20 and (label[i] == 1) ): count[1] += 1
        elif(count[2] < 20 and (label[i] == 2) ): count[2] += 1
        elif(count[3] < 20 and (label[i] == 3) ): count[3] += 1
        elif(count[4] < 20 and (label[i] == 4) ): count[4] += 1
        elif(count[5] < 20 and (label[i] == 5) ): count[5] += 1
        elif(count[6] < 20 and (label[i] == 6) ): count[6] += 1
        elif(count[7] < 20 and (label[i] == 7) ): count[7] += 1
        elif(count[8] < 20 and (label[i] == 8) ): count[8] += 1
        elif(count[9] < 20 and (label[i] == 9) ): count[9] += 1
        else: continue
        train_data.append(label_data[i])
        train_lable.append(label[i])'''

    model = net.Net3(label_data, label, save_model='test.h5', validate=0.1)
    #model = net.Net1(train_data, train_lable, save_model='test.h5', validate=0.1)
    #model = train(label_data, label, save_model='test.h5', validate=0.1)
    #model = train(label_data, label)

    allData, allLabel = utils.data_fromCIFAR10(dict, 10000)
    #unlabel_data = unlabel_data[1000:]
    #print("unlabel data number line102", np.shape(unlabel_data))

    #unlabel_data = allData[1000:]
    #true_label = allLabel[1000:]

    unlabel_data = allData[6000:]
    true_label = allLabel[6000:]

    predict_maxSet = []

    iter = 10

    for i in range(iter):
        print('Iteration {}'.format(i + 1))
        model = load_model('test.h5')
        predict_raw = utils.predict_data(unlabel_data, model)
        prediction = np.argmax(predict_raw, axis=1)
        #predict_raw = np.argmax(predict_raw, axis=1)
        predict_max = np.max(predict_raw)
        predict_maxSet.append(predict_max)

        # test data from another batch, separate from label and unlabel data
        test_raw = utils.predict_data(test_data, model)
        test_raw = np.argmax(test_raw, axis=1)
        test_predict = test_raw.reshape(-1, 1)

        count = 0
        index = []
        aug_data = []
        aug_label = []
        for k in range(predict_raw.shape[0]):
            if (np.max(predict_raw[k]) > (predict_max * 0.995)):
                count += 1
                aug_data.append(unlabel_data[k])
                aug_label.append(prediction[k])

        prediction.reshape(-1, 1)

        new_labelData = label_data + aug_data
        new_label = label + aug_label

        utils.accy(test_predict, test_label)

        model = net.Net3(new_labelData,
                         new_label,
                         'test.h5',
                         'test.h5',
                         validate=0.05)
        #model = train(new_labelData, new_label, save_model='test.h5', validate=0.05)
        del model

        if (i == iter - 1):
            utils.accy(test_predict, test_label)
Beispiel #2
0
def predict_29(val,register,app,video,act):
    def get_features(df,d1,d2):
        tapp = app[(app.day>=d1) & (app.day<=d2)]
        tact = act[(act.day>=d1) & (act.day<=d2)]
        tvideo = video[(video.day>=d1) & (video.day<=d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2-d1      
        #df['register_time'] = d2-df.register_day+1
        
        df = docount(df,tapp,'app',['user_id']);gc.collect() 
        df = docount(df,tapp[tapp.day==lastday],'last_day_app',['user_id']);gc.collect()
        #df['app_mean#'] = df['app$user_id#']/2
        df = docount(df,tvideo,'video',['user_id']);gc.collect()
        df['videorate'] = df['video$user_id#']/(tvideo.shape[0]+0.000001)
        #df['video_mean#'] = df['video$user_id#']/2
        df = docount(df,tact,'act',['user_id']);gc.collect()
        df = docount(df,tact[tact.day==lastday],'last_day_act',['user_id']);gc.collect()
        df = docount(df,tact[tact.day==lastday-1],'first_day_act',['user_id']);gc.collect()
        df['actrate'] = df['act$user_id#']/(tact.shape[0]+0.000001)
        df['last_day_actrate'] = df['last_day_act$user_id#']/(tact.shape[0]+0.000001)
        df['first_day_actrate'] = df['first_day_act$user_id#']/(tact.shape[0]+0.000001)
        df['actrate_gap'] = df['last_day_actrate'] - df['first_day_actrate']
        df['act_gap'] = df['last_day_act$user_id#'] - df['first_day_act$user_id#']
        #df['act_mean#'] = df['act$user_id#']/2
        #page_list = list(tact['page'].unique())
        def iszero(s):
            if s==0:
                return 0
            return 1
        df['act0'] = df['act$user_id#'].apply(iszero)
        df['video0'] = df['video$user_id#'].apply(iszero)    
        
        
        
        for c in [1]: 
            df = docount(df,tact[tact.day==lastday][tact['page']==c],'last_day_act_page='+str(c),['user_id']);gc.collect()
        
        for c in [0,1,2,3,4]: 
            df = docount(df,tact[tact['page']==c],'act_page='+str(c),['user_id']);gc.collect()
            df['act_page='+str(c)+'$user_id#rate'] = df['act_page='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001)
        
        df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']
        df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']+df['act_page=0$user_id#']

        
        action_list = list(tact['action_type'].unique())
        for c in [0,1,2,3,4,5]: 
            df = docount(df,tact[tact['action_type']==c],'action_type='+str(c),['user_id']);gc.collect()
            df = docount(df,tact[tact.day==lastday][tact['action_type']==c],'last_day_action_type='+str(c),['user_id']);gc.collect()
            df['action_type='+str(c)+'$user_id#rate'] = df['action_type='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001)


        df['author_id'] = df['user_id']
        
        df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'author_id');gc.collect()
        df['last_day_act$author_video_m'] = df['last_day_act$user_id_by_video_id_iq']/df['last_day_act$user_id_by_author_id_iq']
        
        df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'author_id');gc.collect()
        df['first_day_act$author_video_m'] = df['first_day_act$user_id_by_video_id_iq']/df['first_day_act$user_id_by_author_id_iq']

        
        df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'author_id');gc.collect()
        df['last2_day_act$author_video_m'] = df['last2_day_act$user_id_by_video_id_iq']/df['last2_day_act$user_id_by_author_id_iq']

        
        
        del df['register_day'],df['author_id']
        return df

    def get_features_all(df,df1):
        lendf = len(df)
        df= df.append(df1)
        del df1
        gc.collect()
        df = docount(df,df,'ALL',['register_type']) 

        del df['user_id']
        
        ccc = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'action_type=2$user_id#rate', 'act_page=0$user_id#rate', 'last_day_act$author_video_m', 'action_type=1$user_id#rate', 'act_page=2$user_id#', 'actrate', 'last_day_act$user_id_by_author_id_iq', 'app$user_id#', 'last_day_act_page=1$user_id#', 'act_page=3$user_id#rate', 'last_day_action_type=0$user_id#', 'first_day_act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'last2_day_act$user_id_by_author_id_iq', 'last2_day_act$user_id_by_video_id_iq', 'first_day_actrate', 'act_page=2$user_id#rate', 'last_day_actrate', 'first_day_act$author_video_m', 'last2_day_act$author_video_m', 'ALL$register_type#', 'act_page=0$user_id#', 'actrate_gap', 'action_type=3$user_id#rate', 'last_day_act$user_id#', 'act$user_id#', 'last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#', 'act_gap', 'action_type=2$user_id#', 'action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#', 'act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#', 'act_page=23$user_id#', 'act_page=023$user_id#', 'act_page=4$user_id#', 'last_day_action_type=2$user_id#', 'last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#', 'last_day_action_type=4$user_id#', 'action_type=4$user_id#', 'last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0']
        ccc1 = [ ]
        
        ddd = ['action_type=2$user_id#rate','action_type=1$user_id#rate','last_day_act$user_id_by_author_id_iq',
               'last_day_act_page=1$user_id#','act_page=3$user_id#rate','first_day_act$user_id_by_video_id_iq',
               'videorate','act_page=1$user_id#rate','last2_day_act$user_id_by_author_id_iq','last2_day_act$user_id_by_video_id_iq',
               'act_page=2$user_id#rate','last_day_actrate', 'first_day_act$author_video_m','last2_day_act$author_video_m',
               'ALL$register_type#','act_page=0$user_id#','actrate_gap','action_type=3$user_id#rate',
               'last_day_act$user_id#','act$user_id#','last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 
               'action_type=1$user_id#','act_gap', 'action_type=2$user_id#','action_type=3$user_id#',
               'first_day_act$user_id#', 'act_page=3$user_id#','act_page=4$user_id#rate', 'video$user_id#', 
               'last_day_action_type=1$user_id#','act_page=23$user_id#', 'act_page=023$user_id#','act_page=4$user_id#', 
               'last_day_action_type=2$user_id#','last_day_action_type=3$user_id#', 'action_type=5$user_id#rate',
               'action_type=5$user_id#', 'last_day_app$user_id#','last_day_action_type=4$user_id#',
               'action_type=4$user_id#','last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0']
        
        used = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#',
                'first_day_act$user_id_by_author_id_iq', 'act_page=0$user_id#rate','last_day_act$author_video_m',
                'act_page=2$user_id#','actrate','app$user_id#', 'last_day_action_type=0$user_id#',
                'first_day_actrate', 'action_type=5$user_id#rate', ]
        
        df = df[used]
        
        
         
        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
    
    path = '../data1/29/'
    
    if val:
        if os.path.exists(path+'val_df.csv'):
            test_df = pd.read_csv(path+'val_df.csv')
            val_y = pd.read_csv(path+'val_y.csv')
        else:
            test_df = register[(register.register_day==22)]
            test_df = get_features(test_df,22,23)
            val_y = is_active(test_df,24,30,app,video,act)
            test_df.to_csv(path+'val_df.csv',index=False)
            val_y.to_csv(path+'val_y.csv',index=False)
        val_y = val_y['Y']
        if os.path.exists(path+'val_train_df.csv'):
            train_df = pd.read_csv(path+'val_train_df.csv')
            train_y = pd.read_csv(path+'val_train_y.csv')
        else:    
            train_df = pd.DataFrame()   
            train_y = pd.DataFrame()                  
            for i in range(1,22):
                df = register[(register.register_day==i)]
                y = is_active(df,i+2,i+8,app,video,act)
                df = get_features(df,i,i+1)
                train_df = train_df.append(df)
                train_y = train_y.append(y)
            train_df.to_csv(path+'val_train_df.csv',index=False)
            train_y.to_csv(path+'val_train_y.csv',index=False)
    else:
        if os.path.exists(path+'test_df.csv'):
            test_df = pd.read_csv(path+'test_df.csv')
        else:
            test_df = register[(register.register_day==29)]
            test_df = get_features(test_df,29,30)
            test_df.to_csv(path+'test_df.csv',index=False)
                               
        if os.path.exists(path+'train_df.csv'):
            train_df = pd.read_csv(path+'train_df.csv')
            train_y = pd.read_csv(path+'train_y.csv')
        else:            
            if os.path.exists(path+'val_train_df.csv'):
                train_df = pd.read_csv(path+'val_train_df.csv')
                train_y = pd.read_csv(path+'val_train_y.csv')
                val_df = pd.read_csv(path+'val_df.csv')
                val_y = pd.read_csv(path+'val_y.csv')
                train_df = train_df.append(val_df)
                train_y = train_y.append(val_y)
            else:
                train_df = pd.DataFrame()   
                train_y = pd.DataFrame()                  
                for i in range(1,23):
                    df = register[(register.register_day==i)]
                    y = is_active(df,i+2,i+8,app,video,act)
                    df = get_features(df,i,i+1)
                    train_df = train_df.append(df)
                    train_y = train_y.append(y)  
            train_df.to_csv(path+'train_df.csv',index=False)
            train_y.to_csv(path+'train_y.csv',index=False)                 
    train_y = train_y['Y']

    ids = test_df['user_id']
    train_df,test_df = get_features_all(train_df,test_df)
    
    pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1)
    
    if val==1:   
        print (len(train_y),sum(train_y))
        showresults(train_y,pre_train)
        showresults(val_y,test_y) 
        showtop(val_y,test_y,nums=1337)
        showtop(train_y,pre_train,nums=19589)
        return ids,test_y,getbest(ids,test_y,th=0.4)
    else:
        showresults(train_y,pre_train)     
        showtop(train_y,pre_train,nums=20926)
        return ids,test_y,getbest(ids,test_y,rank=1294)
Beispiel #3
0
def predict_30(val, register, app, video, act):
    def get_features0(df, d):
        #tapp = app[app.day==d]
        tvideo = video[video.day == d]
        tact = act[act.day == d]
        #df = docount(df,tapp,'app',['user_id']);gc.collect()
        df = docount(df, tvideo, 'video', ['user_id'])
        gc.collect()
        df['videorate'] = df['video$user_id#'] / (tvideo.shape[0] + 0.000001)
        df = docount(df, tact, 'act', ['user_id'])
        gc.collect()
        df['actrate'] = df['act$user_id#'] / (tact.shape[0] + 0.000001)

        page_list = list(tact['page'].unique())
        for c in [0, 1, 2, 3, 4]:
            df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c),
                         ['user_id'])
            gc.collect()
            df['act_page=' + str(c) +
               '$user_id#rate'] = df['act_page=' + str(c) + '$user_id#'] / (
                   df['act$user_id#'] + 0.00001)

        df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df[
            'act_page=3$user_id#']
        df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df[
            'act_page=3$user_id#'] + df['act_page=0$user_id#']

        action_list = list(tact['action_type'].unique())
        for c in [0, 1, 2, 3, 4, 5]:
            df = docount(df, tact[tact['action_type'] == c],
                         'action_type=' + str(c), ['user_id'])
            gc.collect()
            df['action_type=' + str(c) +
               '$user_id#rate'] = df['action_type=' + str(c) + '$user_id#'] / (
                   df['act$user_id#'] + 0.00001)

        df['action_type=01$user_id#'] = df['action_type=0$user_id#'] + df[
            'action_type=1$user_id#']

        def iszero(s):
            if s == 0:
                return 0
            return 1

        df['pageall'] = df['act_page=0$user_id#'].apply(iszero)
        for c in [1, 2, 3, 4]:
            df['pageall'] = df['pageall'] * df['act_page=0$user_id#']
        df['pageall'] = df['act_page=0$user_id#'].apply(iszero)

        df['actionall'] = df['action_type=0$user_id#'].apply(iszero)
        for c in [1, 2, 3, 4, 5]:
            df['pageall'] = df['pageall'] * df['action_type=0$user_id#']
        df['actionall'] = df['action_type=0$user_id#'].apply(iszero)

        df['act0'] = df['act$user_id#'].apply(iszero)
        df['video0'] = df['video$user_id#'].apply(iszero)

        def bigact(s):
            if s >= 50:
                return 5
            else:
                return int(s / 10)

        df['act$user_id#10'] = df['act$user_id#'].apply(bigact)

        df['author_id'] = df['user_id']
        df = docount(df, tact, 'act', ['author_id'])
        gc.collect()
        df = doiq(df, tact, 'act', ['user_id'], 'video_id')
        gc.collect()
        df = doiq(df, tact, 'act', ['user_id'], 'author_id')
        gc.collect()

        df['act$author_video_m'] = df['act$user_id_by_video_id_iq'] / df[
            'act$user_id_by_author_id_iq']

        del df['register_day'], df['author_id']
        return df

    def get_features_all(df, df1):
        lendf = len(df)
        df = df.append(df1)
        del df1
        gc.collect()

        for c in ['act$user_id#']:
            #df = domean(df,df,'All',['device_type'],c);gc.collect()
            df = domean(df, df, 'All', ['register_type'], c)
            gc.collect()
            #df = dovar(df,df,'All',['register_type'],c);gc.collect()
        df = docount(df, df, 'ALL', ['register_type'])
        df = docount(df, df, 'ALL', ['device_type'])

        del df['user_id'],

        ccc = [
            'device_type', 'actrate', 'All$register_type_by_act$user_id#_mean',
            'act_page=1$user_id#', 'action_type=0$user_id#rate',
            'action_type=1$user_id#rate', 'register_type',
            'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq',
            'videorate', 'act_page=1$user_id#rate', 'act$author_video_m',
            'action_type=2$user_id#rate', 'act_page=3$user_id#rate',
            'act_page=0$user_id#', 'action_type=0$user_id#',
            'act_page=2$user_id#', 'act_page=2$user_id#rate',
            'action_type=1$user_id#', 'act$user_id#',
            'act_page=4$user_id#rate', 'act_page=0$user_id#rate', 'pageall',
            'act_page=4$user_id#', 'action_type=3$user_id#rate',
            'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#',
            'action_type=2$user_id#', 'action_type=3$user_id#',
            'act_page=023$user_id#', 'act$author_id#',
            'action_type=01$user_id#', 'action_type=5$user_id#rate',
            'ALL$register_type#', 'action_type=5$user_id#', 'act$user_id#10',
            'action_type=4$user_id#', 'actionall',
            'action_type=4$user_id#rate', 'act0', 'video0'
        ]
        ccc1 = []

        ddd = [
            'All$register_type_by_act$user_id#_mean',
            'act_page=1$user_id#',
            'action_type=1$user_id#rate',
            'act$user_id_by_author_id_iq',
            'act$user_id_by_video_id_iq',
            'act$author_video_m',
            'act_page=2$user_id#',
            'act_page=2$user_id#rate',
            'action_type=1$user_id#',
            'act$user_id#',
            'act_page=4$user_id#rate',
            'act_page=4$user_id#',
            'action_type=3$user_id#rate',
            'act_page=23$user_id#',
            'act_page=3$user_id#',
            'video$user_id#',
            'action_type=2$user_id#',
            'action_type=3$user_id#',
            'act$author_id#',
            'action_type=01$user_id#',
            'ALL$register_type#',
            'ALL$device_type#',
            'action_type=5$user_id#rate',
            'action_type=5$user_id#',
            'act$user_id#10',
            'action_type=4$user_id#',
            'actionall',
            'action_type=4$user_id#rate',
            'act0',
        ]

        used = [
            'device_type',
            'register_type',
            'actrate',
            'action_type=0$user_id#rate',
            'videorate',
            'act_page=1$user_id#rate',
            'action_type=2$user_id#rate',
            'act_page=3$user_id#rate',
            'act_page=0$user_id#',
            'action_type=0$user_id#',
            'act_page=0$user_id#rate',
            'pageall',
            'act_page=023$user_id#',
            'video0',
            'All$register_type_by_act$user_id#_mean',
            'ALL$register_type#',
        ]

        df = df[used]

        df1 = df[lendf:]
        df = df[:lendf]
        return df, df1

    path = '../data1/30/'
    if os.path.exists(path + 'train_df.csv'):
        train_df = pd.read_csv(path + 'train_df.csv')
        train_y = pd.read_csv(path + 'train_y.csv')

    else:
        train_df = pd.DataFrame()
        train_y = pd.DataFrame()
        for i in range(1, 24):
            df = register[register.register_day == i]
            y = is_active(df, i + 1, i + 7, app, video, act)
            df = get_features0(df, i)
            train_df = train_df.append(df)
            train_y = train_y.append(y)
            if i == 22:
                valst = len(train_df)
                print(valst)

        train_df.to_csv(path + 'train_df.csv', index=False)
        train_y.to_csv(path + 'train_y.csv', index=False)

    train_y = train_y['Y']
    if val:
        #35134
        valst = 35134
        test_df = train_df[valst:]
        val_y = train_y[valst:]
        train_df = train_df[:valst]
        train_y = train_y[:valst]
    else:
        if os.path.exists(path + 'test_df.csv'):
            test_df = pd.read_csv(path + 'test_df.csv')
        else:
            test_df = register[register.register_day == 30]
            test_df = get_features0(test_df, 30)
            test_df.to_csv(path + 'test_df.csv', index=False)

    #train_df['Y'] = train_y
    #act0train = train_df[train_df['act$user_id#']==0]
    #print(len(act0train),len(act0train[act0train['Y']==1]))
    #del train_df['Y']
    #act0ids = test_df[test_df['act$user_id#']==0]['user_id']

    ids = test_df['user_id']
    train_df, test_df = get_features_all(train_df, test_df)

    pre_train, test_y = predict_data(train_df,
                                     train_y,
                                     10,
                                     test_df,
                                     importance=1)

    if val == 1:
        print(len(train_y), sum(train_y))
        showresults(train_y, pre_train)
        showresults(val_y, test_y)
        showfalse(ids, test_df, val_y, test_y)
        showtop(val_y, test_y, nums=1457)
        showtop(train_y, pre_train, nums=23260)
        #showtop(train_y,pre_train,nums=15485)
        #showprecision(val_y,test_y)
        #showprecision(train_y,pre_train)
        return ids, test_y, getbest(ids, test_y, th=0.4)
    else:
        showresults(train_y, pre_train)
        showtop(train_y, pre_train, nums=24717)
        #showtop(train_y,pre_train,nums=16943)
        #showprecision(train_y,pre_train)
        return ids, test_y, getbest(ids, test_y, rank=1490)
Beispiel #4
0
def predict_1_23(val, register, app, video, act):
    path = '../data1/1_23/'

    def get_features(df, d1, d2):
        tapp = app[(app.day >= d1) & (app.day <= d2)]
        tact = act[(act.day >= d1) & (act.day <= d2)]
        tvideo = video[(video.day >= d1) & (video.day <= d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2 - d1
        #app
        df = docount(df, tapp, 'app', ['user_id'])
        #df = domin(df,tapp,'app',['user_id'],'day')
        df = domax(df, tapp, 'app', ['user_id'], 'day')

        df['last_app_day'] = lastday - df['app$user_id_by_day_max'] + 1
        #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']+1
        df['app_day_missing'] = df['register_time'] - df['app$user_id#']
        df['app_mean#'] = df['app$user_id#'] / df['register_time']
        del df['app$user_id#'], df['app$user_id_by_day_max']

        df = dovar(df, tapp, 'app', ['user_id'], 'day')
        #df = domean(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day')
        #df = dovar(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day')

        for i in range(8):
            df = docount(df, tapp[tapp.day >= lastday - i],
                         'app_last_' + str(i), ['user_id'])
            if i >= 3:
                df = domean(df, tapp[tapp.day >= lastday - i],
                            'app_last_' + str(i), ['user_id'], 'day')
                df = dovar(df, tapp[tapp.day >= lastday - i],
                           'app_last_' + str(i), ['user_id'], 'day')
        #df = docount(df,tapp[tapp.day>lastday-7],'app_last_7',['user_id'])
        #df = docount(df,tapp[tapp.day>lastday-3],'app_last_3',['user_id'])
        #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id'])

        gc.collect()
        #video
        df = docount(df, tvideo, 'video', ['user_id'])
        df = domin(df, tvideo, 'video', ['user_id'], 'day')
        df = domax(df, tvideo, 'video', ['user_id'], 'day')
        df = doiq(df, tvideo, 'video', ['user_id'], 'day')
        df['last_video_day'] = lastday - df['video$user_id_by_day_max'] + 1
        df['first_video_day'] = lastday - df['video$user_id_by_day_min'] + 1
        df['video_day_gap'] = df['video$user_id_by_day_max'] - df[
            'video$user_id_by_day_min'] + 1
        #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq']
        df['video_mean#'] = df['video$user_id#'] / df['register_time']
        del df['video$user_id#'], df['video$user_id_by_day_max'], df[
            'video$user_id_by_day_min']

        df = dovar(df, tvideo, 'video', ['user_id'], 'day')
        df = domean(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                    ['user_id'], 'day')
        df = dovar(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                   ['user_id'], 'day')

        df = docount(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                     ['user_id'])
        #df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id'])
        #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id'])
        gc.collect()
        #act
        gp = tact.groupby(['user_id', 'day']).size().unstack()
        df = pd.merge(df,
                      gp.max(1).rename('actcount_max').reset_index(),
                      on=['user_id'],
                      how='left')
        df = pd.merge(df,
                      gp.mean(1).rename('actcount_mean').reset_index(),
                      on=['user_id'],
                      how='left')
        df = pd.merge(df,
                      gp.var(1).rename('actcount_var').reset_index(),
                      on=['user_id'],
                      how='left')

        df = docount(df, tact, 'act', ['user_id'])
        df = domin(df, tact, 'act', ['user_id'], 'day')
        df = domax(df, tact, 'act', ['user_id'], 'day')
        df = doiq(df, tact, 'act', ['user_id'], 'day')
        #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        df['act_day_gap'] = df['act$user_id_by_day_max'] - df[
            'act$user_id_by_day_min'] + 1
        df['act_day_missing'] = df['register_time'] - df[
            'act$user_id_by_day_iq']
        df['act_mean#'] = df['act$user_id#'] / df['register_time']
        del df['act$user_id#']

        df = dovar(df, tact, 'act', ['user_id'], 'day')
        #df = domean(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day')
        #df = dovar(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day')

        for i in range(8):
            df = docount(df, tact[tact.day >= lastday - i],
                         'act_last_' + str(i), ['user_id'])
            if i >= 3:
                df = domean(df, tact[tact.day >= lastday - i],
                            'act_last_' + str(i), ['user_id'], 'day')
                df = dovar(df, tact[tact.day >= lastday - i],
                           'act_last_' + str(i), ['user_id'], 'day')

                gp = tact[tact.day >= lastday - i].groupby(
                    ['user_id', 'day']).size().unstack()
                df = pd.merge(df,
                              gp.max(1).rename('act_last_' + str(i) +
                                               '_actcount_max').reset_index(),
                              on=['user_id'],
                              how='left')
                df = pd.merge(
                    df,
                    gp.mean(1).rename('act_last_' + str(i) +
                                      '_actcount_mean').reset_index(),
                    on=['user_id'],
                    how='left')
                df = pd.merge(df,
                              gp.var(1).rename('act_last_' + str(i) +
                                               '_actcount_var').reset_index(),
                              on=['user_id'],
                              how='left')
        #df = docount(df,tact[tact.day>lastday-7],'act_last_7',['user_id'])
        #df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id'])
        #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id'])
        gc.collect()

        page_list = list(tact['page'].unique())
        for c in page_list:
            df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c),
                         ['user_id'])
            df['act_page=' + str(c) +
               '$user_id#'] = df['act_page=' + str(c) +
                                 '$user_id#'] / df['register_time']

        for c in page_list:
            df = docount(df,
                         tact[(tact['page'] == c) & (tact.day > lastday - 8)],
                         'act_last_8_page=' + str(c), ['user_id'])
        for c in page_list:
            df = docount(df,
                         tact[(tact['page'] == c) & (tact.day > lastday - 3)],
                         'act_last_3_page=' + str(c), ['user_id'])

        df['author_id'] = df['user_id']
        df = docount(df, tact, 'act', ['author_id'])
        df['act$author_id#'] = df['act$author_id#'] / df['register_time']

        df = doiq(df, tact, 'act', ['user_id'], 'author_id')
        df['act$user_id_by_author_id_iq'] = df[
            'act$user_id_by_author_id_iq'] / df['register_time']

        df = doiq(df, tact, 'act', ['user_id'], 'video_id')
        df['act$user_id_by_video_id_iq'] = df[
            'act$user_id_by_video_id_iq'] / df['register_time']

        for i in range(8):
            df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i),
                      ['user_id'], 'author_id')
            df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i),
                      ['user_id'], 'video_id')

        #action_list = list(tact['action_type'].unique())
        for c in [0, 1, 2, 3, 5]:
            df = docount(df, tact[tact['action_type'] == c],
                         'action_type=' + str(c), ['user_id'])
            gc.collect()
            df['action_type=' + str(c) +
               '$user_id#'] = df['action_type=' + str(c) +
                                 '$user_id#'] / df['register_time']
        for c in [0, 1, 2, 3]:
            df = docount(
                df,
                tact[(tact['action_type'] == c) & (tact.day > lastday - 8)],
                'act_last_8_action_type=' + str(c), ['user_id'])
        for c in [0, 1, 2, 3]:
            df = docount(
                df,
                tact[(tact['action_type'] == c) & (tact.day > lastday - 3)],
                'act_last_3_action_type=' + str(c), ['user_id'])
        ''' 
        def getmaxcontinuedays(s):
            s = np.array(s)
            ans = 0
            t = 0
            for i in s:
                if i>0:
                    t =  t+ 1
                else:
                    if t>ans:
                        ans = t
                    t = 0
            if t>ans:
                ans=t
            return ans
  
        gp = tapp.groupby(['user_id','day']).size().unstack()
        gp = gp.fillna(0)
        
        #print (gp)
        gp['app_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1)
        #print (gp)
        df = pd.merge(df,gp.reset_index()[['user_id','app_max_continue_days']],on=['user_id'],how='left') 
         
        gp = tact.groupby(['user_id','day']).size().unstack()
        gp = gp.fillna(0)
        
        #print (gp)
        gp['act_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1)
        #print (gp)
        df = pd.merge(df,gp.reset_index()[['user_id','act_max_continue_days']],on=['user_id'],how='left') 
        '''

        del df['author_id']
        gc.collect()

        return df

    def get_features_all(df, df1):
        lendf = len(df)
        df = df.append(df1)
        del df1
        gc.collect()

        #ccc = ['app_mean#', 'last_app_day', 'app$user_id_by_day_var', 'act$user_id_by_day_var', 'device_type', 'act$user_id_by_video_id_iq', 'app_last_4$user_id_by_day_var', 'act_last_0$user_id_by_author_id_iq', 'app_last_4$user_id#', 'register_type', 'act$user_id_by_day_max', 'actcount_var', 'act_last_0$user_id#', 'act_mean#', 'actcount_max', 'act_last_7$user_id_by_day_var', 'app_last_7$user_id_by_day_var', 'app_last_1$user_id#', 'action_type=2$user_id#', 'act_page=1$user_id#', 'action_type=0$user_id#', 'act_last_1$user_id#', 'app_last_5$user_id#', 'act$user_id_by_day_min', 'act_page=3$user_id#', 'act$user_id_by_day_iq', 'actcount_mean', 'act_last_0$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'app_last_7$user_id_by_day_mean', 'act_last_8_action_type=2$user_id#', 'act_last_8_page=1$user_id#', 'act_last_4$user_id_by_day_mean', 'act$user_id_by_author_id_iq', 'app_last_5$user_id_by_day_mean', 'act_day_gap', 'app_day_missing', 'act_last_7_actcount_var', 'action_type=3$user_id#', 'act_last_4_actcount_var', 'act_last_1$user_id_by_author_id_iq', 'app_last_3$user_id_by_day_var', 'act_last_3_actcount_var', 'act_last_1$user_id_by_video_id_iq', 'act_last_3_page=1$user_id#', 'act_page=2$user_id#', 'act_page=0$user_id#', 'act_last_3$user_id_by_video_id_iq', 'act_last_6_actcount_max', 'app_last_2$user_id#', 'act_last_2$user_id#', 'app_last_6$user_id_by_day_mean', 'act_last_6_actcount_var', 'act_last_3_action_type=2$user_id#', 'act_last_6$user_id_by_video_id_iq', 'act_last_7$user_id_by_video_id_iq', 'act_last_5_actcount_var', 'act_last_3$user_id#', 'act_last_7$user_id_by_author_id_iq', 'act_last_2$user_id_by_video_id_iq', 'act_last_8_page=3$user_id#', 'act_page=4$user_id#', 'act_last_7_actcount_max', 'act_last_5$user_id_by_day_var', 'act_last_7$user_id_by_day_mean', 'act_last_8_action_type=0$user_id#', 'act_last_3_actcount_max', 'app_last_5$user_id_by_day_var', 'app_last_0$user_id#', 'app_last_6$user_id_by_day_var', 'act_day_missing', 'action_type=1$user_id#', 'act_last_6_actcount_mean', 'act_last_6$user_id_by_day_mean', 'act_last_3$user_id_by_author_id_iq', 'act_last_8_page=0$user_id#', 'act_last_3_actcount_mean', 'act_last_6$user_id_by_author_id_iq', 'video_last_8$user_id_by_day_var', 'act_last_5$user_id_by_day_mean', 'act_last_3_page=0$user_id#', 'register_time', 'act_last_3$user_id_by_day_var', 'last_video_day', 'act_last_6$user_id_by_day_var', 'act_last_4$user_id#', 'act_last_5$user_id_by_author_id_iq', 'act_last_4$user_id_by_author_id_iq', 'first_video_day', 'video_mean#', 'act_last_8_action_type=3$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_3_page=3$user_id#', 'app_last_4$user_id_by_day_mean', 'app_last_3$user_id#', 'act_last_8_page=4$user_id#', 'act_last_6$user_id#', 'act_last_3$user_id_by_day_mean', 'act_last_7$user_id#', 'act_last_5$user_id_by_video_id_iq', 'video_last_8$user_id_by_day_mean', 'act_last_4$user_id_by_day_var', 'act_last_7_actcount_mean', 'app_last_7$user_id#', 'video$user_id_by_day_var', 'act_last_5_actcount_max', 'act_last_3_page=4$user_id#', 'act_last_8_page=2$user_id#', 'act_last_5$user_id#', 'act_last_4_actcount_max', 'video$user_id_by_day_iq', 'act_last_4$user_id_by_video_id_iq', 'act_last_5_actcount_mean', 'act$author_id#', 'app_last_6$user_id#', 'act_last_4_actcount_mean', 'act_last_8_action_type=1$user_id#', 'video_day_gap', 'act_last_3_action_type=1$user_id#', 'act_last_3_page=2$user_id#', 'app_last_3$user_id_by_day_mean', 'action_type=5$user_id#', 'video_last_8$user_id#', 'act_last_3_action_type=3$user_id#']
        #for i in range(100,124):
        #    del df[ccc[i]]

        del df['user_id']

        df1 = df[lendf:]
        df = df[:lendf]
        return df, df1

    df1 = register[register.register_day < 10]
    df1['register_time'] = 17 - register.register_day
    df2 = register[register.register_day < 17]
    df2['register_time'] = 24 - register.register_day

    test_df = register[register.register_day < 24]
    test_df['register_time'] = 31 - test_df.register_day

    del df1['register_day'], df2['register_day'], test_df['register_day']

    if os.path.exists(path + 'train_y1.csv'):
        train_y1 = pd.read_csv(path + 'train_y1.csv')

    else:
        train_y1 = is_active(df1, 17, 23, app, video, act)
        train_y1.to_csv(path + 'train_y1.csv', index=False)
    train_y1 = train_y1['Y']
    if os.path.exists(path + 'train_y2.csv'):
        train_y2 = pd.read_csv(path + 'train_y2.csv')

    else:
        train_y2 = is_active(df2, 24, 30, app, video, act)
        train_y2.to_csv(path + 'train_y2.csv', index=False)
    train_y2 = train_y2['Y']

    if os.path.exists(path + 'df1.csv'):
        df1 = pd.read_csv(path + 'df1.csv')
    else:
        df1 = get_features(df1, 1, 16)
        df1.to_csv(path + 'df1.csv', index=False)

    if os.path.exists(path + 'df2.csv'):
        df2 = pd.read_csv(path + 'df2.csv')
    else:
        df2 = get_features(df2, 1, 23)
        df2.to_csv(path + 'df2.csv', index=False)

    if val:
        train_df = df1
        test_df = df2
        train_y = train_y1
        val_y = train_y2
    else:
        if os.path.exists(path + 'test_df.csv'):
            test_df = pd.read_csv(path + 'test_df.csv')
        else:
            test_df = get_features(test_df, 1, 30)
            test_df.to_csv(path + 'test_df.csv', index=False)

        train_df = df1.append(df2)
        train_y = train_y1.append(train_y2)
        #train_df = df2
        #train_y = train_y2

    del df1, df2
    gc.collect()
    ids = test_df['user_id']
    train_df, test_df = get_features_all(train_df, test_df)
    '''
    train_df['Y'] = train_y
    print (len(train_df))
    train_js = train_df[train_df['act_mean#']==0]  
    train_df = train_df[train_df['act_mean#']>0]  
    print (len(train_df))
    train_y = train_df['Y']
    del train_df['Y']
    train_y_js = train_js['Y']
    del train_js['Y']
    
    test_df['Y'] = val_y
    test_js =  test_df[test_df['act_mean#']==0] 
    test_df =  test_df[test_df['act_mean#']>0] 
    val_y = test_df['Y']
    del test_df['Y']
    js_y = test_js['Y']
    del test_js['Y']
    '''
    pre_train, test_y = predict_data(train_df,
                                     train_y,
                                     10,
                                     test_df,
                                     importance=1)
    #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1)
    '''
    test_df['Y'] = val_y
    test_df['Y1'] = test_y
    test_js =  test_df[test_df['act_mean#']==0] 
    print(len(test_js))
    print(len(test_js[test_js['Y1']>=0.4]))
    print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)]))
    test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0
    print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]))
    test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0
    '''

    if val == 1:
        showresults(val_y, test_y)
        showtop(val_y, test_y, nums=10705)
        return ids, test_y, getbest(ids, test_y, rank=10705)
    else:
        showresults(train_y, pre_train)
        showtop(train_y, pre_train, nums=16449)
        return ids, test_y, getbest(ids, test_y, th=0.4)
def DNN(model_train='', save_model='', validate =0):
    dict = utils.unpickle('cifar_10/data_batch_1')
    test_dict = utils.unpickle('cifar_10/data_batch_2')
    
    LX, LY = utils.data_fromCIFAR10(dict, 10000)
    test_data, test_label = utils.data_fromCIFAR10(test_dict, 10000)
    
    x_train = LX
    x_train = np.reshape(x_train, (np.shape(x_train)[0],32,32,3))
    
    y_train = LY
    y_train = to_categorical(y_train,10)
    
    allData, allLabel = utils.data_fromCIFAR10(dict, 10000)
    
    UX = allData[6000:]
    true_label = allLabel[6000:]
    
    img_shape = (32,32,3)
    input_img = Input(shape=img_shape)
    
    classifier = Flatten()(input_img)
    classifier = Dropout(0.5)(classifier)
    classifier = Dense(1024, activation='relu')(classifier)
    classifier = Dropout(0.5)(classifier)
    classifier = Dense(256, activation='relu')(classifier)
    classifier = Dropout(0.5)(classifier)

    classifier = Dense(10, activation='softmax')(classifier)

    dnn = Model(inputs=input_img, outputs=classifier)
    #adam2 = keras.optimizers.Adam(lr=0.0003, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    dnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    dnn.summary()

    early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0)
    model_check = ModelCheckpoint(save_model, monitor='val_loss', verbose=1,
                                  save_best_only=True, save_weights_only=False)

    if ( model_train and  save_model) :
        print('Load model and save new train model.')
        if os.path.isfile(model_train):
            dnn = load_model(model_train)
    
        if (validate):
            dnn.fit( x_train, y_train, batch_size= 100, epochs= 20, validation_split= validate,
                         callbacks=[ model_check, early_stop ])
        else:
            dnn.fit( x_train, y_train, batch_size= 100, epochs= 20,
                         callbacks=[ model_check, early_stop ])
        dnn.save(save_model)
        dnn = load_model(save_model)
    elif model_train:
        print('Load ae_model {}'.format( model_train))
        dnn = load_model(model_train)
    elif save_model:
        print('Build ae_model and save. File: {}'.format(save_model))
        dnn.fit( x_train, y_train, batch_size= 100, epochs= 40, validation_split= validate,
                 callbacks=[ model_check, early_stop ])

        dnn.save(save_model)
    else:
        print('Build ae_model only.')
        dnn.fit( x_train, y_train, batch_size= 100, epochs= 30, validation_split= validate )



    prediction = utils.predict_data(test_data, dnn)
    prediction = np.argmax(prediction, axis=1)
    
    count =0
    for i in range (np.shape(test_label)[0]):
        if (test_label[i] == prediction[i]):
            count += 1



    print('Correct Rate = {}'.format( count/np.shape(test_label)[0]))





    return dnn
Beispiel #6
0
train_df, test_df = get_features_all(train_df, test_df)
cfl = ['device_type', 'kmeans', 'register_type', 'register_time']
if val:
    pre_train, test_y = predict_data_val(train_df,
                                         train_y,
                                         10,
                                         test_df,
                                         val_y,
                                         importance=1,
                                         cf_list=cfl)
    #pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1,loss = 1,nb=56)
else:
    pre_train, test_y = predict_data(train_df,
                                     train_y,
                                     10,
                                     test_df,
                                     importance=0,
                                     loss=1,
                                     nb=99)

sp = 1

if val == 1:
    showresults(val_y, test_y)
    #showtop(val_y,test_y,nums=18223)
    showtop(val_y, test_y, nums=15428)
    showfalse(ids, test_df, val_y, test_y)
else:
    showresults(train_y, pre_train)
    if sp:
        df_1_28 = register[register.register_day <= 28]
Beispiel #7
0
def getreal(Fwy):
    list280 = [400319, 407710, 403402]
    onoff280 = [405428, 410590]
    list680 = [400335, 420614, 404690]
    onoff680 = [409056, 408289]
    list101 = [400868, 401472, 400119, 400661]
    onoff101 = [402883, 409308]
    list880 = [400844, 401871, 401545, 400284, 400662]
    onoff880 = [403200, 403098]
    # stations_list = [400319,407710,403402,400335,420614,404690,400868,400661,400119,401472,400844,401871,401545,400284,400662]
    Fwy = 101  # Required Listbox
    # pointA #dummy textbox
    # pointB #dummy textbox
    if (Fwy == 101):
        stations_list = list101
        onoff = onoff101
    elif (Fwy == 280):
        stations_list = list280
        onoff = onoff280
    elif (Fwy == 680):
        stations_list = list680
        onoff = onoff680
    else:
        stations_list = list880
        onoff = onoff880

    cols = [
        'station', 'timestamp_', 'occupancy', 'hourlyprecipitation',
        'hourlywindspeed', 'hourlyvisibility', 'incident', 'day_of_week_num',
        'hour_of_day', 'weekend', 'speed'
    ]
    colstomod = ['occupancy', 'day_of_week_num', 'hour_of_day', 'speed']
    final = pd.DataFrame(columns=cols)
    pred_speeds = np.array([])
    for station in stations_list:
        url = "https://n8nucy5gbh.execute-api.us-east-2.amazonaws.com/production/realtime/?station=" + str(
            station)
        r = requests.get(url=url)
        data = r.json()
        df = pd.read_json(data, orient='columns')[cols]
        print("counting", df.count())
        dfx = df.set_index(['station', 'timestamp_'
                            ]).sort_values(['station',
                                            'timestamp_'])[colstomod]
        stationid = station
        modelfile = "../models/" + str(Fwy) + "_" + str(
            stationid) + "_" + "speed.h5"
        #define how many timesteps to look back as input with the variable n_lag.
        n_lag = 3
        #define how many timesteps ahead to predict with the variable n_steps.
        n_steps = 1
        treframed, tkey, tscaled, tscaler1 = utils.format_model_data(
            dfx, n_lag, n_steps)
        treframed.drop(treframed.columns[[12, 13, 14]], axis=1, inplace=True)
        tinv_y, tinv_yhat = utils.predict_data(treframed, modelfile, tscaler1)
        y_actual = pd.DataFrame(tinv_y)
        y_predicted = pd.DataFrame((tinv_yhat))
        df_2020 = pd.concat([y_actual, y_predicted], axis=1)
        col = ['y_actual', 'y_predicted']
        df_2020.columns = col
        pred_speeds = np.append(pred_speeds, tinv_yhat[-1])
        final = final.append(df)

    final['station'] = final['station'].astype('int64')
    final['occupancy'] = final['occupancy'].astype('float')
    final['speed'] = final['speed'].astype('float')
    final['hourlyprecipitation'] = final['hourlyprecipitation'].astype('float')
    final['hourlywindspeed'] = final['hourlywindspeed'].astype('float')
    final['hourlyvisibility'] = final['hourlyvisibility'].astype('int32')
    final['incident'] = final['incident'].astype('int32')
    final_data = final.set_index(['station'])
    finals = final_data.groupby(final_data.index).agg({
        'speed':
        'mean',
        'incident':
        'sum',
        'occupancy':
        'mean',
        'hourlyprecipitation':
        'mean',
        'hourlywindspeed':
        'mean',
        'hourlyvisibility':
        'mean'
    })
    finals['p_speed'] = pred_speeds
    finals = finals.reset_index(['station'])
    finals.head()

    # Should have this part merged stationwise with above cell later
    df_traffic_metadata = pd.read_csv("station_meta_finalv2.csv",
                                      sep=',',
                                      header=0)
    onoff_withmeta_df = df_traffic_metadata[df_traffic_metadata['ID'].isin(
        onoff)]
    onoff_withmeta_df.drop_duplicates(subset='ID', inplace=True)
    withmeta_df = finals.merge(df_traffic_metadata,
                               left_on="station",
                               right_on="ID",
                               how="left").round(3)
    withmeta_df.head()

    #Time Taken
    sorter = [onoff[0]] + stations_list + [onoff[1]]
    sorterIndex = dict(zip(sorter, range(len(sorter))))
    a = df_traffic_metadata[df_traffic_metadata['ID'].isin(onoff +
                                                           stations_list)]
    a['Rank'] = a['ID'].map(sorterIndex)
    cols1 = ['ID', 'Fwy', 'Latitude', 'Longitude']
    a = a.sort_values(['Rank'])[cols1]
    a.drop_duplicates(subset=['ID'], inplace=True)
    a['speed'] = [pred_speeds[0]] + list(pred_speeds) + [pred_speeds[-1]]
    tim = np.array([])
    for i in range(1, len(a)):
        p1 = (a.iloc[i - 1][2], a.iloc[i - 1][3])
        p2 = (a.iloc[i][2], a.iloc[i][3])
        dist = geodesic(p1, p2).miles
        spd = (a.iloc[i - 1][4] + a.iloc[i][4]) / 2
        t = dist / spd
        tim = np.append(tim, t)
    timetak = sum(tim * 60).round(2)
    print(timetak)
    #Map

    #101
    gpx_file101 = open('101.gpx', 'r')
    gpx101 = gpxpy.parse(gpx_file101)
    points101 = []
    for track in gpx101.tracks:
        for segment in track.segments:
            for point in segment.points:
                points101.append(tuple([point.latitude, point.longitude]))

    #280
    gpx_file280 = open('280.gpx', 'r')
    gpx280 = gpxpy.parse(gpx_file280)
    points280 = []
    for track in gpx280.tracks:
        for segment in track.segments:
            for point in segment.points:
                points280.append(tuple([point.latitude, point.longitude]))

    #680
    gpx_file680 = open('680.gpx', 'r')
    gpx680 = gpxpy.parse(gpx_file680)
    points680 = []
    for track in gpx680.tracks:
        for segment in track.segments:
            for point in segment.points:
                points680.append(tuple([point.latitude, point.longitude]))

    #880
    gpx_file880 = open('880.gpx', 'r')
    gpx880 = gpxpy.parse(gpx_file880)
    points880 = []
    for track in gpx880.tracks:
        for segment in track.segments:
            for point in segment.points:
                points880.append(tuple([point.latitude, point.longitude]))

    ave_lat = sum(p[0] for p in points880) / len(points880)
    ave_lon = sum(p[1] for p in points880) / len(points880)

    # Load map centred on average coordinates
    my_map = folium.Map(location=[ave_lat, ave_lon],
                        zoom_start=9,
                        tiles="Stamen Terrain")
    if (Fwy == 101):
        fg101 = folium.FeatureGroup(name="U.S 101", show=True)
        fg280 = folium.FeatureGroup(name="I280", show=False)
        fg680 = folium.FeatureGroup(name="I680", show=False)
        fg880 = folium.FeatureGroup(name="I880", show=False)
    if (Fwy == 280):
        fg280 = folium.FeatureGroup(name="I280", show=True)
        fg101 = folium.FeatureGroup(name="U.S 101", show=False)
        fg680 = folium.FeatureGroup(name="I680", show=False)
        fg880 = folium.FeatureGroup(name="I880", show=False)
    if (Fwy == 680):
        fg680 = folium.FeatureGroup(name="I680", show=True)
        fg101 = folium.FeatureGroup(name="U.S 101", show=False)
        fg280 = folium.FeatureGroup(name="I280", show=False)
        fg880 = folium.FeatureGroup(name="I880", show=False)
    if (Fwy == 880):
        fg880 = folium.FeatureGroup(name="I880", show=True)
        fg101 = folium.FeatureGroup(name="U.S 101", show=False)
        fg280 = folium.FeatureGroup(name="I280", show=False)
        fg680 = folium.FeatureGroup(name="I680", show=False)
    ###Changes from here
    for row in withmeta_df.itertuples():
        popuptext = "<b>Station:</b>"+str(row.station)+"<br>"+"<b>City:</b>"+str(row.City)+"<br>"+ \
        "<b>Direction:</b>"+str(row.Dir)+"<br>"+ \
        "<b>Predicted Speed:</b>"+str(row.p_speed)+"<br>"+ \
        "<b>Avg Occupancy:</b>"+str(row.occupancy)+"<br>"+ \
        "<b>Avg Precipitation:</b>"+str(row.hourlyprecipitation)+"<br>"+ \
        "<b>Avg Windspeed:</b>"+str(row.hourlywindspeed)+"<br>"+ \
        "<b>Avg Visibility:</b>"+str(row.hourlyvisibility)+"<br>"+ \
        "<b>Incident Count:</b>"+str(row.incident)
        test = folium.Html(popuptext, script=True)
        popup = folium.Popup(test, max_width=200)
        if row.Fwy == 101:
            fg101.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              popup=popup,
                              icon=folium.Icon(color='blue',
                                               prefix='fa',
                                               icon='car')))
        if row.Fwy == 280:
            fg280.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              popup=popup,
                              icon=folium.Icon(color='blue',
                                               prefix='fa',
                                               icon='car')))
        if row.Fwy == 680:
            fg680.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              popup=popup,
                              icon=folium.Icon(color='blue',
                                               prefix='fa',
                                               icon='car')))
        if row.Fwy == 880:
            fg880.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              popup=popup,
                              icon=folium.Icon(color='blue',
                                               prefix='fa',
                                               icon='car')))

    for row in onoff_withmeta_df.itertuples():
        if row.Fwy == 101:
            fg101.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              icon=folium.Icon(color='red',
                                               prefix='fa',
                                               icon='circle')))
        if row.Fwy == 280:
            fg280.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              icon=folium.Icon(color='red',
                                               prefix='fa',
                                               icon='circle')))
        if row.Fwy == 680:
            fg680.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              icon=folium.Icon(color='red',
                                               prefix='fa',
                                               icon='circle')))
        if row.Fwy == 880:
            fg880.add_child(
                folium.Marker(location=[row.Latitude, row.Longitude],
                              icon=folium.Icon(color='red',
                                               prefix='fa',
                                               icon='circle')))

    folium.PolyLine(points101, color="black", weight=2.5,
                    opacity=1).add_to(fg101)
    folium.PolyLine(points280, color="purple", weight=2.5,
                    opacity=1).add_to(fg280)
    folium.PolyLine(points680, color="green", weight=2.5,
                    opacity=1).add_to(fg680)
    folium.PolyLine(points880, color="yellow", weight=2.5,
                    opacity=1).add_to(fg880)
    my_map.add_child(fg101)
    my_map.add_child(fg280)
    my_map.add_child(fg680)
    my_map.add_child(fg880)

    folium.LayerControl().add_to(my_map)

    legend_html = "<div style=\"position: fixed; \
                                bottom: 10px; left: 30px; width: 220px; height: 70px;\
                                border:2px solid grey; z-index:9999; font-size:14px; + \
                                \"><br>\
                                &nbsp;&nbsp;&nbsp;<i class=\"fa fa-car fa-2x\" style=\"color:purpule\"></i>\
                                &nbsp; Travel Time: &nbsp;" + str(
        timetak.round(2)) + " mins" + "<br/><br/>\
                    </div>"

    #my_map.get_root().html.add_child(folium.Element(legend_html))
    my_map.save('./static/Map.html')
    # my_map
    withmeta_df.drop_duplicates(subset=['station'], inplace=True)
    finavg = withmeta_df.groupby('Dir').agg({
        'speed': 'mean',
        'incident': 'sum',
        'occupancy': 'mean',
        'hourlyprecipitation': 'mean',
        'hourlywindspeed': 'mean',
        'hourlyvisibility': 'mean'
    })
    finavg = finavg.reset_index()
    # finavg.head()
    avgocc = str(finavg['occupancy'][0].astype('float').round(1))
    avgspeed = str(finavg['speed'][0].astype('float').round(1))
    avgvisibility = str(finavg['hourlyvisibility'][0].astype('float').round(1))
    avgwindspeed = str(finavg['hourlywindspeed'][0].astype('float').round(1))
    avgprecipitation = str(
        finavg['hourlyprecipitation'][0].astype('float').round(1))
    incidentcount = str(finavg['incident'][0].astype('int'))
    return my_map, timetak, avgocc, avgspeed, avgvisibility, avgwindspeed, avgprecipitation, incidentcount
Beispiel #8
0
def predict_1_28(val,register,app,video,act):
    path = '../data1/1_28/'
        
    def get_features_all(df,df1):
        lendf = len(df)
        df= df.append(df1)
        del df1
        gc.collect()
        
        df = docount(df,df,'ALL',['register_type'])
        df = docount(df,df,'ALL',['device_type'])
        
        
        del df['user_id']

        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
        
    df1 = register[register.register_day<15]
    df1['register_time'] = 17-register.register_day
    df2 = register[register.register_day<22]
    df2['register_time'] = 24-register.register_day
    df2[df2['register_time']>16]['register_time'] = 16


    
    test_df = register[register.register_day<29]
    test_df['register_time'] = 31-test_df.register_day
    df2[df2['register_time']>16]['register_time'] = 16

    
    del df1['register_day'],df2['register_day'],test_df['register_day']
    
    if os.path.exists(path+'train_y1.csv'):
        train_y1=pd.read_csv(path+'train_y1.csv')
        
    else:
        train_y1 = is_active(df1,17,23,app,video,act)
        train_y1.to_csv(path+'train_y1.csv',index=False)
    train_y1 = train_y1['Y']
    if os.path.exists(path+'train_y2.csv'):
        train_y2=pd.read_csv(path+'train_y2.csv')
        
    else:
        train_y2 = is_active(df2,24,30,app,video,act)
        train_y2.to_csv(path+'train_y2.csv',index=False)
    train_y2 = train_y2['Y']        
        
    if os.path.exists(path+'df1.csv'):
        df1=pd.read_csv(path+'df1.csv')
    else:
        df1 = get_features_ks(df1,1,16,app,video,act)
        df1.to_csv(path+'df1.csv',index=False)
    
    if os.path.exists(path+'df2.csv'):
        df2=pd.read_csv(path+'df2.csv')
    else:
        df2 = get_features_ks(df2,8,23,app,video,act)
        df2.to_csv(path+'df2.csv',index=False)
        
    if val:
        train_df = df1
        test_df = df2
        train_y = train_y1
        val_y = train_y2
    else:
        if os.path.exists(path+'test_df.csv'):
            test_df=pd.read_csv(path+'test_df.csv')
        else:
            test_df = get_features_ks(test_df,15,30,app,video,act)
            test_df.to_csv(path+'test_df.csv',index=False)
        
        train_df = df1.append(df2)
        train_y = train_y1.append(train_y2)
        #train_df = df2
        #train_y = train_y2
    
    del df1,df2
    gc.collect()
    ids = test_df['user_id']
    train_df,test_df = get_features_all(train_df,test_df)    
    '''
    train_df['Y'] = train_y
    print (len(train_df))
    train_js = train_df[train_df['act_mean#']==0]  
    train_df = train_df[train_df['act_mean#']>0]  
    print (len(train_df))
    train_y = train_df['Y']
    del train_df['Y']
    train_y_js = train_js['Y']
    del train_js['Y']
    
    test_df['Y'] = val_y
    test_js =  test_df[test_df['act_mean#']==0] 
    test_df =  test_df[test_df['act_mean#']>0] 
    val_y = test_df['Y']
    del test_df['Y']
    js_y = test_js['Y']
    del test_js['Y']
    '''
    pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1)
    #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1)
    '''
    test_df['Y'] = val_y
    test_df['Y1'] = test_y
    test_js =  test_df[test_df['act_mean#']==0] 
    print(len(test_js))
    print(len(test_js[test_js['Y1']>=0.4]))
    print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)]))
    test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0
    print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]))
    test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0
    '''
    
    if val==1:   
        showresults(val_y,test_y) 
        showtop(val_y,test_y,nums=15428)
        showtop(val_y,test_y,nums=15905)
        showfalse(ids,test_df,val_y,test_y)
        #showprecision(val_y,test_y)
        return ids,test_y,getbest(ids,test_y,th=0.4)   
    else:
        showresults(train_y,pre_train)     
        showtop(train_y,pre_train,nums=25713)
    
        #return ids,test_y,getbest(ids,test_y,th=0.4) 
        return ids,test_y,getbest(ids,test_y,rank=22088) 
def autoencoder():
    dict = utils.unpickle('cifar_10/data_batch_1')
    test_dict = utils.unpickle('cifar_10/data_batch_2')
    # dict content = [b'batch_label', b'labels', b'data', b'filenames']

    LX, LY = utils.data_fromCIFAR10(dict, 500)
    test_data, test_label = utils.data_fromCIFAR10(test_dict, 10000)
    #LX, LY = utils.data_fromCIFAR10(dict, 5000)
    #print("label data number {}, label number {}".format(np.shape(LX), np.shape(LY)))

    #model = net.Net1(train_data, train_lable, save_model='test.h5', validate=0.1)
    #model = train(LX, LY, save_model='test.h5', validate=0.1)
    #model = train(LX, LY)

    allData, allLabel = utils.data_fromCIFAR10(dict, 10000)
    #UX = UX[1000:]
    #print("unlabel data number line102", np.shape(UX))

    UX = allData[6000:]
    true_label = allLabel[6000:]

    train_data = np.concatenate((LX, UX), axis=0)
    train_data, _ = data_aug(train_data, np.ones((train_data.shape[0], 1)))

    X_normal = np.asarray(train_data, dtype='float32') / 255.0

    if not os.path.isfile('ae_model.h5'):
        ae_model, ae_dnn = net.Autoencoder(train_data,
                                           X_normal,
                                           model_train='ae_model.h5',
                                           save_model='ae_model.h5',
                                           validate=0.1)
    else:
        ae_model, ae_dnn = net.Autoencoder(train_data,
                                           X_normal,
                                           model_train='ae_model.h5',
                                           validate=0.1)
    '''ae_predict = utils.predict_data(test_data, ae_model)
    ae_predict = np.argmax(ae_predict, axis=1)'''

    #print('auto encoder prediction dim = {}'.format(np.shape(ae_predict)))

    #plt.plot(np.asarray(ae_predict))
    #plt.show()

    #ae_model= load_model('ae_model.h5')

    #ae_model.summary()

    # CNN part

    for layer in ae_model.layers:
        layer.trainable = False


#ae_dnn.summary()
    x_train, y_train = data_aug(LX, LY)

    y_train = to_categorical(y_train, 10)

    iter = 10

    save_model = 'dnn_model.h5'

    early_stop = EarlyStopping(monitor='val_loss',
                               patience=5,
                               mode='min',
                               min_delta=0)
    model_check = ModelCheckpoint(save_model,
                                  monitor='val_loss',
                                  verbose=1,
                                  save_best_only=True,
                                  save_weights_only=False)

    ae_dnn = load_model('dnn_model_1.h5')
    prediction = utils.predict_data(test_data, ae_dnn)
    prediction = np.argmax(prediction, axis=1)

    count = 0
    for i in range(np.shape(test_label)[0]):
        if (test_label[i] == prediction[i]):
            count += 1

    print('Correct Rate = {}'.format(count / np.shape(test_label)[0]))
Beispiel #10
0
def predict_24_28(val,register,app,video,act):
    
    def get_features(df,d1,d2):
        tapp = app[(app.day>=d1) & (app.day<=d2)]
        tact = act[(act.day>=d1) & (act.day<=d2)]
        tvideo = video[(video.day>=d1) & (video.day<=d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2-d1

        df['register_time'] = d2-df.register_day+1
        del df['register_day']
        
        #app
        df = docount(df,tapp,'app',['user_id'])
        df['app_mean#'] = df['app$user_id#']/df['register_time']
        #df = domax(df,tapp,'app',['user_id'],'day')
        #df['last_app_day'] = lastday - df['app$user_id_by_day_max']+1
        del df['app$user_id#']
        #df['app_day_missing'] = df['register_time'] - df['app$user_id#']
        #df['app$user_id#'] = df['app$user_id#']/df['register_time']
        
        #df = dovar(df,tapp,'app',['user_id'],'day')
        #df = docount(df,tapp[tapp.day>lastday-2],'app_last_2',['user_id'])        
        #df = docount(df,tapp[tapp.day>lastday-1],'app_last_1',['user_id']) 
        #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id'])
        gc.collect()
        #video
        #df = docount(df,tvideo,'video',['user_id'])
        #df['video_mean#'] = df['video$user_id#']/df['register_time']
        #df = domax(df,tvideo,'video',['user_id'],'day')
        #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1
        #del df['video$user_id_by_day_max']
        #df = doiq(df,tvideo,'video',['user_id'],'day')
        #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1
        #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq']
        #df['video$user_id#'] = df['video$user_id#']/df['register_time']
        
        #df = dovar(df,tvideo,'video',['user_id'],'day')     
        df = docount(df,tvideo[tvideo.day>lastday-2],'video_last_2',['user_id'])
        df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id'])
        #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id'])
        gc.collect()
        #act
        #gp = act.groupby(['user_id','day']).size().unstack()
        #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left')   
        #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left')
        #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left')        
        
        #df = docount(df,tact,'act',['user_id'])
        #df['act_mean#'] = df['act$user_id#']/df['register_time']
        df = domax(df,tact,'act',['user_id'],'day')
        df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        del df['act$user_id_by_day_max']
        #df = doiq(df,tact,'act',['user_id'],'day')
        #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        #df['act_day_missing'] = df['register_time'] - df['act$user_id_by_day_iq']
        #df['act$user_id#'] = df['act$user_id#']/df['register_time']
        
        #gp = tact.groupby(['user_id','day']).size().unstack()
        #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left')   
        #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left')
        #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left')

        #df = dovar(df,tact,'act',['user_id'],'day')      
        df = docount(df,tact[tact.day>lastday-2],'act_last_2',['user_id']) 
        df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id'])
        #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id'])
        gc.collect()
        
        #page_list = list(tact['page'].unique())
                
        for c in [0,1,2,3]: 
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-3)],'act_last_3_page='+str(c),['user_id']) 
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-2)],'act_last_2_page='+str(c),['user_id'])
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-1)],'act_last_1_page='+str(c),['user_id']) 
        
        df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'video_id')
        
        df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'video_id')
        
        df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'video_id')
        
        for c in [0,1,2,3]: 
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-3)],'act_last_3_action_type='+str(c),['user_id'])
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-2)],'act_last_2_action_type='+str(c),['user_id'])
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-1)],'act_last_1_action_type='+str(c),['user_id'])

        
        gc.collect()
        
        
        return df
    
    
    path = '../data1/24_28/'
    if val:
        if os.path.exists(path+'val_df.csv'):
            test_df = pd.read_csv(path+'val_df.csv')
            val_y = pd.read_csv(path+'val_y.csv')
        else:
            test_df = register[(register.register_day>=17) & (register.register_day<=21)]
            test_df = get_features(test_df,17,23)
            val_y = is_active(test_df,24,30,app,video,act)
            test_df.to_csv(path+'val_df.csv',index=False)
            val_y.to_csv(path+'val_y.csv',index=False)
        val_y = val_y['Y']
        if os.path.exists(path+'val_train_df.csv'):
            train_df = pd.read_csv(path+'val_train_df.csv')
            train_y = pd.read_csv(path+'val_train_y.csv')
        else:    
            train_df = pd.DataFrame()   
            train_y = pd.DataFrame()                  
            for i in range(1,11):
                df = register[(register.register_day>=i) & (register.register_day<=i+4)]
                y = is_active(df,i+7,i+13,app,video,act)
                df = get_features(df,i,i+6)
                train_df = train_df.append(df)
                train_y = train_y.append(y)
            train_df.to_csv(path+'val_train_df.csv',index=False)
            train_y.to_csv(path+'val_train_y.csv',index=False)
    else:
        if os.path.exists(path+'test_df.csv'):
            test_df = pd.read_csv(path+'test_df.csv')
        else:
            test_df = register[(register.register_day>=24) & (register.register_day<=28)]
            test_df = get_features(test_df,24,30)
            test_df.to_csv(path+'test_df.csv',index=False)
                               
        if os.path.exists(path+'train_df.csv'):
            train_df = pd.read_csv(path+'train_df.csv')
            train_y = pd.read_csv(path+'train_y.csv')
        else:            
            if os.path.exists(path+'val_train_df.csv'):
                train_df = pd.read_csv(path+'val_train_df.csv')
                train_y = pd.read_csv(path+'val_train_y.csv') 
                for i in range(11,18):
                    df = register[(register.register_day>=i) & (register.register_day<=i+4)]
                    y = is_active(df,i+7,i+13,app,video,act)
                    df = get_features(df,i,i+6)
                    train_df = train_df.append(df)
                    train_y = train_y.append(y)  
            else:
                train_df = pd.DataFrame()   
                train_y = pd.DataFrame()                  
                for i in range(1,18):
                    df = register[(register.register_day>=i) & (register.register_day<=i+4)]
                    y = is_active(df,i+7,i+13,app,video,act)
                    df = get_features(df,i,i+6)
                    train_df = train_df.append(df)
                    train_y = train_y.append(y)  
            train_df.to_csv(path+'train_df.csv',index=False)
            train_y.to_csv(path+'train_y.csv',index=False)                 
    train_y = train_y['Y']
    #print(sum(train_y)/len(train_y))
        
    def get_features_all(df,df1):
        lendf = len(df)
        
        df= df.append(df1)
        del df1
        gc.collect()
        
        #for c in ['act_last_2$user_id#']:
        #    df = domean(df,df,'All',['device_type'],c);gc.collect()
        #    df = domean(df,df,'All',['register_type'],c);gc.collect()
            
        #del df
            
        #ccc = ['device_type', 'app_mean#', 'register_type', 'register_time', 'act_last_3_page=1$user_id#', 'last_act_day', 'act_last_3$user_id_by_video_id_iq', 'act_last_3_page=2$user_id#', 'act_last_3$user_id_by_author_id_iq', 'act_last_3_action_type=1$user_id#', 'act_last_1$user_id_by_author_id_iq', 'act_last_3_page=3$user_id#', 'act_last_3_page=0$user_id#', 'act_last_1$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'act_last_3$user_id#', 'act_last_2$user_id_by_video_id_iq', 'act_last_3_action_type=2$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_2_page=2$user_id#', 'act_last_2_page=1$user_id#', 'act_last_2_page=3$user_id#', 'act_last_1_page=1$user_id#', 'act_last_2$user_id#', 'act_last_2_page=0$user_id#', 'act_last_1_action_type=0$user_id#', 'act_last_2_action_type=1$user_id#', 'act_last_1_page=2$user_id#', 'act_last_3_action_type=3$user_id#', 'act_last_1_page=3$user_id#', 'act_last_2_action_type=0$user_id#', 'video_last_3$user_id#', 'act_last_1_page=0$user_id#', 'act_last_2_action_type=2$user_id#', 'act_last_2_action_type=3$user_id#', 'video_last_2$user_id#', 'act_last_1_action_type=1$user_id#', 'act_last_1_action_type=3$user_id#', 'act_last_1_action_type=2$user_id#']
        #for i in range(38,39):
        #    del df[ccc[i]]            
            
            
        
        del df['user_id']
        #del df['last_app_day'],df['last_video_day'],df['video_last_1$user_id#'],df['app_last_1$user_id#']
        #del df['act_last_1$user_id#'],df['app_last_2$user_id#']
        
 
        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
        
    
    ids = test_df['user_id']
    train_df,test_df = get_features_all(train_df,test_df)    
    
    pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1)
    #print(test_y)
    if val==1:   
        print (len(train_y),sum(train_y))
        showresults(val_y,test_y) 
        showtop(val_y,test_y,nums=4723)
        showtop(train_y,pre_train,nums=38507)
        #return ids,test_y,getbest(ids,test_y,rank=4723)
        return ids,test_y,getbest(ids,test_y,th=0.4)
    else:
        showresults(train_y,pre_train)     
        showtop(train_y,pre_train,nums=70275)
        return ids,test_y,getbest(ids,test_y,rank=5498)