Exemple #1
0
def predict_30(val, register, app, video, act):
    def get_features0(df, d):
        #tapp = app[app.day==d]
        tvideo = video[video.day == d]
        tact = act[act.day == d]
        #df = docount(df,tapp,'app',['user_id']);gc.collect()
        df = docount(df, tvideo, 'video', ['user_id'])
        gc.collect()
        df['videorate'] = df['video$user_id#'] / (tvideo.shape[0] + 0.000001)
        df = docount(df, tact, 'act', ['user_id'])
        gc.collect()
        df['actrate'] = df['act$user_id#'] / (tact.shape[0] + 0.000001)

        page_list = list(tact['page'].unique())
        for c in [0, 1, 2, 3, 4]:
            df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c),
                         ['user_id'])
            gc.collect()
            df['act_page=' + str(c) +
               '$user_id#rate'] = df['act_page=' + str(c) + '$user_id#'] / (
                   df['act$user_id#'] + 0.00001)

        df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df[
            'act_page=3$user_id#']
        df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df[
            'act_page=3$user_id#'] + df['act_page=0$user_id#']

        action_list = list(tact['action_type'].unique())
        for c in [0, 1, 2, 3, 4, 5]:
            df = docount(df, tact[tact['action_type'] == c],
                         'action_type=' + str(c), ['user_id'])
            gc.collect()
            df['action_type=' + str(c) +
               '$user_id#rate'] = df['action_type=' + str(c) + '$user_id#'] / (
                   df['act$user_id#'] + 0.00001)

        df['action_type=01$user_id#'] = df['action_type=0$user_id#'] + df[
            'action_type=1$user_id#']

        def iszero(s):
            if s == 0:
                return 0
            return 1

        df['pageall'] = df['act_page=0$user_id#'].apply(iszero)
        for c in [1, 2, 3, 4]:
            df['pageall'] = df['pageall'] * df['act_page=0$user_id#']
        df['pageall'] = df['act_page=0$user_id#'].apply(iszero)

        df['actionall'] = df['action_type=0$user_id#'].apply(iszero)
        for c in [1, 2, 3, 4, 5]:
            df['pageall'] = df['pageall'] * df['action_type=0$user_id#']
        df['actionall'] = df['action_type=0$user_id#'].apply(iszero)

        df['act0'] = df['act$user_id#'].apply(iszero)
        df['video0'] = df['video$user_id#'].apply(iszero)

        def bigact(s):
            if s >= 50:
                return 5
            else:
                return int(s / 10)

        df['act$user_id#10'] = df['act$user_id#'].apply(bigact)

        df['author_id'] = df['user_id']
        df = docount(df, tact, 'act', ['author_id'])
        gc.collect()
        df = doiq(df, tact, 'act', ['user_id'], 'video_id')
        gc.collect()
        df = doiq(df, tact, 'act', ['user_id'], 'author_id')
        gc.collect()

        df['act$author_video_m'] = df['act$user_id_by_video_id_iq'] / df[
            'act$user_id_by_author_id_iq']

        del df['register_day'], df['author_id']
        return df

    def get_features_all(df, df1):
        lendf = len(df)
        df = df.append(df1)
        del df1
        gc.collect()

        for c in ['act$user_id#']:
            #df = domean(df,df,'All',['device_type'],c);gc.collect()
            df = domean(df, df, 'All', ['register_type'], c)
            gc.collect()
            #df = dovar(df,df,'All',['register_type'],c);gc.collect()
        df = docount(df, df, 'ALL', ['register_type'])
        df = docount(df, df, 'ALL', ['device_type'])

        del df['user_id'],

        ccc = [
            'device_type', 'actrate', 'All$register_type_by_act$user_id#_mean',
            'act_page=1$user_id#', 'action_type=0$user_id#rate',
            'action_type=1$user_id#rate', 'register_type',
            'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq',
            'videorate', 'act_page=1$user_id#rate', 'act$author_video_m',
            'action_type=2$user_id#rate', 'act_page=3$user_id#rate',
            'act_page=0$user_id#', 'action_type=0$user_id#',
            'act_page=2$user_id#', 'act_page=2$user_id#rate',
            'action_type=1$user_id#', 'act$user_id#',
            'act_page=4$user_id#rate', 'act_page=0$user_id#rate', 'pageall',
            'act_page=4$user_id#', 'action_type=3$user_id#rate',
            'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#',
            'action_type=2$user_id#', 'action_type=3$user_id#',
            'act_page=023$user_id#', 'act$author_id#',
            'action_type=01$user_id#', 'action_type=5$user_id#rate',
            'ALL$register_type#', 'action_type=5$user_id#', 'act$user_id#10',
            'action_type=4$user_id#', 'actionall',
            'action_type=4$user_id#rate', 'act0', 'video0'
        ]
        ccc1 = []

        ddd = [
            'All$register_type_by_act$user_id#_mean',
            'act_page=1$user_id#',
            'action_type=1$user_id#rate',
            'act$user_id_by_author_id_iq',
            'act$user_id_by_video_id_iq',
            'act$author_video_m',
            'act_page=2$user_id#',
            'act_page=2$user_id#rate',
            'action_type=1$user_id#',
            'act$user_id#',
            'act_page=4$user_id#rate',
            'act_page=4$user_id#',
            'action_type=3$user_id#rate',
            'act_page=23$user_id#',
            'act_page=3$user_id#',
            'video$user_id#',
            'action_type=2$user_id#',
            'action_type=3$user_id#',
            'act$author_id#',
            'action_type=01$user_id#',
            'ALL$register_type#',
            'ALL$device_type#',
            'action_type=5$user_id#rate',
            'action_type=5$user_id#',
            'act$user_id#10',
            'action_type=4$user_id#',
            'actionall',
            'action_type=4$user_id#rate',
            'act0',
        ]

        used = [
            'device_type',
            'register_type',
            'actrate',
            'action_type=0$user_id#rate',
            'videorate',
            'act_page=1$user_id#rate',
            'action_type=2$user_id#rate',
            'act_page=3$user_id#rate',
            'act_page=0$user_id#',
            'action_type=0$user_id#',
            'act_page=0$user_id#rate',
            'pageall',
            'act_page=023$user_id#',
            'video0',
            'All$register_type_by_act$user_id#_mean',
            'ALL$register_type#',
        ]

        df = df[used]

        df1 = df[lendf:]
        df = df[:lendf]
        return df, df1

    path = '../data1/30/'
    if os.path.exists(path + 'train_df.csv'):
        train_df = pd.read_csv(path + 'train_df.csv')
        train_y = pd.read_csv(path + 'train_y.csv')

    else:
        train_df = pd.DataFrame()
        train_y = pd.DataFrame()
        for i in range(1, 24):
            df = register[register.register_day == i]
            y = is_active(df, i + 1, i + 7, app, video, act)
            df = get_features0(df, i)
            train_df = train_df.append(df)
            train_y = train_y.append(y)
            if i == 22:
                valst = len(train_df)
                print(valst)

        train_df.to_csv(path + 'train_df.csv', index=False)
        train_y.to_csv(path + 'train_y.csv', index=False)

    train_y = train_y['Y']
    if val:
        #35134
        valst = 35134
        test_df = train_df[valst:]
        val_y = train_y[valst:]
        train_df = train_df[:valst]
        train_y = train_y[:valst]
    else:
        if os.path.exists(path + 'test_df.csv'):
            test_df = pd.read_csv(path + 'test_df.csv')
        else:
            test_df = register[register.register_day == 30]
            test_df = get_features0(test_df, 30)
            test_df.to_csv(path + 'test_df.csv', index=False)

    #train_df['Y'] = train_y
    #act0train = train_df[train_df['act$user_id#']==0]
    #print(len(act0train),len(act0train[act0train['Y']==1]))
    #del train_df['Y']
    #act0ids = test_df[test_df['act$user_id#']==0]['user_id']

    ids = test_df['user_id']
    train_df, test_df = get_features_all(train_df, test_df)

    pre_train, test_y = predict_data(train_df,
                                     train_y,
                                     10,
                                     test_df,
                                     importance=1)

    if val == 1:
        print(len(train_y), sum(train_y))
        showresults(train_y, pre_train)
        showresults(val_y, test_y)
        showfalse(ids, test_df, val_y, test_y)
        showtop(val_y, test_y, nums=1457)
        showtop(train_y, pre_train, nums=23260)
        #showtop(train_y,pre_train,nums=15485)
        #showprecision(val_y,test_y)
        #showprecision(train_y,pre_train)
        return ids, test_y, getbest(ids, test_y, th=0.4)
    else:
        showresults(train_y, pre_train)
        showtop(train_y, pre_train, nums=24717)
        #showtop(train_y,pre_train,nums=16943)
        #showprecision(train_y,pre_train)
        return ids, test_y, getbest(ids, test_y, rank=1490)
Exemple #2
0
def predict_29(val,register,app,video,act):
    def get_features(df,d1,d2):
        tapp = app[(app.day>=d1) & (app.day<=d2)]
        tact = act[(act.day>=d1) & (act.day<=d2)]
        tvideo = video[(video.day>=d1) & (video.day<=d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2-d1      
        #df['register_time'] = d2-df.register_day+1
        
        df = docount(df,tapp,'app',['user_id']);gc.collect() 
        df = docount(df,tapp[tapp.day==lastday],'last_day_app',['user_id']);gc.collect()
        #df['app_mean#'] = df['app$user_id#']/2
        df = docount(df,tvideo,'video',['user_id']);gc.collect()
        df['videorate'] = df['video$user_id#']/(tvideo.shape[0]+0.000001)
        #df['video_mean#'] = df['video$user_id#']/2
        df = docount(df,tact,'act',['user_id']);gc.collect()
        df = docount(df,tact[tact.day==lastday],'last_day_act',['user_id']);gc.collect()
        df = docount(df,tact[tact.day==lastday-1],'first_day_act',['user_id']);gc.collect()
        df['actrate'] = df['act$user_id#']/(tact.shape[0]+0.000001)
        df['last_day_actrate'] = df['last_day_act$user_id#']/(tact.shape[0]+0.000001)
        df['first_day_actrate'] = df['first_day_act$user_id#']/(tact.shape[0]+0.000001)
        df['actrate_gap'] = df['last_day_actrate'] - df['first_day_actrate']
        df['act_gap'] = df['last_day_act$user_id#'] - df['first_day_act$user_id#']
        #df['act_mean#'] = df['act$user_id#']/2
        #page_list = list(tact['page'].unique())
        def iszero(s):
            if s==0:
                return 0
            return 1
        df['act0'] = df['act$user_id#'].apply(iszero)
        df['video0'] = df['video$user_id#'].apply(iszero)    
        
        
        
        for c in [1]: 
            df = docount(df,tact[tact.day==lastday][tact['page']==c],'last_day_act_page='+str(c),['user_id']);gc.collect()
        
        for c in [0,1,2,3,4]: 
            df = docount(df,tact[tact['page']==c],'act_page='+str(c),['user_id']);gc.collect()
            df['act_page='+str(c)+'$user_id#rate'] = df['act_page='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001)
        
        df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']
        df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']+df['act_page=0$user_id#']

        
        action_list = list(tact['action_type'].unique())
        for c in [0,1,2,3,4,5]: 
            df = docount(df,tact[tact['action_type']==c],'action_type='+str(c),['user_id']);gc.collect()
            df = docount(df,tact[tact.day==lastday][tact['action_type']==c],'last_day_action_type='+str(c),['user_id']);gc.collect()
            df['action_type='+str(c)+'$user_id#rate'] = df['action_type='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001)


        df['author_id'] = df['user_id']
        
        df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'author_id');gc.collect()
        df['last_day_act$author_video_m'] = df['last_day_act$user_id_by_video_id_iq']/df['last_day_act$user_id_by_author_id_iq']
        
        df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'author_id');gc.collect()
        df['first_day_act$author_video_m'] = df['first_day_act$user_id_by_video_id_iq']/df['first_day_act$user_id_by_author_id_iq']

        
        df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'video_id');gc.collect()
        df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'author_id');gc.collect()
        df['last2_day_act$author_video_m'] = df['last2_day_act$user_id_by_video_id_iq']/df['last2_day_act$user_id_by_author_id_iq']

        
        
        del df['register_day'],df['author_id']
        return df

    def get_features_all(df,df1):
        lendf = len(df)
        df= df.append(df1)
        del df1
        gc.collect()
        df = docount(df,df,'ALL',['register_type']) 

        del df['user_id']
        
        ccc = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'action_type=2$user_id#rate', 'act_page=0$user_id#rate', 'last_day_act$author_video_m', 'action_type=1$user_id#rate', 'act_page=2$user_id#', 'actrate', 'last_day_act$user_id_by_author_id_iq', 'app$user_id#', 'last_day_act_page=1$user_id#', 'act_page=3$user_id#rate', 'last_day_action_type=0$user_id#', 'first_day_act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'last2_day_act$user_id_by_author_id_iq', 'last2_day_act$user_id_by_video_id_iq', 'first_day_actrate', 'act_page=2$user_id#rate', 'last_day_actrate', 'first_day_act$author_video_m', 'last2_day_act$author_video_m', 'ALL$register_type#', 'act_page=0$user_id#', 'actrate_gap', 'action_type=3$user_id#rate', 'last_day_act$user_id#', 'act$user_id#', 'last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#', 'act_gap', 'action_type=2$user_id#', 'action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#', 'act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#', 'act_page=23$user_id#', 'act_page=023$user_id#', 'act_page=4$user_id#', 'last_day_action_type=2$user_id#', 'last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#', 'last_day_action_type=4$user_id#', 'action_type=4$user_id#', 'last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0']
        ccc1 = [ ]
        
        ddd = ['action_type=2$user_id#rate','action_type=1$user_id#rate','last_day_act$user_id_by_author_id_iq',
               'last_day_act_page=1$user_id#','act_page=3$user_id#rate','first_day_act$user_id_by_video_id_iq',
               'videorate','act_page=1$user_id#rate','last2_day_act$user_id_by_author_id_iq','last2_day_act$user_id_by_video_id_iq',
               'act_page=2$user_id#rate','last_day_actrate', 'first_day_act$author_video_m','last2_day_act$author_video_m',
               'ALL$register_type#','act_page=0$user_id#','actrate_gap','action_type=3$user_id#rate',
               'last_day_act$user_id#','act$user_id#','last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 
               'action_type=1$user_id#','act_gap', 'action_type=2$user_id#','action_type=3$user_id#',
               'first_day_act$user_id#', 'act_page=3$user_id#','act_page=4$user_id#rate', 'video$user_id#', 
               'last_day_action_type=1$user_id#','act_page=23$user_id#', 'act_page=023$user_id#','act_page=4$user_id#', 
               'last_day_action_type=2$user_id#','last_day_action_type=3$user_id#', 'action_type=5$user_id#rate',
               'action_type=5$user_id#', 'last_day_app$user_id#','last_day_action_type=4$user_id#',
               'action_type=4$user_id#','last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0']
        
        used = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#',
                'first_day_act$user_id_by_author_id_iq', 'act_page=0$user_id#rate','last_day_act$author_video_m',
                'act_page=2$user_id#','actrate','app$user_id#', 'last_day_action_type=0$user_id#',
                'first_day_actrate', 'action_type=5$user_id#rate', ]
        
        df = df[used]
        
        
         
        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
    
    path = '../data1/29/'
    
    if val:
        if os.path.exists(path+'val_df.csv'):
            test_df = pd.read_csv(path+'val_df.csv')
            val_y = pd.read_csv(path+'val_y.csv')
        else:
            test_df = register[(register.register_day==22)]
            test_df = get_features(test_df,22,23)
            val_y = is_active(test_df,24,30,app,video,act)
            test_df.to_csv(path+'val_df.csv',index=False)
            val_y.to_csv(path+'val_y.csv',index=False)
        val_y = val_y['Y']
        if os.path.exists(path+'val_train_df.csv'):
            train_df = pd.read_csv(path+'val_train_df.csv')
            train_y = pd.read_csv(path+'val_train_y.csv')
        else:    
            train_df = pd.DataFrame()   
            train_y = pd.DataFrame()                  
            for i in range(1,22):
                df = register[(register.register_day==i)]
                y = is_active(df,i+2,i+8,app,video,act)
                df = get_features(df,i,i+1)
                train_df = train_df.append(df)
                train_y = train_y.append(y)
            train_df.to_csv(path+'val_train_df.csv',index=False)
            train_y.to_csv(path+'val_train_y.csv',index=False)
    else:
        if os.path.exists(path+'test_df.csv'):
            test_df = pd.read_csv(path+'test_df.csv')
        else:
            test_df = register[(register.register_day==29)]
            test_df = get_features(test_df,29,30)
            test_df.to_csv(path+'test_df.csv',index=False)
                               
        if os.path.exists(path+'train_df.csv'):
            train_df = pd.read_csv(path+'train_df.csv')
            train_y = pd.read_csv(path+'train_y.csv')
        else:            
            if os.path.exists(path+'val_train_df.csv'):
                train_df = pd.read_csv(path+'val_train_df.csv')
                train_y = pd.read_csv(path+'val_train_y.csv')
                val_df = pd.read_csv(path+'val_df.csv')
                val_y = pd.read_csv(path+'val_y.csv')
                train_df = train_df.append(val_df)
                train_y = train_y.append(val_y)
            else:
                train_df = pd.DataFrame()   
                train_y = pd.DataFrame()                  
                for i in range(1,23):
                    df = register[(register.register_day==i)]
                    y = is_active(df,i+2,i+8,app,video,act)
                    df = get_features(df,i,i+1)
                    train_df = train_df.append(df)
                    train_y = train_y.append(y)  
            train_df.to_csv(path+'train_df.csv',index=False)
            train_y.to_csv(path+'train_y.csv',index=False)                 
    train_y = train_y['Y']

    ids = test_df['user_id']
    train_df,test_df = get_features_all(train_df,test_df)
    
    pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1)
    
    if val==1:   
        print (len(train_y),sum(train_y))
        showresults(train_y,pre_train)
        showresults(val_y,test_y) 
        showtop(val_y,test_y,nums=1337)
        showtop(train_y,pre_train,nums=19589)
        return ids,test_y,getbest(ids,test_y,th=0.4)
    else:
        showresults(train_y,pre_train)     
        showtop(train_y,pre_train,nums=20926)
        return ids,test_y,getbest(ids,test_y,rank=1294)
def predict_1_23(val, register, app, video, act):
    path = '../data1/1_23/'

    def get_features(df, d1, d2):
        tapp = app[(app.day >= d1) & (app.day <= d2)]
        tact = act[(act.day >= d1) & (act.day <= d2)]
        tvideo = video[(video.day >= d1) & (video.day <= d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2 - d1
        #app
        df = docount(df, tapp, 'app', ['user_id'])
        #df = domin(df,tapp,'app',['user_id'],'day')
        df = domax(df, tapp, 'app', ['user_id'], 'day')

        df['last_app_day'] = lastday - df['app$user_id_by_day_max'] + 1
        #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']+1
        df['app_day_missing'] = df['register_time'] - df['app$user_id#']
        df['app_mean#'] = df['app$user_id#'] / df['register_time']
        del df['app$user_id#'], df['app$user_id_by_day_max']

        df = dovar(df, tapp, 'app', ['user_id'], 'day')
        #df = domean(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day')
        #df = dovar(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day')

        for i in range(8):
            df = docount(df, tapp[tapp.day >= lastday - i],
                         'app_last_' + str(i), ['user_id'])
            if i >= 3:
                df = domean(df, tapp[tapp.day >= lastday - i],
                            'app_last_' + str(i), ['user_id'], 'day')
                df = dovar(df, tapp[tapp.day >= lastday - i],
                           'app_last_' + str(i), ['user_id'], 'day')
        #df = docount(df,tapp[tapp.day>lastday-7],'app_last_7',['user_id'])
        #df = docount(df,tapp[tapp.day>lastday-3],'app_last_3',['user_id'])
        #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id'])

        gc.collect()
        #video
        df = docount(df, tvideo, 'video', ['user_id'])
        df = domin(df, tvideo, 'video', ['user_id'], 'day')
        df = domax(df, tvideo, 'video', ['user_id'], 'day')
        df = doiq(df, tvideo, 'video', ['user_id'], 'day')
        df['last_video_day'] = lastday - df['video$user_id_by_day_max'] + 1
        df['first_video_day'] = lastday - df['video$user_id_by_day_min'] + 1
        df['video_day_gap'] = df['video$user_id_by_day_max'] - df[
            'video$user_id_by_day_min'] + 1
        #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq']
        df['video_mean#'] = df['video$user_id#'] / df['register_time']
        del df['video$user_id#'], df['video$user_id_by_day_max'], df[
            'video$user_id_by_day_min']

        df = dovar(df, tvideo, 'video', ['user_id'], 'day')
        df = domean(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                    ['user_id'], 'day')
        df = dovar(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                   ['user_id'], 'day')

        df = docount(df, tvideo[tvideo.day > lastday - 8], 'video_last_8',
                     ['user_id'])
        #df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id'])
        #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id'])
        gc.collect()
        #act
        gp = tact.groupby(['user_id', 'day']).size().unstack()
        df = pd.merge(df,
                      gp.max(1).rename('actcount_max').reset_index(),
                      on=['user_id'],
                      how='left')
        df = pd.merge(df,
                      gp.mean(1).rename('actcount_mean').reset_index(),
                      on=['user_id'],
                      how='left')
        df = pd.merge(df,
                      gp.var(1).rename('actcount_var').reset_index(),
                      on=['user_id'],
                      how='left')

        df = docount(df, tact, 'act', ['user_id'])
        df = domin(df, tact, 'act', ['user_id'], 'day')
        df = domax(df, tact, 'act', ['user_id'], 'day')
        df = doiq(df, tact, 'act', ['user_id'], 'day')
        #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        df['act_day_gap'] = df['act$user_id_by_day_max'] - df[
            'act$user_id_by_day_min'] + 1
        df['act_day_missing'] = df['register_time'] - df[
            'act$user_id_by_day_iq']
        df['act_mean#'] = df['act$user_id#'] / df['register_time']
        del df['act$user_id#']

        df = dovar(df, tact, 'act', ['user_id'], 'day')
        #df = domean(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day')
        #df = dovar(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day')

        for i in range(8):
            df = docount(df, tact[tact.day >= lastday - i],
                         'act_last_' + str(i), ['user_id'])
            if i >= 3:
                df = domean(df, tact[tact.day >= lastday - i],
                            'act_last_' + str(i), ['user_id'], 'day')
                df = dovar(df, tact[tact.day >= lastday - i],
                           'act_last_' + str(i), ['user_id'], 'day')

                gp = tact[tact.day >= lastday - i].groupby(
                    ['user_id', 'day']).size().unstack()
                df = pd.merge(df,
                              gp.max(1).rename('act_last_' + str(i) +
                                               '_actcount_max').reset_index(),
                              on=['user_id'],
                              how='left')
                df = pd.merge(
                    df,
                    gp.mean(1).rename('act_last_' + str(i) +
                                      '_actcount_mean').reset_index(),
                    on=['user_id'],
                    how='left')
                df = pd.merge(df,
                              gp.var(1).rename('act_last_' + str(i) +
                                               '_actcount_var').reset_index(),
                              on=['user_id'],
                              how='left')
        #df = docount(df,tact[tact.day>lastday-7],'act_last_7',['user_id'])
        #df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id'])
        #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id'])
        gc.collect()

        page_list = list(tact['page'].unique())
        for c in page_list:
            df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c),
                         ['user_id'])
            df['act_page=' + str(c) +
               '$user_id#'] = df['act_page=' + str(c) +
                                 '$user_id#'] / df['register_time']

        for c in page_list:
            df = docount(df,
                         tact[(tact['page'] == c) & (tact.day > lastday - 8)],
                         'act_last_8_page=' + str(c), ['user_id'])
        for c in page_list:
            df = docount(df,
                         tact[(tact['page'] == c) & (tact.day > lastday - 3)],
                         'act_last_3_page=' + str(c), ['user_id'])

        df['author_id'] = df['user_id']
        df = docount(df, tact, 'act', ['author_id'])
        df['act$author_id#'] = df['act$author_id#'] / df['register_time']

        df = doiq(df, tact, 'act', ['user_id'], 'author_id')
        df['act$user_id_by_author_id_iq'] = df[
            'act$user_id_by_author_id_iq'] / df['register_time']

        df = doiq(df, tact, 'act', ['user_id'], 'video_id')
        df['act$user_id_by_video_id_iq'] = df[
            'act$user_id_by_video_id_iq'] / df['register_time']

        for i in range(8):
            df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i),
                      ['user_id'], 'author_id')
            df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i),
                      ['user_id'], 'video_id')

        #action_list = list(tact['action_type'].unique())
        for c in [0, 1, 2, 3, 5]:
            df = docount(df, tact[tact['action_type'] == c],
                         'action_type=' + str(c), ['user_id'])
            gc.collect()
            df['action_type=' + str(c) +
               '$user_id#'] = df['action_type=' + str(c) +
                                 '$user_id#'] / df['register_time']
        for c in [0, 1, 2, 3]:
            df = docount(
                df,
                tact[(tact['action_type'] == c) & (tact.day > lastday - 8)],
                'act_last_8_action_type=' + str(c), ['user_id'])
        for c in [0, 1, 2, 3]:
            df = docount(
                df,
                tact[(tact['action_type'] == c) & (tact.day > lastday - 3)],
                'act_last_3_action_type=' + str(c), ['user_id'])
        ''' 
        def getmaxcontinuedays(s):
            s = np.array(s)
            ans = 0
            t = 0
            for i in s:
                if i>0:
                    t =  t+ 1
                else:
                    if t>ans:
                        ans = t
                    t = 0
            if t>ans:
                ans=t
            return ans
  
        gp = tapp.groupby(['user_id','day']).size().unstack()
        gp = gp.fillna(0)
        
        #print (gp)
        gp['app_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1)
        #print (gp)
        df = pd.merge(df,gp.reset_index()[['user_id','app_max_continue_days']],on=['user_id'],how='left') 
         
        gp = tact.groupby(['user_id','day']).size().unstack()
        gp = gp.fillna(0)
        
        #print (gp)
        gp['act_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1)
        #print (gp)
        df = pd.merge(df,gp.reset_index()[['user_id','act_max_continue_days']],on=['user_id'],how='left') 
        '''

        del df['author_id']
        gc.collect()

        return df

    def get_features_all(df, df1):
        lendf = len(df)
        df = df.append(df1)
        del df1
        gc.collect()

        #ccc = ['app_mean#', 'last_app_day', 'app$user_id_by_day_var', 'act$user_id_by_day_var', 'device_type', 'act$user_id_by_video_id_iq', 'app_last_4$user_id_by_day_var', 'act_last_0$user_id_by_author_id_iq', 'app_last_4$user_id#', 'register_type', 'act$user_id_by_day_max', 'actcount_var', 'act_last_0$user_id#', 'act_mean#', 'actcount_max', 'act_last_7$user_id_by_day_var', 'app_last_7$user_id_by_day_var', 'app_last_1$user_id#', 'action_type=2$user_id#', 'act_page=1$user_id#', 'action_type=0$user_id#', 'act_last_1$user_id#', 'app_last_5$user_id#', 'act$user_id_by_day_min', 'act_page=3$user_id#', 'act$user_id_by_day_iq', 'actcount_mean', 'act_last_0$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'app_last_7$user_id_by_day_mean', 'act_last_8_action_type=2$user_id#', 'act_last_8_page=1$user_id#', 'act_last_4$user_id_by_day_mean', 'act$user_id_by_author_id_iq', 'app_last_5$user_id_by_day_mean', 'act_day_gap', 'app_day_missing', 'act_last_7_actcount_var', 'action_type=3$user_id#', 'act_last_4_actcount_var', 'act_last_1$user_id_by_author_id_iq', 'app_last_3$user_id_by_day_var', 'act_last_3_actcount_var', 'act_last_1$user_id_by_video_id_iq', 'act_last_3_page=1$user_id#', 'act_page=2$user_id#', 'act_page=0$user_id#', 'act_last_3$user_id_by_video_id_iq', 'act_last_6_actcount_max', 'app_last_2$user_id#', 'act_last_2$user_id#', 'app_last_6$user_id_by_day_mean', 'act_last_6_actcount_var', 'act_last_3_action_type=2$user_id#', 'act_last_6$user_id_by_video_id_iq', 'act_last_7$user_id_by_video_id_iq', 'act_last_5_actcount_var', 'act_last_3$user_id#', 'act_last_7$user_id_by_author_id_iq', 'act_last_2$user_id_by_video_id_iq', 'act_last_8_page=3$user_id#', 'act_page=4$user_id#', 'act_last_7_actcount_max', 'act_last_5$user_id_by_day_var', 'act_last_7$user_id_by_day_mean', 'act_last_8_action_type=0$user_id#', 'act_last_3_actcount_max', 'app_last_5$user_id_by_day_var', 'app_last_0$user_id#', 'app_last_6$user_id_by_day_var', 'act_day_missing', 'action_type=1$user_id#', 'act_last_6_actcount_mean', 'act_last_6$user_id_by_day_mean', 'act_last_3$user_id_by_author_id_iq', 'act_last_8_page=0$user_id#', 'act_last_3_actcount_mean', 'act_last_6$user_id_by_author_id_iq', 'video_last_8$user_id_by_day_var', 'act_last_5$user_id_by_day_mean', 'act_last_3_page=0$user_id#', 'register_time', 'act_last_3$user_id_by_day_var', 'last_video_day', 'act_last_6$user_id_by_day_var', 'act_last_4$user_id#', 'act_last_5$user_id_by_author_id_iq', 'act_last_4$user_id_by_author_id_iq', 'first_video_day', 'video_mean#', 'act_last_8_action_type=3$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_3_page=3$user_id#', 'app_last_4$user_id_by_day_mean', 'app_last_3$user_id#', 'act_last_8_page=4$user_id#', 'act_last_6$user_id#', 'act_last_3$user_id_by_day_mean', 'act_last_7$user_id#', 'act_last_5$user_id_by_video_id_iq', 'video_last_8$user_id_by_day_mean', 'act_last_4$user_id_by_day_var', 'act_last_7_actcount_mean', 'app_last_7$user_id#', 'video$user_id_by_day_var', 'act_last_5_actcount_max', 'act_last_3_page=4$user_id#', 'act_last_8_page=2$user_id#', 'act_last_5$user_id#', 'act_last_4_actcount_max', 'video$user_id_by_day_iq', 'act_last_4$user_id_by_video_id_iq', 'act_last_5_actcount_mean', 'act$author_id#', 'app_last_6$user_id#', 'act_last_4_actcount_mean', 'act_last_8_action_type=1$user_id#', 'video_day_gap', 'act_last_3_action_type=1$user_id#', 'act_last_3_page=2$user_id#', 'app_last_3$user_id_by_day_mean', 'action_type=5$user_id#', 'video_last_8$user_id#', 'act_last_3_action_type=3$user_id#']
        #for i in range(100,124):
        #    del df[ccc[i]]

        del df['user_id']

        df1 = df[lendf:]
        df = df[:lendf]
        return df, df1

    df1 = register[register.register_day < 10]
    df1['register_time'] = 17 - register.register_day
    df2 = register[register.register_day < 17]
    df2['register_time'] = 24 - register.register_day

    test_df = register[register.register_day < 24]
    test_df['register_time'] = 31 - test_df.register_day

    del df1['register_day'], df2['register_day'], test_df['register_day']

    if os.path.exists(path + 'train_y1.csv'):
        train_y1 = pd.read_csv(path + 'train_y1.csv')

    else:
        train_y1 = is_active(df1, 17, 23, app, video, act)
        train_y1.to_csv(path + 'train_y1.csv', index=False)
    train_y1 = train_y1['Y']
    if os.path.exists(path + 'train_y2.csv'):
        train_y2 = pd.read_csv(path + 'train_y2.csv')

    else:
        train_y2 = is_active(df2, 24, 30, app, video, act)
        train_y2.to_csv(path + 'train_y2.csv', index=False)
    train_y2 = train_y2['Y']

    if os.path.exists(path + 'df1.csv'):
        df1 = pd.read_csv(path + 'df1.csv')
    else:
        df1 = get_features(df1, 1, 16)
        df1.to_csv(path + 'df1.csv', index=False)

    if os.path.exists(path + 'df2.csv'):
        df2 = pd.read_csv(path + 'df2.csv')
    else:
        df2 = get_features(df2, 1, 23)
        df2.to_csv(path + 'df2.csv', index=False)

    if val:
        train_df = df1
        test_df = df2
        train_y = train_y1
        val_y = train_y2
    else:
        if os.path.exists(path + 'test_df.csv'):
            test_df = pd.read_csv(path + 'test_df.csv')
        else:
            test_df = get_features(test_df, 1, 30)
            test_df.to_csv(path + 'test_df.csv', index=False)

        train_df = df1.append(df2)
        train_y = train_y1.append(train_y2)
        #train_df = df2
        #train_y = train_y2

    del df1, df2
    gc.collect()
    ids = test_df['user_id']
    train_df, test_df = get_features_all(train_df, test_df)
    '''
    train_df['Y'] = train_y
    print (len(train_df))
    train_js = train_df[train_df['act_mean#']==0]  
    train_df = train_df[train_df['act_mean#']>0]  
    print (len(train_df))
    train_y = train_df['Y']
    del train_df['Y']
    train_y_js = train_js['Y']
    del train_js['Y']
    
    test_df['Y'] = val_y
    test_js =  test_df[test_df['act_mean#']==0] 
    test_df =  test_df[test_df['act_mean#']>0] 
    val_y = test_df['Y']
    del test_df['Y']
    js_y = test_js['Y']
    del test_js['Y']
    '''
    pre_train, test_y = predict_data(train_df,
                                     train_y,
                                     10,
                                     test_df,
                                     importance=1)
    #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1)
    '''
    test_df['Y'] = val_y
    test_df['Y1'] = test_y
    test_js =  test_df[test_df['act_mean#']==0] 
    print(len(test_js))
    print(len(test_js[test_js['Y1']>=0.4]))
    print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)]))
    test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0
    print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]))
    test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0
    '''

    if val == 1:
        showresults(val_y, test_y)
        showtop(val_y, test_y, nums=10705)
        return ids, test_y, getbest(ids, test_y, rank=10705)
    else:
        showresults(train_y, pre_train)
        showtop(train_y, pre_train, nums=16449)
        return ids, test_y, getbest(ids, test_y, th=0.4)
Exemple #4
0
                                     nb=99)

sp = 1

if val == 1:
    showresults(val_y, test_y)
    #showtop(val_y,test_y,nums=18223)
    showtop(val_y, test_y, nums=15428)
    showfalse(ids, test_df, val_y, test_y)
else:
    showresults(train_y, pre_train)
    if sp:
        df_1_28 = register[register.register_day <= 28]
        #df_29_30 = register[register.register_day>28]
        ans_1_28 = getbest1(df_1_28, ids, test_y, rank=22088)
        #ans_29_30 = getbest1(df_29_30,ids,test_y,th=0.4)
        #print (len(ans_1_28),len(ans_29_30))
        from predict_30 import predict_30
        from predict_29 import predict_29
        ids29, test_y29, ans29 = predict_29(val, register, app, video, act)
        ids30, test_y30, ans30 = predict_30(val, register, app, video, act)
        ans = ans_1_28 + ans29 + ans30

    else:
        ans = getbest(ids, test_y, rank=22088)
    print(len(ans))
    import time
    name = time.strftime('%Y-%m-%d_%H_%M_%S', time.localtime(time.time()))
    submission = pd.DataFrame({'user_id': ans})
    submission.to_csv('ksn_submit' + name + '.csv', index=False, header=None)
def predict_1_28(val,register,app,video,act):
    path = '../data1/1_28/'
        
    def get_features_all(df,df1):
        lendf = len(df)
        df= df.append(df1)
        del df1
        gc.collect()
        
        df = docount(df,df,'ALL',['register_type'])
        df = docount(df,df,'ALL',['device_type'])
        
        
        del df['user_id']

        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
        
    df1 = register[register.register_day<15]
    df1['register_time'] = 17-register.register_day
    df2 = register[register.register_day<22]
    df2['register_time'] = 24-register.register_day
    df2[df2['register_time']>16]['register_time'] = 16


    
    test_df = register[register.register_day<29]
    test_df['register_time'] = 31-test_df.register_day
    df2[df2['register_time']>16]['register_time'] = 16

    
    del df1['register_day'],df2['register_day'],test_df['register_day']
    
    if os.path.exists(path+'train_y1.csv'):
        train_y1=pd.read_csv(path+'train_y1.csv')
        
    else:
        train_y1 = is_active(df1,17,23,app,video,act)
        train_y1.to_csv(path+'train_y1.csv',index=False)
    train_y1 = train_y1['Y']
    if os.path.exists(path+'train_y2.csv'):
        train_y2=pd.read_csv(path+'train_y2.csv')
        
    else:
        train_y2 = is_active(df2,24,30,app,video,act)
        train_y2.to_csv(path+'train_y2.csv',index=False)
    train_y2 = train_y2['Y']        
        
    if os.path.exists(path+'df1.csv'):
        df1=pd.read_csv(path+'df1.csv')
    else:
        df1 = get_features_ks(df1,1,16,app,video,act)
        df1.to_csv(path+'df1.csv',index=False)
    
    if os.path.exists(path+'df2.csv'):
        df2=pd.read_csv(path+'df2.csv')
    else:
        df2 = get_features_ks(df2,8,23,app,video,act)
        df2.to_csv(path+'df2.csv',index=False)
        
    if val:
        train_df = df1
        test_df = df2
        train_y = train_y1
        val_y = train_y2
    else:
        if os.path.exists(path+'test_df.csv'):
            test_df=pd.read_csv(path+'test_df.csv')
        else:
            test_df = get_features_ks(test_df,15,30,app,video,act)
            test_df.to_csv(path+'test_df.csv',index=False)
        
        train_df = df1.append(df2)
        train_y = train_y1.append(train_y2)
        #train_df = df2
        #train_y = train_y2
    
    del df1,df2
    gc.collect()
    ids = test_df['user_id']
    train_df,test_df = get_features_all(train_df,test_df)    
    '''
    train_df['Y'] = train_y
    print (len(train_df))
    train_js = train_df[train_df['act_mean#']==0]  
    train_df = train_df[train_df['act_mean#']>0]  
    print (len(train_df))
    train_y = train_df['Y']
    del train_df['Y']
    train_y_js = train_js['Y']
    del train_js['Y']
    
    test_df['Y'] = val_y
    test_js =  test_df[test_df['act_mean#']==0] 
    test_df =  test_df[test_df['act_mean#']>0] 
    val_y = test_df['Y']
    del test_df['Y']
    js_y = test_js['Y']
    del test_js['Y']
    '''
    pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1)
    #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1)
    '''
    test_df['Y'] = val_y
    test_df['Y1'] = test_y
    test_js =  test_df[test_df['act_mean#']==0] 
    print(len(test_js))
    print(len(test_js[test_js['Y1']>=0.4]))
    print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)]))
    test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0
    print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]))
    test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0
    '''
    
    if val==1:   
        showresults(val_y,test_y) 
        showtop(val_y,test_y,nums=15428)
        showtop(val_y,test_y,nums=15905)
        showfalse(ids,test_df,val_y,test_y)
        #showprecision(val_y,test_y)
        return ids,test_y,getbest(ids,test_y,th=0.4)   
    else:
        showresults(train_y,pre_train)     
        showtop(train_y,pre_train,nums=25713)
    
        #return ids,test_y,getbest(ids,test_y,th=0.4) 
        return ids,test_y,getbest(ids,test_y,rank=22088) 
def predict_24_28(val,register,app,video,act):
    
    def get_features(df,d1,d2):
        tapp = app[(app.day>=d1) & (app.day<=d2)]
        tact = act[(act.day>=d1) & (act.day<=d2)]
        tvideo = video[(video.day>=d1) & (video.day<=d2)]
        tapp.day = tapp.day - d1
        tact.day = tact.day - d1
        tvideo.day = tvideo.day - d1
        lastday = d2-d1

        df['register_time'] = d2-df.register_day+1
        del df['register_day']
        
        #app
        df = docount(df,tapp,'app',['user_id'])
        df['app_mean#'] = df['app$user_id#']/df['register_time']
        #df = domax(df,tapp,'app',['user_id'],'day')
        #df['last_app_day'] = lastday - df['app$user_id_by_day_max']+1
        del df['app$user_id#']
        #df['app_day_missing'] = df['register_time'] - df['app$user_id#']
        #df['app$user_id#'] = df['app$user_id#']/df['register_time']
        
        #df = dovar(df,tapp,'app',['user_id'],'day')
        #df = docount(df,tapp[tapp.day>lastday-2],'app_last_2',['user_id'])        
        #df = docount(df,tapp[tapp.day>lastday-1],'app_last_1',['user_id']) 
        #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id'])
        gc.collect()
        #video
        #df = docount(df,tvideo,'video',['user_id'])
        #df['video_mean#'] = df['video$user_id#']/df['register_time']
        #df = domax(df,tvideo,'video',['user_id'],'day')
        #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1
        #del df['video$user_id_by_day_max']
        #df = doiq(df,tvideo,'video',['user_id'],'day')
        #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1
        #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq']
        #df['video$user_id#'] = df['video$user_id#']/df['register_time']
        
        #df = dovar(df,tvideo,'video',['user_id'],'day')     
        df = docount(df,tvideo[tvideo.day>lastday-2],'video_last_2',['user_id'])
        df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id'])
        #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id'])
        gc.collect()
        #act
        #gp = act.groupby(['user_id','day']).size().unstack()
        #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left')   
        #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left')
        #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left')        
        
        #df = docount(df,tact,'act',['user_id'])
        #df['act_mean#'] = df['act$user_id#']/df['register_time']
        df = domax(df,tact,'act',['user_id'],'day')
        df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        del df['act$user_id_by_day_max']
        #df = doiq(df,tact,'act',['user_id'],'day')
        #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1
        #df['act_day_missing'] = df['register_time'] - df['act$user_id_by_day_iq']
        #df['act$user_id#'] = df['act$user_id#']/df['register_time']
        
        #gp = tact.groupby(['user_id','day']).size().unstack()
        #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left')   
        #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left')
        #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left')

        #df = dovar(df,tact,'act',['user_id'],'day')      
        df = docount(df,tact[tact.day>lastday-2],'act_last_2',['user_id']) 
        df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id'])
        #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id'])
        gc.collect()
        
        #page_list = list(tact['page'].unique())
                
        for c in [0,1,2,3]: 
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-3)],'act_last_3_page='+str(c),['user_id']) 
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-2)],'act_last_2_page='+str(c),['user_id'])
            df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-1)],'act_last_1_page='+str(c),['user_id']) 
        
        df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'video_id')
        
        df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'video_id')
        
        df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'author_id')  
        df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'video_id')
        
        for c in [0,1,2,3]: 
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-3)],'act_last_3_action_type='+str(c),['user_id'])
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-2)],'act_last_2_action_type='+str(c),['user_id'])
            df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-1)],'act_last_1_action_type='+str(c),['user_id'])

        
        gc.collect()
        
        
        return df
    
    
    path = '../data1/24_28/'
    if val:
        if os.path.exists(path+'val_df.csv'):
            test_df = pd.read_csv(path+'val_df.csv')
            val_y = pd.read_csv(path+'val_y.csv')
        else:
            test_df = register[(register.register_day>=17) & (register.register_day<=21)]
            test_df = get_features(test_df,17,23)
            val_y = is_active(test_df,24,30,app,video,act)
            test_df.to_csv(path+'val_df.csv',index=False)
            val_y.to_csv(path+'val_y.csv',index=False)
        val_y = val_y['Y']
        if os.path.exists(path+'val_train_df.csv'):
            train_df = pd.read_csv(path+'val_train_df.csv')
            train_y = pd.read_csv(path+'val_train_y.csv')
        else:    
            train_df = pd.DataFrame()   
            train_y = pd.DataFrame()                  
            for i in range(1,11):
                df = register[(register.register_day>=i) & (register.register_day<=i+4)]
                y = is_active(df,i+7,i+13,app,video,act)
                df = get_features(df,i,i+6)
                train_df = train_df.append(df)
                train_y = train_y.append(y)
            train_df.to_csv(path+'val_train_df.csv',index=False)
            train_y.to_csv(path+'val_train_y.csv',index=False)
    else:
        if os.path.exists(path+'test_df.csv'):
            test_df = pd.read_csv(path+'test_df.csv')
        else:
            test_df = register[(register.register_day>=24) & (register.register_day<=28)]
            test_df = get_features(test_df,24,30)
            test_df.to_csv(path+'test_df.csv',index=False)
                               
        if os.path.exists(path+'train_df.csv'):
            train_df = pd.read_csv(path+'train_df.csv')
            train_y = pd.read_csv(path+'train_y.csv')
        else:            
            if os.path.exists(path+'val_train_df.csv'):
                train_df = pd.read_csv(path+'val_train_df.csv')
                train_y = pd.read_csv(path+'val_train_y.csv') 
                for i in range(11,18):
                    df = register[(register.register_day>=i) & (register.register_day<=i+4)]
                    y = is_active(df,i+7,i+13,app,video,act)
                    df = get_features(df,i,i+6)
                    train_df = train_df.append(df)
                    train_y = train_y.append(y)  
            else:
                train_df = pd.DataFrame()   
                train_y = pd.DataFrame()                  
                for i in range(1,18):
                    df = register[(register.register_day>=i) & (register.register_day<=i+4)]
                    y = is_active(df,i+7,i+13,app,video,act)
                    df = get_features(df,i,i+6)
                    train_df = train_df.append(df)
                    train_y = train_y.append(y)  
            train_df.to_csv(path+'train_df.csv',index=False)
            train_y.to_csv(path+'train_y.csv',index=False)                 
    train_y = train_y['Y']
    #print(sum(train_y)/len(train_y))
        
    def get_features_all(df,df1):
        lendf = len(df)
        
        df= df.append(df1)
        del df1
        gc.collect()
        
        #for c in ['act_last_2$user_id#']:
        #    df = domean(df,df,'All',['device_type'],c);gc.collect()
        #    df = domean(df,df,'All',['register_type'],c);gc.collect()
            
        #del df
            
        #ccc = ['device_type', 'app_mean#', 'register_type', 'register_time', 'act_last_3_page=1$user_id#', 'last_act_day', 'act_last_3$user_id_by_video_id_iq', 'act_last_3_page=2$user_id#', 'act_last_3$user_id_by_author_id_iq', 'act_last_3_action_type=1$user_id#', 'act_last_1$user_id_by_author_id_iq', 'act_last_3_page=3$user_id#', 'act_last_3_page=0$user_id#', 'act_last_1$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'act_last_3$user_id#', 'act_last_2$user_id_by_video_id_iq', 'act_last_3_action_type=2$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_2_page=2$user_id#', 'act_last_2_page=1$user_id#', 'act_last_2_page=3$user_id#', 'act_last_1_page=1$user_id#', 'act_last_2$user_id#', 'act_last_2_page=0$user_id#', 'act_last_1_action_type=0$user_id#', 'act_last_2_action_type=1$user_id#', 'act_last_1_page=2$user_id#', 'act_last_3_action_type=3$user_id#', 'act_last_1_page=3$user_id#', 'act_last_2_action_type=0$user_id#', 'video_last_3$user_id#', 'act_last_1_page=0$user_id#', 'act_last_2_action_type=2$user_id#', 'act_last_2_action_type=3$user_id#', 'video_last_2$user_id#', 'act_last_1_action_type=1$user_id#', 'act_last_1_action_type=3$user_id#', 'act_last_1_action_type=2$user_id#']
        #for i in range(38,39):
        #    del df[ccc[i]]            
            
            
        
        del df['user_id']
        #del df['last_app_day'],df['last_video_day'],df['video_last_1$user_id#'],df['app_last_1$user_id#']
        #del df['act_last_1$user_id#'],df['app_last_2$user_id#']
        
 
        df1 = df[lendf:]
        df = df[:lendf]
        return df,df1
        
    
    ids = test_df['user_id']
    train_df,test_df = get_features_all(train_df,test_df)    
    
    pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1)
    #print(test_y)
    if val==1:   
        print (len(train_y),sum(train_y))
        showresults(val_y,test_y) 
        showtop(val_y,test_y,nums=4723)
        showtop(train_y,pre_train,nums=38507)
        #return ids,test_y,getbest(ids,test_y,rank=4723)
        return ids,test_y,getbest(ids,test_y,th=0.4)
    else:
        showresults(train_y,pre_train)     
        showtop(train_y,pre_train,nums=70275)
        return ids,test_y,getbest(ids,test_y,rank=5498)