def predict_30(val, register, app, video, act): def get_features0(df, d): #tapp = app[app.day==d] tvideo = video[video.day == d] tact = act[act.day == d] #df = docount(df,tapp,'app',['user_id']);gc.collect() df = docount(df, tvideo, 'video', ['user_id']) gc.collect() df['videorate'] = df['video$user_id#'] / (tvideo.shape[0] + 0.000001) df = docount(df, tact, 'act', ['user_id']) gc.collect() df['actrate'] = df['act$user_id#'] / (tact.shape[0] + 0.000001) page_list = list(tact['page'].unique()) for c in [0, 1, 2, 3, 4]: df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c), ['user_id']) gc.collect() df['act_page=' + str(c) + '$user_id#rate'] = df['act_page=' + str(c) + '$user_id#'] / ( df['act$user_id#'] + 0.00001) df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df[ 'act_page=3$user_id#'] df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df[ 'act_page=3$user_id#'] + df['act_page=0$user_id#'] action_list = list(tact['action_type'].unique()) for c in [0, 1, 2, 3, 4, 5]: df = docount(df, tact[tact['action_type'] == c], 'action_type=' + str(c), ['user_id']) gc.collect() df['action_type=' + str(c) + '$user_id#rate'] = df['action_type=' + str(c) + '$user_id#'] / ( df['act$user_id#'] + 0.00001) df['action_type=01$user_id#'] = df['action_type=0$user_id#'] + df[ 'action_type=1$user_id#'] def iszero(s): if s == 0: return 0 return 1 df['pageall'] = df['act_page=0$user_id#'].apply(iszero) for c in [1, 2, 3, 4]: df['pageall'] = df['pageall'] * df['act_page=0$user_id#'] df['pageall'] = df['act_page=0$user_id#'].apply(iszero) df['actionall'] = df['action_type=0$user_id#'].apply(iszero) for c in [1, 2, 3, 4, 5]: df['pageall'] = df['pageall'] * df['action_type=0$user_id#'] df['actionall'] = df['action_type=0$user_id#'].apply(iszero) df['act0'] = df['act$user_id#'].apply(iszero) df['video0'] = df['video$user_id#'].apply(iszero) def bigact(s): if s >= 50: return 5 else: return int(s / 10) df['act$user_id#10'] = df['act$user_id#'].apply(bigact) df['author_id'] = df['user_id'] df = docount(df, tact, 'act', ['author_id']) gc.collect() df = doiq(df, tact, 'act', ['user_id'], 'video_id') gc.collect() df = doiq(df, tact, 'act', ['user_id'], 'author_id') gc.collect() df['act$author_video_m'] = df['act$user_id_by_video_id_iq'] / df[ 'act$user_id_by_author_id_iq'] del df['register_day'], df['author_id'] return df def get_features_all(df, df1): lendf = len(df) df = df.append(df1) del df1 gc.collect() for c in ['act$user_id#']: #df = domean(df,df,'All',['device_type'],c);gc.collect() df = domean(df, df, 'All', ['register_type'], c) gc.collect() #df = dovar(df,df,'All',['register_type'],c);gc.collect() df = docount(df, df, 'ALL', ['register_type']) df = docount(df, df, 'ALL', ['device_type']) del df['user_id'], ccc = [ 'device_type', 'actrate', 'All$register_type_by_act$user_id#_mean', 'act_page=1$user_id#', 'action_type=0$user_id#rate', 'action_type=1$user_id#rate', 'register_type', 'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'act$author_video_m', 'action_type=2$user_id#rate', 'act_page=3$user_id#rate', 'act_page=0$user_id#', 'action_type=0$user_id#', 'act_page=2$user_id#', 'act_page=2$user_id#rate', 'action_type=1$user_id#', 'act$user_id#', 'act_page=4$user_id#rate', 'act_page=0$user_id#rate', 'pageall', 'act_page=4$user_id#', 'action_type=3$user_id#rate', 'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#', 'action_type=2$user_id#', 'action_type=3$user_id#', 'act_page=023$user_id#', 'act$author_id#', 'action_type=01$user_id#', 'action_type=5$user_id#rate', 'ALL$register_type#', 'action_type=5$user_id#', 'act$user_id#10', 'action_type=4$user_id#', 'actionall', 'action_type=4$user_id#rate', 'act0', 'video0' ] ccc1 = [] ddd = [ 'All$register_type_by_act$user_id#_mean', 'act_page=1$user_id#', 'action_type=1$user_id#rate', 'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq', 'act$author_video_m', 'act_page=2$user_id#', 'act_page=2$user_id#rate', 'action_type=1$user_id#', 'act$user_id#', 'act_page=4$user_id#rate', 'act_page=4$user_id#', 'action_type=3$user_id#rate', 'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#', 'action_type=2$user_id#', 'action_type=3$user_id#', 'act$author_id#', 'action_type=01$user_id#', 'ALL$register_type#', 'ALL$device_type#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'act$user_id#10', 'action_type=4$user_id#', 'actionall', 'action_type=4$user_id#rate', 'act0', ] used = [ 'device_type', 'register_type', 'actrate', 'action_type=0$user_id#rate', 'videorate', 'act_page=1$user_id#rate', 'action_type=2$user_id#rate', 'act_page=3$user_id#rate', 'act_page=0$user_id#', 'action_type=0$user_id#', 'act_page=0$user_id#rate', 'pageall', 'act_page=023$user_id#', 'video0', 'All$register_type_by_act$user_id#_mean', 'ALL$register_type#', ] df = df[used] df1 = df[lendf:] df = df[:lendf] return df, df1 path = '../data1/30/' if os.path.exists(path + 'train_df.csv'): train_df = pd.read_csv(path + 'train_df.csv') train_y = pd.read_csv(path + 'train_y.csv') else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1, 24): df = register[register.register_day == i] y = is_active(df, i + 1, i + 7, app, video, act) df = get_features0(df, i) train_df = train_df.append(df) train_y = train_y.append(y) if i == 22: valst = len(train_df) print(valst) train_df.to_csv(path + 'train_df.csv', index=False) train_y.to_csv(path + 'train_y.csv', index=False) train_y = train_y['Y'] if val: #35134 valst = 35134 test_df = train_df[valst:] val_y = train_y[valst:] train_df = train_df[:valst] train_y = train_y[:valst] else: if os.path.exists(path + 'test_df.csv'): test_df = pd.read_csv(path + 'test_df.csv') else: test_df = register[register.register_day == 30] test_df = get_features0(test_df, 30) test_df.to_csv(path + 'test_df.csv', index=False) #train_df['Y'] = train_y #act0train = train_df[train_df['act$user_id#']==0] #print(len(act0train),len(act0train[act0train['Y']==1])) #del train_df['Y'] #act0ids = test_df[test_df['act$user_id#']==0]['user_id'] ids = test_df['user_id'] train_df, test_df = get_features_all(train_df, test_df) pre_train, test_y = predict_data(train_df, train_y, 10, test_df, importance=1) if val == 1: print(len(train_y), sum(train_y)) showresults(train_y, pre_train) showresults(val_y, test_y) showfalse(ids, test_df, val_y, test_y) showtop(val_y, test_y, nums=1457) showtop(train_y, pre_train, nums=23260) #showtop(train_y,pre_train,nums=15485) #showprecision(val_y,test_y) #showprecision(train_y,pre_train) return ids, test_y, getbest(ids, test_y, th=0.4) else: showresults(train_y, pre_train) showtop(train_y, pre_train, nums=24717) #showtop(train_y,pre_train,nums=16943) #showprecision(train_y,pre_train) return ids, test_y, getbest(ids, test_y, rank=1490)
def predict_29(val,register,app,video,act): def get_features(df,d1,d2): tapp = app[(app.day>=d1) & (app.day<=d2)] tact = act[(act.day>=d1) & (act.day<=d2)] tvideo = video[(video.day>=d1) & (video.day<=d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2-d1 #df['register_time'] = d2-df.register_day+1 df = docount(df,tapp,'app',['user_id']);gc.collect() df = docount(df,tapp[tapp.day==lastday],'last_day_app',['user_id']);gc.collect() #df['app_mean#'] = df['app$user_id#']/2 df = docount(df,tvideo,'video',['user_id']);gc.collect() df['videorate'] = df['video$user_id#']/(tvideo.shape[0]+0.000001) #df['video_mean#'] = df['video$user_id#']/2 df = docount(df,tact,'act',['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday],'last_day_act',['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday-1],'first_day_act',['user_id']);gc.collect() df['actrate'] = df['act$user_id#']/(tact.shape[0]+0.000001) df['last_day_actrate'] = df['last_day_act$user_id#']/(tact.shape[0]+0.000001) df['first_day_actrate'] = df['first_day_act$user_id#']/(tact.shape[0]+0.000001) df['actrate_gap'] = df['last_day_actrate'] - df['first_day_actrate'] df['act_gap'] = df['last_day_act$user_id#'] - df['first_day_act$user_id#'] #df['act_mean#'] = df['act$user_id#']/2 #page_list = list(tact['page'].unique()) def iszero(s): if s==0: return 0 return 1 df['act0'] = df['act$user_id#'].apply(iszero) df['video0'] = df['video$user_id#'].apply(iszero) for c in [1]: df = docount(df,tact[tact.day==lastday][tact['page']==c],'last_day_act_page='+str(c),['user_id']);gc.collect() for c in [0,1,2,3,4]: df = docount(df,tact[tact['page']==c],'act_page='+str(c),['user_id']);gc.collect() df['act_page='+str(c)+'$user_id#rate'] = df['act_page='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001) df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#'] df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']+df['act_page=0$user_id#'] action_list = list(tact['action_type'].unique()) for c in [0,1,2,3,4,5]: df = docount(df,tact[tact['action_type']==c],'action_type='+str(c),['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday][tact['action_type']==c],'last_day_action_type='+str(c),['user_id']);gc.collect() df['action_type='+str(c)+'$user_id#rate'] = df['action_type='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001) df['author_id'] = df['user_id'] df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'author_id');gc.collect() df['last_day_act$author_video_m'] = df['last_day_act$user_id_by_video_id_iq']/df['last_day_act$user_id_by_author_id_iq'] df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'author_id');gc.collect() df['first_day_act$author_video_m'] = df['first_day_act$user_id_by_video_id_iq']/df['first_day_act$user_id_by_author_id_iq'] df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'author_id');gc.collect() df['last2_day_act$author_video_m'] = df['last2_day_act$user_id_by_video_id_iq']/df['last2_day_act$user_id_by_author_id_iq'] del df['register_day'],df['author_id'] return df def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() df = docount(df,df,'ALL',['register_type']) del df['user_id'] ccc = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'action_type=2$user_id#rate', 'act_page=0$user_id#rate', 'last_day_act$author_video_m', 'action_type=1$user_id#rate', 'act_page=2$user_id#', 'actrate', 'last_day_act$user_id_by_author_id_iq', 'app$user_id#', 'last_day_act_page=1$user_id#', 'act_page=3$user_id#rate', 'last_day_action_type=0$user_id#', 'first_day_act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'last2_day_act$user_id_by_author_id_iq', 'last2_day_act$user_id_by_video_id_iq', 'first_day_actrate', 'act_page=2$user_id#rate', 'last_day_actrate', 'first_day_act$author_video_m', 'last2_day_act$author_video_m', 'ALL$register_type#', 'act_page=0$user_id#', 'actrate_gap', 'action_type=3$user_id#rate', 'last_day_act$user_id#', 'act$user_id#', 'last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#', 'act_gap', 'action_type=2$user_id#', 'action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#', 'act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#', 'act_page=23$user_id#', 'act_page=023$user_id#', 'act_page=4$user_id#', 'last_day_action_type=2$user_id#', 'last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#', 'last_day_action_type=4$user_id#', 'action_type=4$user_id#', 'last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0'] ccc1 = [ ] ddd = ['action_type=2$user_id#rate','action_type=1$user_id#rate','last_day_act$user_id_by_author_id_iq', 'last_day_act_page=1$user_id#','act_page=3$user_id#rate','first_day_act$user_id_by_video_id_iq', 'videorate','act_page=1$user_id#rate','last2_day_act$user_id_by_author_id_iq','last2_day_act$user_id_by_video_id_iq', 'act_page=2$user_id#rate','last_day_actrate', 'first_day_act$author_video_m','last2_day_act$author_video_m', 'ALL$register_type#','act_page=0$user_id#','actrate_gap','action_type=3$user_id#rate', 'last_day_act$user_id#','act$user_id#','last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#','act_gap', 'action_type=2$user_id#','action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#','act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#','act_page=23$user_id#', 'act_page=023$user_id#','act_page=4$user_id#', 'last_day_action_type=2$user_id#','last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#','last_day_action_type=4$user_id#', 'action_type=4$user_id#','last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0'] used = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'act_page=0$user_id#rate','last_day_act$author_video_m', 'act_page=2$user_id#','actrate','app$user_id#', 'last_day_action_type=0$user_id#', 'first_day_actrate', 'action_type=5$user_id#rate', ] df = df[used] df1 = df[lendf:] df = df[:lendf] return df,df1 path = '../data1/29/' if val: if os.path.exists(path+'val_df.csv'): test_df = pd.read_csv(path+'val_df.csv') val_y = pd.read_csv(path+'val_y.csv') else: test_df = register[(register.register_day==22)] test_df = get_features(test_df,22,23) val_y = is_active(test_df,24,30,app,video,act) test_df.to_csv(path+'val_df.csv',index=False) val_y.to_csv(path+'val_y.csv',index=False) val_y = val_y['Y'] if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,22): df = register[(register.register_day==i)] y = is_active(df,i+2,i+8,app,video,act) df = get_features(df,i,i+1) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'val_train_df.csv',index=False) train_y.to_csv(path+'val_train_y.csv',index=False) else: if os.path.exists(path+'test_df.csv'): test_df = pd.read_csv(path+'test_df.csv') else: test_df = register[(register.register_day==29)] test_df = get_features(test_df,29,30) test_df.to_csv(path+'test_df.csv',index=False) if os.path.exists(path+'train_df.csv'): train_df = pd.read_csv(path+'train_df.csv') train_y = pd.read_csv(path+'train_y.csv') else: if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') val_df = pd.read_csv(path+'val_df.csv') val_y = pd.read_csv(path+'val_y.csv') train_df = train_df.append(val_df) train_y = train_y.append(val_y) else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,23): df = register[(register.register_day==i)] y = is_active(df,i+2,i+8,app,video,act) df = get_features(df,i,i+1) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'train_df.csv',index=False) train_y.to_csv(path+'train_y.csv',index=False) train_y = train_y['Y'] ids = test_df['user_id'] train_df,test_df = get_features_all(train_df,test_df) pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1) if val==1: print (len(train_y),sum(train_y)) showresults(train_y,pre_train) showresults(val_y,test_y) showtop(val_y,test_y,nums=1337) showtop(train_y,pre_train,nums=19589) return ids,test_y,getbest(ids,test_y,th=0.4) else: showresults(train_y,pre_train) showtop(train_y,pre_train,nums=20926) return ids,test_y,getbest(ids,test_y,rank=1294)
def predict_1_23(val, register, app, video, act): path = '../data1/1_23/' def get_features(df, d1, d2): tapp = app[(app.day >= d1) & (app.day <= d2)] tact = act[(act.day >= d1) & (act.day <= d2)] tvideo = video[(video.day >= d1) & (video.day <= d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2 - d1 #app df = docount(df, tapp, 'app', ['user_id']) #df = domin(df,tapp,'app',['user_id'],'day') df = domax(df, tapp, 'app', ['user_id'], 'day') df['last_app_day'] = lastday - df['app$user_id_by_day_max'] + 1 #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']+1 df['app_day_missing'] = df['register_time'] - df['app$user_id#'] df['app_mean#'] = df['app$user_id#'] / df['register_time'] del df['app$user_id#'], df['app$user_id_by_day_max'] df = dovar(df, tapp, 'app', ['user_id'], 'day') #df = domean(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day') #df = dovar(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day') for i in range(8): df = docount(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id']) if i >= 3: df = domean(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id'], 'day') df = dovar(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id'], 'day') #df = docount(df,tapp[tapp.day>lastday-7],'app_last_7',['user_id']) #df = docount(df,tapp[tapp.day>lastday-3],'app_last_3',['user_id']) #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id']) gc.collect() #video df = docount(df, tvideo, 'video', ['user_id']) df = domin(df, tvideo, 'video', ['user_id'], 'day') df = domax(df, tvideo, 'video', ['user_id'], 'day') df = doiq(df, tvideo, 'video', ['user_id'], 'day') df['last_video_day'] = lastday - df['video$user_id_by_day_max'] + 1 df['first_video_day'] = lastday - df['video$user_id_by_day_min'] + 1 df['video_day_gap'] = df['video$user_id_by_day_max'] - df[ 'video$user_id_by_day_min'] + 1 #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq'] df['video_mean#'] = df['video$user_id#'] / df['register_time'] del df['video$user_id#'], df['video$user_id_by_day_max'], df[ 'video$user_id_by_day_min'] df = dovar(df, tvideo, 'video', ['user_id'], 'day') df = domean(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id'], 'day') df = dovar(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id'], 'day') df = docount(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id']) #df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id']) #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id']) gc.collect() #act gp = tact.groupby(['user_id', 'day']).size().unstack() df = pd.merge(df, gp.max(1).rename('actcount_max').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.mean(1).rename('actcount_mean').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.var(1).rename('actcount_var').reset_index(), on=['user_id'], how='left') df = docount(df, tact, 'act', ['user_id']) df = domin(df, tact, 'act', ['user_id'], 'day') df = domax(df, tact, 'act', ['user_id'], 'day') df = doiq(df, tact, 'act', ['user_id'], 'day') #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 df['act_day_gap'] = df['act$user_id_by_day_max'] - df[ 'act$user_id_by_day_min'] + 1 df['act_day_missing'] = df['register_time'] - df[ 'act$user_id_by_day_iq'] df['act_mean#'] = df['act$user_id#'] / df['register_time'] del df['act$user_id#'] df = dovar(df, tact, 'act', ['user_id'], 'day') #df = domean(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day') #df = dovar(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day') for i in range(8): df = docount(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id']) if i >= 3: df = domean(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'day') df = dovar(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'day') gp = tact[tact.day >= lastday - i].groupby( ['user_id', 'day']).size().unstack() df = pd.merge(df, gp.max(1).rename('act_last_' + str(i) + '_actcount_max').reset_index(), on=['user_id'], how='left') df = pd.merge( df, gp.mean(1).rename('act_last_' + str(i) + '_actcount_mean').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.var(1).rename('act_last_' + str(i) + '_actcount_var').reset_index(), on=['user_id'], how='left') #df = docount(df,tact[tact.day>lastday-7],'act_last_7',['user_id']) #df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id']) #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id']) gc.collect() page_list = list(tact['page'].unique()) for c in page_list: df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c), ['user_id']) df['act_page=' + str(c) + '$user_id#'] = df['act_page=' + str(c) + '$user_id#'] / df['register_time'] for c in page_list: df = docount(df, tact[(tact['page'] == c) & (tact.day > lastday - 8)], 'act_last_8_page=' + str(c), ['user_id']) for c in page_list: df = docount(df, tact[(tact['page'] == c) & (tact.day > lastday - 3)], 'act_last_3_page=' + str(c), ['user_id']) df['author_id'] = df['user_id'] df = docount(df, tact, 'act', ['author_id']) df['act$author_id#'] = df['act$author_id#'] / df['register_time'] df = doiq(df, tact, 'act', ['user_id'], 'author_id') df['act$user_id_by_author_id_iq'] = df[ 'act$user_id_by_author_id_iq'] / df['register_time'] df = doiq(df, tact, 'act', ['user_id'], 'video_id') df['act$user_id_by_video_id_iq'] = df[ 'act$user_id_by_video_id_iq'] / df['register_time'] for i in range(8): df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'author_id') df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'video_id') #action_list = list(tact['action_type'].unique()) for c in [0, 1, 2, 3, 5]: df = docount(df, tact[tact['action_type'] == c], 'action_type=' + str(c), ['user_id']) gc.collect() df['action_type=' + str(c) + '$user_id#'] = df['action_type=' + str(c) + '$user_id#'] / df['register_time'] for c in [0, 1, 2, 3]: df = docount( df, tact[(tact['action_type'] == c) & (tact.day > lastday - 8)], 'act_last_8_action_type=' + str(c), ['user_id']) for c in [0, 1, 2, 3]: df = docount( df, tact[(tact['action_type'] == c) & (tact.day > lastday - 3)], 'act_last_3_action_type=' + str(c), ['user_id']) ''' def getmaxcontinuedays(s): s = np.array(s) ans = 0 t = 0 for i in s: if i>0: t = t+ 1 else: if t>ans: ans = t t = 0 if t>ans: ans=t return ans gp = tapp.groupby(['user_id','day']).size().unstack() gp = gp.fillna(0) #print (gp) gp['app_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1) #print (gp) df = pd.merge(df,gp.reset_index()[['user_id','app_max_continue_days']],on=['user_id'],how='left') gp = tact.groupby(['user_id','day']).size().unstack() gp = gp.fillna(0) #print (gp) gp['act_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1) #print (gp) df = pd.merge(df,gp.reset_index()[['user_id','act_max_continue_days']],on=['user_id'],how='left') ''' del df['author_id'] gc.collect() return df def get_features_all(df, df1): lendf = len(df) df = df.append(df1) del df1 gc.collect() #ccc = ['app_mean#', 'last_app_day', 'app$user_id_by_day_var', 'act$user_id_by_day_var', 'device_type', 'act$user_id_by_video_id_iq', 'app_last_4$user_id_by_day_var', 'act_last_0$user_id_by_author_id_iq', 'app_last_4$user_id#', 'register_type', 'act$user_id_by_day_max', 'actcount_var', 'act_last_0$user_id#', 'act_mean#', 'actcount_max', 'act_last_7$user_id_by_day_var', 'app_last_7$user_id_by_day_var', 'app_last_1$user_id#', 'action_type=2$user_id#', 'act_page=1$user_id#', 'action_type=0$user_id#', 'act_last_1$user_id#', 'app_last_5$user_id#', 'act$user_id_by_day_min', 'act_page=3$user_id#', 'act$user_id_by_day_iq', 'actcount_mean', 'act_last_0$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'app_last_7$user_id_by_day_mean', 'act_last_8_action_type=2$user_id#', 'act_last_8_page=1$user_id#', 'act_last_4$user_id_by_day_mean', 'act$user_id_by_author_id_iq', 'app_last_5$user_id_by_day_mean', 'act_day_gap', 'app_day_missing', 'act_last_7_actcount_var', 'action_type=3$user_id#', 'act_last_4_actcount_var', 'act_last_1$user_id_by_author_id_iq', 'app_last_3$user_id_by_day_var', 'act_last_3_actcount_var', 'act_last_1$user_id_by_video_id_iq', 'act_last_3_page=1$user_id#', 'act_page=2$user_id#', 'act_page=0$user_id#', 'act_last_3$user_id_by_video_id_iq', 'act_last_6_actcount_max', 'app_last_2$user_id#', 'act_last_2$user_id#', 'app_last_6$user_id_by_day_mean', 'act_last_6_actcount_var', 'act_last_3_action_type=2$user_id#', 'act_last_6$user_id_by_video_id_iq', 'act_last_7$user_id_by_video_id_iq', 'act_last_5_actcount_var', 'act_last_3$user_id#', 'act_last_7$user_id_by_author_id_iq', 'act_last_2$user_id_by_video_id_iq', 'act_last_8_page=3$user_id#', 'act_page=4$user_id#', 'act_last_7_actcount_max', 'act_last_5$user_id_by_day_var', 'act_last_7$user_id_by_day_mean', 'act_last_8_action_type=0$user_id#', 'act_last_3_actcount_max', 'app_last_5$user_id_by_day_var', 'app_last_0$user_id#', 'app_last_6$user_id_by_day_var', 'act_day_missing', 'action_type=1$user_id#', 'act_last_6_actcount_mean', 'act_last_6$user_id_by_day_mean', 'act_last_3$user_id_by_author_id_iq', 'act_last_8_page=0$user_id#', 'act_last_3_actcount_mean', 'act_last_6$user_id_by_author_id_iq', 'video_last_8$user_id_by_day_var', 'act_last_5$user_id_by_day_mean', 'act_last_3_page=0$user_id#', 'register_time', 'act_last_3$user_id_by_day_var', 'last_video_day', 'act_last_6$user_id_by_day_var', 'act_last_4$user_id#', 'act_last_5$user_id_by_author_id_iq', 'act_last_4$user_id_by_author_id_iq', 'first_video_day', 'video_mean#', 'act_last_8_action_type=3$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_3_page=3$user_id#', 'app_last_4$user_id_by_day_mean', 'app_last_3$user_id#', 'act_last_8_page=4$user_id#', 'act_last_6$user_id#', 'act_last_3$user_id_by_day_mean', 'act_last_7$user_id#', 'act_last_5$user_id_by_video_id_iq', 'video_last_8$user_id_by_day_mean', 'act_last_4$user_id_by_day_var', 'act_last_7_actcount_mean', 'app_last_7$user_id#', 'video$user_id_by_day_var', 'act_last_5_actcount_max', 'act_last_3_page=4$user_id#', 'act_last_8_page=2$user_id#', 'act_last_5$user_id#', 'act_last_4_actcount_max', 'video$user_id_by_day_iq', 'act_last_4$user_id_by_video_id_iq', 'act_last_5_actcount_mean', 'act$author_id#', 'app_last_6$user_id#', 'act_last_4_actcount_mean', 'act_last_8_action_type=1$user_id#', 'video_day_gap', 'act_last_3_action_type=1$user_id#', 'act_last_3_page=2$user_id#', 'app_last_3$user_id_by_day_mean', 'action_type=5$user_id#', 'video_last_8$user_id#', 'act_last_3_action_type=3$user_id#'] #for i in range(100,124): # del df[ccc[i]] del df['user_id'] df1 = df[lendf:] df = df[:lendf] return df, df1 df1 = register[register.register_day < 10] df1['register_time'] = 17 - register.register_day df2 = register[register.register_day < 17] df2['register_time'] = 24 - register.register_day test_df = register[register.register_day < 24] test_df['register_time'] = 31 - test_df.register_day del df1['register_day'], df2['register_day'], test_df['register_day'] if os.path.exists(path + 'train_y1.csv'): train_y1 = pd.read_csv(path + 'train_y1.csv') else: train_y1 = is_active(df1, 17, 23, app, video, act) train_y1.to_csv(path + 'train_y1.csv', index=False) train_y1 = train_y1['Y'] if os.path.exists(path + 'train_y2.csv'): train_y2 = pd.read_csv(path + 'train_y2.csv') else: train_y2 = is_active(df2, 24, 30, app, video, act) train_y2.to_csv(path + 'train_y2.csv', index=False) train_y2 = train_y2['Y'] if os.path.exists(path + 'df1.csv'): df1 = pd.read_csv(path + 'df1.csv') else: df1 = get_features(df1, 1, 16) df1.to_csv(path + 'df1.csv', index=False) if os.path.exists(path + 'df2.csv'): df2 = pd.read_csv(path + 'df2.csv') else: df2 = get_features(df2, 1, 23) df2.to_csv(path + 'df2.csv', index=False) if val: train_df = df1 test_df = df2 train_y = train_y1 val_y = train_y2 else: if os.path.exists(path + 'test_df.csv'): test_df = pd.read_csv(path + 'test_df.csv') else: test_df = get_features(test_df, 1, 30) test_df.to_csv(path + 'test_df.csv', index=False) train_df = df1.append(df2) train_y = train_y1.append(train_y2) #train_df = df2 #train_y = train_y2 del df1, df2 gc.collect() ids = test_df['user_id'] train_df, test_df = get_features_all(train_df, test_df) ''' train_df['Y'] = train_y print (len(train_df)) train_js = train_df[train_df['act_mean#']==0] train_df = train_df[train_df['act_mean#']>0] print (len(train_df)) train_y = train_df['Y'] del train_df['Y'] train_y_js = train_js['Y'] del train_js['Y'] test_df['Y'] = val_y test_js = test_df[test_df['act_mean#']==0] test_df = test_df[test_df['act_mean#']>0] val_y = test_df['Y'] del test_df['Y'] js_y = test_js['Y'] del test_js['Y'] ''' pre_train, test_y = predict_data(train_df, train_y, 10, test_df, importance=1) #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1) ''' test_df['Y'] = val_y test_df['Y1'] = test_y test_js = test_df[test_df['act_mean#']==0] print(len(test_js)) print(len(test_js[test_js['Y1']>=0.4])) print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)])) test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0 print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)])) test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0 ''' if val == 1: showresults(val_y, test_y) showtop(val_y, test_y, nums=10705) return ids, test_y, getbest(ids, test_y, rank=10705) else: showresults(train_y, pre_train) showtop(train_y, pre_train, nums=16449) return ids, test_y, getbest(ids, test_y, th=0.4)
nb=99) sp = 1 if val == 1: showresults(val_y, test_y) #showtop(val_y,test_y,nums=18223) showtop(val_y, test_y, nums=15428) showfalse(ids, test_df, val_y, test_y) else: showresults(train_y, pre_train) if sp: df_1_28 = register[register.register_day <= 28] #df_29_30 = register[register.register_day>28] ans_1_28 = getbest1(df_1_28, ids, test_y, rank=22088) #ans_29_30 = getbest1(df_29_30,ids,test_y,th=0.4) #print (len(ans_1_28),len(ans_29_30)) from predict_30 import predict_30 from predict_29 import predict_29 ids29, test_y29, ans29 = predict_29(val, register, app, video, act) ids30, test_y30, ans30 = predict_30(val, register, app, video, act) ans = ans_1_28 + ans29 + ans30 else: ans = getbest(ids, test_y, rank=22088) print(len(ans)) import time name = time.strftime('%Y-%m-%d_%H_%M_%S', time.localtime(time.time())) submission = pd.DataFrame({'user_id': ans}) submission.to_csv('ksn_submit' + name + '.csv', index=False, header=None)
def predict_1_28(val,register,app,video,act): path = '../data1/1_28/' def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() df = docount(df,df,'ALL',['register_type']) df = docount(df,df,'ALL',['device_type']) del df['user_id'] df1 = df[lendf:] df = df[:lendf] return df,df1 df1 = register[register.register_day<15] df1['register_time'] = 17-register.register_day df2 = register[register.register_day<22] df2['register_time'] = 24-register.register_day df2[df2['register_time']>16]['register_time'] = 16 test_df = register[register.register_day<29] test_df['register_time'] = 31-test_df.register_day df2[df2['register_time']>16]['register_time'] = 16 del df1['register_day'],df2['register_day'],test_df['register_day'] if os.path.exists(path+'train_y1.csv'): train_y1=pd.read_csv(path+'train_y1.csv') else: train_y1 = is_active(df1,17,23,app,video,act) train_y1.to_csv(path+'train_y1.csv',index=False) train_y1 = train_y1['Y'] if os.path.exists(path+'train_y2.csv'): train_y2=pd.read_csv(path+'train_y2.csv') else: train_y2 = is_active(df2,24,30,app,video,act) train_y2.to_csv(path+'train_y2.csv',index=False) train_y2 = train_y2['Y'] if os.path.exists(path+'df1.csv'): df1=pd.read_csv(path+'df1.csv') else: df1 = get_features_ks(df1,1,16,app,video,act) df1.to_csv(path+'df1.csv',index=False) if os.path.exists(path+'df2.csv'): df2=pd.read_csv(path+'df2.csv') else: df2 = get_features_ks(df2,8,23,app,video,act) df2.to_csv(path+'df2.csv',index=False) if val: train_df = df1 test_df = df2 train_y = train_y1 val_y = train_y2 else: if os.path.exists(path+'test_df.csv'): test_df=pd.read_csv(path+'test_df.csv') else: test_df = get_features_ks(test_df,15,30,app,video,act) test_df.to_csv(path+'test_df.csv',index=False) train_df = df1.append(df2) train_y = train_y1.append(train_y2) #train_df = df2 #train_y = train_y2 del df1,df2 gc.collect() ids = test_df['user_id'] train_df,test_df = get_features_all(train_df,test_df) ''' train_df['Y'] = train_y print (len(train_df)) train_js = train_df[train_df['act_mean#']==0] train_df = train_df[train_df['act_mean#']>0] print (len(train_df)) train_y = train_df['Y'] del train_df['Y'] train_y_js = train_js['Y'] del train_js['Y'] test_df['Y'] = val_y test_js = test_df[test_df['act_mean#']==0] test_df = test_df[test_df['act_mean#']>0] val_y = test_df['Y'] del test_df['Y'] js_y = test_js['Y'] del test_js['Y'] ''' pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1) #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1) ''' test_df['Y'] = val_y test_df['Y1'] = test_y test_js = test_df[test_df['act_mean#']==0] print(len(test_js)) print(len(test_js[test_js['Y1']>=0.4])) print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)])) test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0 print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)])) test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0 ''' if val==1: showresults(val_y,test_y) showtop(val_y,test_y,nums=15428) showtop(val_y,test_y,nums=15905) showfalse(ids,test_df,val_y,test_y) #showprecision(val_y,test_y) return ids,test_y,getbest(ids,test_y,th=0.4) else: showresults(train_y,pre_train) showtop(train_y,pre_train,nums=25713) #return ids,test_y,getbest(ids,test_y,th=0.4) return ids,test_y,getbest(ids,test_y,rank=22088)
def predict_24_28(val,register,app,video,act): def get_features(df,d1,d2): tapp = app[(app.day>=d1) & (app.day<=d2)] tact = act[(act.day>=d1) & (act.day<=d2)] tvideo = video[(video.day>=d1) & (video.day<=d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2-d1 df['register_time'] = d2-df.register_day+1 del df['register_day'] #app df = docount(df,tapp,'app',['user_id']) df['app_mean#'] = df['app$user_id#']/df['register_time'] #df = domax(df,tapp,'app',['user_id'],'day') #df['last_app_day'] = lastday - df['app$user_id_by_day_max']+1 del df['app$user_id#'] #df['app_day_missing'] = df['register_time'] - df['app$user_id#'] #df['app$user_id#'] = df['app$user_id#']/df['register_time'] #df = dovar(df,tapp,'app',['user_id'],'day') #df = docount(df,tapp[tapp.day>lastday-2],'app_last_2',['user_id']) #df = docount(df,tapp[tapp.day>lastday-1],'app_last_1',['user_id']) #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id']) gc.collect() #video #df = docount(df,tvideo,'video',['user_id']) #df['video_mean#'] = df['video$user_id#']/df['register_time'] #df = domax(df,tvideo,'video',['user_id'],'day') #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1 #del df['video$user_id_by_day_max'] #df = doiq(df,tvideo,'video',['user_id'],'day') #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1 #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq'] #df['video$user_id#'] = df['video$user_id#']/df['register_time'] #df = dovar(df,tvideo,'video',['user_id'],'day') df = docount(df,tvideo[tvideo.day>lastday-2],'video_last_2',['user_id']) df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id']) #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id']) gc.collect() #act #gp = act.groupby(['user_id','day']).size().unstack() #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left') #df = docount(df,tact,'act',['user_id']) #df['act_mean#'] = df['act$user_id#']/df['register_time'] df = domax(df,tact,'act',['user_id'],'day') df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 del df['act$user_id_by_day_max'] #df = doiq(df,tact,'act',['user_id'],'day') #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 #df['act_day_missing'] = df['register_time'] - df['act$user_id_by_day_iq'] #df['act$user_id#'] = df['act$user_id#']/df['register_time'] #gp = tact.groupby(['user_id','day']).size().unstack() #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left') #df = dovar(df,tact,'act',['user_id'],'day') df = docount(df,tact[tact.day>lastday-2],'act_last_2',['user_id']) df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id']) #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id']) gc.collect() #page_list = list(tact['page'].unique()) for c in [0,1,2,3]: df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-3)],'act_last_3_page='+str(c),['user_id']) df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-2)],'act_last_2_page='+str(c),['user_id']) df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-1)],'act_last_1_page='+str(c),['user_id']) df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'video_id') df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'video_id') df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'video_id') for c in [0,1,2,3]: df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-3)],'act_last_3_action_type='+str(c),['user_id']) df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-2)],'act_last_2_action_type='+str(c),['user_id']) df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-1)],'act_last_1_action_type='+str(c),['user_id']) gc.collect() return df path = '../data1/24_28/' if val: if os.path.exists(path+'val_df.csv'): test_df = pd.read_csv(path+'val_df.csv') val_y = pd.read_csv(path+'val_y.csv') else: test_df = register[(register.register_day>=17) & (register.register_day<=21)] test_df = get_features(test_df,17,23) val_y = is_active(test_df,24,30,app,video,act) test_df.to_csv(path+'val_df.csv',index=False) val_y.to_csv(path+'val_y.csv',index=False) val_y = val_y['Y'] if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,11): df = register[(register.register_day>=i) & (register.register_day<=i+4)] y = is_active(df,i+7,i+13,app,video,act) df = get_features(df,i,i+6) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'val_train_df.csv',index=False) train_y.to_csv(path+'val_train_y.csv',index=False) else: if os.path.exists(path+'test_df.csv'): test_df = pd.read_csv(path+'test_df.csv') else: test_df = register[(register.register_day>=24) & (register.register_day<=28)] test_df = get_features(test_df,24,30) test_df.to_csv(path+'test_df.csv',index=False) if os.path.exists(path+'train_df.csv'): train_df = pd.read_csv(path+'train_df.csv') train_y = pd.read_csv(path+'train_y.csv') else: if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') for i in range(11,18): df = register[(register.register_day>=i) & (register.register_day<=i+4)] y = is_active(df,i+7,i+13,app,video,act) df = get_features(df,i,i+6) train_df = train_df.append(df) train_y = train_y.append(y) else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,18): df = register[(register.register_day>=i) & (register.register_day<=i+4)] y = is_active(df,i+7,i+13,app,video,act) df = get_features(df,i,i+6) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'train_df.csv',index=False) train_y.to_csv(path+'train_y.csv',index=False) train_y = train_y['Y'] #print(sum(train_y)/len(train_y)) def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() #for c in ['act_last_2$user_id#']: # df = domean(df,df,'All',['device_type'],c);gc.collect() # df = domean(df,df,'All',['register_type'],c);gc.collect() #del df #ccc = ['device_type', 'app_mean#', 'register_type', 'register_time', 'act_last_3_page=1$user_id#', 'last_act_day', 'act_last_3$user_id_by_video_id_iq', 'act_last_3_page=2$user_id#', 'act_last_3$user_id_by_author_id_iq', 'act_last_3_action_type=1$user_id#', 'act_last_1$user_id_by_author_id_iq', 'act_last_3_page=3$user_id#', 'act_last_3_page=0$user_id#', 'act_last_1$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'act_last_3$user_id#', 'act_last_2$user_id_by_video_id_iq', 'act_last_3_action_type=2$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_2_page=2$user_id#', 'act_last_2_page=1$user_id#', 'act_last_2_page=3$user_id#', 'act_last_1_page=1$user_id#', 'act_last_2$user_id#', 'act_last_2_page=0$user_id#', 'act_last_1_action_type=0$user_id#', 'act_last_2_action_type=1$user_id#', 'act_last_1_page=2$user_id#', 'act_last_3_action_type=3$user_id#', 'act_last_1_page=3$user_id#', 'act_last_2_action_type=0$user_id#', 'video_last_3$user_id#', 'act_last_1_page=0$user_id#', 'act_last_2_action_type=2$user_id#', 'act_last_2_action_type=3$user_id#', 'video_last_2$user_id#', 'act_last_1_action_type=1$user_id#', 'act_last_1_action_type=3$user_id#', 'act_last_1_action_type=2$user_id#'] #for i in range(38,39): # del df[ccc[i]] del df['user_id'] #del df['last_app_day'],df['last_video_day'],df['video_last_1$user_id#'],df['app_last_1$user_id#'] #del df['act_last_1$user_id#'],df['app_last_2$user_id#'] df1 = df[lendf:] df = df[:lendf] return df,df1 ids = test_df['user_id'] train_df,test_df = get_features_all(train_df,test_df) pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1) #print(test_y) if val==1: print (len(train_y),sum(train_y)) showresults(val_y,test_y) showtop(val_y,test_y,nums=4723) showtop(train_y,pre_train,nums=38507) #return ids,test_y,getbest(ids,test_y,rank=4723) return ids,test_y,getbest(ids,test_y,th=0.4) else: showresults(train_y,pre_train) showtop(train_y,pre_train,nums=70275) return ids,test_y,getbest(ids,test_y,rank=5498)