def self_train(): dict = utils.unpickle('cifar_10/data_batch_1') test_dict = utils.unpickle('cifar_10/data_batch_2') # dict content = [b'batch_label', b'labels', b'data', b'filenames'] label_data, label = utils.data_fromCIFAR10(dict, 500) test_data, test_label = utils.data_fromCIFAR10(dict, 5000) #label_data, label = utils.data_fromCIFAR10(dict, 5000) #print("label data number {}, label number {}".format(np.shape(label_data), np.shape(label))) ''' train_data = [] train_lable = [] count = np.zeros(10) for i in range(500): if(count[0] < 20 and (label[i] == 0) ): count[0] += 1 elif(count[1] < 20 and (label[i] == 1) ): count[1] += 1 elif(count[2] < 20 and (label[i] == 2) ): count[2] += 1 elif(count[3] < 20 and (label[i] == 3) ): count[3] += 1 elif(count[4] < 20 and (label[i] == 4) ): count[4] += 1 elif(count[5] < 20 and (label[i] == 5) ): count[5] += 1 elif(count[6] < 20 and (label[i] == 6) ): count[6] += 1 elif(count[7] < 20 and (label[i] == 7) ): count[7] += 1 elif(count[8] < 20 and (label[i] == 8) ): count[8] += 1 elif(count[9] < 20 and (label[i] == 9) ): count[9] += 1 else: continue train_data.append(label_data[i]) train_lable.append(label[i])''' model = net.Net3(label_data, label, save_model='test.h5', validate=0.1) #model = net.Net1(train_data, train_lable, save_model='test.h5', validate=0.1) #model = train(label_data, label, save_model='test.h5', validate=0.1) #model = train(label_data, label) allData, allLabel = utils.data_fromCIFAR10(dict, 10000) #unlabel_data = unlabel_data[1000:] #print("unlabel data number line102", np.shape(unlabel_data)) #unlabel_data = allData[1000:] #true_label = allLabel[1000:] unlabel_data = allData[6000:] true_label = allLabel[6000:] predict_maxSet = [] iter = 10 for i in range(iter): print('Iteration {}'.format(i + 1)) model = load_model('test.h5') predict_raw = utils.predict_data(unlabel_data, model) prediction = np.argmax(predict_raw, axis=1) #predict_raw = np.argmax(predict_raw, axis=1) predict_max = np.max(predict_raw) predict_maxSet.append(predict_max) # test data from another batch, separate from label and unlabel data test_raw = utils.predict_data(test_data, model) test_raw = np.argmax(test_raw, axis=1) test_predict = test_raw.reshape(-1, 1) count = 0 index = [] aug_data = [] aug_label = [] for k in range(predict_raw.shape[0]): if (np.max(predict_raw[k]) > (predict_max * 0.995)): count += 1 aug_data.append(unlabel_data[k]) aug_label.append(prediction[k]) prediction.reshape(-1, 1) new_labelData = label_data + aug_data new_label = label + aug_label utils.accy(test_predict, test_label) model = net.Net3(new_labelData, new_label, 'test.h5', 'test.h5', validate=0.05) #model = train(new_labelData, new_label, save_model='test.h5', validate=0.05) del model if (i == iter - 1): utils.accy(test_predict, test_label)
def predict_29(val,register,app,video,act): def get_features(df,d1,d2): tapp = app[(app.day>=d1) & (app.day<=d2)] tact = act[(act.day>=d1) & (act.day<=d2)] tvideo = video[(video.day>=d1) & (video.day<=d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2-d1 #df['register_time'] = d2-df.register_day+1 df = docount(df,tapp,'app',['user_id']);gc.collect() df = docount(df,tapp[tapp.day==lastday],'last_day_app',['user_id']);gc.collect() #df['app_mean#'] = df['app$user_id#']/2 df = docount(df,tvideo,'video',['user_id']);gc.collect() df['videorate'] = df['video$user_id#']/(tvideo.shape[0]+0.000001) #df['video_mean#'] = df['video$user_id#']/2 df = docount(df,tact,'act',['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday],'last_day_act',['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday-1],'first_day_act',['user_id']);gc.collect() df['actrate'] = df['act$user_id#']/(tact.shape[0]+0.000001) df['last_day_actrate'] = df['last_day_act$user_id#']/(tact.shape[0]+0.000001) df['first_day_actrate'] = df['first_day_act$user_id#']/(tact.shape[0]+0.000001) df['actrate_gap'] = df['last_day_actrate'] - df['first_day_actrate'] df['act_gap'] = df['last_day_act$user_id#'] - df['first_day_act$user_id#'] #df['act_mean#'] = df['act$user_id#']/2 #page_list = list(tact['page'].unique()) def iszero(s): if s==0: return 0 return 1 df['act0'] = df['act$user_id#'].apply(iszero) df['video0'] = df['video$user_id#'].apply(iszero) for c in [1]: df = docount(df,tact[tact.day==lastday][tact['page']==c],'last_day_act_page='+str(c),['user_id']);gc.collect() for c in [0,1,2,3,4]: df = docount(df,tact[tact['page']==c],'act_page='+str(c),['user_id']);gc.collect() df['act_page='+str(c)+'$user_id#rate'] = df['act_page='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001) df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#'] df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df['act_page=3$user_id#']+df['act_page=0$user_id#'] action_list = list(tact['action_type'].unique()) for c in [0,1,2,3,4,5]: df = docount(df,tact[tact['action_type']==c],'action_type='+str(c),['user_id']);gc.collect() df = docount(df,tact[tact.day==lastday][tact['action_type']==c],'last_day_action_type='+str(c),['user_id']);gc.collect() df['action_type='+str(c)+'$user_id#rate'] = df['action_type='+str(c)+'$user_id#']/(df['act$user_id#']+0.00001) df['author_id'] = df['user_id'] df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day==lastday],'last_day_act',['user_id'],'author_id');gc.collect() df['last_day_act$author_video_m'] = df['last_day_act$user_id_by_video_id_iq']/df['last_day_act$user_id_by_author_id_iq'] df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day==lastday-1],'first_day_act',['user_id'],'author_id');gc.collect() df['first_day_act$author_video_m'] = df['first_day_act$user_id_by_video_id_iq']/df['first_day_act$user_id_by_author_id_iq'] df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'video_id');gc.collect() df = doiq(df,tact[tact.day>=lastday-1],'last2_day_act',['user_id'],'author_id');gc.collect() df['last2_day_act$author_video_m'] = df['last2_day_act$user_id_by_video_id_iq']/df['last2_day_act$user_id_by_author_id_iq'] del df['register_day'],df['author_id'] return df def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() df = docount(df,df,'ALL',['register_type']) del df['user_id'] ccc = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'action_type=2$user_id#rate', 'act_page=0$user_id#rate', 'last_day_act$author_video_m', 'action_type=1$user_id#rate', 'act_page=2$user_id#', 'actrate', 'last_day_act$user_id_by_author_id_iq', 'app$user_id#', 'last_day_act_page=1$user_id#', 'act_page=3$user_id#rate', 'last_day_action_type=0$user_id#', 'first_day_act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'last2_day_act$user_id_by_author_id_iq', 'last2_day_act$user_id_by_video_id_iq', 'first_day_actrate', 'act_page=2$user_id#rate', 'last_day_actrate', 'first_day_act$author_video_m', 'last2_day_act$author_video_m', 'ALL$register_type#', 'act_page=0$user_id#', 'actrate_gap', 'action_type=3$user_id#rate', 'last_day_act$user_id#', 'act$user_id#', 'last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#', 'act_gap', 'action_type=2$user_id#', 'action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#', 'act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#', 'act_page=23$user_id#', 'act_page=023$user_id#', 'act_page=4$user_id#', 'last_day_action_type=2$user_id#', 'last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#', 'last_day_action_type=4$user_id#', 'action_type=4$user_id#', 'last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0'] ccc1 = [ ] ddd = ['action_type=2$user_id#rate','action_type=1$user_id#rate','last_day_act$user_id_by_author_id_iq', 'last_day_act_page=1$user_id#','act_page=3$user_id#rate','first_day_act$user_id_by_video_id_iq', 'videorate','act_page=1$user_id#rate','last2_day_act$user_id_by_author_id_iq','last2_day_act$user_id_by_video_id_iq', 'act_page=2$user_id#rate','last_day_actrate', 'first_day_act$author_video_m','last2_day_act$author_video_m', 'ALL$register_type#','act_page=0$user_id#','actrate_gap','action_type=3$user_id#rate', 'last_day_act$user_id#','act$user_id#','last_day_act$user_id_by_video_id_iq', 'action_type=0$user_id#', 'action_type=1$user_id#','act_gap', 'action_type=2$user_id#','action_type=3$user_id#', 'first_day_act$user_id#', 'act_page=3$user_id#','act_page=4$user_id#rate', 'video$user_id#', 'last_day_action_type=1$user_id#','act_page=23$user_id#', 'act_page=023$user_id#','act_page=4$user_id#', 'last_day_action_type=2$user_id#','last_day_action_type=3$user_id#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'last_day_app$user_id#','last_day_action_type=4$user_id#', 'action_type=4$user_id#','last_day_action_type=5$user_id#', 'act0', 'action_type=4$user_id#rate', 'video0'] used = ['device_type', 'register_type', 'action_type=0$user_id#rate', 'act_page=1$user_id#', 'first_day_act$user_id_by_author_id_iq', 'act_page=0$user_id#rate','last_day_act$author_video_m', 'act_page=2$user_id#','actrate','app$user_id#', 'last_day_action_type=0$user_id#', 'first_day_actrate', 'action_type=5$user_id#rate', ] df = df[used] df1 = df[lendf:] df = df[:lendf] return df,df1 path = '../data1/29/' if val: if os.path.exists(path+'val_df.csv'): test_df = pd.read_csv(path+'val_df.csv') val_y = pd.read_csv(path+'val_y.csv') else: test_df = register[(register.register_day==22)] test_df = get_features(test_df,22,23) val_y = is_active(test_df,24,30,app,video,act) test_df.to_csv(path+'val_df.csv',index=False) val_y.to_csv(path+'val_y.csv',index=False) val_y = val_y['Y'] if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,22): df = register[(register.register_day==i)] y = is_active(df,i+2,i+8,app,video,act) df = get_features(df,i,i+1) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'val_train_df.csv',index=False) train_y.to_csv(path+'val_train_y.csv',index=False) else: if os.path.exists(path+'test_df.csv'): test_df = pd.read_csv(path+'test_df.csv') else: test_df = register[(register.register_day==29)] test_df = get_features(test_df,29,30) test_df.to_csv(path+'test_df.csv',index=False) if os.path.exists(path+'train_df.csv'): train_df = pd.read_csv(path+'train_df.csv') train_y = pd.read_csv(path+'train_y.csv') else: if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') val_df = pd.read_csv(path+'val_df.csv') val_y = pd.read_csv(path+'val_y.csv') train_df = train_df.append(val_df) train_y = train_y.append(val_y) else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,23): df = register[(register.register_day==i)] y = is_active(df,i+2,i+8,app,video,act) df = get_features(df,i,i+1) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'train_df.csv',index=False) train_y.to_csv(path+'train_y.csv',index=False) train_y = train_y['Y'] ids = test_df['user_id'] train_df,test_df = get_features_all(train_df,test_df) pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1) if val==1: print (len(train_y),sum(train_y)) showresults(train_y,pre_train) showresults(val_y,test_y) showtop(val_y,test_y,nums=1337) showtop(train_y,pre_train,nums=19589) return ids,test_y,getbest(ids,test_y,th=0.4) else: showresults(train_y,pre_train) showtop(train_y,pre_train,nums=20926) return ids,test_y,getbest(ids,test_y,rank=1294)
def predict_30(val, register, app, video, act): def get_features0(df, d): #tapp = app[app.day==d] tvideo = video[video.day == d] tact = act[act.day == d] #df = docount(df,tapp,'app',['user_id']);gc.collect() df = docount(df, tvideo, 'video', ['user_id']) gc.collect() df['videorate'] = df['video$user_id#'] / (tvideo.shape[0] + 0.000001) df = docount(df, tact, 'act', ['user_id']) gc.collect() df['actrate'] = df['act$user_id#'] / (tact.shape[0] + 0.000001) page_list = list(tact['page'].unique()) for c in [0, 1, 2, 3, 4]: df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c), ['user_id']) gc.collect() df['act_page=' + str(c) + '$user_id#rate'] = df['act_page=' + str(c) + '$user_id#'] / ( df['act$user_id#'] + 0.00001) df['act_page=23$user_id#'] = df['act_page=2$user_id#'] + df[ 'act_page=3$user_id#'] df['act_page=023$user_id#'] = df['act_page=2$user_id#'] + df[ 'act_page=3$user_id#'] + df['act_page=0$user_id#'] action_list = list(tact['action_type'].unique()) for c in [0, 1, 2, 3, 4, 5]: df = docount(df, tact[tact['action_type'] == c], 'action_type=' + str(c), ['user_id']) gc.collect() df['action_type=' + str(c) + '$user_id#rate'] = df['action_type=' + str(c) + '$user_id#'] / ( df['act$user_id#'] + 0.00001) df['action_type=01$user_id#'] = df['action_type=0$user_id#'] + df[ 'action_type=1$user_id#'] def iszero(s): if s == 0: return 0 return 1 df['pageall'] = df['act_page=0$user_id#'].apply(iszero) for c in [1, 2, 3, 4]: df['pageall'] = df['pageall'] * df['act_page=0$user_id#'] df['pageall'] = df['act_page=0$user_id#'].apply(iszero) df['actionall'] = df['action_type=0$user_id#'].apply(iszero) for c in [1, 2, 3, 4, 5]: df['pageall'] = df['pageall'] * df['action_type=0$user_id#'] df['actionall'] = df['action_type=0$user_id#'].apply(iszero) df['act0'] = df['act$user_id#'].apply(iszero) df['video0'] = df['video$user_id#'].apply(iszero) def bigact(s): if s >= 50: return 5 else: return int(s / 10) df['act$user_id#10'] = df['act$user_id#'].apply(bigact) df['author_id'] = df['user_id'] df = docount(df, tact, 'act', ['author_id']) gc.collect() df = doiq(df, tact, 'act', ['user_id'], 'video_id') gc.collect() df = doiq(df, tact, 'act', ['user_id'], 'author_id') gc.collect() df['act$author_video_m'] = df['act$user_id_by_video_id_iq'] / df[ 'act$user_id_by_author_id_iq'] del df['register_day'], df['author_id'] return df def get_features_all(df, df1): lendf = len(df) df = df.append(df1) del df1 gc.collect() for c in ['act$user_id#']: #df = domean(df,df,'All',['device_type'],c);gc.collect() df = domean(df, df, 'All', ['register_type'], c) gc.collect() #df = dovar(df,df,'All',['register_type'],c);gc.collect() df = docount(df, df, 'ALL', ['register_type']) df = docount(df, df, 'ALL', ['device_type']) del df['user_id'], ccc = [ 'device_type', 'actrate', 'All$register_type_by_act$user_id#_mean', 'act_page=1$user_id#', 'action_type=0$user_id#rate', 'action_type=1$user_id#rate', 'register_type', 'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq', 'videorate', 'act_page=1$user_id#rate', 'act$author_video_m', 'action_type=2$user_id#rate', 'act_page=3$user_id#rate', 'act_page=0$user_id#', 'action_type=0$user_id#', 'act_page=2$user_id#', 'act_page=2$user_id#rate', 'action_type=1$user_id#', 'act$user_id#', 'act_page=4$user_id#rate', 'act_page=0$user_id#rate', 'pageall', 'act_page=4$user_id#', 'action_type=3$user_id#rate', 'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#', 'action_type=2$user_id#', 'action_type=3$user_id#', 'act_page=023$user_id#', 'act$author_id#', 'action_type=01$user_id#', 'action_type=5$user_id#rate', 'ALL$register_type#', 'action_type=5$user_id#', 'act$user_id#10', 'action_type=4$user_id#', 'actionall', 'action_type=4$user_id#rate', 'act0', 'video0' ] ccc1 = [] ddd = [ 'All$register_type_by_act$user_id#_mean', 'act_page=1$user_id#', 'action_type=1$user_id#rate', 'act$user_id_by_author_id_iq', 'act$user_id_by_video_id_iq', 'act$author_video_m', 'act_page=2$user_id#', 'act_page=2$user_id#rate', 'action_type=1$user_id#', 'act$user_id#', 'act_page=4$user_id#rate', 'act_page=4$user_id#', 'action_type=3$user_id#rate', 'act_page=23$user_id#', 'act_page=3$user_id#', 'video$user_id#', 'action_type=2$user_id#', 'action_type=3$user_id#', 'act$author_id#', 'action_type=01$user_id#', 'ALL$register_type#', 'ALL$device_type#', 'action_type=5$user_id#rate', 'action_type=5$user_id#', 'act$user_id#10', 'action_type=4$user_id#', 'actionall', 'action_type=4$user_id#rate', 'act0', ] used = [ 'device_type', 'register_type', 'actrate', 'action_type=0$user_id#rate', 'videorate', 'act_page=1$user_id#rate', 'action_type=2$user_id#rate', 'act_page=3$user_id#rate', 'act_page=0$user_id#', 'action_type=0$user_id#', 'act_page=0$user_id#rate', 'pageall', 'act_page=023$user_id#', 'video0', 'All$register_type_by_act$user_id#_mean', 'ALL$register_type#', ] df = df[used] df1 = df[lendf:] df = df[:lendf] return df, df1 path = '../data1/30/' if os.path.exists(path + 'train_df.csv'): train_df = pd.read_csv(path + 'train_df.csv') train_y = pd.read_csv(path + 'train_y.csv') else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1, 24): df = register[register.register_day == i] y = is_active(df, i + 1, i + 7, app, video, act) df = get_features0(df, i) train_df = train_df.append(df) train_y = train_y.append(y) if i == 22: valst = len(train_df) print(valst) train_df.to_csv(path + 'train_df.csv', index=False) train_y.to_csv(path + 'train_y.csv', index=False) train_y = train_y['Y'] if val: #35134 valst = 35134 test_df = train_df[valst:] val_y = train_y[valst:] train_df = train_df[:valst] train_y = train_y[:valst] else: if os.path.exists(path + 'test_df.csv'): test_df = pd.read_csv(path + 'test_df.csv') else: test_df = register[register.register_day == 30] test_df = get_features0(test_df, 30) test_df.to_csv(path + 'test_df.csv', index=False) #train_df['Y'] = train_y #act0train = train_df[train_df['act$user_id#']==0] #print(len(act0train),len(act0train[act0train['Y']==1])) #del train_df['Y'] #act0ids = test_df[test_df['act$user_id#']==0]['user_id'] ids = test_df['user_id'] train_df, test_df = get_features_all(train_df, test_df) pre_train, test_y = predict_data(train_df, train_y, 10, test_df, importance=1) if val == 1: print(len(train_y), sum(train_y)) showresults(train_y, pre_train) showresults(val_y, test_y) showfalse(ids, test_df, val_y, test_y) showtop(val_y, test_y, nums=1457) showtop(train_y, pre_train, nums=23260) #showtop(train_y,pre_train,nums=15485) #showprecision(val_y,test_y) #showprecision(train_y,pre_train) return ids, test_y, getbest(ids, test_y, th=0.4) else: showresults(train_y, pre_train) showtop(train_y, pre_train, nums=24717) #showtop(train_y,pre_train,nums=16943) #showprecision(train_y,pre_train) return ids, test_y, getbest(ids, test_y, rank=1490)
def predict_1_23(val, register, app, video, act): path = '../data1/1_23/' def get_features(df, d1, d2): tapp = app[(app.day >= d1) & (app.day <= d2)] tact = act[(act.day >= d1) & (act.day <= d2)] tvideo = video[(video.day >= d1) & (video.day <= d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2 - d1 #app df = docount(df, tapp, 'app', ['user_id']) #df = domin(df,tapp,'app',['user_id'],'day') df = domax(df, tapp, 'app', ['user_id'], 'day') df['last_app_day'] = lastday - df['app$user_id_by_day_max'] + 1 #df['app_day_gap'] = df['app$user_id_by_day_max']- df['app$user_id_by_day_min']+1 df['app_day_missing'] = df['register_time'] - df['app$user_id#'] df['app_mean#'] = df['app$user_id#'] / df['register_time'] del df['app$user_id#'], df['app$user_id_by_day_max'] df = dovar(df, tapp, 'app', ['user_id'], 'day') #df = domean(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day') #df = dovar(df,tapp[tapp.day>lastday-8],'app_last_8',['user_id'],'day') for i in range(8): df = docount(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id']) if i >= 3: df = domean(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id'], 'day') df = dovar(df, tapp[tapp.day >= lastday - i], 'app_last_' + str(i), ['user_id'], 'day') #df = docount(df,tapp[tapp.day>lastday-7],'app_last_7',['user_id']) #df = docount(df,tapp[tapp.day>lastday-3],'app_last_3',['user_id']) #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id']) gc.collect() #video df = docount(df, tvideo, 'video', ['user_id']) df = domin(df, tvideo, 'video', ['user_id'], 'day') df = domax(df, tvideo, 'video', ['user_id'], 'day') df = doiq(df, tvideo, 'video', ['user_id'], 'day') df['last_video_day'] = lastday - df['video$user_id_by_day_max'] + 1 df['first_video_day'] = lastday - df['video$user_id_by_day_min'] + 1 df['video_day_gap'] = df['video$user_id_by_day_max'] - df[ 'video$user_id_by_day_min'] + 1 #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq'] df['video_mean#'] = df['video$user_id#'] / df['register_time'] del df['video$user_id#'], df['video$user_id_by_day_max'], df[ 'video$user_id_by_day_min'] df = dovar(df, tvideo, 'video', ['user_id'], 'day') df = domean(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id'], 'day') df = dovar(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id'], 'day') df = docount(df, tvideo[tvideo.day > lastday - 8], 'video_last_8', ['user_id']) #df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id']) #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id']) gc.collect() #act gp = tact.groupby(['user_id', 'day']).size().unstack() df = pd.merge(df, gp.max(1).rename('actcount_max').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.mean(1).rename('actcount_mean').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.var(1).rename('actcount_var').reset_index(), on=['user_id'], how='left') df = docount(df, tact, 'act', ['user_id']) df = domin(df, tact, 'act', ['user_id'], 'day') df = domax(df, tact, 'act', ['user_id'], 'day') df = doiq(df, tact, 'act', ['user_id'], 'day') #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 df['act_day_gap'] = df['act$user_id_by_day_max'] - df[ 'act$user_id_by_day_min'] + 1 df['act_day_missing'] = df['register_time'] - df[ 'act$user_id_by_day_iq'] df['act_mean#'] = df['act$user_id#'] / df['register_time'] del df['act$user_id#'] df = dovar(df, tact, 'act', ['user_id'], 'day') #df = domean(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day') #df = dovar(df,tact[tact.day>lastday-8],'act_last_8',['user_id'],'day') for i in range(8): df = docount(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id']) if i >= 3: df = domean(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'day') df = dovar(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'day') gp = tact[tact.day >= lastday - i].groupby( ['user_id', 'day']).size().unstack() df = pd.merge(df, gp.max(1).rename('act_last_' + str(i) + '_actcount_max').reset_index(), on=['user_id'], how='left') df = pd.merge( df, gp.mean(1).rename('act_last_' + str(i) + '_actcount_mean').reset_index(), on=['user_id'], how='left') df = pd.merge(df, gp.var(1).rename('act_last_' + str(i) + '_actcount_var').reset_index(), on=['user_id'], how='left') #df = docount(df,tact[tact.day>lastday-7],'act_last_7',['user_id']) #df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id']) #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id']) gc.collect() page_list = list(tact['page'].unique()) for c in page_list: df = docount(df, tact[tact['page'] == c], 'act_page=' + str(c), ['user_id']) df['act_page=' + str(c) + '$user_id#'] = df['act_page=' + str(c) + '$user_id#'] / df['register_time'] for c in page_list: df = docount(df, tact[(tact['page'] == c) & (tact.day > lastday - 8)], 'act_last_8_page=' + str(c), ['user_id']) for c in page_list: df = docount(df, tact[(tact['page'] == c) & (tact.day > lastday - 3)], 'act_last_3_page=' + str(c), ['user_id']) df['author_id'] = df['user_id'] df = docount(df, tact, 'act', ['author_id']) df['act$author_id#'] = df['act$author_id#'] / df['register_time'] df = doiq(df, tact, 'act', ['user_id'], 'author_id') df['act$user_id_by_author_id_iq'] = df[ 'act$user_id_by_author_id_iq'] / df['register_time'] df = doiq(df, tact, 'act', ['user_id'], 'video_id') df['act$user_id_by_video_id_iq'] = df[ 'act$user_id_by_video_id_iq'] / df['register_time'] for i in range(8): df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'author_id') df = doiq(df, tact[tact.day >= lastday - i], 'act_last_' + str(i), ['user_id'], 'video_id') #action_list = list(tact['action_type'].unique()) for c in [0, 1, 2, 3, 5]: df = docount(df, tact[tact['action_type'] == c], 'action_type=' + str(c), ['user_id']) gc.collect() df['action_type=' + str(c) + '$user_id#'] = df['action_type=' + str(c) + '$user_id#'] / df['register_time'] for c in [0, 1, 2, 3]: df = docount( df, tact[(tact['action_type'] == c) & (tact.day > lastday - 8)], 'act_last_8_action_type=' + str(c), ['user_id']) for c in [0, 1, 2, 3]: df = docount( df, tact[(tact['action_type'] == c) & (tact.day > lastday - 3)], 'act_last_3_action_type=' + str(c), ['user_id']) ''' def getmaxcontinuedays(s): s = np.array(s) ans = 0 t = 0 for i in s: if i>0: t = t+ 1 else: if t>ans: ans = t t = 0 if t>ans: ans=t return ans gp = tapp.groupby(['user_id','day']).size().unstack() gp = gp.fillna(0) #print (gp) gp['app_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1) #print (gp) df = pd.merge(df,gp.reset_index()[['user_id','app_max_continue_days']],on=['user_id'],how='left') gp = tact.groupby(['user_id','day']).size().unstack() gp = gp.fillna(0) #print (gp) gp['act_max_continue_days'] = gp.apply(getmaxcontinuedays,axis=1) #print (gp) df = pd.merge(df,gp.reset_index()[['user_id','act_max_continue_days']],on=['user_id'],how='left') ''' del df['author_id'] gc.collect() return df def get_features_all(df, df1): lendf = len(df) df = df.append(df1) del df1 gc.collect() #ccc = ['app_mean#', 'last_app_day', 'app$user_id_by_day_var', 'act$user_id_by_day_var', 'device_type', 'act$user_id_by_video_id_iq', 'app_last_4$user_id_by_day_var', 'act_last_0$user_id_by_author_id_iq', 'app_last_4$user_id#', 'register_type', 'act$user_id_by_day_max', 'actcount_var', 'act_last_0$user_id#', 'act_mean#', 'actcount_max', 'act_last_7$user_id_by_day_var', 'app_last_7$user_id_by_day_var', 'app_last_1$user_id#', 'action_type=2$user_id#', 'act_page=1$user_id#', 'action_type=0$user_id#', 'act_last_1$user_id#', 'app_last_5$user_id#', 'act$user_id_by_day_min', 'act_page=3$user_id#', 'act$user_id_by_day_iq', 'actcount_mean', 'act_last_0$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'app_last_7$user_id_by_day_mean', 'act_last_8_action_type=2$user_id#', 'act_last_8_page=1$user_id#', 'act_last_4$user_id_by_day_mean', 'act$user_id_by_author_id_iq', 'app_last_5$user_id_by_day_mean', 'act_day_gap', 'app_day_missing', 'act_last_7_actcount_var', 'action_type=3$user_id#', 'act_last_4_actcount_var', 'act_last_1$user_id_by_author_id_iq', 'app_last_3$user_id_by_day_var', 'act_last_3_actcount_var', 'act_last_1$user_id_by_video_id_iq', 'act_last_3_page=1$user_id#', 'act_page=2$user_id#', 'act_page=0$user_id#', 'act_last_3$user_id_by_video_id_iq', 'act_last_6_actcount_max', 'app_last_2$user_id#', 'act_last_2$user_id#', 'app_last_6$user_id_by_day_mean', 'act_last_6_actcount_var', 'act_last_3_action_type=2$user_id#', 'act_last_6$user_id_by_video_id_iq', 'act_last_7$user_id_by_video_id_iq', 'act_last_5_actcount_var', 'act_last_3$user_id#', 'act_last_7$user_id_by_author_id_iq', 'act_last_2$user_id_by_video_id_iq', 'act_last_8_page=3$user_id#', 'act_page=4$user_id#', 'act_last_7_actcount_max', 'act_last_5$user_id_by_day_var', 'act_last_7$user_id_by_day_mean', 'act_last_8_action_type=0$user_id#', 'act_last_3_actcount_max', 'app_last_5$user_id_by_day_var', 'app_last_0$user_id#', 'app_last_6$user_id_by_day_var', 'act_day_missing', 'action_type=1$user_id#', 'act_last_6_actcount_mean', 'act_last_6$user_id_by_day_mean', 'act_last_3$user_id_by_author_id_iq', 'act_last_8_page=0$user_id#', 'act_last_3_actcount_mean', 'act_last_6$user_id_by_author_id_iq', 'video_last_8$user_id_by_day_var', 'act_last_5$user_id_by_day_mean', 'act_last_3_page=0$user_id#', 'register_time', 'act_last_3$user_id_by_day_var', 'last_video_day', 'act_last_6$user_id_by_day_var', 'act_last_4$user_id#', 'act_last_5$user_id_by_author_id_iq', 'act_last_4$user_id_by_author_id_iq', 'first_video_day', 'video_mean#', 'act_last_8_action_type=3$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_3_page=3$user_id#', 'app_last_4$user_id_by_day_mean', 'app_last_3$user_id#', 'act_last_8_page=4$user_id#', 'act_last_6$user_id#', 'act_last_3$user_id_by_day_mean', 'act_last_7$user_id#', 'act_last_5$user_id_by_video_id_iq', 'video_last_8$user_id_by_day_mean', 'act_last_4$user_id_by_day_var', 'act_last_7_actcount_mean', 'app_last_7$user_id#', 'video$user_id_by_day_var', 'act_last_5_actcount_max', 'act_last_3_page=4$user_id#', 'act_last_8_page=2$user_id#', 'act_last_5$user_id#', 'act_last_4_actcount_max', 'video$user_id_by_day_iq', 'act_last_4$user_id_by_video_id_iq', 'act_last_5_actcount_mean', 'act$author_id#', 'app_last_6$user_id#', 'act_last_4_actcount_mean', 'act_last_8_action_type=1$user_id#', 'video_day_gap', 'act_last_3_action_type=1$user_id#', 'act_last_3_page=2$user_id#', 'app_last_3$user_id_by_day_mean', 'action_type=5$user_id#', 'video_last_8$user_id#', 'act_last_3_action_type=3$user_id#'] #for i in range(100,124): # del df[ccc[i]] del df['user_id'] df1 = df[lendf:] df = df[:lendf] return df, df1 df1 = register[register.register_day < 10] df1['register_time'] = 17 - register.register_day df2 = register[register.register_day < 17] df2['register_time'] = 24 - register.register_day test_df = register[register.register_day < 24] test_df['register_time'] = 31 - test_df.register_day del df1['register_day'], df2['register_day'], test_df['register_day'] if os.path.exists(path + 'train_y1.csv'): train_y1 = pd.read_csv(path + 'train_y1.csv') else: train_y1 = is_active(df1, 17, 23, app, video, act) train_y1.to_csv(path + 'train_y1.csv', index=False) train_y1 = train_y1['Y'] if os.path.exists(path + 'train_y2.csv'): train_y2 = pd.read_csv(path + 'train_y2.csv') else: train_y2 = is_active(df2, 24, 30, app, video, act) train_y2.to_csv(path + 'train_y2.csv', index=False) train_y2 = train_y2['Y'] if os.path.exists(path + 'df1.csv'): df1 = pd.read_csv(path + 'df1.csv') else: df1 = get_features(df1, 1, 16) df1.to_csv(path + 'df1.csv', index=False) if os.path.exists(path + 'df2.csv'): df2 = pd.read_csv(path + 'df2.csv') else: df2 = get_features(df2, 1, 23) df2.to_csv(path + 'df2.csv', index=False) if val: train_df = df1 test_df = df2 train_y = train_y1 val_y = train_y2 else: if os.path.exists(path + 'test_df.csv'): test_df = pd.read_csv(path + 'test_df.csv') else: test_df = get_features(test_df, 1, 30) test_df.to_csv(path + 'test_df.csv', index=False) train_df = df1.append(df2) train_y = train_y1.append(train_y2) #train_df = df2 #train_y = train_y2 del df1, df2 gc.collect() ids = test_df['user_id'] train_df, test_df = get_features_all(train_df, test_df) ''' train_df['Y'] = train_y print (len(train_df)) train_js = train_df[train_df['act_mean#']==0] train_df = train_df[train_df['act_mean#']>0] print (len(train_df)) train_y = train_df['Y'] del train_df['Y'] train_y_js = train_js['Y'] del train_js['Y'] test_df['Y'] = val_y test_js = test_df[test_df['act_mean#']==0] test_df = test_df[test_df['act_mean#']>0] val_y = test_df['Y'] del test_df['Y'] js_y = test_js['Y'] del test_js['Y'] ''' pre_train, test_y = predict_data(train_df, train_y, 10, test_df, importance=1) #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1) ''' test_df['Y'] = val_y test_df['Y1'] = test_y test_js = test_df[test_df['act_mean#']==0] print(len(test_js)) print(len(test_js[test_js['Y1']>=0.4])) print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)])) test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0 print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)])) test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0 ''' if val == 1: showresults(val_y, test_y) showtop(val_y, test_y, nums=10705) return ids, test_y, getbest(ids, test_y, rank=10705) else: showresults(train_y, pre_train) showtop(train_y, pre_train, nums=16449) return ids, test_y, getbest(ids, test_y, th=0.4)
def DNN(model_train='', save_model='', validate =0): dict = utils.unpickle('cifar_10/data_batch_1') test_dict = utils.unpickle('cifar_10/data_batch_2') LX, LY = utils.data_fromCIFAR10(dict, 10000) test_data, test_label = utils.data_fromCIFAR10(test_dict, 10000) x_train = LX x_train = np.reshape(x_train, (np.shape(x_train)[0],32,32,3)) y_train = LY y_train = to_categorical(y_train,10) allData, allLabel = utils.data_fromCIFAR10(dict, 10000) UX = allData[6000:] true_label = allLabel[6000:] img_shape = (32,32,3) input_img = Input(shape=img_shape) classifier = Flatten()(input_img) classifier = Dropout(0.5)(classifier) classifier = Dense(1024, activation='relu')(classifier) classifier = Dropout(0.5)(classifier) classifier = Dense(256, activation='relu')(classifier) classifier = Dropout(0.5)(classifier) classifier = Dense(10, activation='softmax')(classifier) dnn = Model(inputs=input_img, outputs=classifier) #adam2 = keras.optimizers.Adam(lr=0.0003, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) dnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) dnn.summary() early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0) model_check = ModelCheckpoint(save_model, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False) if ( model_train and save_model) : print('Load model and save new train model.') if os.path.isfile(model_train): dnn = load_model(model_train) if (validate): dnn.fit( x_train, y_train, batch_size= 100, epochs= 20, validation_split= validate, callbacks=[ model_check, early_stop ]) else: dnn.fit( x_train, y_train, batch_size= 100, epochs= 20, callbacks=[ model_check, early_stop ]) dnn.save(save_model) dnn = load_model(save_model) elif model_train: print('Load ae_model {}'.format( model_train)) dnn = load_model(model_train) elif save_model: print('Build ae_model and save. File: {}'.format(save_model)) dnn.fit( x_train, y_train, batch_size= 100, epochs= 40, validation_split= validate, callbacks=[ model_check, early_stop ]) dnn.save(save_model) else: print('Build ae_model only.') dnn.fit( x_train, y_train, batch_size= 100, epochs= 30, validation_split= validate ) prediction = utils.predict_data(test_data, dnn) prediction = np.argmax(prediction, axis=1) count =0 for i in range (np.shape(test_label)[0]): if (test_label[i] == prediction[i]): count += 1 print('Correct Rate = {}'.format( count/np.shape(test_label)[0])) return dnn
train_df, test_df = get_features_all(train_df, test_df) cfl = ['device_type', 'kmeans', 'register_type', 'register_time'] if val: pre_train, test_y = predict_data_val(train_df, train_y, 10, test_df, val_y, importance=1, cf_list=cfl) #pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1,loss = 1,nb=56) else: pre_train, test_y = predict_data(train_df, train_y, 10, test_df, importance=0, loss=1, nb=99) sp = 1 if val == 1: showresults(val_y, test_y) #showtop(val_y,test_y,nums=18223) showtop(val_y, test_y, nums=15428) showfalse(ids, test_df, val_y, test_y) else: showresults(train_y, pre_train) if sp: df_1_28 = register[register.register_day <= 28]
def getreal(Fwy): list280 = [400319, 407710, 403402] onoff280 = [405428, 410590] list680 = [400335, 420614, 404690] onoff680 = [409056, 408289] list101 = [400868, 401472, 400119, 400661] onoff101 = [402883, 409308] list880 = [400844, 401871, 401545, 400284, 400662] onoff880 = [403200, 403098] # stations_list = [400319,407710,403402,400335,420614,404690,400868,400661,400119,401472,400844,401871,401545,400284,400662] Fwy = 101 # Required Listbox # pointA #dummy textbox # pointB #dummy textbox if (Fwy == 101): stations_list = list101 onoff = onoff101 elif (Fwy == 280): stations_list = list280 onoff = onoff280 elif (Fwy == 680): stations_list = list680 onoff = onoff680 else: stations_list = list880 onoff = onoff880 cols = [ 'station', 'timestamp_', 'occupancy', 'hourlyprecipitation', 'hourlywindspeed', 'hourlyvisibility', 'incident', 'day_of_week_num', 'hour_of_day', 'weekend', 'speed' ] colstomod = ['occupancy', 'day_of_week_num', 'hour_of_day', 'speed'] final = pd.DataFrame(columns=cols) pred_speeds = np.array([]) for station in stations_list: url = "https://n8nucy5gbh.execute-api.us-east-2.amazonaws.com/production/realtime/?station=" + str( station) r = requests.get(url=url) data = r.json() df = pd.read_json(data, orient='columns')[cols] print("counting", df.count()) dfx = df.set_index(['station', 'timestamp_' ]).sort_values(['station', 'timestamp_'])[colstomod] stationid = station modelfile = "../models/" + str(Fwy) + "_" + str( stationid) + "_" + "speed.h5" #define how many timesteps to look back as input with the variable n_lag. n_lag = 3 #define how many timesteps ahead to predict with the variable n_steps. n_steps = 1 treframed, tkey, tscaled, tscaler1 = utils.format_model_data( dfx, n_lag, n_steps) treframed.drop(treframed.columns[[12, 13, 14]], axis=1, inplace=True) tinv_y, tinv_yhat = utils.predict_data(treframed, modelfile, tscaler1) y_actual = pd.DataFrame(tinv_y) y_predicted = pd.DataFrame((tinv_yhat)) df_2020 = pd.concat([y_actual, y_predicted], axis=1) col = ['y_actual', 'y_predicted'] df_2020.columns = col pred_speeds = np.append(pred_speeds, tinv_yhat[-1]) final = final.append(df) final['station'] = final['station'].astype('int64') final['occupancy'] = final['occupancy'].astype('float') final['speed'] = final['speed'].astype('float') final['hourlyprecipitation'] = final['hourlyprecipitation'].astype('float') final['hourlywindspeed'] = final['hourlywindspeed'].astype('float') final['hourlyvisibility'] = final['hourlyvisibility'].astype('int32') final['incident'] = final['incident'].astype('int32') final_data = final.set_index(['station']) finals = final_data.groupby(final_data.index).agg({ 'speed': 'mean', 'incident': 'sum', 'occupancy': 'mean', 'hourlyprecipitation': 'mean', 'hourlywindspeed': 'mean', 'hourlyvisibility': 'mean' }) finals['p_speed'] = pred_speeds finals = finals.reset_index(['station']) finals.head() # Should have this part merged stationwise with above cell later df_traffic_metadata = pd.read_csv("station_meta_finalv2.csv", sep=',', header=0) onoff_withmeta_df = df_traffic_metadata[df_traffic_metadata['ID'].isin( onoff)] onoff_withmeta_df.drop_duplicates(subset='ID', inplace=True) withmeta_df = finals.merge(df_traffic_metadata, left_on="station", right_on="ID", how="left").round(3) withmeta_df.head() #Time Taken sorter = [onoff[0]] + stations_list + [onoff[1]] sorterIndex = dict(zip(sorter, range(len(sorter)))) a = df_traffic_metadata[df_traffic_metadata['ID'].isin(onoff + stations_list)] a['Rank'] = a['ID'].map(sorterIndex) cols1 = ['ID', 'Fwy', 'Latitude', 'Longitude'] a = a.sort_values(['Rank'])[cols1] a.drop_duplicates(subset=['ID'], inplace=True) a['speed'] = [pred_speeds[0]] + list(pred_speeds) + [pred_speeds[-1]] tim = np.array([]) for i in range(1, len(a)): p1 = (a.iloc[i - 1][2], a.iloc[i - 1][3]) p2 = (a.iloc[i][2], a.iloc[i][3]) dist = geodesic(p1, p2).miles spd = (a.iloc[i - 1][4] + a.iloc[i][4]) / 2 t = dist / spd tim = np.append(tim, t) timetak = sum(tim * 60).round(2) print(timetak) #Map #101 gpx_file101 = open('101.gpx', 'r') gpx101 = gpxpy.parse(gpx_file101) points101 = [] for track in gpx101.tracks: for segment in track.segments: for point in segment.points: points101.append(tuple([point.latitude, point.longitude])) #280 gpx_file280 = open('280.gpx', 'r') gpx280 = gpxpy.parse(gpx_file280) points280 = [] for track in gpx280.tracks: for segment in track.segments: for point in segment.points: points280.append(tuple([point.latitude, point.longitude])) #680 gpx_file680 = open('680.gpx', 'r') gpx680 = gpxpy.parse(gpx_file680) points680 = [] for track in gpx680.tracks: for segment in track.segments: for point in segment.points: points680.append(tuple([point.latitude, point.longitude])) #880 gpx_file880 = open('880.gpx', 'r') gpx880 = gpxpy.parse(gpx_file880) points880 = [] for track in gpx880.tracks: for segment in track.segments: for point in segment.points: points880.append(tuple([point.latitude, point.longitude])) ave_lat = sum(p[0] for p in points880) / len(points880) ave_lon = sum(p[1] for p in points880) / len(points880) # Load map centred on average coordinates my_map = folium.Map(location=[ave_lat, ave_lon], zoom_start=9, tiles="Stamen Terrain") if (Fwy == 101): fg101 = folium.FeatureGroup(name="U.S 101", show=True) fg280 = folium.FeatureGroup(name="I280", show=False) fg680 = folium.FeatureGroup(name="I680", show=False) fg880 = folium.FeatureGroup(name="I880", show=False) if (Fwy == 280): fg280 = folium.FeatureGroup(name="I280", show=True) fg101 = folium.FeatureGroup(name="U.S 101", show=False) fg680 = folium.FeatureGroup(name="I680", show=False) fg880 = folium.FeatureGroup(name="I880", show=False) if (Fwy == 680): fg680 = folium.FeatureGroup(name="I680", show=True) fg101 = folium.FeatureGroup(name="U.S 101", show=False) fg280 = folium.FeatureGroup(name="I280", show=False) fg880 = folium.FeatureGroup(name="I880", show=False) if (Fwy == 880): fg880 = folium.FeatureGroup(name="I880", show=True) fg101 = folium.FeatureGroup(name="U.S 101", show=False) fg280 = folium.FeatureGroup(name="I280", show=False) fg680 = folium.FeatureGroup(name="I680", show=False) ###Changes from here for row in withmeta_df.itertuples(): popuptext = "<b>Station:</b>"+str(row.station)+"<br>"+"<b>City:</b>"+str(row.City)+"<br>"+ \ "<b>Direction:</b>"+str(row.Dir)+"<br>"+ \ "<b>Predicted Speed:</b>"+str(row.p_speed)+"<br>"+ \ "<b>Avg Occupancy:</b>"+str(row.occupancy)+"<br>"+ \ "<b>Avg Precipitation:</b>"+str(row.hourlyprecipitation)+"<br>"+ \ "<b>Avg Windspeed:</b>"+str(row.hourlywindspeed)+"<br>"+ \ "<b>Avg Visibility:</b>"+str(row.hourlyvisibility)+"<br>"+ \ "<b>Incident Count:</b>"+str(row.incident) test = folium.Html(popuptext, script=True) popup = folium.Popup(test, max_width=200) if row.Fwy == 101: fg101.add_child( folium.Marker(location=[row.Latitude, row.Longitude], popup=popup, icon=folium.Icon(color='blue', prefix='fa', icon='car'))) if row.Fwy == 280: fg280.add_child( folium.Marker(location=[row.Latitude, row.Longitude], popup=popup, icon=folium.Icon(color='blue', prefix='fa', icon='car'))) if row.Fwy == 680: fg680.add_child( folium.Marker(location=[row.Latitude, row.Longitude], popup=popup, icon=folium.Icon(color='blue', prefix='fa', icon='car'))) if row.Fwy == 880: fg880.add_child( folium.Marker(location=[row.Latitude, row.Longitude], popup=popup, icon=folium.Icon(color='blue', prefix='fa', icon='car'))) for row in onoff_withmeta_df.itertuples(): if row.Fwy == 101: fg101.add_child( folium.Marker(location=[row.Latitude, row.Longitude], icon=folium.Icon(color='red', prefix='fa', icon='circle'))) if row.Fwy == 280: fg280.add_child( folium.Marker(location=[row.Latitude, row.Longitude], icon=folium.Icon(color='red', prefix='fa', icon='circle'))) if row.Fwy == 680: fg680.add_child( folium.Marker(location=[row.Latitude, row.Longitude], icon=folium.Icon(color='red', prefix='fa', icon='circle'))) if row.Fwy == 880: fg880.add_child( folium.Marker(location=[row.Latitude, row.Longitude], icon=folium.Icon(color='red', prefix='fa', icon='circle'))) folium.PolyLine(points101, color="black", weight=2.5, opacity=1).add_to(fg101) folium.PolyLine(points280, color="purple", weight=2.5, opacity=1).add_to(fg280) folium.PolyLine(points680, color="green", weight=2.5, opacity=1).add_to(fg680) folium.PolyLine(points880, color="yellow", weight=2.5, opacity=1).add_to(fg880) my_map.add_child(fg101) my_map.add_child(fg280) my_map.add_child(fg680) my_map.add_child(fg880) folium.LayerControl().add_to(my_map) legend_html = "<div style=\"position: fixed; \ bottom: 10px; left: 30px; width: 220px; height: 70px;\ border:2px solid grey; z-index:9999; font-size:14px; + \ \"><br>\ <i class=\"fa fa-car fa-2x\" style=\"color:purpule\"></i>\ Travel Time: " + str( timetak.round(2)) + " mins" + "<br/><br/>\ </div>" #my_map.get_root().html.add_child(folium.Element(legend_html)) my_map.save('./static/Map.html') # my_map withmeta_df.drop_duplicates(subset=['station'], inplace=True) finavg = withmeta_df.groupby('Dir').agg({ 'speed': 'mean', 'incident': 'sum', 'occupancy': 'mean', 'hourlyprecipitation': 'mean', 'hourlywindspeed': 'mean', 'hourlyvisibility': 'mean' }) finavg = finavg.reset_index() # finavg.head() avgocc = str(finavg['occupancy'][0].astype('float').round(1)) avgspeed = str(finavg['speed'][0].astype('float').round(1)) avgvisibility = str(finavg['hourlyvisibility'][0].astype('float').round(1)) avgwindspeed = str(finavg['hourlywindspeed'][0].astype('float').round(1)) avgprecipitation = str( finavg['hourlyprecipitation'][0].astype('float').round(1)) incidentcount = str(finavg['incident'][0].astype('int')) return my_map, timetak, avgocc, avgspeed, avgvisibility, avgwindspeed, avgprecipitation, incidentcount
def predict_1_28(val,register,app,video,act): path = '../data1/1_28/' def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() df = docount(df,df,'ALL',['register_type']) df = docount(df,df,'ALL',['device_type']) del df['user_id'] df1 = df[lendf:] df = df[:lendf] return df,df1 df1 = register[register.register_day<15] df1['register_time'] = 17-register.register_day df2 = register[register.register_day<22] df2['register_time'] = 24-register.register_day df2[df2['register_time']>16]['register_time'] = 16 test_df = register[register.register_day<29] test_df['register_time'] = 31-test_df.register_day df2[df2['register_time']>16]['register_time'] = 16 del df1['register_day'],df2['register_day'],test_df['register_day'] if os.path.exists(path+'train_y1.csv'): train_y1=pd.read_csv(path+'train_y1.csv') else: train_y1 = is_active(df1,17,23,app,video,act) train_y1.to_csv(path+'train_y1.csv',index=False) train_y1 = train_y1['Y'] if os.path.exists(path+'train_y2.csv'): train_y2=pd.read_csv(path+'train_y2.csv') else: train_y2 = is_active(df2,24,30,app,video,act) train_y2.to_csv(path+'train_y2.csv',index=False) train_y2 = train_y2['Y'] if os.path.exists(path+'df1.csv'): df1=pd.read_csv(path+'df1.csv') else: df1 = get_features_ks(df1,1,16,app,video,act) df1.to_csv(path+'df1.csv',index=False) if os.path.exists(path+'df2.csv'): df2=pd.read_csv(path+'df2.csv') else: df2 = get_features_ks(df2,8,23,app,video,act) df2.to_csv(path+'df2.csv',index=False) if val: train_df = df1 test_df = df2 train_y = train_y1 val_y = train_y2 else: if os.path.exists(path+'test_df.csv'): test_df=pd.read_csv(path+'test_df.csv') else: test_df = get_features_ks(test_df,15,30,app,video,act) test_df.to_csv(path+'test_df.csv',index=False) train_df = df1.append(df2) train_y = train_y1.append(train_y2) #train_df = df2 #train_y = train_y2 del df1,df2 gc.collect() ids = test_df['user_id'] train_df,test_df = get_features_all(train_df,test_df) ''' train_df['Y'] = train_y print (len(train_df)) train_js = train_df[train_df['act_mean#']==0] train_df = train_df[train_df['act_mean#']>0] print (len(train_df)) train_y = train_df['Y'] del train_df['Y'] train_y_js = train_js['Y'] del train_js['Y'] test_df['Y'] = val_y test_js = test_df[test_df['act_mean#']==0] test_df = test_df[test_df['act_mean#']>0] val_y = test_df['Y'] del test_df['Y'] js_y = test_js['Y'] del test_js['Y'] ''' pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1) #pre_train_js,test_y_js = predict_data(train_js,train_y_js,10,test_js,importance=1) ''' test_df['Y'] = val_y test_df['Y1'] = test_y test_js = test_df[test_df['act_mean#']==0] print(len(test_js)) print(len(test_js[test_js['Y1']>=0.4])) print(len(test_js[(test_js['Y1']>=0.4) & (test_js['Y']==1)])) test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)]['Y1'] = 0 print (len(test_df[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)])) test_y[(test_df['act_mean#']==0) & (test_df['Y1']>=0.4)] = 0 ''' if val==1: showresults(val_y,test_y) showtop(val_y,test_y,nums=15428) showtop(val_y,test_y,nums=15905) showfalse(ids,test_df,val_y,test_y) #showprecision(val_y,test_y) return ids,test_y,getbest(ids,test_y,th=0.4) else: showresults(train_y,pre_train) showtop(train_y,pre_train,nums=25713) #return ids,test_y,getbest(ids,test_y,th=0.4) return ids,test_y,getbest(ids,test_y,rank=22088)
def autoencoder(): dict = utils.unpickle('cifar_10/data_batch_1') test_dict = utils.unpickle('cifar_10/data_batch_2') # dict content = [b'batch_label', b'labels', b'data', b'filenames'] LX, LY = utils.data_fromCIFAR10(dict, 500) test_data, test_label = utils.data_fromCIFAR10(test_dict, 10000) #LX, LY = utils.data_fromCIFAR10(dict, 5000) #print("label data number {}, label number {}".format(np.shape(LX), np.shape(LY))) #model = net.Net1(train_data, train_lable, save_model='test.h5', validate=0.1) #model = train(LX, LY, save_model='test.h5', validate=0.1) #model = train(LX, LY) allData, allLabel = utils.data_fromCIFAR10(dict, 10000) #UX = UX[1000:] #print("unlabel data number line102", np.shape(UX)) UX = allData[6000:] true_label = allLabel[6000:] train_data = np.concatenate((LX, UX), axis=0) train_data, _ = data_aug(train_data, np.ones((train_data.shape[0], 1))) X_normal = np.asarray(train_data, dtype='float32') / 255.0 if not os.path.isfile('ae_model.h5'): ae_model, ae_dnn = net.Autoencoder(train_data, X_normal, model_train='ae_model.h5', save_model='ae_model.h5', validate=0.1) else: ae_model, ae_dnn = net.Autoencoder(train_data, X_normal, model_train='ae_model.h5', validate=0.1) '''ae_predict = utils.predict_data(test_data, ae_model) ae_predict = np.argmax(ae_predict, axis=1)''' #print('auto encoder prediction dim = {}'.format(np.shape(ae_predict))) #plt.plot(np.asarray(ae_predict)) #plt.show() #ae_model= load_model('ae_model.h5') #ae_model.summary() # CNN part for layer in ae_model.layers: layer.trainable = False #ae_dnn.summary() x_train, y_train = data_aug(LX, LY) y_train = to_categorical(y_train, 10) iter = 10 save_model = 'dnn_model.h5' early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0) model_check = ModelCheckpoint(save_model, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False) ae_dnn = load_model('dnn_model_1.h5') prediction = utils.predict_data(test_data, ae_dnn) prediction = np.argmax(prediction, axis=1) count = 0 for i in range(np.shape(test_label)[0]): if (test_label[i] == prediction[i]): count += 1 print('Correct Rate = {}'.format(count / np.shape(test_label)[0]))
def predict_24_28(val,register,app,video,act): def get_features(df,d1,d2): tapp = app[(app.day>=d1) & (app.day<=d2)] tact = act[(act.day>=d1) & (act.day<=d2)] tvideo = video[(video.day>=d1) & (video.day<=d2)] tapp.day = tapp.day - d1 tact.day = tact.day - d1 tvideo.day = tvideo.day - d1 lastday = d2-d1 df['register_time'] = d2-df.register_day+1 del df['register_day'] #app df = docount(df,tapp,'app',['user_id']) df['app_mean#'] = df['app$user_id#']/df['register_time'] #df = domax(df,tapp,'app',['user_id'],'day') #df['last_app_day'] = lastday - df['app$user_id_by_day_max']+1 del df['app$user_id#'] #df['app_day_missing'] = df['register_time'] - df['app$user_id#'] #df['app$user_id#'] = df['app$user_id#']/df['register_time'] #df = dovar(df,tapp,'app',['user_id'],'day') #df = docount(df,tapp[tapp.day>lastday-2],'app_last_2',['user_id']) #df = docount(df,tapp[tapp.day>lastday-1],'app_last_1',['user_id']) #df = docount(df,tapp[tapp.day==lastday],'app_last_1',['user_id']) gc.collect() #video #df = docount(df,tvideo,'video',['user_id']) #df['video_mean#'] = df['video$user_id#']/df['register_time'] #df = domax(df,tvideo,'video',['user_id'],'day') #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1 #del df['video$user_id_by_day_max'] #df = doiq(df,tvideo,'video',['user_id'],'day') #df['last_video_day'] = lastday - df['video$user_id_by_day_max']+1 #df['video_day_missing'] = df['register_time'] - df['video$user_id_by_day_iq'] #df['video$user_id#'] = df['video$user_id#']/df['register_time'] #df = dovar(df,tvideo,'video',['user_id'],'day') df = docount(df,tvideo[tvideo.day>lastday-2],'video_last_2',['user_id']) df = docount(df,tvideo[tvideo.day>lastday-3],'video_last_3',['user_id']) #df = docount(df,tvideo[tvideo.day==lastday],'video_last_1',['user_id']) gc.collect() #act #gp = act.groupby(['user_id','day']).size().unstack() #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left') #df = docount(df,tact,'act',['user_id']) #df['act_mean#'] = df['act$user_id#']/df['register_time'] df = domax(df,tact,'act',['user_id'],'day') df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 del df['act$user_id_by_day_max'] #df = doiq(df,tact,'act',['user_id'],'day') #df['last_act_day'] = lastday - df['act$user_id_by_day_max']+1 #df['act_day_missing'] = df['register_time'] - df['act$user_id_by_day_iq'] #df['act$user_id#'] = df['act$user_id#']/df['register_time'] #gp = tact.groupby(['user_id','day']).size().unstack() #df = pd.merge(df,gp.max(1).rename('actcount_max').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.mean(1).rename('actcount_mean').reset_index(),on=['user_id'],how='left') #df = pd.merge(df,gp.var(1).rename('actcount_var').reset_index(),on=['user_id'],how='left') #df = dovar(df,tact,'act',['user_id'],'day') df = docount(df,tact[tact.day>lastday-2],'act_last_2',['user_id']) df = docount(df,tact[tact.day>lastday-3],'act_last_3',['user_id']) #df = docount(df,tact[tact.day==lastday],'act_last_1',['user_id']) gc.collect() #page_list = list(tact['page'].unique()) for c in [0,1,2,3]: df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-3)],'act_last_3_page='+str(c),['user_id']) df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-2)],'act_last_2_page='+str(c),['user_id']) df = docount(df,tact[(tact['page']==c) & (tact.day>lastday-1)],'act_last_1_page='+str(c),['user_id']) df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-3],'act_last_3',['user_id'],'video_id') df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-2],'act_last_2',['user_id'],'video_id') df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'author_id') df = doiq(df,tact[tact.day>lastday-1],'act_last_1',['user_id'],'video_id') for c in [0,1,2,3]: df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-3)],'act_last_3_action_type='+str(c),['user_id']) df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-2)],'act_last_2_action_type='+str(c),['user_id']) df = docount(df,tact[(tact['action_type']==c) & (tact.day>lastday-1)],'act_last_1_action_type='+str(c),['user_id']) gc.collect() return df path = '../data1/24_28/' if val: if os.path.exists(path+'val_df.csv'): test_df = pd.read_csv(path+'val_df.csv') val_y = pd.read_csv(path+'val_y.csv') else: test_df = register[(register.register_day>=17) & (register.register_day<=21)] test_df = get_features(test_df,17,23) val_y = is_active(test_df,24,30,app,video,act) test_df.to_csv(path+'val_df.csv',index=False) val_y.to_csv(path+'val_y.csv',index=False) val_y = val_y['Y'] if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,11): df = register[(register.register_day>=i) & (register.register_day<=i+4)] y = is_active(df,i+7,i+13,app,video,act) df = get_features(df,i,i+6) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'val_train_df.csv',index=False) train_y.to_csv(path+'val_train_y.csv',index=False) else: if os.path.exists(path+'test_df.csv'): test_df = pd.read_csv(path+'test_df.csv') else: test_df = register[(register.register_day>=24) & (register.register_day<=28)] test_df = get_features(test_df,24,30) test_df.to_csv(path+'test_df.csv',index=False) if os.path.exists(path+'train_df.csv'): train_df = pd.read_csv(path+'train_df.csv') train_y = pd.read_csv(path+'train_y.csv') else: if os.path.exists(path+'val_train_df.csv'): train_df = pd.read_csv(path+'val_train_df.csv') train_y = pd.read_csv(path+'val_train_y.csv') for i in range(11,18): df = register[(register.register_day>=i) & (register.register_day<=i+4)] y = is_active(df,i+7,i+13,app,video,act) df = get_features(df,i,i+6) train_df = train_df.append(df) train_y = train_y.append(y) else: train_df = pd.DataFrame() train_y = pd.DataFrame() for i in range(1,18): df = register[(register.register_day>=i) & (register.register_day<=i+4)] y = is_active(df,i+7,i+13,app,video,act) df = get_features(df,i,i+6) train_df = train_df.append(df) train_y = train_y.append(y) train_df.to_csv(path+'train_df.csv',index=False) train_y.to_csv(path+'train_y.csv',index=False) train_y = train_y['Y'] #print(sum(train_y)/len(train_y)) def get_features_all(df,df1): lendf = len(df) df= df.append(df1) del df1 gc.collect() #for c in ['act_last_2$user_id#']: # df = domean(df,df,'All',['device_type'],c);gc.collect() # df = domean(df,df,'All',['register_type'],c);gc.collect() #del df #ccc = ['device_type', 'app_mean#', 'register_type', 'register_time', 'act_last_3_page=1$user_id#', 'last_act_day', 'act_last_3$user_id_by_video_id_iq', 'act_last_3_page=2$user_id#', 'act_last_3$user_id_by_author_id_iq', 'act_last_3_action_type=1$user_id#', 'act_last_1$user_id_by_author_id_iq', 'act_last_3_page=3$user_id#', 'act_last_3_page=0$user_id#', 'act_last_1$user_id_by_video_id_iq', 'act_last_2$user_id_by_author_id_iq', 'act_last_3$user_id#', 'act_last_2$user_id_by_video_id_iq', 'act_last_3_action_type=2$user_id#', 'act_last_3_action_type=0$user_id#', 'act_last_2_page=2$user_id#', 'act_last_2_page=1$user_id#', 'act_last_2_page=3$user_id#', 'act_last_1_page=1$user_id#', 'act_last_2$user_id#', 'act_last_2_page=0$user_id#', 'act_last_1_action_type=0$user_id#', 'act_last_2_action_type=1$user_id#', 'act_last_1_page=2$user_id#', 'act_last_3_action_type=3$user_id#', 'act_last_1_page=3$user_id#', 'act_last_2_action_type=0$user_id#', 'video_last_3$user_id#', 'act_last_1_page=0$user_id#', 'act_last_2_action_type=2$user_id#', 'act_last_2_action_type=3$user_id#', 'video_last_2$user_id#', 'act_last_1_action_type=1$user_id#', 'act_last_1_action_type=3$user_id#', 'act_last_1_action_type=2$user_id#'] #for i in range(38,39): # del df[ccc[i]] del df['user_id'] #del df['last_app_day'],df['last_video_day'],df['video_last_1$user_id#'],df['app_last_1$user_id#'] #del df['act_last_1$user_id#'],df['app_last_2$user_id#'] df1 = df[lendf:] df = df[:lendf] return df,df1 ids = test_df['user_id'] train_df,test_df = get_features_all(train_df,test_df) pre_train,test_y = predict_data(train_df,train_y,10,test_df,importance=1) #print(test_y) if val==1: print (len(train_y),sum(train_y)) showresults(val_y,test_y) showtop(val_y,test_y,nums=4723) showtop(train_y,pre_train,nums=38507) #return ids,test_y,getbest(ids,test_y,rank=4723) return ids,test_y,getbest(ids,test_y,th=0.4) else: showresults(train_y,pre_train) showtop(train_y,pre_train,nums=70275) return ids,test_y,getbest(ids,test_y,rank=5498)