def submit_partial_merge(base, folder, all_blended=False): root_path = '/home/workspace/checkins' folder = "%s/submit/%s" % (root_path, folder) stamp = str(datetime.now().strftime("%Y%m%d_%H%M%S")) output = "%s/submit/treva_overwrite_%s_all_blended_%s.csv" % (root_path, stamp, all_blended) if all_blended: tfiles = [f for f in listdir(folder) if 'blend' in f] else: tfiles = [f for f in listdir(folder) if 'blend' not in f] # # remove old batch # print("tfiles before removing old batch: %i" % len(tfiles)) # old_partials = [f for f in listdir(root_path + "/submit/treva_merge")] # tfiles = [f for f in tfiles if f not in old_partials] # print("tfiles after removing old batch: %i" % len(tfiles)) # concat and merge df_treva = [pd.read_csv("%s/%s" % (folder, f)) for f in tfiles] df_treva = pd.concat(df_treva).sort_values(by='row_id') df_base = pd.read_csv("%s/data/submits/%s" % (root_path, base)) df_base = df_base[~df_base.row_id.isin(df_treva.row_id.values)] df_overwrite = pd.concat([df_base, df_treva]).sort_values(by='row_id') df_overwrite[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False) print("ensure dim:", len(df_treva), len(set(df_treva.row_id.values)), len(set(df_overwrite.row_id.values))) print("overwrite output written in %s @ %s" % (output, datetime.now())) submiter.submiter().submit(entry=output, message="treva submit_partial_merge with %s and all_blended=%s" % (base, all_blended))
def main(stamp, x_step, y_step, do_submit=False, base=None): start_time = time.time() timestamp = str(datetime.now().strftime("%Y%m%d_%H%M%S")) path = '/home/workspace/checkins/submit/%s' % stamp score_all, df_all = process_all(path=path, size=10.0, x_step=x_step, y_step=y_step) score_avg = sum([v * l for v, l in score_all]) / sum([l for v, l in score_all]) print("score_avg: ", score_avg) sub_fname = '/home/workspace/checkins/submit/%s_%s_cv%.4f.csv' % ( stamp, timestamp, score_avg) df_all['place_id'] = [ " ".join([str(k) for k in l]) for l in df_all[[0, 1, 2]].values.tolist() ] if base: df_base = pd.read_csv("./data/submits/%s" % (base)) df_base = df_base[~df_base.row_id.isin(df_all.row_id.values)] df_all = pd.concat([df_base, df_all]).sort_values(by='row_id') df_all[['row_id', 'place_id']].to_csv(sub_fname, index=False) print("submit file written in %s" % sub_fname) if do_submit: submiter.submiter().submit(entry=sub_fname, message="%s_%s_cv%s" % (stamp, timestamp, score_avg)) print("[Finish!] Elapsed %.1f secs" % (time.time() - start_time))
def train_alg(self, alg, keep_model=False, submit=False, upload=False, mdl_config={}): # get data start_time = time.time() norm = self.params.get('norm') df_train, df_valid, df_test = self.pas.get_data() # train & test print("[train_alg]: alg=%s, mdl_config=%s" % (alg, mdl_config)) self.tra.train(df_train, alg=alg, mdl_config=mdl_config, norm=norm) train_score, valid_score = 0, 0 if self.params['size'] <= 0.5: # eva.train only when dev. _, train_score = self.eva.evaluate(df_train, title='Eva.Train', norm=norm) if len(df_valid) > 0: valids_total, valid_score = self.eva.evaluate(df_valid, title='Eva.Test', norm=norm) pickle.dump([valids_total, df_valid], open("%s/valid/valid_%s.pkl" % (self.params['root'], self.params['stamp']), 'wb')) # self.eva.gen_submit_file(valids_total, valid_score, title='valid') if alg in ['skrf', 'skrfp', 'sket', 'sketp']: print("[skrf feature_importance]", self.get_feature_importance()) # save & clear if not keep_model: self.eva.clear_meta_files() if submit: preds_total, _ = self.eva.evaluate(df_test, title='Submit', norm=norm) sfile = self.eva.gen_submit_file(preds_total, valid_score) if upload: submiter.submiter().submit(entry=sfile, message=self.params) print("[Finished!] Elapsed time overall for %.2f secs" % (time.time() - start_time)) return valid_score
def blending_flow(va_paths, te_paths, top_w=2, submit=False): va_preds = [pickle.load(open(path, 'rb')) for path in va_paths] te_preds = [pickle.load(open(path, 'rb')) for path in te_paths] scores = [v['score'][1] for v in va_preds] best_mdl = scores.index(max(scores)) mdl_weights = [(top_w if mi == best_mdl else 1) for mi in range(len(va_preds))] print("scores=%s, mdl_weights=%s" % (scores, mdl_weights)) # blending _ = blendor(va_preds, mdl_weights, ytest=va_preds[0]['ytest']) blended_submits = blendor(te_preds, mdl_weights, ytest=None) # output output = "./submit/knn2_blended_%s.csv" % (stamp) df = pd.DataFrame(blended_submits) df['row_id'] = df.index df['place_id'] = df[[0, 1, 2]].astype(str).apply(lambda x: ' '.join(x), axis=1) df.drop([0, 1, 2], axis=1, inplace=True) df[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False) print("submit file written in %s @ %s" % (output, datetime.now())) if submit: submiter.submiter().submit(entry=output, message="knn2")
def submit_partial_merge(base, folder, all_blended=False): root_path = '/home/workspace/checkins' folder = "%s/submit/%s" % (root_path, folder) stamp = str(datetime.now().strftime("%Y%m%d_%H%M%S")) output = "%s/submit/treva_overwrite_%s_all_blended_%s.csv" % ( root_path, stamp, all_blended) if all_blended: tfiles = [f for f in listdir(folder) if 'blend' in f] else: tfiles = [f for f in listdir(folder) if 'blend' not in f] # # remove old batch # print("tfiles before removing old batch: %i" % len(tfiles)) # old_partials = [f for f in listdir(root_path + "/submit/treva_merge")] # tfiles = [f for f in tfiles if f not in old_partials] # print("tfiles after removing old batch: %i" % len(tfiles)) # concat and merge df_treva = [pd.read_csv("%s/%s" % (folder, f)) for f in tfiles] df_treva = pd.concat(df_treva).sort_values(by='row_id') df_base = pd.read_csv("%s/data/submits/%s" % (root_path, base)) df_base = df_base[~df_base.row_id.isin(df_treva.row_id.values)] df_overwrite = pd.concat([df_base, df_treva]).sort_values(by='row_id') df_overwrite[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False) print("ensure dim:", len(df_treva), len(set(df_treva.row_id.values)), len(set(df_overwrite.row_id.values))) print("overwrite output written in %s @ %s" % (output, datetime.now())) submiter.submiter().submit( entry=output, message="treva submit_partial_merge with %s and all_blended=%s" % (base, all_blended))
def run(self, cmd=None): #--------------------------------------------- if cmd == 'gs_top_w': for top_w in [1.5, 1.7, 1.9]: self.top_w = {0: top_w} stamp = "gs_top_w%s_%s" % ( top_w, str(datetime.now().strftime("%Y%m%d_%H%M%S"))) self.launch(stamp=stamp) print("[RUN] done gs_top_w=%i" % (top_w)) #--------------------------------------------- elif cmd == 'gs_rank_w': rank_ws = [ [1, 0.8, 0.6], [1, 0.8, 0.4], [1, 0.6, 0.4], [1, 0.6, 0.1], [1, 0.4, 0.2], ] for rank_w in rank_ws: self.rank_w = rank_w stamp = "gs_rank_ws_%s_%s" % ("_".join([ str(w) for w in rank_w ]), str(datetime.now().strftime("%Y%m%d_%H%M%S"))) self.launch(stamp=stamp) print("[RUN] done gs_rank_ws=%s" % (rank_w)) #--------------------------------------------- elif cmd == 'gs_top_n': self.init_models() all_mdl_names = [(k, 1) for k, v in self.mdl_names] for n in [8, 15, 30]: print("[gs_top_n] n=%i" % n) self.mdl_names = all_mdl_names[:n] self.launch() submiter.submiter().submit(entry=self.output_fname, message=self.mdl_names) return #--------------------------------------------- elif cmd == 'debug': self.do_corr_rows = 100000 self.do_blend_rows = 100000 self.launch() elif cmd == 'average': self.init_models() self.mdl_names = [(k, 1) for k, v in self.mdl_names] self.launch() elif cmd == 'average_but_top': self.init_models() self.mdl_names = [((k, 1) if idx > 0 else (k, 2)) for idx, (k, v) in enumerate(self.mdl_names)] self.launch() else: self.launch() # auto-submit if self.do_upload: submiter.submiter().submit(entry=self.output_fname, message=self.mdl_names)
def generate_submission(preds, sfile, msg, submit=None): print('Writing submission file') preds = np.sort(preds.view('i8,i8,i8,i8'), order=['f0'], axis=0).view(np.int) with open(sfile, "w") as out: out.write("row_id,place_id\n") rows = ['']*preds.shape[0] for num in range(preds.shape[0]): rows[num]='%d,%d %d %d\n' % (preds[num,0],preds[num,1],preds[num,2],preds[num,3]) out.writelines(rows) if submit: submiter.submiter().submit(entry=sfile, message=msg)
def evaluate_model(self, evaluate=False, submit=False, upload=False): print("[Evaluate_model] with params=%s" % (self.params)) start_time = time.time() norm = self.params.get('norm') df_train, df_valid, df_test = self.pas.get_data() valid_score = 0.0 if evaluate: _, valid_score = self.eva.evaluate(df_valid, title='Test', norm=norm) if submit: preds_total, _ = self.eva.evaluate(df_test, title='Submit', norm=norm) sfile = self.eva.gen_submit_file(preds_total, valid_score) if upload: submiter.submiter().submit(entry=sfile, message=self.params) print("[Finished!] evaluate_model for %.2f secs" % (time.time() - start_time))
def generate_submission(preds, sfile, msg, submit=None): print('Writing submission file') preds = np.sort(preds.view('i8,i8,i8,i8'), order=['f0'], axis=0).view(np.int) with open(sfile, "w") as out: out.write("row_id,place_id\n") rows = [''] * preds.shape[0] for num in range(preds.shape[0]): rows[num] = '%d,%d %d %d\n' % (preds[num, 0], preds[num, 1], preds[num, 2], preds[num, 3]) out.writelines(rows) if submit: submiter.submiter().submit(entry=sfile, message=msg)
def run(self, cmd=None): #--------------------------------------------- if cmd == 'gs_top_w': for top_w in [1.5, 1.7, 1.9]: self.top_w = {0: top_w} stamp = "gs_top_w%s_%s" % (top_w, str(datetime.now().strftime("%Y%m%d_%H%M%S"))) self.launch(stamp=stamp) print("[RUN] done gs_top_w=%i" % (top_w)) #--------------------------------------------- elif cmd == 'gs_rank_w': rank_ws = [ [1, 0.8, 0.6], [1, 0.8, 0.4], [1, 0.6, 0.4], [1, 0.6, 0.1], [1, 0.4, 0.2], ] for rank_w in rank_ws: self.rank_w = rank_w stamp = "gs_rank_ws_%s_%s" % ("_".join([str(w) for w in rank_w]), str(datetime.now().strftime("%Y%m%d_%H%M%S"))) self.launch(stamp=stamp) print("[RUN] done gs_rank_ws=%s" % (rank_w)) #--------------------------------------------- elif cmd == 'gs_top_n': self.init_models() all_mdl_names = [(k, 1) for k,v in self.mdl_names] for n in [8, 15, 30]: print("[gs_top_n] n=%i" % n) self.mdl_names = all_mdl_names[:n] self.launch() submiter.submiter().submit(entry=self.output_fname, message=self.mdl_names) return #--------------------------------------------- elif cmd == 'debug': self.do_corr_rows = 100000 self.do_blend_rows = 100000 self.launch() elif cmd == 'average': self.init_models() self.mdl_names = [(k, 1) for k,v in self.mdl_names] self.launch() elif cmd == 'average_but_top': self.init_models() self.mdl_names = [((k, 1) if idx > 0 else (k, 2)) for idx, (k,v) in enumerate(self.mdl_names)] self.launch() else: self.launch() # auto-submit if self.do_upload: submiter.submiter().submit(entry=self.output_fname, message=self.mdl_names)
def train_alg(self, alg, keep_model=False, submit=False, upload=False, mdl_config={}): # get data start_time = time.time() norm = self.params.get('norm') df_train, df_valid, df_test = self.pas.get_data() # train & test print("[train_alg]: alg=%s, mdl_config=%s" % (alg, mdl_config)) self.tra.train(df_train, alg=alg, mdl_config=mdl_config, norm=norm) train_score, valid_score = 0, 0 if self.params['size'] <= 0.5: # eva.train only when dev. _, train_score = self.eva.evaluate(df_train, title='Eva.Train', norm=norm) if len(df_valid) > 0: valids_total, valid_score = self.eva.evaluate(df_valid, title='Eva.Test', norm=norm) pickle.dump([valids_total, df_valid], open( "%s/valid/valid_%s.pkl" % (self.params['root'], self.params['stamp']), 'wb')) # self.eva.gen_submit_file(valids_total, valid_score, title='valid') if alg in ['skrf', 'skrfp', 'sket', 'sketp']: print("[skrf feature_importance]", self.get_feature_importance()) # save & clear if not keep_model: self.eva.clear_meta_files() if submit: preds_total, _ = self.eva.evaluate(df_test, title='Submit', norm=norm) sfile = self.eva.gen_submit_file(preds_total, valid_score) if upload: submiter.submiter().submit(entry=sfile, message=self.params) print("[Finished!] Elapsed time overall for %.2f secs" % (time.time() - start_time)) return valid_score
def conclude(cross_validation, test, ytest, max3index, max3placeids, indices): if cross_validation == 1: indices = ([1 / 1.0, 1 / 2.0, 1 / 3.0] * (ytest[:, None] == indices[:, 0:3]) ).sum() / indices[np.nonzero(indices[:, 0])].shape[0] map3 = ([1 / 1.0, 1 / 2.0, 1 / 3.0] * (ytest[:, None] == max3placeids[:, 0:3]) ).sum() / max3placeids[np.nonzero(max3placeids[:, 0])].shape[0] ## calculation assumes unique values print('indices: %.5f, map@3: %.5f' % (indices, map3)) ## calculate map3 for each grid max3placeids1 = pd.DataFrame({ 'row_id': test.index.values, 'grid_cell': test['grid_cell'], 'ytest': ytest.values, 'id1': max3placeids[:, 0], 'id2': max3placeids[:, 1], 'id3': max3placeids[:, 2] }) gridwisemap3 = max3placeids1.groupby('grid_cell').apply( calcgridwisemap3) print("[Finish!] @ %s" % (datetime.now())) return indices, map3 else: print('writing submission file...') max3placeids = pd.DataFrame({ 'row_id': test.index.values, 'id1': max3placeids[:, 0], 'id2': max3placeids[:, 1], 'id3': max3placeids[:, 2] }) max3placeids['place_id'] = max3placeids.id1.astype(str).str.cat( [max3placeids.id2.astype(str), max3placeids.id3.astype(str)], sep=' ') sfile = './submit/%s_%s.csv' % (alg, stamp) max3placeids[['row_id', 'place_id']].to_csv(sfile, header=True, index=False) print("[Finish!] @ %s" % datetime.now()) if False: submiter.submiter().submit(entry=sfile, message="knn2") return None, None
def main(stamp, x_step, y_step, do_submit=False, base=None): start_time = time.time() timestamp = str(datetime.now().strftime("%Y%m%d_%H%M%S")) path = '/home/workspace/checkins/submit/%s' % stamp score_all, df_all = process_all(path=path, size=10.0, x_step=x_step, y_step=y_step) score_avg = sum([v*l for v, l in score_all]) / sum([l for v,l in score_all]) print("score_avg: ", score_avg) sub_fname = '/home/workspace/checkins/submit/%s_%s_cv%.4f.csv' % (stamp, timestamp, score_avg) df_all['place_id'] = [" ".join([str(k) for k in l]) for l in df_all[[0,1,2]].values.tolist()] if base: df_base = pd.read_csv("./data/submits/%s" % (base)) df_base = df_base[~df_base.row_id.isin(df_all.row_id.values)] df_all = pd.concat([df_base, df_all]).sort_values(by='row_id') df_all[['row_id', 'place_id']].to_csv(sub_fname, index=False) print("submit file written in %s" % sub_fname) if do_submit: submiter.submiter().submit(entry=sub_fname, message="%s_%s_cv%s" % (stamp, timestamp, score_avg)) print("[Finish!] Elapsed %.1f secs" % (time.time() - start_time))
def blending_flow(va_paths, te_paths, top_w=2, submit=False): va_preds = [pickle.load(open(path, 'rb')) for path in va_paths] te_preds = [pickle.load(open(path, 'rb')) for path in te_paths] scores = [v['score'][1] for v in va_preds] best_mdl = scores.index(max(scores)) mdl_weights = [(top_w if mi == best_mdl else 1) for mi in range(len(va_preds))] print("scores=%s, mdl_weights=%s" % (scores, mdl_weights)) # blending _ = blendor(va_preds, mdl_weights, ytest=va_preds[0]['ytest']) blended_submits = blendor(te_preds, mdl_weights, ytest=None) # output output = "./submit/knn2_blended_%s.csv" % (stamp) df = pd.DataFrame(blended_submits) df['row_id'] = df.index df['place_id'] = df[[0,1,2]].astype(str).apply(lambda x: ' '.join(x), axis=1) df.drop([0,1,2], axis=1, inplace=True) df[['row_id', 'place_id']].sort_values(by='row_id').to_csv(output, index=False) print("submit file written in %s @ %s" % (output, datetime.now())) if submit: submiter.submiter().submit(entry=output, message="knn2")
def conclude(cross_validation, test, ytest, max3index, max3placeids, indices): if cross_validation==1: indices = ([1/1.0, 1/2.0, 1/3.0]*(ytest[:,None] == indices[:,0:3]) ).sum()/indices[np.nonzero(indices[:,0])].shape[0] map3 = ([1/1.0, 1/2.0, 1/3.0]*(ytest[:,None] == max3placeids[:,0:3]) ).sum()/max3placeids[np.nonzero(max3placeids[:,0])].shape[0] ## calculation assumes unique values print('indices: %.5f, map@3: %.5f' % (indices, map3)) ## calculate map3 for each grid max3placeids1 = pd.DataFrame({'row_id':test.index.values, 'grid_cell': test['grid_cell'], 'ytest': ytest.values, 'id1':max3placeids[:,0],'id2':max3placeids[:,1],'id3':max3placeids[:,2]} ) gridwisemap3 = max3placeids1.groupby('grid_cell').apply(calcgridwisemap3) print("[Finish!] @ %s" % (datetime.now())) return indices, map3 else: print('writing submission file...') max3placeids = pd.DataFrame({'row_id':test.index.values,'id1':max3placeids[:,0],'id2':max3placeids[:,1],'id3':max3placeids[:,2]} ) max3placeids['place_id']=max3placeids.id1.astype(str).str.cat([max3placeids.id2.astype(str),max3placeids.id3.astype(str)], sep = ' ') sfile = './submit/%s_%s.csv' % (alg, stamp) max3placeids[['row_id','place_id']].to_csv(sfile, header=True, index=False) print("[Finish!] @ %s" % datetime.now()) if False: submiter.submiter().submit(entry=sfile, message="knn2") return None, None
def run(self): run_cmd = self.params['alg'] alg = run_cmd.split('_')[0] print("[RUN_CMD] %s" % run_cmd) start_time = time.time() #========================================== # Shared config #========================================== if 'knn' in run_cmd: self.params['norm'] = { 'x': 500, 'y':1000, 'hour':4, 'logacc':1, 'weekday':3, 'qday':1, 'month':2, 'year':10, 'day':1./22, } # self.params['norm'] = {'x': 700, 'y':1100, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10} self.params['x_cols'] = [x for x in self.params['x_cols'] if x in self.params['norm'].keys()] self.params['x_step'] = 0.25 self.params['y_step'] = 0.25 if 'try_inter' in run_cmd: self.params['x_inter'] = 2 self.params['y_inter'] = 2 self.params['mdl_weights'] = (0.4, 1, 0.4) if 'try_large_grid' in run_cmd: self.params['x_step'] = 0.4 self.params['y_step'] = 0.4 #========================================== # Choose-one config #========================================== if run_cmd == 'all': for a in ['skrf', 'skrfp', 'sket', 'sketp', 'knn', 'xgb', 'skgbc']: self.init_team() self.train_alg(a) #------------------------------------------ elif 'skrf_reverse_valid_split_time' in run_cmd: self.params['train_test_split_time'] = 100000 self.params['place_min_last_checkin'] = None self.init_team() self.train_alg(alg) #------------------------------------------ elif '_grid_step' in run_cmd: # for x_step in [0.04, 0.05, 0.08, 0.1, 0.2]: # for y_step in [0.04, 0.05, 0.08, 0.1, 0.2]: for x_step in [0.1, 0.2, 0.5, 1]: for y_step in [0.1, 0.2, 0.5, 1]: print("=====[%s for step=(%.2f, %.2f)]=====" % (run_cmd, x_step, y_step)) self.params['x_step'] = x_step self.params['y_step'] = y_step self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'knn_grid_weights': # self.params['norm'] = {'x': 500, 'y':1000, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10} for logacc in np.arange(1, 10, 2): for qday in np.arange(1, 10, 2): print("[knn_grid_weights] logacc=%i, qday=%i" % (logacc, qday)) self.params['norm']['logacc'] = logacc self.params['norm']['qday'] = qday self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_recursive_feature_elimination': fixed_feats = {'logacc', 'qday', 'x', 'y', 'hour', 'weekday', 'year', 'month'} feats = set(self.all_feats) print("[RFE] checking x_cols for %s" % (feats - fixed_feats)) while True: scores = {} self.params['x_cols'] = list(feats) self.init_team() scores['all'] = self.train_alg(alg) print("[RFE] baseline = %.4f" % scores['all']) for af in (feats - fixed_feats): self.params['x_cols'] = [a for a in feats if a != af] self.init_team() print("[RFE] x_cols remove [%s], using %s" % (af, self.params['x_cols'])) scores[af] = self.train_alg(alg) rm_feat, rm_score = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[0] if rm_score > scores['all'] - 0.01: print("[RFE] base_score = %.4f, remove %s to achieve %.4f" % (scores['all'], rm_feat, rm_score)) feats -= set([rm_feat]) else: print("[RFE] finished since no feature shall be removed!") break #------------------------------------------ elif 'skrf_mdl_weights' in run_cmd: for sw in np.arange(0, 1.2, 0.1): self.params['mdl_weights'] = (sw, 0, 1.0, 0, sw) self.init_team() self.train_alg(alg) #------------------------------------------ elif 'skrf_preprocessing' in run_cmd: for en in [0, 1]: self.params['en_preprocessing'] = en self.init_team() self.train_alg(alg) #------------------------------------------ elif 'skrf_max_cands' in run_cmd: for proc in ['W', 'H']: for cants in np.arange(10, 50, 10): self.params['en_preprocessing'] = proc self.params['max_cands'] = cants self.init_team() self.train_alg(alg) #------------------------------------------ elif 'skrf_remove_distance_outlier' in run_cmd: for std in np.arange(1, 3, 0.5): self.params['remove_distance_outlier'] = std self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_feats_sel': all_feats = self.all_feats # baseline self.params['x_cols'] = all_feats self.init_team() self.train_alg(alg) # drop 1 feature for af in all_feats: self.params['x_cols'] = [a for a in all_feats if a != af] self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_time': for mfc in [None, 200000, 250000, 300000]: for tmt in [None]: #, 400000, 500000, 600000, 700000]: self.params['place_max_first_checkin'] = mfc self.params['train_max_time'] = tmt self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_loc_th': # for th_y in np.arange(1.5, 2.5, 0.1): # for th_x in np.arange(0.6, 2, 0.2): for th_y in np.arange(1.7, 2.5, 0.2): for th_x in np.arange(2.3, 3.5, 0.2): print("[SKRF_GS_LOC_TH]: th_x=%s, th_y=%s" % (th_x, th_y)) self.params['loc_th_x'] = th_x self.params['loc_th_y'] = th_y self.init_team() self.evaluate_model(evaluate=True, submit=False) #------------------------------------------ elif run_cmd == 'skrf_place_min_checkin': for mc in np.arange(0, 5, 1): self.params['place_min_checkin'] = mc self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_time_th_wd': for pth in np.arange(0, 0.005, 0.001): self.params['time_th_wd'] = pth self.init_team() self.evaluate_model(evaluate=True, submit=False) #------------------------------------------ elif run_cmd == 'skrf_gs_time_th_hr': for pth in np.arange(0.005, 0.02, 0.002): self.params['time_th_hr'] = pth self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_popu_th': for pth in np.arange(0, 0.005, 0.001): self.params['popu_th'] = pth self.init_team() self.evaluate_model(evaluate=True, submit=False) #------------------------------------------ elif run_cmd in ['skrf_gs_params', 'skrfp_gs_params']: self.init_team() for max_feats in [0.3, 0.35, 0.4]: for n_estimators in [500]: for max_depth in [15]: self.train_alg(alg, mdl_config={'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features': max_feats}) elif run_cmd in ['sket_gs_params', 'sketp_gs_params']: self.init_team() for n_estimators in [800, 1200, 1500]: for max_depth in [13, 15, 18]: for max_feats in ['auto', 0.4, 0.5, 0.6]: self.train_alg(alg, mdl_config={'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features': max_feats}) elif run_cmd == 'xgb_gs_params': self.init_team() for n_estimators in [30, 35, 40]: for max_depth in [3, 4, 5]: for learning_rate in [0.1]: self.train_alg(alg, params={'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate}) #------------------------------------------ elif run_cmd == 'skrf_place_min_last_checkin': for mlc in [550000, 650000]: self.params['place_min_last_checkin'] = mlc self.params['stamp'] = "%s_%s_%i" % (self.params['alg'], self.timestamp, mlc/1e4) self.init_team() self.train_alg(alg, submit=True) #------------------------------------------ elif run_cmd == 'skrf_train_min_time': for mlc in [0, 50000, 100000, 150000, 200000]: self.params['train_min_time'] = mlc self.params['stamp'] = "%s_%s_%i" % (self.params['alg'], self.timestamp, mlc/1e4) self.init_team() self.train_alg(alg, submit=True) #------------------------------------------ elif 'submit_rf_family' in run_cmd: self.params['train_test_split_time'] = 1e10 # use all samples for training self.init_team() for a in ['skrf', 'skrfp']: self.train_alg(a, keep_model=True, submit=True, upload=True) self.train_alg('knn', submit=True, upload=True) elif 'submit_et_family' in run_cmd: self.params['train_test_split_time'] = 1e10 # use all samples for training self.init_team() for a in ['sket', 'sketp']: self.train_alg(a, submit=True, upload=True) elif 'submit_full' in run_cmd: self.params['train_test_split_time'] = 1e10 # use all samples for training self.init_team() self.train_alg(alg, submit=True, upload=True) elif '_submit' in run_cmd: self.init_team() self.train_alg(alg, keep_model=True, submit=True) elif 'eva_exist' in run_cmd: self.init_team() self.evaluate_model(evaluate=True, submit=False) elif 'smt_exist' in run_cmd: self.params['train_test_split_time'] = 1e10 self.init_team() self.evaluate_model(evaluate=False, submit=True, upload=True) #------------------------------------------ elif 'fast' in run_cmd: # fast flow debug self.init_team() self.train_alg(alg, mdl_config={'n_estimators': 5}) #------------------------------------------ elif run_cmd == 'treva_cv': self.init_team() df_train, df_valid, df_test = self.pas.get_data() tva = treva.trainer(self.params) tva.train(df_train, df_valid, df_test) #------------------------------------------ elif 'treva' in run_cmd: if 'elite' in run_cmd: self.params['train_test_split_time'] = 1e10 else: self.params['train_test_split_time'] = 700000 self.init_team() df_train, df_valid, df_test = self.pas.get_data() tva = treva.trainer(self.params) sfile = tva.train(df_train, df_valid, df_test) submiter.submiter().submit(entry=sfile, message=self.params) elif run_cmd == 'tuner': self.init_team() df_train, df_valid, _ = self.pas.get_data() grids = [] for i in range(10): xb, yb = int(125*random())*0.08, int(125*random())*0.08 grids += [(xb, xb+0.08, yb, yb+0.08)] print(grids) df_all = pd.concat([df_train, df_valid]) all_scores = tuner.tuner(df_all, grids) else: # single model self.init_team() self.train_alg(alg) #------------------------------------------ print("[Finished!] Elapsed time overall for %.2f secs" % (time.time() - start_time))
def run(self): run_cmd = self.params['alg'] alg = run_cmd.split('_')[0] print("[RUN_CMD] %s" % run_cmd) start_time = time.time() #========================================== # Shared config #========================================== if 'knn' in run_cmd: self.params['norm'] = { 'x': 500, 'y': 1000, 'hour': 4, 'logacc': 1, 'weekday': 3, 'qday': 1, 'month': 2, 'year': 10, 'day': 1. / 22, } # self.params['norm'] = {'x': 700, 'y':1100, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10} self.params['x_cols'] = [ x for x in self.params['x_cols'] if x in self.params['norm'].keys() ] self.params['x_step'] = 0.25 self.params['y_step'] = 0.25 if 'try_inter' in run_cmd: self.params['x_inter'] = 2 self.params['y_inter'] = 2 self.params['mdl_weights'] = (0.4, 1, 0.4) if 'try_large_grid' in run_cmd: self.params['x_step'] = 0.4 self.params['y_step'] = 0.4 #========================================== # Choose-one config #========================================== if run_cmd == 'all': for a in ['skrf', 'skrfp', 'sket', 'sketp', 'knn', 'xgb', 'skgbc']: self.init_team() self.train_alg(a) #------------------------------------------ elif 'skrf_reverse_valid_split_time' in run_cmd: self.params['train_test_split_time'] = 100000 self.params['place_min_last_checkin'] = None self.init_team() self.train_alg(alg) #------------------------------------------ elif '_grid_step' in run_cmd: # for x_step in [0.04, 0.05, 0.08, 0.1, 0.2]: # for y_step in [0.04, 0.05, 0.08, 0.1, 0.2]: for x_step in [0.1, 0.2, 0.5, 1]: for y_step in [0.1, 0.2, 0.5, 1]: print("=====[%s for step=(%.2f, %.2f)]=====" % (run_cmd, x_step, y_step)) self.params['x_step'] = x_step self.params['y_step'] = y_step self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'knn_grid_weights': # self.params['norm'] = {'x': 500, 'y':1000, 'hour':4, 'qday':1, 'logacc':1, 'weekday':3, 'day':1./22., 'month':2, 'year':10} for logacc in np.arange(1, 10, 2): for qday in np.arange(1, 10, 2): print("[knn_grid_weights] logacc=%i, qday=%i" % (logacc, qday)) self.params['norm']['logacc'] = logacc self.params['norm']['qday'] = qday self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_recursive_feature_elimination': fixed_feats = { 'logacc', 'qday', 'x', 'y', 'hour', 'weekday', 'year', 'month' } feats = set(self.all_feats) print("[RFE] checking x_cols for %s" % (feats - fixed_feats)) while True: scores = {} self.params['x_cols'] = list(feats) self.init_team() scores['all'] = self.train_alg(alg) print("[RFE] baseline = %.4f" % scores['all']) for af in (feats - fixed_feats): self.params['x_cols'] = [a for a in feats if a != af] self.init_team() print("[RFE] x_cols remove [%s], using %s" % (af, self.params['x_cols'])) scores[af] = self.train_alg(alg) rm_feat, rm_score = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)[0] if rm_score > scores['all'] - 0.01: print( "[RFE] base_score = %.4f, remove %s to achieve %.4f" % (scores['all'], rm_feat, rm_score)) feats -= set([rm_feat]) else: print("[RFE] finished since no feature shall be removed!") break #------------------------------------------ elif 'skrf_mdl_weights' in run_cmd: for sw in np.arange(0, 1.2, 0.1): self.params['mdl_weights'] = (sw, 0, 1.0, 0, sw) self.init_team() self.train_alg(alg) #------------------------------------------ elif 'skrf_preprocessing' in run_cmd: for en in [0, 1]: self.params['en_preprocessing'] = en self.init_team() self.train_alg(alg) #------------------------------------------ elif 'skrf_max_cands' in run_cmd: for proc in ['W', 'H']: for cants in np.arange(10, 50, 10): self.params['en_preprocessing'] = proc self.params['max_cands'] = cants self.init_team() self.train_alg(alg) #------------------------------------------ elif 'skrf_remove_distance_outlier' in run_cmd: for std in np.arange(1, 3, 0.5): self.params['remove_distance_outlier'] = std self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_feats_sel': all_feats = self.all_feats # baseline self.params['x_cols'] = all_feats self.init_team() self.train_alg(alg) # drop 1 feature for af in all_feats: self.params['x_cols'] = [a for a in all_feats if a != af] self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_time': for mfc in [None, 200000, 250000, 300000]: for tmt in [None]: #, 400000, 500000, 600000, 700000]: self.params['place_max_first_checkin'] = mfc self.params['train_max_time'] = tmt self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_loc_th': # for th_y in np.arange(1.5, 2.5, 0.1): # for th_x in np.arange(0.6, 2, 0.2): for th_y in np.arange(1.7, 2.5, 0.2): for th_x in np.arange(2.3, 3.5, 0.2): print("[SKRF_GS_LOC_TH]: th_x=%s, th_y=%s" % (th_x, th_y)) self.params['loc_th_x'] = th_x self.params['loc_th_y'] = th_y self.init_team() self.evaluate_model(evaluate=True, submit=False) #------------------------------------------ elif run_cmd == 'skrf_place_min_checkin': for mc in np.arange(0, 5, 1): self.params['place_min_checkin'] = mc self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_time_th_wd': for pth in np.arange(0, 0.005, 0.001): self.params['time_th_wd'] = pth self.init_team() self.evaluate_model(evaluate=True, submit=False) #------------------------------------------ elif run_cmd == 'skrf_gs_time_th_hr': for pth in np.arange(0.005, 0.02, 0.002): self.params['time_th_hr'] = pth self.init_team() self.train_alg(alg) #------------------------------------------ elif run_cmd == 'skrf_gs_popu_th': for pth in np.arange(0, 0.005, 0.001): self.params['popu_th'] = pth self.init_team() self.evaluate_model(evaluate=True, submit=False) #------------------------------------------ elif run_cmd in ['skrf_gs_params', 'skrfp_gs_params']: self.init_team() for max_feats in [0.3, 0.35, 0.4]: for n_estimators in [500]: for max_depth in [15]: self.train_alg(alg, mdl_config={ 'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features': max_feats }) elif run_cmd in ['sket_gs_params', 'sketp_gs_params']: self.init_team() for n_estimators in [800, 1200, 1500]: for max_depth in [13, 15, 18]: for max_feats in ['auto', 0.4, 0.5, 0.6]: self.train_alg(alg, mdl_config={ 'n_estimators': n_estimators, 'max_depth': max_depth, 'max_features': max_feats }) elif run_cmd == 'xgb_gs_params': self.init_team() for n_estimators in [30, 35, 40]: for max_depth in [3, 4, 5]: for learning_rate in [0.1]: self.train_alg(alg, params={ 'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate }) #------------------------------------------ elif run_cmd == 'skrf_place_min_last_checkin': for mlc in [550000, 650000]: self.params['place_min_last_checkin'] = mlc self.params['stamp'] = "%s_%s_%i" % (self.params['alg'], self.timestamp, mlc / 1e4) self.init_team() self.train_alg(alg, submit=True) #------------------------------------------ elif run_cmd == 'skrf_train_min_time': for mlc in [0, 50000, 100000, 150000, 200000]: self.params['train_min_time'] = mlc self.params['stamp'] = "%s_%s_%i" % (self.params['alg'], self.timestamp, mlc / 1e4) self.init_team() self.train_alg(alg, submit=True) #------------------------------------------ elif 'submit_rf_family' in run_cmd: self.params[ 'train_test_split_time'] = 1e10 # use all samples for training self.init_team() for a in ['skrf', 'skrfp']: self.train_alg(a, keep_model=True, submit=True, upload=True) self.train_alg('knn', submit=True, upload=True) elif 'submit_et_family' in run_cmd: self.params[ 'train_test_split_time'] = 1e10 # use all samples for training self.init_team() for a in ['sket', 'sketp']: self.train_alg(a, submit=True, upload=True) elif 'submit_full' in run_cmd: self.params[ 'train_test_split_time'] = 1e10 # use all samples for training self.init_team() self.train_alg(alg, submit=True, upload=True) elif '_submit' in run_cmd: self.init_team() self.train_alg(alg, keep_model=True, submit=True) elif 'eva_exist' in run_cmd: self.init_team() self.evaluate_model(evaluate=True, submit=False) elif 'smt_exist' in run_cmd: self.params['train_test_split_time'] = 1e10 self.init_team() self.evaluate_model(evaluate=False, submit=True, upload=True) #------------------------------------------ elif 'fast' in run_cmd: # fast flow debug self.init_team() self.train_alg(alg, mdl_config={'n_estimators': 5}) #------------------------------------------ elif run_cmd == 'treva_cv': self.init_team() df_train, df_valid, df_test = self.pas.get_data() tva = treva.trainer(self.params) tva.train(df_train, df_valid, df_test) #------------------------------------------ elif 'treva' in run_cmd: if 'elite' in run_cmd: self.params['train_test_split_time'] = 1e10 else: self.params['train_test_split_time'] = 700000 self.init_team() df_train, df_valid, df_test = self.pas.get_data() tva = treva.trainer(self.params) sfile = tva.train(df_train, df_valid, df_test) submiter.submiter().submit(entry=sfile, message=self.params) elif run_cmd == 'tuner': self.init_team() df_train, df_valid, _ = self.pas.get_data() grids = [] for i in range(10): xb, yb = int(125 * random()) * 0.08, int(125 * random()) * 0.08 grids += [(xb, xb + 0.08, yb, yb + 0.08)] print(grids) df_all = pd.concat([df_train, df_valid]) all_scores = tuner.tuner(df_all, grids) else: # single model self.init_team() self.train_alg(alg) #------------------------------------------ print("[Finished!] Elapsed time overall for %.2f secs" % (time.time() - start_time))