def train_wrapper(params): cv_losses, _ = cross_validate(params, X, y) # return an object to be recorded in hyperopt trials for future uses return { 'loss': np.mean(cv_losses), 'status': STATUS_OK, 'eval_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'params': params }
#----For smaller data sets------# (Do not work or have very long training times on large sparse datasets) Require .todense() #clf = ensemble.RandomForestRegressor(n_estimators=50); clfname='RFReg_50' #clf = ensemble.ExtraTreesRegressor(n_estimators=30) #n_jobs = -1 if running in a main() loop #clf = ensemble.GradientBoostingRegressor(n_estimators=700, learning_rate=.1, max_depth=1, random_state=888, loss='ls');clf_name='GBM' clf = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=3, max_features=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best'), n_estimators=150, learning_rate=.5, loss='linear', random_state=None) #clf = gaussian_process.GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1, random_start=100) #clf = neighbors.KNeighborsRegressor(100, weights='uniform', algorithm = 'auto');clf_name='KNN_200' ################################################################################################ #---Different methods of cross validation---# #May require mtxTrn.toarray() cv_preds = train.cross_validate(hstack([sparse.csr_matrix(dfTrn.urlid.values).transpose(),mtxTrn]),mtxTrnTarget.ravel(), folds=10,SEED=42,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=True) train.cross_validate(mtxTrn,mtxTrnTarget.ravel(),folds=8,SEED=888,test_size=.1,clf=clf,clf_name=clf_name,pred_fg=False) train.cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget.ravel(),mtxTestTarget.ravel(),clf=clf, clf_name=clf_name,pred_fg=False) train.cross_validate_using_benchmark('global_mean',dfTrn, mtxTrn,mtxTrnTarget,folds=20) ################################################################################################ #---Calculate the degree of variance between ground truth and the mean of the CV predictions.----# #---Returns a list of all training records with their average variance---# train.calc_cv_preds_var(dfTrn,cv_preds) ################################################################################################ #--Use estimator for manual predictions--# dfTest, clf = train.predict(mtxTrn,mtxTrnTarget.ravel(),mtxTest,dfTest,clf,clf_name) #may require mtxTest.toarray()
#--------------Machine Learning (woohoo, we finally got to the good stuff)------------------------# #quant_features = ['user_average_stars','user_review_count','calc_total_checkins','bus_stars','bus_review_count'] quant_features = ['bus_stars','user_average_stars','bus_review_count', 'user_review_count','calc_total_checkins','calc_cat_avg'] dfTrn_ML=dfTrn_All_5_8; dfTest_ML= dfTest_All_1_5; mtxTrn,mtxTest = features.standardize(dfTrn_ML,dfTest_ML,quant_features) #--Combine the standardized quant features and the vectorized categorical features--# #mtxTrn = hstack([mtxTrn,vecTrn_BusOpen]) #vecTrn_BusOpen,vecTrn_Cats,vecTrn_Zip, #mtxTest = hstack([mtxTest,vecTest_BusOpen]) #vecTest_Master_Cats,vecTest_Master_Zip, #--Test without the vecZip and vecCats--# #mtxTrn = hstack([mtxTrn,vecTrn_BusOpen]) #mtxTest = hstack([mtxTest,vecTest_Master_BusOpen]) #--select target--# mtxTarget = dfTrn_ML.ix[:,['rev_stars']].as_matrix() #--Use classifier for cross validation--# train.cross_validate(mtxTrn,mtxTarget,clf,folds=10,SEED=42,test_size=.2) #may require mtxTrn.toarray() #--Use classifier for predictions--# dfTest_ML, clf = train.predict(mtxTrn,mtxTarget,mtxTest,dfTest_ML,clf,clf_name) #may require mtxTest.toarray() #--Save predictions to file--# train.save_predictions(dfTest_ML,clf_name,'_All_1_5_KitchenSink',submission_no) #---------End Machine Learning Section-------------# #------------------------------Optional Steps----------------------------------# #--Memory cleanup prior to running the memory intensive classifiers--# dfTrn,dfTest,dfAll = utils.data_garbage_collection(dfTrn,dfTest,dfAll) #--use a benchmark instead of a classifier--# benchmark_preds = train.cross_validate_using_benchmark('3.5', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
'calc_total_checkins', 'calc_cat_avg' ] dfTrn_ML = dfTrn_All_5_8 dfTest_ML = dfTest_All_1_5 mtxTrn, mtxTest = features.standardize(dfTrn_ML, dfTest_ML, quant_features) #--Combine the standardized quant features and the vectorized categorical features--# #mtxTrn = hstack([mtxTrn,vecTrn_BusOpen]) #vecTrn_BusOpen,vecTrn_Cats,vecTrn_Zip, #mtxTest = hstack([mtxTest,vecTest_BusOpen]) #vecTest_Master_Cats,vecTest_Master_Zip, #--Test without the vecZip and vecCats--# #mtxTrn = hstack([mtxTrn,vecTrn_BusOpen]) #mtxTest = hstack([mtxTest,vecTest_Master_BusOpen]) #--select target--# mtxTarget = dfTrn_ML.ix[:, ['rev_stars']].as_matrix() #--Use classifier for cross validation--# train.cross_validate(mtxTrn, mtxTarget, clf, folds=10, SEED=42, test_size=.2) #may require mtxTrn.toarray() #--Use classifier for predictions--# dfTest_ML, clf = train.predict(mtxTrn, mtxTarget, mtxTest, dfTest_ML, clf, clf_name) #may require mtxTest.toarray() #--Save predictions to file--# train.save_predictions(dfTest_ML, clf_name, '_All_1_5_KitchenSink', submission_no) #---------End Machine Learning Section-------------# #------------------------------Optional Steps----------------------------------# #--Memory cleanup prior to running the memory intensive classifiers--# dfTrn, dfTest, dfAll = utils.data_garbage_collection(dfTrn, dfTest, dfAll)
#----------------------------------------# #--------- Machine Learning--------------# #----------------------------------------# #--select target--# mtxTarget = frmTrn_All.ix[:,['rev_votes_useful']].as_matrix() #--select classifier--# ## Common options: ensemble -- RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor ## linear_model -- SGDRegressor, Lasso #clf = linear_model.LassoCV(cv=3) #clf = linear_model.ElasticNet() #clf = ensemble.RandomForestRegressor(n_estimators=50) clf = linear_model.SGDRegressor(alpha=0.001, n_iter=1000,shuffle=True); clfname='SGD_001_1000' #--Use classifier for cross validation--# train.cross_validate(mtxTrn,mtxTarget,clf,folds=10,SEED=42,test_size=.15) #--Use classifier for predictions--# frmTest_All = train.predict(mtxTrn,mtxTarget,mtxTest,frmTest_All,clf,clfname) #frmTest_NoVotes = train.predict(mtxTrn,mtxTarget,mtxTest,frmTest_NoVotes,clf,clfname) #frmTest_NoUser = train.predict(mtxTrn,mtxTarget,mtxTest,frmTest_NoUser,clf,clfname) #--Save predictions to file--# train.save_predictions(frmTest_All,clfname) #train.save_predictions(frmTest_NoVotes,clfname) #train.save_predictions(frmTest_NoUser,clfname) #--Save model to joblib file--# train.save_model(clf,clfname) #--Load model from joblib file--#
def main(): log.info('********New program instance started********') #-------------Load Environment----------------------# #Get program settings and model settings from SETTINGS.json file in root directory settings, model_settings = utils.load_settings() #If not using cached data, then load raw data, clean/munge it, create hand-crafted features, slice it for CV if settings['use_cached_data'] == 'y': log.info('==========LOADING CACHED FEATURES===========') dfTrn = data_io.load_cached_object('dfTrn') dfTest = data_io.load_cached_object('dfTest') dfCV = data_io.load_flatfile_to_df('Data/CV.csv') else: #-------Data Loading/Cleaning/Munging------------# #Load the data log.info('===============LOADING DATA=================') dfTrn = data_io.load_flatfile_to_df(settings['file_data_train']) dfTest = data_io.load_flatfile_to_df(settings['file_data_test']) dfCV = data_io.load_flatfile_to_df('Data/CV.csv') #Clean/Munge the data log.info('=======CLEANING AND MUNGING DATA============') dfTrn = munge.clean(dfTrn) dfTest = munge.clean(dfTest) #-------Feature creation-------------------------# #Add all currently used hand crafted features to dataframes log.info('====CREATING HAND-CRAFTED DATA FEATURES=====') features.add(dfTrn) features.add(dfTest) #---------Data slicing/parsing--------------------------# #Split data for CV if settings['generate_cv_score'] == 'y': log.info('=====SPLITTING DATA FOR CROSS-VALIDATION====') if settings['cv_method'] == 'april': dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 04, 1)) elif settings['cv_method'] == 'march': #take an addtional week from February b/c of lack of remote_api source issues in March dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 02, 21)) elif settings['cv_method'] == 'list_split': #load stored list of data points and use those for CV dfCVlist = pd.DataFrame({'id': data_io.load_cached_object('Cache/cv_issue_ids.pkl'), 'dummy': 0}) dfTrnCV, dfTestCV = munge.list_split(dfTrn, dfCVlist) #--------------Modeling-------------------------# #If cached models exist then load them for reuse into segment_models. Then run through model_settings and for # each model where 'use_cached_model' is false then clear the cached model and recreate it fresh log.info('=========LOADING CACHED MODELS==============') segment_models = data_io.load_cached_object('segment_models') if segment_models == None: log.info('=========CACHED MODELS NOT LOADED===========') for model in model_settings: model['use_cached_model'] = 'n' segment_models = [] #Initialize new model for models not set to use cache log.info('=======INITIALIZING UN-CACHED MODELS========') index = 0 for model in model_settings: if model_settings[model]['use_cached_model'] == 'n': new_model = ensembles.Model(model_name=model,target=model_settings[model]['target'], segment=model_settings[model]['segment'], estimator_class=model_settings[model]['estimator_class'], estimator_params=model_settings[model]['estimator_params'], features=model_settings[model]['features'], postprocess_scalar=model_settings[model]['postprocess_scalar']) #Flag the model as not cached, so that it does not get skipped when running the modeling process new_model.use_cached_model='n' #Project specific model attributes not part of base class new_model.KNN_neighborhood_threshold=model_settings[model]['KNN_neighborhood_threshold'] new_model.sub_zip_neighborhood=model_settings[model]['sub_zip_neighborhood'] segment_models[index] = new_model log.info('Model %s intialized at index %i' % (model,index)) index += 1 #Cross validate all segment models (optional) if settings['export_cv_predictions_all_models'] == 'y' or settings['export_cv_predictions_new_models'] == 'y': log.info('============CROSS VALIDATION================') for model in segment_models[:]: #If model has cached CV predictions then skip predicting and just export them (if selected in settings) if hasattr(model,'dfCVPredictions'): log.info('Cached CV predictions found. Using cached CV predictions.') if settings['export_cv_predictions_all_models'] == 'y': data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class, note='CV_list') else: print_model_header(model) #Prepare segment model: segment and create feature vectors for the CV data set dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrnCV,dfTestCV,model) #Generate CV predictions train.cross_validate(model, settings, dfTrn_Segment, dfTest_Segment) #Cache the CV predictions as a dataframe stored in each segment model model.dfCVPredictions = dfTest_Segment.ix[:,['id',model.target]] if settings['export_cv_predictions_new_models'] == 'y': data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class, note='CV_list') #Generate predictions on test set for all segment models (optional) if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y'\ or settings['export_predictions_total'] == 'y': log.info('=======GENERATING TEST PREDICTIONS==========') for model in segment_models[:]: #If model has cached test predictions then skip predicting and just export them (if selected in settings) if hasattr(model,'dfPredictions'): log.info('Cached test predictions found for model %s. Using cached predictions.' % model.model_name) if settings['export_predictions_all_models'] == 'y': data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class,note='TESTset') else: print_model_header(model) #Prepare segment model: segment and create feature vectors for the full TEST data set dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrn,dfTest,model) #Generate TEST set predictions model.predict(dfTrn_Segment, dfTest_Segment) if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y': data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class,note='TESTset') log.info(utils.line_break()) #Cache the trained models and predictions to file (optional) if settings['export_cached_models'] == 'y': log.info('==========EXPORTING CACHED MODELS===========') data_io.save_cached_object(segment_models,'segment_models') #Merge each segment model's CV predictions into a master dataframe and export it (optional)----# if settings['export_cv_predictions_total'] == 'y': log.info('====MERGING CV PREDICTIONS FROM SEGMENTS====') dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTestCV, cv=True) #---Apply post process rules to master dataframe---# #Set all votes and comments for remote_api segment to 1 and 0 dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left') for x in dfTestPredictionsTotal.index: if dfTestPredictionsTotal.source[x] == 'remote_api_created': dfTestPredictionsTotal.num_votes[x] = 1 dfTestPredictionsTotal.num_comments[x] = 0 #Export timestamp = datetime.now().strftime('%m-%d-%y_%H%M') filename = 'Submits/'+timestamp+'--bryan_CV_predictions.csv' dfTestPredictionsTotal.to_csv(filename) #Merge each segment model's TEST predictions into a master dataframe and export it (optional)----# if settings['export_predictions_total'] == 'y': log.info('===MERGING TEST PREDICTIONS FROM SEGMENTS===') dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTest) #---Apply post process rules to master dataframe---# #Set all votes and comments for remote_api segment to 1 and 0 dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left') for x in dfTestPredictionsTotal.index: if dfTestPredictionsTotal.source[x] == 'remote_api_created': dfTestPredictionsTotal.num_votes[x] = 1 dfTestPredictionsTotal.num_comments[x] = 0 del dfTestPredictionsTotal['source'] #Export filename = 'bryan_test_predictions.csv' data_io.save_combined_predictions(dfTestPredictionsTotal, settings['dir_submissions'], filename) #End main log.info('********Program ran successfully. Exiting********')
def cross_validate_pipeline(X, y): """Trains and validates a model""" train.cross_validate(X, y)
splitter='best'), n_estimators=150, learning_rate=.5, loss='linear', random_state=None) #clf = gaussian_process.GaussianProcess(corr='cubic', theta0=1e-2, thetaL=1e-4, thetaU=1e-1, random_start=100) #clf = neighbors.KNeighborsRegressor(100, weights='uniform', algorithm = 'auto');clf_name='KNN_200' ################################################################################################ #---Different methods of cross validation---# #May require mtxTrn.toarray() cv_preds = train.cross_validate(hstack( [sparse.csr_matrix(dfTrn.urlid.values).transpose(), mtxTrn]), mtxTrnTarget.ravel(), folds=10, SEED=42, test_size=.1, clf=clf, clf_name=clf_name, pred_fg=True) train.cross_validate(mtxTrn, mtxTrnTarget.ravel(), folds=8, SEED=888, test_size=.1, clf=clf, clf_name=clf_name, pred_fg=False) train.cross_validate_temporal(mtxTrn, mtxTest, mtxTrnTarget.ravel(),