def extract_audio(input_file): output_args = ['-y', '-vn', '-acodec', 'copy'] output = '{}/input_audio.aac'.format(TMP_FOLDER) ff = ffmpy.FFmpeg(inputs={input_file: None}, outputs={output: output_args}) try: ff.run() except Exception as e: line_break(3) print('Failed to increase audio.\n{}'.format(e)) return output
def increase_audio(input_audio, amt): output_args = ['-y', '-af', 'volume=3, bass=g=5, treble=g=-10'] audio_file = None for idx in xrange(0, amt): input_file = audio_file or input_audio output = '{}/tmp_audio_{}.wav'.format(TMP_FOLDER, idx) ff = ffmpy.FFmpeg(inputs={input_file: None}, outputs={output: output_args}) try: ff.run() audio_file = output except Exception as e: line_break(3) print('Failed to increase audio.\n{}'.format(e)) return audio_file
def deep_fry_video(input_file, video_dip): emojified_video = add_random_emojis(input_file) inputs = create_inputs(emojified_video) output_args = create_base_args() + create_filter_args() for idx in xrange(0, video_dip): output = '{}/deep_fried_{}.mp4'.format(TMP_FOLDER, idx) outputs = create_outputs(output, output_args) ff = ffmpy.FFmpeg(inputs=inputs, outputs=outputs) try: ff.run() inputs = create_inputs(output) except Exception as e: line_break(3) print('Failed to deep fry video.\n{}'.format(e)) return output
def print_model_header(model): """Print header with model info """ features_list = (map(str,model.features.keys())) features_list.sort() log.info(utils.line_break()) log.info('MODEL: %s SEGMENT: %s TARGET: %s ' % (model.model_name, model.segment, model.target)) log.info('FEATURES: %s' % features_list) log.info('ESTIMATOR CLASS: %s ' % model.estimator) log.info('POST-PROCESS SCALAR: %s ' % model.postprocess_scalar)
def add_random_emojis(input_file): """ Overlays emojis at random angles, size, durations, and start frames over a given input file. The amount of emojis is based on input file length. """ emoji_filters = create_emoji_filters(input_file) inputs = create_inputs(input_file, emoji_filters) output_args = ['-an'] + create_base_args() output_args += ['-filter_complex', ''.join(emoji_filters)] tmp_output = '{}/emojied_video.mp4'.format(TMP_FOLDER) outputs = create_outputs(tmp_output, output_args) ff = ffmpy.FFmpeg(inputs=inputs, outputs=outputs) try: ff.run() return tmp_output except Exception as e: line_break(3) print('Failed to add emojis.\n{}'.format(e))
def create_final_video(fried_video, boosted_audio, output_file): inputs = OrderedDict([ (fried_video, None), (boosted_audio, None), ]) outputs = OrderedDict([ (output_file, ['-y', '-vcodec', 'libx264']), ]) ff = ffmpy.FFmpeg(inputs=inputs, outputs=outputs) try: ff.run() line_break(3) print('Succesfully deep fried video at {}!'.format(output_file)) line_break(3) return output_file except Exception as e: line_break(3) print('Failed to create final video.\n{}'.format(e))
def cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget,mtxTestTarget,model): start_time = datetime.now() log.info('Temporal CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'))) utils.line_break() train_cv = mtxTrn test_cv = mtxTest y_target = mtxTrnTarget y_true = mtxTestTarget #If target variable has been transformed, transform y_true back to normal state for comparison to predictions y_true = [np.exp(x)-1 for x in y_true] #--------Hyperparameter optimization---------# #Make predictions try: model.estimator.fit(train_cv, y_target) preds = model.estimator.predict(test_cv) except TypeError: model.estimator.fit(train_cv.todense(), y_target) preds = model.estimator.predict(test_cv.todense()) #----------Post processing rules----------# #If target variable has been transformed, transform predictions back to original state preds = [np.exp(x)-1 for x in preds] #Apply scalar if model.postprocess_scalar != 1: preds = [x*model.postprocess_scalar for x in preds] #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes if model.target == 'num_votes': preds = [1 if x < 1 else x for x in preds] else: preds = [0 if x < 0 else x for x in preds] ##score the prediction by measuring the error using the chosen error metric score = ml_metrics.rmsle(y_true, preds) finish_time = datetime.now() log.info('Error Measure:' , score) log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' % (np.mean(preds)), (np.max(preds),np.std(preds),np.min(preds),np.max(preds))) utils.line_break() log.info('Temporal CV completed at: %s. Total runtime: %s' \ % (datetime.now().strftime('%m-%d-%y %H:%M'),str(finish_time-start_time))) utils.line_break() return preds
def cross_validate_using_benchmark(benchmark_name, dfTrn, mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15): fold_scores = [] SEED = SEED * time.localtime().tm_sec start_time = datetime.now() log.info('Benchmark CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'))) utils.line_break() for i in range(folds): #For each fold, create a test set (test_holdout) by randomly holding out X% of the data as CV set, where X is test_size (default .15) train_cv, test_cv, y_target, y_true = cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=SEED*i+10) #If target variable has been transformed, transform y_true back to normal state for comparison to predictions y_true = [np.exp(x)-1 for x in y_true] #Calc benchmarks and use them to make a prediction benchmark_preds = 0 if benchmark_name =='global_mean': benchmark_preds = [13.899 for x in test_cv] if benchmark_name =='all_ones': #find user avg stars mean benchmark_preds = [1 for x in test_cv] if benchmark_name =='9999': #find user avg stars mean benchmark_preds = [9999 for x in test_cv] log.info('Using benchmark %s:' % (benchmark_name)) #For this CV fold, measure the error score = ml_metrics.rmsle(y_true, benchmark_preds) #print score fold_scores += [score] log.info('RMSLE (fold %d/%d): %f' % (i + 1, folds, score)) ##Now that folds are complete, calculate and print the results finish_time = datetime.now() log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' % (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores))) utils.line_break() log.info('CV completed at: %s. Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'), str(finish_time-start_time))) utils.line_break()
def cross_validate_kfold(mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15,pred_fg='false'): fold_scores = [] SEED = SEED * time.localtime().tm_sec start_time = datetime.now() log.info('K-Fold CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'))) utils.line_break() #If predictions are wanted, initialize the dict so that its length will match all records in the training set, #even if not all records are predicted during the CV (randomness is a bitch) if pred_fg == 'true': cv_preds = {key[0]:[] for key in mtxTrn.getcol(0).toarray()} for i in range(folds): ##For each fold, create a test set (test_cv) by randomly holding out test_size% of the data as CV set train_cv, test_cv, y_target, y_true = \ cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=i*SEED+1) #If target variable has been transformed, transform y_true back to normal state for comparison to predictions y_true = [np.exp(x)-1 for x in y_true] #if predictions are wanted, parse off the first row from train and test cv sets. First row contains ID if pred_fg == 'true': #TODO: create dense matrix copies for the clf's that only use dense matrices train_cv = sparse.csr_matrix(train_cv)[:,1:] test_cv2 = sparse.csr_matrix(test_cv)[:,1:] test_cv = sparse.csr_matrix(test_cv)[:,1:] #----------Hyperparameter optimization------# try: model.estimator.fit(train_cv, y_target) preds = model.estimator.predict(test_cv) except TypeError: model.estimator.fit(train_cv.todense(), y_target) preds = model.estimator.predict(test_cv.todense()) preds = model.estimator.predict(test_cv) #----------Post processing rules----------# #If target variable has been transformed, transform predictions back to original state preds = [np.exp(x)-1 for x in preds] #Apply scalar if model.postprocess_scalar != 1: preds = [x*model.postprocess_scalar for x in preds] #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes if model.target == 'num_votes': preds = [1 if x < 1 else x for x in preds] else: preds = [0 if x < 0 else x for x in preds] ##For each fold, score the prediction by measuring the error using the chosen error metric score = ml_metrics.rmsle(y_true, preds) fold_scores += [score] log.info('RMLSE (fold %d/%d): %f' % (i + 1, folds, score)) ##IF we want to record predictions, then for each fold add the predictions to the cv_preds dict for later output if pred_fg == 'true': for i in range(0,test_cv2.shape[0]): if test_cv2.getcol(0).toarray()[i][0] in cv_preds.keys(): cv_preds[test_cv2.getcol(0).toarray()[i][0]] += [preds[i]] else: cv_preds[test_cv2.getcol(0).toarray()[i][0]] = [preds[i]] ##Now that folds are complete, calculate and print the results finish_time = datetime.now() log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' % (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores))) utils.line_break() log.info('K-Fold CV completed at: %s. Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'), str(finish_time-start_time))) utils.line_break() if pred_fg == 'true': return cv_preds
def main(): #---Load environment settings from SETTINGS.json in root directory and build filepaths for all base submissions---# settings = utils.load_settings('SETTINGS.json') base_filepaths = (settings['file_bryan_submission'], settings['file_miroslaw_submission']) segment_weights = settings['ensemble_segment_weights'] segments = segment_weights.keys() targets = segment_weights[segments[0]].keys() #---Output the segment weights to be used for ensemble averaging of base submissions---# log.info('==========ENSEMBLE WEIGHTS (B,M)============') for segment in segment_weights: log.info(segment.upper()+':') for target in segment_weights[segment]: log.info(' '+target.upper()+' -- ['+segment_weights[segment][target]['0']+','+ segment_weights[segment][target]['1']+']') #---Load each base submission to a list of dataframes---# base_subs = [] for file in base_filepaths: try: base_subs.append(pd.read_csv(file).set_index(['id'], drop=False).sort()) log.info('Base submission successfully loaded: %s.' % file) except IOError: log.info('Base submission file does not exist: %s. Run base model to generate, or update filepath.' %file) sys.exit('---Exiting---') utils.line_break() #---Load id's labeled with segments to a dataframe used for segment based averaging---# file = settings['file_segment_ids'] try: segment_ids = pd.read_csv(file) log.info('Segment IDs successfully loaded from: %s.' % file) except IOError: log.info('Segment IDs file does not exist: %s. Update filepath in SETTINGS.json.' % file) utils.line_break() #---Transform base predictions to log space prior to averaging, if selected in settings---# if settings['avg_log_space'] == 'y': log.info('Transforming base predictions to log space prior to averaging.') for i in range(len(base_subs)): for target in targets: base_subs[i][target] = np.log(base_subs[i][target]+1) utils.line_break() #---Apply segment based weights to each base submission then combine them to create ensemble submission---# log.info('Applying segment weights to base submissions then combining to create ensemble.') for i in range(len(base_subs)): #Merge the segment labels from the segment id's file with the base submission dataframe base_subs[i] = base_subs[i].merge(segment_ids,on='id',how='inner') for segment in segments: for target in targets: base_subs[i][target][base_subs[i]['Segment'] == segment] \ *= float(segment_weights[segment][target][str(i)]) del base_subs[i]['Segment'] ensemble_sub = base_subs[0].ix[:] for i in range(len(base_subs)-1): for target in targets: ensemble_sub[target] += base_subs[i+1][target] utils.line_break() #---Transform ensemble predictions back to normal, if use log space averaging was selected in settings---# if settings['avg_log_space'] == 'y': log.info('Transforming ensemble predictions back to normal from log space.') for target in targets: ensemble_sub[target] = np.exp(ensemble_sub[target])-1 utils.line_break() #---Apply any final target scalars to ensemble predictions---# for target in targets: ensemble_sub[target] *= float(settings['target_scalars'][target]) #---Output ensemble submission to directory set in SETTINGS.json, appending creation date and time---# timestamp = datetime.now().strftime('%m-%d-%y_%H%M') filename = settings['dir_ensemble_submissions']+'ensemble_predictions_'+timestamp+'.csv' ensemble_sub.to_csv(filename, index=False) log.info('Ensemble submission saved: %s' % filename) utils.line_break() #End main log.info('Program executed successfully without error! Exiting.')
def setup_repo(): """Set up repository / checkout""" timestamp = run("date '+%Y%m%d%H%M%S'") timestamp_with_dots = timestamp[0:len(timestamp) - 2] + '.' + timestamp[len(timestamp)-2:len(timestamp)] sha = local('git ls-remote %(repo)s %(branch)s' % {'repo': env.repository, 'branch': env.branch}, capture=True) sha = sha.split("\t")[0] cache_dir = '%(app_dir)s/shared/cached-copy' % {'app_dir': env.applicationdir} deploy_dir = '%(app_dir)s/releases/%(timestamp)s' % {'app_dir': env.applicationdir, 'timestamp': timestamp} geo_lite_file = '%(app_dir)s/shared/config/GeoLiteCity.dat' % {'app_dir': env.applicationdir} if(files.exists(cache_dir)): with cd(cache_dir): run('git fetch -q origin') run('git reset -q --hard %(sha)s' % {'sha': sha}) run('git clean -q -d -x -f') else: run('git clone -q %(repo)s %(cache_dir)s' % {'repo': env.repository, 'cache_dir': cache_dir}) with cd('%(cache_dir)s' % {'cache_dir': cache_dir}): run('git checkout -q -b %(user)s %(sha)s;' % {'sha': sha, 'user': env.user}) with cd(cache_dir): run('cp -RPp %(cache_dir)s %(deploy_dir)s' % {'cache_dir': cache_dir, 'deploy_dir': deploy_dir}) run('echo %(sha)s > %(deploy_dir)s/REVISION' % {'sha': sha, 'deploy_dir': deploy_dir}) with cd(deploy_dir): run('bundle install --gemfile %(deploy_dir)s/Gemfile --path %(app_dir)s/shared/bundle --deployment --quiet --without development test cucumber' % {'deploy_dir': deploy_dir, 'app_dir': env.applicationdir}) run('./script/gem_downgrade_time') run('chmod -R g+w %(deploy_dir)s' % {'deploy_dir': deploy_dir}) run('rm -rf %(deploy_dir)s/log %(deploy_dir)s/public/system %(deploy_dir)s/tmp/pids' % {'deploy_dir': deploy_dir}) run('mkdir -p %(deploy_dir)s/public' % {'deploy_dir': deploy_dir}) run('mkdir -p %(deploy_dir)s/tmp' % {'deploy_dir': deploy_dir}) run('ln -s %(app_dir)s/shared/log %(deploy_dir)s/log' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir}) run('ln -s %(app_dir)s/shared/system %(deploy_dir)s/public/system' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir}) run('ln -s %(app_dir)s/shared/pids %(deploy_dir)s/tmp/pids' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir}) run("find %(deploy_dir)s/public/images %(deploy_dir)s/public/stylesheets %(deploy_dir)s/public/javascripts -exec touch -t %(timestamp_with_dots)s {} ';'; true" % {'deploy_dir': deploy_dir, 'timestamp': timestamp, 'timestamp_with_dots': timestamp_with_dots}) if(env.env == 'production'): with cd(deploy_dir): run('bundle exec whenever --clear-crontab %(app_name)s' % {'app_name': env.application}) if(not files.exists(geo_lite_file)): utils.line_break() print("ERROR: GeoLiteCity file doesn't exist: %(file)s" % {'file': geo_lite_file}) utils.line_break() return False fs = [ {'file': 'shards-replication.yml', 'final_file': 'shards.yml'}, {'file': 'database.yml'}, {'file': 'core.yml'}, {'file': 'authorize_net.yml'}, {'file': 'braintree.yml'}, {'file': 'braintree.yml'}, {'file': 'google_maps.yml'}, {'file': 'server.yml'}, {'file': 's3.yml'}, {'file': 'GeoLiteCity.dat'}, {'file': 'unicorn.rb'} ] for f in fs: try: final = f['final_file'] except KeyError: final = f['file'] run('ln -nfs %(app_dir)s/shared/config/%(f)s %(deploy_dir)s/config/%(final)s' % {'f': f['file'], 'final': final, 'app_dir': env.applicationdir, 'deploy_dir': deploy_dir}) run('ln -nfs %(app_dir)s/shared/cache %(deploy_dir)s/public/cache' % {'app_dir': env.applicationdir, 'deploy_dir': deploy_dir}) run('ls -x %(app_dir)s/releases' % {'app_dir': env.applicationdir}) with cd(deploy_dir): run('bundle exec rake RAILS_ENV=%(state)s db:migrate compass:compile db:seed 1> /dev/null' % {'state': env.env}) run('ln -sf %(deploy_dir)s %(app_dir)s/current' % {'deploy_dir': deploy_dir, 'app_dir': env.applicationdir}) run('bundle exec jammit') run('cp public/robots_disallow.txt public/robots.txt') run('rm -f %(app_dir)s/current' % {'app_dir': env.applicationdir}) run('ln -s %(deploy_dir)s %(app_dir)s/current' % {'deploy_dir': deploy_dir, 'app_dir': env.applicationdir}) with cd('%(deploy_dir)s/..' % {'deploy_dir': deploy_dir}): to_delete = 5 dirs = run("ls -ltr | awk '{print $8}'").split('\n') total = len(dirs) if(total > to_delete): del_dirs = dirs[0:total - to_delete] for d in del_dirs: d = d[0:len(d)-1] run('rm -Rf %(dd)s' % {'dd': d}) with cd('%(app_dir)s/current' % {'app_dir': env.applicationdir}): run('bundle exec whenever --update-crontab %(app_name)s --set environment=%(state)s' % {'state': env.env, 'app_name': env.application}) if(env.env == 'staging'): utils.line_break() print(red("Killing unicorns, the bastards...")) utils.line_break() with settings(warn_only=True): run('pkill -KILL -f unicorn') run('pkill -KILL -f delayed') if(files.exists('%(app_dir)s/current/config/unicorn/%(state)s.rb' % {'state': env.env, 'app_dir': env.applicationdir})): with cd('%(app_dir)s/current' % {'app_dir': env.applicationdir}): run('BUNDLE_GEMFILE=%(app_dir)s/current/Gemfile bundle exec unicorn_rails -c %(app_dir)s/current/config/unicorn/%(state)s.rb -E %(state)s -D' % {'app_dir': env.applicationdir, 'state': env.env}) elif(env.env == 'production'): if(files.exists('%(app_dir)s/current/tmp/pids/unicorn.pid' % {'app_dir': env.applicationdir})): print("PRODUCTION UNICORN RELOAD VOILA") with cd('%(app_dir)s' % {'app_dir': env.applicationdir}): with cd('%(app_dir)s/current' % {'app_dir': env.applicationdir}): run('bundle exec rake page_cache:refresher:disable_all cache:clear_rescue cache:clear_storehouse dj:enable dj:start RAILS_ENV=%(state)s' % {'state': env.env}) run('rm -fr shared/cache/*')
def main(): log.info('********New program instance started********') #-------------Load Environment----------------------# #Get program settings and model settings from SETTINGS.json file in root directory settings, model_settings = utils.load_settings() #If not using cached data, then load raw data, clean/munge it, create hand-crafted features, slice it for CV if settings['use_cached_data'] == 'y': log.info('==========LOADING CACHED FEATURES===========') dfTrn = data_io.load_cached_object('dfTrn') dfTest = data_io.load_cached_object('dfTest') dfCV = data_io.load_flatfile_to_df('Data/CV.csv') else: #-------Data Loading/Cleaning/Munging------------# #Load the data log.info('===============LOADING DATA=================') dfTrn = data_io.load_flatfile_to_df(settings['file_data_train']) dfTest = data_io.load_flatfile_to_df(settings['file_data_test']) dfCV = data_io.load_flatfile_to_df('Data/CV.csv') #Clean/Munge the data log.info('=======CLEANING AND MUNGING DATA============') dfTrn = munge.clean(dfTrn) dfTest = munge.clean(dfTest) #-------Feature creation-------------------------# #Add all currently used hand crafted features to dataframes log.info('====CREATING HAND-CRAFTED DATA FEATURES=====') features.add(dfTrn) features.add(dfTest) #---------Data slicing/parsing--------------------------# #Split data for CV if settings['generate_cv_score'] == 'y': log.info('=====SPLITTING DATA FOR CROSS-VALIDATION====') if settings['cv_method'] == 'april': dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 04, 1)) elif settings['cv_method'] == 'march': #take an addtional week from February b/c of lack of remote_api source issues in March dfTrnCV, dfTestCV = munge.temporal_split(dfTrn, (2013, 02, 21)) elif settings['cv_method'] == 'list_split': #load stored list of data points and use those for CV dfCVlist = pd.DataFrame({'id': data_io.load_cached_object('Cache/cv_issue_ids.pkl'), 'dummy': 0}) dfTrnCV, dfTestCV = munge.list_split(dfTrn, dfCVlist) #--------------Modeling-------------------------# #If cached models exist then load them for reuse into segment_models. Then run through model_settings and for # each model where 'use_cached_model' is false then clear the cached model and recreate it fresh log.info('=========LOADING CACHED MODELS==============') segment_models = data_io.load_cached_object('segment_models') if segment_models == None: log.info('=========CACHED MODELS NOT LOADED===========') for model in model_settings: model['use_cached_model'] = 'n' segment_models = [] #Initialize new model for models not set to use cache log.info('=======INITIALIZING UN-CACHED MODELS========') index = 0 for model in model_settings: if model_settings[model]['use_cached_model'] == 'n': new_model = ensembles.Model(model_name=model,target=model_settings[model]['target'], segment=model_settings[model]['segment'], estimator_class=model_settings[model]['estimator_class'], estimator_params=model_settings[model]['estimator_params'], features=model_settings[model]['features'], postprocess_scalar=model_settings[model]['postprocess_scalar']) #Flag the model as not cached, so that it does not get skipped when running the modeling process new_model.use_cached_model='n' #Project specific model attributes not part of base class new_model.KNN_neighborhood_threshold=model_settings[model]['KNN_neighborhood_threshold'] new_model.sub_zip_neighborhood=model_settings[model]['sub_zip_neighborhood'] segment_models[index] = new_model log.info('Model %s intialized at index %i' % (model,index)) index += 1 #Cross validate all segment models (optional) if settings['export_cv_predictions_all_models'] == 'y' or settings['export_cv_predictions_new_models'] == 'y': log.info('============CROSS VALIDATION================') for model in segment_models[:]: #If model has cached CV predictions then skip predicting and just export them (if selected in settings) if hasattr(model,'dfCVPredictions'): log.info('Cached CV predictions found. Using cached CV predictions.') if settings['export_cv_predictions_all_models'] == 'y': data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class, note='CV_list') else: print_model_header(model) #Prepare segment model: segment and create feature vectors for the CV data set dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrnCV,dfTestCV,model) #Generate CV predictions train.cross_validate(model, settings, dfTrn_Segment, dfTest_Segment) #Cache the CV predictions as a dataframe stored in each segment model model.dfCVPredictions = dfTest_Segment.ix[:,['id',model.target]] if settings['export_cv_predictions_new_models'] == 'y': data_io.save_predictions(model.dfCVPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class, note='CV_list') #Generate predictions on test set for all segment models (optional) if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y'\ or settings['export_predictions_total'] == 'y': log.info('=======GENERATING TEST PREDICTIONS==========') for model in segment_models[:]: #If model has cached test predictions then skip predicting and just export them (if selected in settings) if hasattr(model,'dfPredictions'): log.info('Cached test predictions found for model %s. Using cached predictions.' % model.model_name) if settings['export_predictions_all_models'] == 'y': data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class,note='TESTset') else: print_model_header(model) #Prepare segment model: segment and create feature vectors for the full TEST data set dfTrn_Segment, dfTest_Segment = prepare_segment_model(dfTrn,dfTest,model) #Generate TEST set predictions model.predict(dfTrn_Segment, dfTest_Segment) if settings['export_predictions_all_models'] == 'y' or settings['export_predictions_new_models'] == 'y': data_io.save_predictions(model.dfPredictions,model.target,model_name=model.model_name, directory=settings['dir_submissions'], estimator_class=model.estimator_class,note='TESTset') log.info(utils.line_break()) #Cache the trained models and predictions to file (optional) if settings['export_cached_models'] == 'y': log.info('==========EXPORTING CACHED MODELS===========') data_io.save_cached_object(segment_models,'segment_models') #Merge each segment model's CV predictions into a master dataframe and export it (optional)----# if settings['export_cv_predictions_total'] == 'y': log.info('====MERGING CV PREDICTIONS FROM SEGMENTS====') dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTestCV, cv=True) #---Apply post process rules to master dataframe---# #Set all votes and comments for remote_api segment to 1 and 0 dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left') for x in dfTestPredictionsTotal.index: if dfTestPredictionsTotal.source[x] == 'remote_api_created': dfTestPredictionsTotal.num_votes[x] = 1 dfTestPredictionsTotal.num_comments[x] = 0 #Export timestamp = datetime.now().strftime('%m-%d-%y_%H%M') filename = 'Submits/'+timestamp+'--bryan_CV_predictions.csv' dfTestPredictionsTotal.to_csv(filename) #Merge each segment model's TEST predictions into a master dataframe and export it (optional)----# if settings['export_predictions_total'] == 'y': log.info('===MERGING TEST PREDICTIONS FROM SEGMENTS===') dfTestPredictionsTotal = merge_segment_predictions(segment_models, dfTest) #---Apply post process rules to master dataframe---# #Set all votes and comments for remote_api segment to 1 and 0 dfTestPredictionsTotal = dfTestPredictionsTotal.merge(dfTest.ix[:][['source','id']], on='id', how='left') for x in dfTestPredictionsTotal.index: if dfTestPredictionsTotal.source[x] == 'remote_api_created': dfTestPredictionsTotal.num_votes[x] = 1 dfTestPredictionsTotal.num_comments[x] = 0 del dfTestPredictionsTotal['source'] #Export filename = 'bryan_test_predictions.csv' data_io.save_combined_predictions(dfTestPredictionsTotal, settings['dir_submissions'], filename) #End main log.info('********Program ran successfully. Exiting********')
def main(): #---Load environment settings from SETTINGS.json in root directory and build filepaths for all base submissions---# settings = utils.load_settings('SETTINGS.json') base_filepaths = (settings['file_bryan_submission'], settings['file_miroslaw_submission']) segment_weights = settings['ensemble_segment_weights'] segments = segment_weights.keys() targets = segment_weights[segments[0]].keys() #---Output the segment weights to be used for ensemble averaging of base submissions---# log.info('==========ENSEMBLE WEIGHTS (B,M)============') for segment in segment_weights: log.info(segment.upper() + ':') for target in segment_weights[segment]: log.info(' ' + target.upper() + ' -- [' + segment_weights[segment][target]['0'] + ',' + segment_weights[segment][target]['1'] + ']') #---Load each base submission to a list of dataframes---# base_subs = [] for file in base_filepaths: try: base_subs.append( pd.read_csv(file).set_index(['id'], drop=False).sort()) log.info('Base submission successfully loaded: %s.' % file) except IOError: log.info( 'Base submission file does not exist: %s. Run base model to generate, or update filepath.' % file) sys.exit('---Exiting---') utils.line_break() #---Load id's labeled with segments to a dataframe used for segment based averaging---# file = settings['file_segment_ids'] try: segment_ids = pd.read_csv(file) log.info('Segment IDs successfully loaded from: %s.' % file) except IOError: log.info( 'Segment IDs file does not exist: %s. Update filepath in SETTINGS.json.' % file) utils.line_break() #---Transform base predictions to log space prior to averaging, if selected in settings---# if settings['avg_log_space'] == 'y': log.info( 'Transforming base predictions to log space prior to averaging.') for i in range(len(base_subs)): for target in targets: base_subs[i][target] = np.log(base_subs[i][target] + 1) utils.line_break() #---Apply segment based weights to each base submission then combine them to create ensemble submission---# log.info( 'Applying segment weights to base submissions then combining to create ensemble.' ) for i in range(len(base_subs)): #Merge the segment labels from the segment id's file with the base submission dataframe base_subs[i] = base_subs[i].merge(segment_ids, on='id', how='inner') for segment in segments: for target in targets: base_subs[i][target][base_subs[i]['Segment'] == segment] \ *= float(segment_weights[segment][target][str(i)]) del base_subs[i]['Segment'] ensemble_sub = base_subs[0].ix[:] for i in range(len(base_subs) - 1): for target in targets: ensemble_sub[target] += base_subs[i + 1][target] utils.line_break() #---Transform ensemble predictions back to normal, if use log space averaging was selected in settings---# if settings['avg_log_space'] == 'y': log.info( 'Transforming ensemble predictions back to normal from log space.') for target in targets: ensemble_sub[target] = np.exp(ensemble_sub[target]) - 1 utils.line_break() #---Apply any final target scalars to ensemble predictions---# for target in targets: ensemble_sub[target] *= float(settings['target_scalars'][target]) #---Output ensemble submission to directory set in SETTINGS.json, appending creation date and time---# timestamp = datetime.now().strftime('%m-%d-%y_%H%M') filename = settings[ 'dir_ensemble_submissions'] + 'ensemble_predictions_' + timestamp + '.csv' ensemble_sub.to_csv(filename, index=False) log.info('Ensemble submission saved: %s' % filename) utils.line_break() #End main log.info('Program executed successfully without error! Exiting.')