def main(RUNID='run001', START_DT_STR=None, MODELFILENAME='sm', PICKLE_DATA=False, DO_TESTS=False, PROFILE=False, verbose=False, debug=False, RELOAD=False, n_cpus=1, PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']): ''' Runs our series model or triggered series model job based on the runtime conditions and run parameters. IN: RUNID - str - str name for the folder where output will be stored and the name of the json (without extension) containing run parameters for seriesmodel or triggeredseriesmodel START_DT_STR - str - timestamp as a string to append to the logfile. Set in the header global params of capstone MODELFILENAME - str - filename of model (for pickling) PICKLE_DATA - bool - if the raw data should be pickled after loading into a data frame DO_TESTS - bool - if unittests should be run (True), or a job run (False) PROFILE - bool - if memory profiling should be performed (True) verbose - bool - when set to true, verbose output debug - bool - whether a full dataset should be used (False), or a smaller set of time points (True) RELOAD - bool - whether data should be loaded from pickle (False), or reloaded from raw data (True). Set to true only for first run on a new instance, then set to False for future runs to save load time. n_cpus - int - number of cpus to use for multiprocessing jobs. PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels) data and spots_used file names. When RELOAD is set to True, this is the filenames where this data will be saved. When RELOAD is set to False, this is where the data will be loaded from. OUT: None ''' RUNID = command_line_process(RUNID) # prepare to run job LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR) LOGFILE = create_logfile(RUNID, LOGFILENAME) # get the run conditions for the runid from the json # NOTE excludes verbose and debug flags - those are fit parameters # and exludes runid since that is set up above with open((RUNID + '.json')) as f: run_params = json.load(f, object_hook=ascii_encode_dict) # to see if more ram is used for more cpus n_jobs = run_params['detection_model_arguments']['n_jobs'] ### Unittests ### if DO_TESTS: start = time.time() ptf('\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end - start), len(X)), LOGFILE) tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE) # sm_unit = run_unittests(X_test, y_test, verbose=False) else: # ouptput run conditions to screen and logfile bigstart = time.time() # start memory profiling if PROFILE: tr, tr_sm = start_memory_profiling if RUNTYPE == 'trigger': ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID) elif RUNTYPE == 'series': ptf('*** %s - SERIES MODEL - ***' % RUNID) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=True) if RELOAD: X, y, used_column_headers, df, df_raw = reload_data( LOGFILE, PICKLE_DATA) else: start = time.time() ptf('\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end - start), len(X)), LOGFILE) run_params['logfile'] = LOGFILE run_params['runid'] = RUNID # create model if RUNTYPE == 'trigger': sm = TriggeredSeriesModel(used_column_headers.values, **run_params) elif RUNTYPE == 'series': sm = SeriesModel(**run_params) # Altogether now print('** DOING THE FIT **') sm.fit(X, y, verbose=verbose, debug=debug) bigend = time.time() ptf( '====> %d seconds (%0.1f mins)' % ((bigend - bigstart), (bigend - bigstart) / 60.0), LOGFILE) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=False) print_run_details(X, sm, LOGFILE) save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE) ## VIEW RESULTS if RUNTYPE == 'trigger': make_trigger_plots(sm, y, RUNID, debug=debug) elif RUNTYPE == 'series': make_series_plots(sm) if PROFILE: print_memory_profiles(sm, tr, tr_sm, LOGFILE=None) LOGFILE.close()
# to see if more ram is used for more cpus n_jobs = run_params['detection_model_arguments']['n_jobs'] ### Unittests ### if DO_TESTS: start = time.time() ptf( '\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE) # sm_unit = run_unittests(X_test, y_test, verbose=False) else: # ouptput run conditions to screen and logfile bigstart = time.time() # start memory profiling if PROFILE: tr, tr_sm = start_memory_profiling if RUNTYPE == 'trigger': ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID) elif RUNTYPE == 'series': ptf('*** %s - SERIES MODEL - ***' % RUNID)
def main(RUNID='run001', START_DT_STR=None, MODELFILENAME='sm', PICKLE_DATA=False, DO_TESTS=False, PROFILE=False, verbose=False, debug=False, RELOAD=False, n_cpus=1, PICKLE_NAMES=['Xdf.pkl', 'ydf.pkl', 'used_column_headers.pkl']): ''' Runs our series model or triggered series model job based on the runtime conditions and run parameters. IN: RUNID - str - str name for the folder where output will be stored and the name of the json (without extension) containing run parameters for seriesmodel or triggeredseriesmodel START_DT_STR - str - timestamp as a string to append to the logfile. Set in the header global params of capstone MODELFILENAME - str - filename of model (for pickling) PICKLE_DATA - bool - if the raw data should be pickled after loading into a data frame DO_TESTS - bool - if unittests should be run (True), or a job run (False) PROFILE - bool - if memory profiling should be performed (True) verbose - bool - when set to true, verbose output debug - bool - whether a full dataset should be used (False), or a smaller set of time points (True) RELOAD - bool - whether data should be loaded from pickle (False), or reloaded from raw data (True). Set to true only for first run on a new instance, then set to False for future runs to save load time. n_cpus - int - number of cpus to use for multiprocessing jobs. PICKLE_NAMES - list of str - list of the X (features) dataframe, y (labels) data and spots_used file names. When RELOAD is set to True, this is the filenames where this data will be saved. When RELOAD is set to False, this is where the data will be loaded from. OUT: None ''' RUNID = command_line_process(RUNID) # prepare to run job LOGFILENAME = 'log_%s_%s.txt' % (RUNID, START_DT_STR) LOGFILE = create_logfile(RUNID, LOGFILENAME) # get the run conditions for the runid from the json # NOTE excludes verbose and debug flags - those are fit parameters # and exludes runid since that is set up above with open((RUNID + '.json')) as f: run_params = json.load(f, object_hook=ascii_encode_dict) # to see if more ram is used for more cpus n_jobs = run_params['detection_model_arguments']['n_jobs'] ### Unittests ### if DO_TESTS: start = time.time() ptf( '\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) tsm_unit = run_tsm_unittests(X, y, used_column_headers.values, verbose=verbose, logfile=LOGFILE) # sm_unit = run_unittests(X_test, y_test, verbose=False) else: # ouptput run conditions to screen and logfile bigstart = time.time() # start memory profiling if PROFILE: tr, tr_sm = start_memory_profiling if RUNTYPE == 'trigger': ptf('*** %s - TRIGGERED SERIES MODEL - ***' % RUNID) elif RUNTYPE == 'series': ptf('*** %s - SERIES MODEL - ***' % RUNID) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=True) if RELOAD: X, y, used_column_headers, df, df_raw = reload_data(LOGFILE, PICKLE_DATA) else: start = time.time() ptf( '\n>> Unpickling data ...\n', LOGFILE) X = my_unpickle(PICKLE_NAMES[0]) y = my_unpickle(PICKLE_NAMES[1]) used_column_headers = my_unpickle(PICKLE_NAMES[2]) end = time.time() ptf( 'Data unpickled in %d seconds (%d total trials)' % ((end-start), len(X)), LOGFILE) run_params['logfile'] = LOGFILE run_params['runid'] = RUNID # create model if RUNTYPE == 'trigger': sm = TriggeredSeriesModel(used_column_headers.values, **run_params) elif RUNTYPE == 'series': sm = SeriesModel(**run_params) # Altogether now print ('** DOING THE FIT **') sm.fit(X, y, verbose=verbose, debug=debug) bigend = time.time() ptf('====> %d seconds (%0.1f mins)' % ((bigend-bigstart), (bigend-bigstart)/60.0), LOGFILE) print_job_info(run_params, n_jobs, n_cpus, RUNID, START_DT_STR, LOGFILE=LOGFILE, debug=debug, profile=PROFILE, verbose=verbose, start=False) print_run_details(X, sm, LOGFILE) save_model(sm, RUNID, MODELFILENAME, LOGFILE=LOGFILE) ## VIEW RESULTS if RUNTYPE == 'trigger': make_trigger_plots(sm, y, RUNID, debug=debug) elif RUNTYPE == 'series': make_series_plots(sm) if PROFILE: print_memory_profiles(sm, tr, tr_sm, LOGFILE = None) LOGFILE.close()