def _get_input(): """ Get tf.data.Dataset object according to command-line flags for testing using tf.estimator.Estimator Returns: dataset : elements structured as [features, labels] feature structure can be seen in postbatch_fn in mjsynth.py """ # WARNING: More than two filters causes SEVERE throughput slowdown filter_fn = filters.input_filter_fn \ ( min_image_width=FLAGS.min_image_width, max_image_width=FLAGS.max_image_width, min_string_length=FLAGS.min_string_length, max_string_length=FLAGS.max_string_length ) # Get data according to flags dataset = pipeline.get_data(use_static_data=True, base_dir=FLAGS.test_path, file_patterns=str.split( FLAGS.filename_pattern, ','), num_threads=FLAGS.num_input_threads, batch_size=FLAGS.batch_size, filter_fn=filter_fn, num_epochs=1) return dataset
def _get_input(): """ Get tf.data.Dataset according to command-line flags for training using tf.estimator.Estimator Note: Default behavior is bucketing according to default bucket boundaries listed in pipeline.get_data Returns: dataset : elements structured as [features, labels] feature structure can be seen in postbatch_fn in mjsynth.py or maptextsynth.py for static or dynamic data pipelines respectively """ # WARNING: More than two filters causes SEVERE throughput slowdown filter_fn = filters.input_filter_fn \ ( min_image_width=FLAGS.min_image_width, max_image_width=FLAGS.max_image_width, min_string_length=FLAGS.min_string_length, max_string_length=FLAGS.max_string_length, check_input=(not FLAGS.static_data) ) gpu_batch_size = FLAGS.batch_size // FLAGS.num_gpus # Pack keyword arguments into dictionary data_args = { 'num_threads': FLAGS.num_input_threads, 'batch_size': gpu_batch_size, 'filter_fn': filter_fn } if FLAGS.static_data: # Pack data stream-specific parameters data_args['base_dir'] = FLAGS.train_path data_args['file_patterns'] = str.split(FLAGS.filename_pattern, ',') else: data_args['synth_config_file'] = FLAGS.synth_config_file data_args['use_ipc_synth'] = FLAGS.ipc_synth if not FLAGS.bucket_data: data_args['boundaries'] = None # Turn off bucketing (on by default) elif not FLAGS.static_data: # Extra buckets for the wider synthetic data data_args['boundaries'] = [ 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512 ] # Get data according to flags dataset = pipeline.get_data(FLAGS.static_data, **data_args) return dataset
def run_results_analysis(run_log_df): model_id_list = list(run_log_df.model_id) model_id_list_str = "'" + "','".join(map(str, model_id_list)) + "'" results_stats_tablename = "model.results_stats" query_results_stats = ('select *' ' from ' + results_stats_tablename + ' where model_id in (' + model_id_list_str + ')') results_stats_df = get_data(query_results_stats) valid_tablename = "model.valid" query_valid = ('select *' ' from ' + valid_tablename + ' where model_id in (' + model_id_list_str + ')') #valid_df= get_data(query_valid) features_tablename = "model.feature_importances" query_features = ('select *' ' from ' + features_tablename + ' where model_id in (' + model_id_list_str + ')') #features_df= get_data(query_features) model_top_precision = list( results_stats_df.sort_values('precision_at_1', ascending=False)['model_id'])[0] valid_top_precision_tablename = "model.valid" query_valid_top_precision = ('select *' ' from ' + valid_top_precision_tablename + " where model_id in ('" + model_top_precision + "')") print query_valid_top_precision valid_top_precision_df = get_data(query_valid_top_precision) gen_decile_table(model_top_precision, valid_top_precision_df)
def _get_input(): """ Get tf.data.Dataset object according to command-line flags for evaluation using tf.estimator.Estimator Note: Default behavior is bucketing according to default bucket boundaries listed in pipeline.get_data Returns: features, labels feature structure can be seen in postbatch_fn in mjsynth.py or maptextsynth.py for static or dynamic data pipelines respectively """ # WARNING: More than two filters causes SEVERE throughput slowdown filter_fn = filters.input_filter_fn \ ( min_image_width=FLAGS.min_image_width, max_image_width=FLAGS.max_image_width, min_string_length=FLAGS.min_string_length, max_string_length=FLAGS.max_string_length ) # Pack keyword arguments into dictionary data_args = { 'base_dir': FLAGS.test_path, 'file_patterns': str.split(FLAGS.filename_pattern, ','), 'num_threads': FLAGS.num_input_threads, 'batch_size': FLAGS.batch_size, 'filter_fn': filter_fn } if not FLAGS.bucket_data: # Turn off bucketing (on by default in pipeline) data_args['boundaries'] = None # Get data according to flags dataset = pipeline.get_data(use_static_data=True, **data_args) return dataset
def run_feature_heatmap(): tablename = "model.results_stats" break_window = "'3Year'" past_year = "6" static_features = "'diameters,pipe_age,install_year_imputed,musym,rocktype1,rocktype2,zone_name,material_imputed'" query_models = ('select model_id,model_name' ' from ' + tablename + ' where break_window = ' + break_window + ' and past_year = ' + past_year + ' and static_features = ' + static_features) print query_models model_df = get_data(query_models) model_name_list = list(model_df.model_name) model_id_list = list(model_df.model_id.values.ravel()) model_id_list_str = "'" + "','".join(map(str, model_id_list)) + "'" tablename = "model.feature_importances" query_features = ('select *' ' from ' + tablename + ' where model_id in (' + model_list_str + ')') print query_features feature_imp = get_data(query_features) #Check if all models have the same features for model_id in model_id_list: for i in ( feature_imp.feature[feature_imp.model_id == model_id]. sort_values(inplace=False) == feature_imp.feature[ feature_imp.model_id == '1471225699.39507699'].sort_values( inplace=False)): if i == False: print model_id feature_names_list = list( feature_imp.feature[feature_imp.model_id == '1471225699.39507699'].sort_values(inplace=False)) feature_imp_matrix = pd.DataFrame for model_id in model_list[:1]: feature_imp_matrix = feature_imp[ feature_imp.model_id == model_id].sort_values( "feature", inplace=False).importance.values for model_id in model_list[1:]: b = feature_imp[feature_imp.model_id == model_id].sort_values( "feature", inplace=False).importance.values feature_imp_matrix = np.vstack((feature_imp_matrix, b)) feature_imp_matrix_normd = scale(np.transpose(feature_imp_matrix), axis=0, with_mean=True, with_std=True, copy=True) feature_imp_matrix_normd.shape plt.figure(figsize=(50, 50)) sns.heatmap( feature_imp_matrix_normd #, annot=True , square=True, linewidths=0.5, yticklabels=feature_names_list, xticklabels=model_names_list) plt.savefig('feature_model_heatmap.png', bbox_inches='tight') plt.close()