Esempio n. 1
0
def _get_input():
    """
    Get tf.data.Dataset object according to command-line flags for testing
    using tf.estimator.Estimator
    Returns:
      dataset : elements structured as [features, labels]
                feature structure can be seen in postbatch_fn 
                in mjsynth.py
    """

    # WARNING: More than two filters causes SEVERE throughput slowdown
    filter_fn = filters.input_filter_fn \
                ( min_image_width=FLAGS.min_image_width,
                  max_image_width=FLAGS.max_image_width,
                  min_string_length=FLAGS.min_string_length,
                  max_string_length=FLAGS.max_string_length )

    # Get data according to flags
    dataset = pipeline.get_data(use_static_data=True,
                                base_dir=FLAGS.test_path,
                                file_patterns=str.split(
                                    FLAGS.filename_pattern, ','),
                                num_threads=FLAGS.num_input_threads,
                                batch_size=FLAGS.batch_size,
                                filter_fn=filter_fn,
                                num_epochs=1)
    return dataset
def _get_input():
    """
    Get tf.data.Dataset according to command-line flags for training 
    using tf.estimator.Estimator

    Note: Default behavior is bucketing according to default bucket boundaries
    listed in pipeline.get_data

    Returns:
      dataset : elements structured as [features, labels]
                feature structure can be seen in postbatch_fn 
                in mjsynth.py or maptextsynth.py for static or dynamic
                data pipelines respectively
    """

    # WARNING: More than two filters causes SEVERE throughput slowdown
    filter_fn = filters.input_filter_fn \
                ( min_image_width=FLAGS.min_image_width,
                  max_image_width=FLAGS.max_image_width,
                  min_string_length=FLAGS.min_string_length,
                  max_string_length=FLAGS.max_string_length,
                  check_input=(not FLAGS.static_data) )

    gpu_batch_size = FLAGS.batch_size // FLAGS.num_gpus

    # Pack keyword arguments into dictionary
    data_args = {
        'num_threads': FLAGS.num_input_threads,
        'batch_size': gpu_batch_size,
        'filter_fn': filter_fn
    }

    if FLAGS.static_data:  # Pack data stream-specific parameters
        data_args['base_dir'] = FLAGS.train_path
        data_args['file_patterns'] = str.split(FLAGS.filename_pattern, ',')
    else:
        data_args['synth_config_file'] = FLAGS.synth_config_file
        data_args['use_ipc_synth'] = FLAGS.ipc_synth

    if not FLAGS.bucket_data:
        data_args['boundaries'] = None  # Turn off bucketing (on by default)
    elif not FLAGS.static_data:  # Extra buckets for the wider synthetic data
        data_args['boundaries'] = [
            32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448,
            480, 512
        ]

    # Get data according to flags
    dataset = pipeline.get_data(FLAGS.static_data, **data_args)

    return dataset
Esempio n. 3
0
def run_results_analysis(run_log_df):

    model_id_list = list(run_log_df.model_id)
    model_id_list_str = "'" + "','".join(map(str, model_id_list)) + "'"

    results_stats_tablename = "model.results_stats"
    query_results_stats = ('select *'
                           ' from ' + results_stats_tablename +
                           ' where model_id in (' + model_id_list_str + ')')
    results_stats_df = get_data(query_results_stats)

    valid_tablename = "model.valid"
    query_valid = ('select *'
                   ' from ' + valid_tablename + ' where model_id in (' +
                   model_id_list_str + ')')
    #valid_df= get_data(query_valid)

    features_tablename = "model.feature_importances"
    query_features = ('select *'
                      ' from ' + features_tablename + ' where model_id in (' +
                      model_id_list_str + ')')
    #features_df= get_data(query_features)

    model_top_precision = list(
        results_stats_df.sort_values('precision_at_1',
                                     ascending=False)['model_id'])[0]

    valid_top_precision_tablename = "model.valid"
    query_valid_top_precision = ('select *'
                                 ' from ' + valid_top_precision_tablename +
                                 " where model_id in ('" +
                                 model_top_precision + "')")
    print query_valid_top_precision

    valid_top_precision_df = get_data(query_valid_top_precision)
    gen_decile_table(model_top_precision, valid_top_precision_df)
Esempio n. 4
0
def _get_input():
    """
    Get tf.data.Dataset object according to command-line flags for evaluation
    using tf.estimator.Estimator

    Note: Default behavior is bucketing according to default bucket boundaries
    listed in pipeline.get_data

    Returns:
      features, labels
                feature structure can be seen in postbatch_fn 
                in mjsynth.py or maptextsynth.py for static or dynamic
                data pipelines respectively
    """

    # WARNING: More than two filters causes SEVERE throughput slowdown
    filter_fn = filters.input_filter_fn \
                ( min_image_width=FLAGS.min_image_width,
                  max_image_width=FLAGS.max_image_width,
                  min_string_length=FLAGS.min_string_length,
                  max_string_length=FLAGS.max_string_length )

    # Pack keyword arguments into dictionary
    data_args = {
        'base_dir': FLAGS.test_path,
        'file_patterns': str.split(FLAGS.filename_pattern, ','),
        'num_threads': FLAGS.num_input_threads,
        'batch_size': FLAGS.batch_size,
        'filter_fn': filter_fn
    }

    if not FLAGS.bucket_data:  # Turn off bucketing (on by default in pipeline)
        data_args['boundaries'] = None

    # Get data according to flags
    dataset = pipeline.get_data(use_static_data=True, **data_args)

    return dataset
Esempio n. 5
0
def run_feature_heatmap():
    tablename = "model.results_stats"
    break_window = "'3Year'"
    past_year = "6"
    static_features = "'diameters,pipe_age,install_year_imputed,musym,rocktype1,rocktype2,zone_name,material_imputed'"
    query_models = ('select model_id,model_name'
                    ' from ' + tablename + ' where break_window = ' +
                    break_window + ' and past_year = ' + past_year +
                    ' and static_features = ' + static_features)
    print query_models

    model_df = get_data(query_models)
    model_name_list = list(model_df.model_name)
    model_id_list = list(model_df.model_id.values.ravel())

    model_id_list_str = "'" + "','".join(map(str, model_id_list)) + "'"

    tablename = "model.feature_importances"
    query_features = ('select *'
                      ' from ' + tablename + ' where model_id in (' +
                      model_list_str + ')')
    print query_features

    feature_imp = get_data(query_features)

    #Check if all models have the same features
    for model_id in model_id_list:
        for i in (
                feature_imp.feature[feature_imp.model_id == model_id].
                sort_values(inplace=False) == feature_imp.feature[
                    feature_imp.model_id == '1471225699.39507699'].sort_values(
                        inplace=False)):
            if i == False:
                print model_id

    feature_names_list = list(
        feature_imp.feature[feature_imp.model_id ==
                            '1471225699.39507699'].sort_values(inplace=False))

    feature_imp_matrix = pd.DataFrame

    for model_id in model_list[:1]:
        feature_imp_matrix = feature_imp[
            feature_imp.model_id == model_id].sort_values(
                "feature", inplace=False).importance.values

    for model_id in model_list[1:]:
        b = feature_imp[feature_imp.model_id == model_id].sort_values(
            "feature", inplace=False).importance.values
        feature_imp_matrix = np.vstack((feature_imp_matrix, b))

    feature_imp_matrix_normd = scale(np.transpose(feature_imp_matrix),
                                     axis=0,
                                     with_mean=True,
                                     with_std=True,
                                     copy=True)
    feature_imp_matrix_normd.shape

    plt.figure(figsize=(50, 50))
    sns.heatmap(
        feature_imp_matrix_normd
        #, annot=True
        ,
        square=True,
        linewidths=0.5,
        yticklabels=feature_names_list,
        xticklabels=model_names_list)
    plt.savefig('feature_model_heatmap.png', bbox_inches='tight')
    plt.close()