def write_narr_grids_to_file(frontal_grid_table, pickle_file_name): """Writes one or more NARR* grids to file. * NARR = North American Regional Reanalysis :param frontal_grid_table: pandas DataFrame with the following columns. Each row is one valid time. frontal_grid_table.unix_time_sec: Valid time. frontal_grid_table.warm_front_row_indices: length-W numpy array with row indices (integers) of grid cells intersected by a warm front. frontal_grid_table.warm_front_column_indices: Same as above, except for columns. frontal_grid_table.cold_front_row_indices: length-C numpy array with row indices (integers) of grid cells intersected by a cold front. frontal_grid_table.cold_front_column_indices: Same as above, except for columns. :param pickle_file_name: Path to output file. """ error_checking.assert_columns_in_dataframe(frontal_grid_table, REQUIRED_GRID_COLUMNS) file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name) pickle_file_handle = open(pickle_file_name, 'wb') pickle.dump(frontal_grid_table[REQUIRED_GRID_COLUMNS], pickle_file_handle) pickle_file_handle.close()
def write_station_metadata_to_processed_file(station_metadata_table, csv_file_name): """Writes metadata for weather stations to file. This is considered a "processed file," as opposed to a "raw file". A "raw file" is one taken directly from another database, in the native format of said database. For examples, see `hfmetar_io.read_station_metadata_from_raw_file` and `ok_mesonet_io.read_station_metadata_from_raw_file`. :param station_metadata_table: pandas DataFrame with the following columns. station_metadata_table.station_id: String ID for station. station_metadata_table.station_name: Verbose name for station. station_metadata_table.latitude_deg: Latitude (deg N). station_metadata_table.longitude_deg: Longitude (deg E). station_metadata_table.elevation_m_asl: Elevation (metres above sea level). station_metadata_table.utc_offset_hours [optional]: Local time minus UTC. :param csv_file_name: Path to output file. """ error_checking.assert_columns_in_dataframe( station_metadata_table, REQUIRED_STATION_METADATA_COLUMNS) file_system_utils.mkdir_recursive_if_necessary(file_name=csv_file_name) station_metadata_table.to_csv(csv_file_name, header=True, columns=STATION_METADATA_COLUMNS, index=False)
def write_processed_file(tornado_table, csv_file_name): """Writes tornado reports to CSV file. This is considered a "processed file," as opposed to a "raw file" (one taken directly from the Storm Events database). Raw files with tornado reports are handled by storm_events_io.py. :param tornado_table: pandas DataFrame with the following columns. tornado_table.start_time_unix_sec: Start time. tornado_table.end_time_unix_sec: End time. tornado_table.start_latitude_deg: Latitude (deg N) of start point. tornado_table.start_longitude_deg: Longitude (deg E) of start point. tornado_table.end_latitude_deg: Latitude (deg N) of end point. tornado_table.end_longitude_deg: Longitude (deg E) of end point. tornado_table.fujita_rating: F-scale or EF-scale rating (integer from 0...5). tornado_table.width_metres: Tornado width (metres). :param csv_file_name: Path to output file. """ error_checking.assert_columns_in_dataframe(tornado_table, MANDATORY_COLUMNS) file_system_utils.mkdir_recursive_if_necessary(file_name=csv_file_name) tornado_table.to_csv(csv_file_name, header=True, columns=MANDATORY_COLUMNS, index=False)
def _check_input_data_for_learning( input_table, feature_names, target_name=None): """Checks input data (to machine-learning model) for errors. :param input_table: pandas DataFrame, where each row is one example (data point). :param feature_names: 1-D list with names of features (predictor variables). Each feature must be a column of input_table. :param target_name: Name of target variable (predictand). Must be a column of input_table. All values must be 0 or 1. """ error_checking.assert_is_string_list(feature_names) error_checking.assert_is_numpy_array( numpy.array(feature_names), num_dimensions=1) if target_name is None: error_checking.assert_columns_in_dataframe(input_table, feature_names) return error_checking.assert_is_string(target_name) error_checking.assert_columns_in_dataframe( input_table, feature_names + [target_name]) target_values = input_table[target_name].values error_checking.assert_is_integer_numpy_array(target_values) error_checking.assert_is_geq_numpy_array(target_values, 0) error_checking.assert_is_leq_numpy_array(target_values, 1)
def write_processed_file(wind_table, csv_file_name=None, write_mode='w'): """Writes wind observations to file. This is considered a "processed file," as opposed to a "raw file". A "raw file" is one taken directly from another database, in the native format of said database. For examples, see `madis_io.read_winds_from_raw_file` and `ok_mesonet_io.read_winds_from_raw_file`. :param wind_table: pandas DataFrame with the following columns. wind_table.station_id: String ID for station. wind_table.station_name: Verbose name for station. wind_table.latitude_deg: Latitude (deg N). wind_table.longitude_deg: Longitude (deg E). wind_table.elevation_m_asl: Elevation (metres above sea level). wind_table.unix_time_sec: Valid time in Unix format. wind_table.u_wind_m_s01: u-wind (metres per second). wind_table.v_wind_m_s01: v-wind (metres per second). :param csv_file_name: Path to output file. :param write_mode: Any string accepted by the built-in method `open`. """ error_checking.assert_columns_in_dataframe(wind_table, WIND_COLUMNS) file_system_utils.mkdir_recursive_if_necessary(file_name=csv_file_name) write_header = not os.path.isfile(csv_file_name) or 'w' in write_mode wind_table.to_csv(csv_file_name, header=write_header, columns=WIND_COLUMNS, index=False, mode=write_mode)
def test_assert_columns_in_dataframe_true(self): """Checks assert_columns_in_dataframe. In this case, input is pandas DataFrame with all desired columns. """ error_checking.assert_columns_in_dataframe(DATAFRAME, COLUMNS_IN_DATAFRAME)
def test_assert_columns_in_dataframe_missing_columns(self): """Checks assert_columns_in_dataframe. In this case, input is pandas DataFrame but is missing one of the desired columns. """ with self.assertRaises(KeyError): error_checking.assert_columns_in_dataframe( DATAFRAME, FAKE_COLUMNS_IN_DATAFRAME)
def read_normalization_params_from_file(pickle_file_name): """Reads normalization parameters from Pickle file. :param pickle_file_name: Path to input file. :return: radar_table_no_height: See doc for `write_normalization_params`. :return: radar_table_with_height: Same. :return: sounding_table_no_height: Same. :return: sounding_table_with_height: Same. """ # TODO(thunderhoser): Move this to normalization.py or something. pickle_file_handle = open(pickle_file_name, 'rb') radar_table_no_height = pickle.load(pickle_file_handle) radar_table_with_height = pickle.load(pickle_file_handle) sounding_table_no_height = pickle.load(pickle_file_handle) sounding_table_with_height = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe( radar_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT) error_checking.assert_columns_in_dataframe( radar_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT) error_checking.assert_columns_in_dataframe( sounding_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT) error_checking.assert_columns_in_dataframe( sounding_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT) return (radar_table_no_height, radar_table_with_height, sounding_table_no_height, sounding_table_with_height)
def read_polylines_from_file(pickle_file_name): """Reads one or more frontal polylines from Pickle file. :param pickle_file_name: Path to input file. :return: front_table: See documentation for `write_polylines_to_file`. """ pickle_file_handle = open(pickle_file_name, 'rb') front_table = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe(front_table, REQUIRED_POLYLINE_COLUMNS) return front_table
def read_file(pickle_file_name): """Reads tracking data from Pickle file. :param pickle_file_name: Path to input file. :return: storm_object_table: See documentation for `write_file`. """ pickle_file_handle = open(pickle_file_name, 'rb') storm_object_table = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe( storm_object_table, REQUIRED_COLUMNS) return storm_object_table
def read_storm_to_winds_table(pickle_file_name): """Reads linkages (storm-to-wind associations) from Pickle file. :param pickle_file_name: Path to input file. :return: storm_to_winds_table: pandas DataFrame with columns documented in write_storm_to_winds_table. """ pickle_file_handle = open(pickle_file_name, 'rb') storm_to_winds_table = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe(storm_to_winds_table, REQUIRED_COLUMNS_TO_WRITE) return storm_to_winds_table
def _read_intermediate_results(temp_file_name): """Reads intermediate best-track results for a subset of storm objects. :param temp_file_name: Path to intermediate file. :return: storm_object_table: See documentation for _write_intermediate_results. """ pickle_file_handle = open(temp_file_name, 'rb') storm_object_table = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe(storm_object_table, INTERMEDIATE_COLUMNS) return storm_object_table
def read_narr_grids_from_file(pickle_file_name): """Reads one or more NARR* grids from file. * NARR = North American Regional Reanalysis :param pickle_file_name: Path to input file. :return: frontal_grid_table: See documentation for `write_narr_grids_to_file`. """ pickle_file_handle = open(pickle_file_name, 'rb') frontal_grid_table = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe(frontal_grid_table, REQUIRED_GRID_COLUMNS) return frontal_grid_table
def read_processed_file(pickle_file_name): """Reads tracking data from file. This file should contain both polygons and track statistics for one time step and one tracking scale. :param pickle_file_name: Path to input file. :return: storm_object_table: See documentation for write_processed_file. """ pickle_file_handle = open(pickle_file_name, 'rb') storm_object_table = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe(storm_object_table, MANDATORY_COLUMNS) return storm_object_table
def read_file(pickle_file_name): """Reads normalization parameters from Pickle file. :param pickle_file_name: Path to input file. :return: norm_table_no_height: See doc for `write_file`. :return: norm_table_with_height: Same. """ pickle_file_handle = open(pickle_file_name, 'rb') norm_table_no_height = pickle.load(pickle_file_handle) norm_table_with_height = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe(norm_table_no_height, TABLE_COLUMNS) error_checking.assert_columns_in_dataframe(norm_table_with_height, TABLE_COLUMNS) return norm_table_no_height, norm_table_with_height
def read_normalization_params_from_file(pickle_file_name): """Reads normalization parameters from Pickle file. :param pickle_file_name: Path to input file. :return: radar_table_no_height: See doc for `write_normalization_params`. :return: radar_table_with_height: Same. :return: sounding_table_no_height: Same. :return: sounding_table_with_height: Same. """ if not os.path.isfile(pickle_file_name): pickle_file_name = pickle_file_name.replace('/condo/swatwork/ralager', '/scratch/ralager') if not os.path.isfile(pickle_file_name): pickle_file_name = pickle_file_name.replace('/scratch/ralager', '/glade/scratch/ryanlage') if not os.path.isfile(pickle_file_name): pickle_file_name = pickle_file_name.replace('/glade/scratch/ryanlage', '/glade/work/ryanlage') if not os.path.isfile(pickle_file_name): pickle_file_name = pickle_file_name.replace('/glade/work/ryanlage', '/condo/swatwork/ralager') if not os.path.isfile(pickle_file_name): pickle_file_name = pickle_file_name.replace( '/condo/swatwork/ralager', '/condo/swatcommon/common') if not os.path.isfile(pickle_file_name): pickle_file_name = pickle_file_name.replace('/condo/swatwork/ralager', '/scratch/ralager') # TODO(thunderhoser): Move this to normalization.py or something. pickle_file_handle = open(pickle_file_name, 'rb') radar_table_no_height = pickle.load(pickle_file_handle) radar_table_with_height = pickle.load(pickle_file_handle) sounding_table_no_height = pickle.load(pickle_file_handle) sounding_table_with_height = pickle.load(pickle_file_handle) pickle_file_handle.close() error_checking.assert_columns_in_dataframe( radar_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT) error_checking.assert_columns_in_dataframe( radar_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT) error_checking.assert_columns_in_dataframe( sounding_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT) error_checking.assert_columns_in_dataframe( sounding_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT) return (radar_table_no_height, radar_table_with_height, sounding_table_no_height, sounding_table_with_height)
def write_polylines_to_file(front_table, pickle_file_name): """Writes one or more frontal polylines to Pickle file. :param front_table: pandas DataFrame with the following columns. Each row is one front. front_table.front_type: Type of front (examples: "warm", "cold"). front_table.unix_time_sec: Valid time. front_table.latitudes_deg: 1-D numpy array of latitudes (deg N) along front. front_table.longitudes_deg: 1-D numpy array of longitudes (deg E) along front. :param pickle_file_name: Path to output file. """ error_checking.assert_columns_in_dataframe(front_table, REQUIRED_POLYLINE_COLUMNS) file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name) pickle_file_handle = open(pickle_file_name, 'wb') pickle.dump(front_table[REQUIRED_POLYLINE_COLUMNS], pickle_file_handle) pickle_file_handle.close()
def write_normalization_params(pickle_file_name, radar_table_no_height, radar_table_with_height, sounding_table_no_height, sounding_table_with_height): """Writes normalization parameters to Pickle file. :param pickle_file_name: Path to output file. :param radar_table_no_height: Single-indexed pandas DataFrame. Each index is a field name (accepted by `radar_utils.check_field_name`). Must contain the following columns. radar_table_no_height.mean_value: Mean value for the given field. radar_table_no_height.standard_deviation: Standard deviation. radar_table_no_height.min_value: Minimum value. radar_table_no_height.max_value: Max value. :param radar_table_with_height: Double-indexed pandas DataFrame. Each index is a tuple with (field_name, height_m_agl), where `field_name` is accepted by `radar_utils.check_field_name` and `height_m_agl` is in metres above ground level. Must contain the following columns. radar_table_with_height.mean_value: Mean value for the given field. radar_table_with_height.standard_deviation: Standard deviation. :param sounding_table_no_height: Single-indexed pandas DataFrame. Each index is a field name (accepted by `soundings.check_field_name`). Columns should be the same as in `radar_table_no_height`. :param sounding_table_with_height: Double-indexed pandas DataFrame. Each index is a tuple with (field_name, height_m_agl), where `field_name` is accepted by `soundings.check_field_name` and `height_m_agl` is in metres above ground level. Columns should be the same as in `radar_table_with_height`. """ # TODO(thunderhoser): Move this to normalization.py or something. error_checking.assert_columns_in_dataframe( radar_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT) error_checking.assert_columns_in_dataframe( radar_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT) error_checking.assert_columns_in_dataframe( sounding_table_no_height, NORMALIZATION_COLUMNS_NO_HEIGHT) error_checking.assert_columns_in_dataframe( sounding_table_with_height, NORMALIZATION_COLUMNS_WITH_HEIGHT) file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name) pickle_file_handle = open(pickle_file_name, 'wb') pickle.dump(radar_table_no_height, pickle_file_handle) pickle.dump(radar_table_with_height, pickle_file_handle) pickle.dump(sounding_table_no_height, pickle_file_handle) pickle.dump(sounding_table_with_height, pickle_file_handle) pickle_file_handle.close()
def write_file(pickle_file_name, norm_table_no_height, norm_table_with_height): """Writes normalization parameters to Pickle file. :param pickle_file_name: Path to output file. :param norm_table_no_height: pandas DataFrame created by `finalize_params`, containing one set of params for each variable. This table should be single-indexed (field name only). :param norm_table_with_height: pandas DataFrame created by `finalize_params`, containing one set of params for each variable/height. This table should be double-indexed (field name, then height in metres above ground level). """ error_checking.assert_columns_in_dataframe(norm_table_no_height, TABLE_COLUMNS) error_checking.assert_columns_in_dataframe(norm_table_with_height, TABLE_COLUMNS) file_system_utils.mkdir_recursive_if_necessary(file_name=pickle_file_name) pickle_file_handle = open(pickle_file_name, 'wb') pickle.dump(norm_table_no_height, pickle_file_handle) pickle.dump(norm_table_with_height, pickle_file_handle) pickle_file_handle.close()
def check_statistic_table(statistic_table, require_storm_objects=True): """Ensures that pandas DataFrame contains shape statistics. :param statistic_table: pandas DataFrame. :param require_storm_objects: Boolean flag. If True, statistic_table must contain columns "storm_id" and "unix_time_sec". If False, statistic_table does not need these columns. :return: statistic_column_names: 1-D list containing names of columns with shape statistics. :raises: ValueError: if statistic_table does not contain any columns with shape statistics. """ statistic_column_names = get_statistic_columns(statistic_table) if statistic_column_names is None: raise ValueError( 'statistic_table does not contain any column with shape ' 'statistics.') if require_storm_objects: error_checking.assert_columns_in_dataframe(statistic_table, STORM_COLUMNS_TO_KEEP) return statistic_column_names
def check_label_table(label_table, require_storm_objects=True): """Ensures that pandas DataFrame contains labels. :param label_table: pandas DataFrame. :param require_storm_objects: Boolean flag. If True, label_table must contain columns "storm_id" and "unix_time_sec". If False, label_table does not need these columns. :return: label_column_names: 1-D list containing names of columns with regression or classification labels. :raises: ValueError: if label_table does not contain any columns with regression or classification labels. """ label_column_names = get_label_columns(label_table) if label_column_names is None: raise ValueError( 'label_table does not contain any column with regression or ' 'classification labels.') if require_storm_objects: error_checking.assert_columns_in_dataframe(label_table, MANDATORY_COLUMNS) return label_column_names
def check_feature_table(feature_table, require_storm_objects=True): """Ensures that pandas DataFrame contains features and labels. feature_table must contain one or more feature columns. feature_table must contain either 1 or 2 label columns. If 2 columns, there must be one regression label L_r and one classification label L_c, where L_r is the regression version of L_c. :param feature_table: pandas DataFrame. :param require_storm_objects: Boolean flag. If True, feature_table must contain columns "storm_id" and "unix_time_sec". If False, feature_table does not need these columns. :return: feature_column_names: 1-D list containing names of columns with features. :return: regression_label_column_name: Name of column with regression label. If there is no regression label, this will be None. :return: classification_label_column_name: Name of column with classification label. If there is no regression label, this will be None. :raises: ValueError: if feature_table does not contain any feature columns. :raises: ValueError: if feature_table does not contain exactly one label column. """ feature_column_names = radar_stats.get_statistic_columns(feature_table) shape_stat_column_names = shape_stats.get_statistic_columns(feature_table) if shape_stat_column_names: if feature_column_names: feature_column_names += shape_stat_column_names else: feature_column_names = shape_stat_column_names sounding_stat_column_names = soundings.get_sounding_stat_columns( feature_table) if sounding_stat_column_names: if feature_column_names: feature_column_names += sounding_stat_column_names else: feature_column_names = sounding_stat_column_names if feature_column_names is None: raise ValueError( 'feature_table does not contain any columns with features ' '(predictor variables).') regression_label_column_names = labels.get_regression_label_columns( feature_table) if regression_label_column_names and len( regression_label_column_names) == 1: regression_label_column_name = regression_label_column_names[0] else: regression_label_column_name = None classification_label_column_names = labels.get_classification_label_columns( feature_table) if classification_label_column_names and len( classification_label_column_names) == 1: classification_label_column_name = classification_label_column_names[0] else: classification_label_column_name = None if regression_label_column_name and classification_label_column_name: classification_param_dict = labels.column_name_to_label_params( classification_label_column_name) this_regression_label_column_name = ( labels.get_column_name_for_regression_label( min_lead_time_sec=classification_param_dict[ labels.MIN_LEAD_TIME_NAME], max_lead_time_sec=classification_param_dict[ labels.MAX_LEAD_TIME_NAME], min_distance_metres=classification_param_dict[ labels.MIN_DISTANCE_NAME], max_distance_metres=classification_param_dict[ labels.MAX_DISTANCE_NAME], percentile_level=classification_param_dict[ labels.PERCENTILE_LEVEL_NAME])) if this_regression_label_column_name != regression_label_column_name: regression_label_column_name = None classification_label_column_name = None if not (regression_label_column_name or classification_label_column_name): error_string = ( '\n\n' + str(regression_label_column_names) + str(classification_label_column_names) + '\n\nfeature_table ' + 'should contain one regression-label column, one classification-' 'label column, or a classification-label column with the ' 'corresponding regression-label column. Instead, contains label ' 'columns listed above.') raise ValueError(error_string) if require_storm_objects: error_checking.assert_columns_in_dataframe( feature_table, STORM_TO_WIND_COLUMNS_TO_KEEP) return (feature_column_names, regression_label_column_name, classification_label_column_name)
def _transform_each_marginal_to_uniform(new_feature_table, orig_feature_table=None): """Transforms marginal distribution of each feature to uniform distribution. This method transforms data in `new_feature_table` only. If `orig_feature_table` is None, the transformation for feature "x" in the [i]th example will be based on the percentile score of new_feature_table["x"].values[i] in new_feature_table["x"].values. If `orig_feature_table` is specified, the transformation for feature "x" in the [i]th example will be based on the percentile score of new_feature_table["x"].values[i] in orig_feature_table["x"].values. P = number of original examples Q = number of new examples M = number of features :param new_feature_table: pandas DataFrame with Q rows and M columns. Column names are feature names. :param orig_feature_table: pandas DataFrame with P rows and M columns. Column names are feature names. :return: transformed_new_feature_table: Same as input, except that the marginal distribution of each column is uniform. """ # TODO(thunderhoser): I could probably make this faster for cases where # `orig_feature_table` is specified. feature_names = list(new_feature_table) new_feature_matrix = new_feature_table[feature_names].to_numpy() if orig_feature_table is not None: error_checking.assert_columns_in_dataframe(orig_feature_table, feature_names) orig_feature_matrix = orig_feature_table[feature_names].to_numpy() num_features = len(feature_names) num_new_examples = new_feature_matrix.shape[0] transformed_new_feature_table = None for j in range(num_features): new_indices_to_use = numpy.where( numpy.invert(numpy.isnan(new_feature_matrix[:, j])))[0] transformed_values = numpy.full(num_new_examples, 0.5) if orig_feature_table is None: these_ranks = scipy.stats.rankdata( new_feature_matrix[new_indices_to_use, j], method='average') transformed_values[new_indices_to_use] = (these_ranks / len(new_indices_to_use)) else: orig_indices_to_use = numpy.where( numpy.invert(numpy.isnan(orig_feature_matrix[:, j])))[0] for i in new_indices_to_use: transformed_values[i] = scipy.stats.percentileofscore( orig_feature_matrix[orig_indices_to_use, j], new_feature_matrix[i, j], kind='weak') / 100 if transformed_new_feature_table is None: transformed_new_feature_table = pandas.DataFrame.from_dict( {feature_names[j]: transformed_values}) else: transformed_new_feature_table = ( transformed_new_feature_table.assign( **{feature_names[j]: transformed_values})) return transformed_new_feature_table
def test_assert_columns_in_dataframe_tuple(self): """Checks assert_columns_in_dataframe when input is tuple.""" with self.assertRaises(TypeError): error_checking.assert_columns_in_dataframe( REAL_NUMBER_TUPLE, FAKE_COLUMNS_IN_DATAFRAME)
def test_assert_columns_in_dataframe_numpy_array(self): """Checks assert_columns_in_dataframe when input is numpy array.""" with self.assertRaises(TypeError): error_checking.assert_columns_in_dataframe( REAL_NUMPY_ARRAY, FAKE_COLUMNS_IN_DATAFRAME)