def subset_by_standard_atmo(prediction_dict, standard_atmo_enum): """Subsets examples by standard-atmosphere type. :param prediction_dict: See doc for `write_file`. :param standard_atmo_enum: See doc for `example_utils.check_standard_atmo_type`. :return: prediction_dict: Same as input but with fewer examples. """ example_utils.check_standard_atmo_type(standard_atmo_enum) all_standard_atmo_enums = example_utils.parse_example_ids( prediction_dict[EXAMPLE_IDS_KEY])[ example_utils.STANDARD_ATMO_FLAGS_KEY] desired_indices = numpy.where( all_standard_atmo_enums == standard_atmo_enum)[0] return subset_by_index(prediction_dict=prediction_dict, desired_indices=desired_indices)
def subset_by_zenith_angle(prediction_dict, min_zenith_angle_rad, max_zenith_angle_rad, max_inclusive=None): """Subsets examples by solar zenith angle. :param prediction_dict: See doc for `write_file`. :param min_zenith_angle_rad: Minimum zenith angle (radians). :param max_zenith_angle_rad: Max zenith angle (radians). :param max_inclusive: Boolean flag. If True (False), `max_zenith_angle_rad` will be included in subset. :return: prediction_dict: Same as input but with fewer examples. """ error_checking.assert_is_geq(min_zenith_angle_rad, 0.) error_checking.assert_is_leq(max_zenith_angle_rad, MAX_ZENITH_ANGLE_RADIANS) error_checking.assert_is_greater(max_zenith_angle_rad, min_zenith_angle_rad) if max_inclusive is None: max_inclusive = max_zenith_angle_rad == MAX_ZENITH_ANGLE_RADIANS error_checking.assert_is_boolean(max_inclusive) all_zenith_angles_rad = example_utils.parse_example_ids( prediction_dict[EXAMPLE_IDS_KEY])[example_utils.ZENITH_ANGLES_KEY] min_flags = all_zenith_angles_rad >= min_zenith_angle_rad if max_inclusive: max_flags = all_zenith_angles_rad <= max_zenith_angle_rad else: max_flags = all_zenith_angles_rad < max_zenith_angle_rad desired_indices = numpy.where(numpy.logical_and(min_flags, max_flags))[0] return subset_by_index(prediction_dict=prediction_dict, desired_indices=desired_indices)
def test_parse_example_ids(self): """Ensures correct output from parse_example_ids.""" metadata_dict = example_utils.parse_example_ids(EXAMPLE_ID_STRINGS) these_latitudes_deg_n = metadata_dict[example_utils.LATITUDES_KEY] these_longitudes_deg_e = metadata_dict[example_utils.LONGITUDES_KEY] these_albedos = metadata_dict[example_utils.ALBEDOS_KEY] these_zenith_angles_rad = metadata_dict[ example_utils.ZENITH_ANGLES_KEY] these_times_unix_sec = metadata_dict[example_utils.VALID_TIMES_KEY] these_standard_atmo_flags = ( metadata_dict[example_utils.STANDARD_ATMO_FLAGS_KEY]) these_10m_temps_kelvins = ( metadata_dict[example_utils.TEMPERATURES_10M_KEY]) self.assertTrue( numpy.allclose(these_latitudes_deg_n, LATITUDES_FOR_ID_DEG_N, atol=TOLERANCE)) self.assertTrue( numpy.allclose(these_longitudes_deg_e, LONGITUDES_FOR_ID_DEG_E, atol=TOLERANCE)) self.assertTrue( numpy.allclose(these_albedos, ALBEDOS_FOR_ID, atol=TOLERANCE)) self.assertTrue( numpy.allclose(these_zenith_angles_rad, ZENITH_ANGLES_FOR_ID_RAD, atol=TOLERANCE)) self.assertTrue( numpy.array_equal(these_times_unix_sec, TIMES_FOR_ID_UNIX_SEC)) self.assertTrue( numpy.array_equal(these_standard_atmo_flags, STANDARD_ATMO_FLAGS_FOR_ID)) self.assertTrue( numpy.allclose(these_10m_temps_kelvins, TEMPERATURES_FOR_ID_KELVINS, atol=TOLERANCE))
def subset_by_month(prediction_dict, desired_month): """Subsets examples by month. :param prediction_dict: See doc for `write_file`. :param desired_month: Desired month (integer from 1...12). :return: prediction_dict: Same as input but with fewer examples. """ error_checking.assert_is_integer(desired_month) error_checking.assert_is_geq(desired_month, 1) error_checking.assert_is_leq(desired_month, 12) all_times_unix_sec = example_utils.parse_example_ids( prediction_dict[EXAMPLE_IDS_KEY])[example_utils.VALID_TIMES_KEY] all_months = numpy.array([ int(time_conversion.unix_sec_to_string(t, '%m')) for t in all_times_unix_sec ], dtype=int) desired_indices = numpy.where(all_months == desired_month)[0] return subset_by_index(prediction_dict=prediction_dict, desired_indices=desired_indices)
def subset_by_albedo(prediction_dict, min_albedo, max_albedo, max_inclusive=None): """Subsets examples by albedo. :param prediction_dict: See doc for `write_file`. :param min_albedo: Minimum albedo (unitless). :param max_albedo: Max albedo (unitless). :param max_inclusive: Boolean flag. If True (False), `max_albedo` will be included in subset. :return: prediction_dict: Same as input but with fewer examples. """ error_checking.assert_is_geq(min_albedo, 0.) error_checking.assert_is_leq(max_albedo, 1.) error_checking.assert_is_greater(max_albedo, min_albedo) if max_inclusive is None: max_inclusive = max_albedo == 1. error_checking.assert_is_boolean(max_inclusive) all_albedos = example_utils.parse_example_ids( prediction_dict[EXAMPLE_IDS_KEY])[example_utils.ALBEDOS_KEY] min_flags = all_albedos >= min_albedo if max_inclusive: max_flags = all_albedos <= max_albedo else: max_flags = all_albedos < max_albedo desired_indices = numpy.where(numpy.logical_and(min_flags, max_flags))[0] return subset_by_index(prediction_dict=prediction_dict, desired_indices=desired_indices)
def _run_experiment_one_example(example_dict, example_index, max_noise_k_day01, high_res_pressures_pa, high_res_heights_m_asl, first_interp_method_name, second_interp_method_name, interp_fluxes, output_dir_name): """Runs interpolation experiment for one example (one profile). H = number of levels in high-resolution grid :param example_dict: Dictionary in format returned by `example_io.read_file`. :param example_index: Will run experiment for [i]th example, where i = `example_index`. :param max_noise_k_day01: See documentation at top of file. :param high_res_pressures_pa: length-H numpy array of pressures (Pascals) in high-resolution grid. :param high_res_heights_m_asl: length-H numpy array of heights (metres above sea level) in high-resolution grid. :param first_interp_method_name: See documentation at top of file. :param second_interp_method_name: Same. :param interp_fluxes: Same. :param output_dir_name: Same. :return: max_difference_k_day01: Column-max difference between low-resolution and fake low-resolution heating rates. """ example_id_string = ( example_dict[example_utils.EXAMPLE_IDS_KEY][example_index]) metadata_dict = example_utils.parse_example_ids([example_id_string]) surface_height_m_asl = geodetic_utils._get_elevation( latitude_deg=metadata_dict[example_utils.LATITUDES_KEY][0], longitude_deg=metadata_dict[example_utils.LONGITUDES_KEY][0])[0] low_res_heights_m_agl = example_dict[example_utils.HEIGHTS_KEY] low_res_heights_m_asl = surface_height_m_asl + low_res_heights_m_agl low_res_pressures_pa = standard_atmo.height_to_pressure( low_res_heights_m_asl) low_res_heating_rates_k_day01 = example_utils.get_field_from_dict( example_dict=example_dict, field_name=example_utils.SHORTWAVE_HEATING_RATE_NAME)[example_index, :] if interp_fluxes: low_res_down_fluxes_w_m02 = example_utils.get_field_from_dict( example_dict=example_dict, field_name=example_utils.SHORTWAVE_DOWN_FLUX_NAME)[ example_index, :] interp_object = interp1d(x=low_res_pressures_pa[::-1], y=low_res_down_fluxes_w_m02[::-1], kind=first_interp_method_name, bounds_error=False, fill_value='extrapolate', assume_sorted=True) high_res_down_fluxes_w_m02 = interp_object(high_res_pressures_pa) low_res_up_fluxes_w_m02 = example_utils.get_field_from_dict( example_dict=example_dict, field_name=example_utils.SHORTWAVE_UP_FLUX_NAME)[example_index, :] interp_object = interp1d(x=low_res_pressures_pa[::-1], y=low_res_up_fluxes_w_m02[::-1], kind=first_interp_method_name, bounds_error=False, fill_value='extrapolate', assume_sorted=True) high_res_up_fluxes_w_m02 = interp_object(high_res_pressures_pa) high_res_heating_rates_k_day01 = _fluxes_to_heating_rate( down_fluxes_w_m02=high_res_down_fluxes_w_m02, up_fluxes_w_m02=high_res_up_fluxes_w_m02, pressures_pa=high_res_pressures_pa) interp_object = interp1d(x=high_res_pressures_pa[::-1], y=high_res_down_fluxes_w_m02[::-1], kind=second_interp_method_name, bounds_error=True, assume_sorted=True) fake_low_res_down_fluxes_w_m02 = interp_object(low_res_pressures_pa) interp_object = interp1d(x=high_res_pressures_pa[::-1], y=high_res_up_fluxes_w_m02[::-1], kind=second_interp_method_name, bounds_error=True, assume_sorted=True) fake_low_res_up_fluxes_w_m02 = interp_object(low_res_pressures_pa) fake_low_res_heating_rates_k_day01 = _fluxes_to_heating_rate( down_fluxes_w_m02=fake_low_res_down_fluxes_w_m02, up_fluxes_w_m02=fake_low_res_up_fluxes_w_m02, pressures_pa=low_res_pressures_pa) else: interp_object = interp1d(x=low_res_pressures_pa[::-1], y=low_res_heating_rates_k_day01[::-1], kind=first_interp_method_name, bounds_error=False, fill_value='extrapolate', assume_sorted=True) high_res_heating_rates_k_day01 = interp_object(high_res_pressures_pa) noise_values_k_day01 = numpy.random.uniform( low=-max_noise_k_day01, high=max_noise_k_day01, size=high_res_heating_rates_k_day01.shape) noise_values_k_day01 *= ( max_noise_k_day01 / numpy.max(numpy.absolute(noise_values_k_day01))) high_res_heating_rates_k_day01 += noise_values_k_day01 interp_object = interp1d(x=high_res_pressures_pa[::-1], y=high_res_heating_rates_k_day01[::-1], kind=second_interp_method_name, bounds_error=True, assume_sorted=True) fake_low_res_heating_rates_k_day01 = interp_object( low_res_pressures_pa) high_res_heights_m_agl = high_res_heights_m_asl - surface_height_m_asl figure_object, axes_object = profile_plotting.plot_one_variable( values=low_res_heating_rates_k_day01, heights_m_agl=low_res_heights_m_agl, use_log_scale=True, line_colour=LOW_RES_COLOUR, line_width=LOW_RES_LINE_WIDTH) profile_plotting.plot_one_variable(values=high_res_heating_rates_k_day01, heights_m_agl=high_res_heights_m_agl, use_log_scale=True, line_colour=HIGH_RES_COLOUR, line_width=HIGH_RES_LINE_WIDTH, figure_object=figure_object) profile_plotting.plot_one_variable( values=fake_low_res_heating_rates_k_day01, heights_m_agl=low_res_heights_m_agl, use_log_scale=True, line_colour=FAKE_LOW_RES_COLOUR, line_width=FAKE_LOW_RES_LINE_WIDTH, figure_object=figure_object) axes_object.set_xlim(left=-0.5) y_max = axes_object.get_ylim()[1] axes_object.set_ylim(top=y_max * 1.05) max_difference_k_day01 = numpy.max( numpy.absolute(low_res_heating_rates_k_day01 - fake_low_res_heating_rates_k_day01)) title_string = ( 'Max diff between low-res and reconstructed low-res = {0:.4f} K day' ).format(max_difference_k_day01) title_string = title_string + r'$^{-1}$' axes_object.set_title(title_string, fontsize=20) output_file_name = '{0:s}/{1:s}.jpg'.format(output_dir_name, example_id_string) print('Saving figure to: "{0:s}"...'.format(output_file_name)) figure_object.savefig(output_file_name, dpi=FIGURE_RESOLUTION_DPI, pad_inches=0, bbox_inches='tight') pyplot.close(figure_object) return max_difference_k_day01
def _run(input_file_name, top_output_dir_name): """Splits predictions by site (point location). This is effectively the main method. :param input_file_name: See documentation at top of file. :param top_output_dir_name: Same. :raises: ValueError: if any example cannot be assigned to a site. """ # Read data. print('Reading data from: "{0:s}"...'.format(input_file_name)) prediction_dict = prediction_io.read_file(input_file_name) example_metadata_dict = example_utils.parse_example_ids( prediction_dict[prediction_io.EXAMPLE_IDS_KEY]) example_latitudes_deg_n = number_rounding.round_to_nearest( example_metadata_dict[example_utils.LATITUDES_KEY], LATLNG_TOLERANCE_DEG) example_longitudes_deg_e = number_rounding.round_to_nearest( example_metadata_dict[example_utils.LONGITUDES_KEY], LATLNG_TOLERANCE_DEG) example_longitudes_deg_e = lng_conversion.convert_lng_positive_in_west( example_longitudes_deg_e) num_examples = len(example_latitudes_deg_n) example_written_flags = numpy.full(num_examples, False, dtype=bool) site_names = list(SITE_NAME_TO_LATLNG.keys()) num_sites = len(site_names) for j in range(num_sites): this_site_latitude_deg_n = SITE_NAME_TO_LATLNG[site_names[j]][0] this_site_longitude_deg_e = SITE_NAME_TO_LATLNG[site_names[j]][1] these_indices = numpy.where( numpy.logical_and( numpy.absolute(example_latitudes_deg_n - this_site_latitude_deg_n) <= LATLNG_TOLERANCE_DEG, numpy.absolute(example_longitudes_deg_e - this_site_longitude_deg_e) <= LATLNG_TOLERANCE_DEG))[0] this_prediction_dict = prediction_io.subset_by_index( prediction_dict=copy.deepcopy(prediction_dict), desired_indices=these_indices) this_output_file_name = '{0:s}/{1:s}/predictions.nc'.format( top_output_dir_name, site_names[j]) print('Writing {0:d} examples to: "{1:s}"...'.format( len(these_indices), this_output_file_name)) if len(these_indices) == 0: continue example_written_flags[these_indices] = True prediction_io.write_file( netcdf_file_name=this_output_file_name, scalar_target_matrix=this_prediction_dict[ prediction_io.SCALAR_TARGETS_KEY], vector_target_matrix=this_prediction_dict[ prediction_io.VECTOR_TARGETS_KEY], scalar_prediction_matrix=this_prediction_dict[ prediction_io.SCALAR_PREDICTIONS_KEY], vector_prediction_matrix=this_prediction_dict[ prediction_io.VECTOR_PREDICTIONS_KEY], heights_m_agl=this_prediction_dict[prediction_io.HEIGHTS_KEY], example_id_strings=this_prediction_dict[ prediction_io.EXAMPLE_IDS_KEY], model_file_name=this_prediction_dict[prediction_io.MODEL_FILE_KEY]) if numpy.all(example_written_flags): return # bad_latitudes_deg_n = ( # example_latitudes_deg_n[example_written_flags == False] # ) # bad_longitudes_deg_e = ( # example_longitudes_deg_e[example_written_flags == False] # ) # bad_coord_matrix = numpy.transpose(numpy.vstack(( # bad_latitudes_deg_n, bad_longitudes_deg_e # ))) # bad_coord_matrix = numpy.unique(bad_coord_matrix, axis=0) # print(bad_coord_matrix) error_string = ( '{0:d} of {1:d} examples could not be assigned to a site. This is a ' 'BIG PROBLEM.').format(numpy.sum(example_written_flags == False), num_examples) raise ValueError(error_string)
def read_file(netcdf_file_name, exclude_summit_greenland=False, id_strings_to_read=None, allow_missing_ids=False): """Reads learning examples from NetCDF file. E = number of examples H = number of heights P_s = number of scalar predictors P_v = number of vector predictors T_s = number of scalar targets T_v = number of vector targets :param netcdf_file_name: Path to input file. :param exclude_summit_greenland: Boolean flag. If True, will not read data from Summit, Greenland. :param id_strings_to_read: 1-D list of IDs for examples to read. If None, will read all examples. :param allow_missing_ids: [used only if `id_strings_to_read is not None`] Boolean flag. If True, will allow missing IDs. If False, will throw error for missing IDs. :return: example_dict: Dictionary with the following keys. example_dict['scalar_predictor_matrix']: numpy array (E x P_s) with values of scalar predictors. example_dict['scalar_predictor_names']: list (length P_s) with names of scalar predictors. example_dict['vector_predictor_matrix']: numpy array (E x H x P_v) with values of vector predictors. example_dict['vector_predictor_names']: list (length P_v) with names of vector predictors. example_dict['scalar_target_matrix']: numpy array (E x T_s) with values of scalar targets. example_dict['scalar_target_names']: list (length T_s) with names of scalar targets. example_dict['vector_target_matrix']: numpy array (E x H x T_v) with values of vector targets. example_dict['vector_target_names']: list (length T_v) with names of vector targets. example_dict['valid_times_unix_sec']: length-E numpy array of valid times (Unix seconds). example_dict['heights_m_agl']: length-H numpy array of heights (metres above ground level). example_dict['standard_atmo_flags']: length-E numpy array of flags (each in the list `STANDARD_ATMO_ENUMS`). example_dict['example_id_strings']: length-E list of example IDs. """ # TODO(thunderhoser): This is a HACK. if not os.path.isfile(netcdf_file_name): netcdf_file_name = netcdf_file_name.replace('/home/ryan.lagerquist', '/home/ralager') dataset_object = netCDF4.Dataset(netcdf_file_name) example_id_strings = [ str(id) for id in netCDF4.chartostring(dataset_object.variables[ example_utils.EXAMPLE_IDS_KEY][:]) ] if id_strings_to_read is None: num_examples = dataset_object.dimensions[EXAMPLE_DIMENSION_KEY].size indices_to_read = numpy.linspace(0, num_examples - 1, num=num_examples, dtype=int) else: exclude_summit_greenland = False indices_to_read = example_utils.find_examples( all_id_strings=example_id_strings, desired_id_strings=id_strings_to_read, allow_missing=allow_missing_ids) indices_to_read = indices_to_read[indices_to_read >= 0] error_checking.assert_is_boolean(exclude_summit_greenland) # TODO(thunderhoser): This is a HACK to deal with potentially bad data. if exclude_summit_greenland: metadata_dict = example_utils.parse_example_ids(example_id_strings) latitudes_deg_n = metadata_dict[example_utils.LATITUDES_KEY] longitudes_deg_e = lng_conversion.convert_lng_positive_in_west( metadata_dict[example_utils.LONGITUDES_KEY]) bad_flags = numpy.logical_and( numpy.isclose(latitudes_deg_n, SUMMIT_LATITUDE_DEG_N, atol=1e-4), numpy.isclose(longitudes_deg_e, SUMMIT_LONGITUDE_DEG_E, atol=1e-4)) good_indices = numpy.where(numpy.invert(bad_flags))[0] warning_string = ( 'Removing {0:d} of {1:d} examples (profiles), because they are at ' 'Summit GL.').format( len(indices_to_read) - len(good_indices), len(indices_to_read)) warnings.warn(warning_string) indices_to_read = indices_to_read[good_indices] example_dict = { example_utils.EXAMPLE_IDS_KEY: [example_id_strings[k] for k in indices_to_read] } string_keys = [ example_utils.SCALAR_PREDICTOR_NAMES_KEY, example_utils.VECTOR_PREDICTOR_NAMES_KEY, example_utils.SCALAR_TARGET_NAMES_KEY, example_utils.VECTOR_TARGET_NAMES_KEY ] main_data_keys = [ example_utils.SCALAR_PREDICTOR_VALS_KEY, example_utils.VECTOR_PREDICTOR_VALS_KEY, example_utils.SCALAR_TARGET_VALS_KEY, example_utils.VECTOR_TARGET_VALS_KEY ] integer_keys = [ example_utils.VALID_TIMES_KEY, example_utils.STANDARD_ATMO_FLAGS_KEY ] for this_key in string_keys: example_dict[this_key] = [ str(n) for n in netCDF4.chartostring( dataset_object.variables[this_key][:]) ] for this_key in main_data_keys: example_dict[this_key] = numpy.array( dataset_object.variables[this_key][indices_to_read, ...], dtype=float) for this_key in integer_keys: example_dict[this_key] = numpy.array(numpy.round( dataset_object.variables[this_key][indices_to_read]), dtype=int) example_dict[example_utils.HEIGHTS_KEY] = numpy.array( dataset_object.variables[example_utils.HEIGHTS_KEY][:], dtype=float) dataset_object.close() return example_dict
def _run(input_file_name, min_latitude_deg, max_latitude_deg, min_longitude_deg, max_longitude_deg, latitude_spacing_deg, longitude_spacing_deg, output_dir_name): """Splits predictions by spatial region. This is effectively the main method. :param input_file_name: See documentation at top of file. :param min_latitude_deg: Same. :param max_latitude_deg: Same. :param min_longitude_deg: Same. :param max_longitude_deg: Same. :param latitude_spacing_deg: Same. :param longitude_spacing_deg: Same. :param output_dir_name: Same. """ # Read data. print('Reading data from: "{0:s}"...'.format(input_file_name)) prediction_dict = prediction_io.read_file(input_file_name) example_metadata_dict = example_utils.parse_example_ids( prediction_dict[prediction_io.EXAMPLE_IDS_KEY] ) example_latitudes_deg = example_metadata_dict[example_utils.LATITUDES_KEY] example_longitudes_deg = example_metadata_dict[example_utils.LONGITUDES_KEY] these_limits_deg = numpy.array([ min_latitude_deg, max_latitude_deg, min_longitude_deg, max_longitude_deg ]) if numpy.any(numpy.isnan(these_limits_deg)): min_latitude_deg = numpy.min(example_latitudes_deg) max_latitude_deg = numpy.max(example_latitudes_deg) min_longitude_deg = numpy.min(example_longitudes_deg) max_longitude_deg = numpy.max(example_longitudes_deg) # Create grid. grid_point_latitudes_deg, grid_point_longitudes_deg = ( misc.create_latlng_grid( min_latitude_deg=min_latitude_deg, max_latitude_deg=max_latitude_deg, latitude_spacing_deg=latitude_spacing_deg, min_longitude_deg=min_longitude_deg, max_longitude_deg=max_longitude_deg, longitude_spacing_deg=longitude_spacing_deg ) ) num_grid_rows = len(grid_point_latitudes_deg) num_grid_columns = len(grid_point_longitudes_deg) grid_edge_latitudes_deg, grid_edge_longitudes_deg = ( grids.get_latlng_grid_cell_edges( min_latitude_deg=grid_point_latitudes_deg[0], min_longitude_deg=grid_point_longitudes_deg[0], lat_spacing_deg=numpy.diff(grid_point_latitudes_deg[:2])[0], lng_spacing_deg=numpy.diff(grid_point_longitudes_deg[:2])[0], num_rows=num_grid_rows, num_columns=num_grid_columns ) ) print(SEPARATOR_STRING) for i in range(num_grid_rows): for j in range(num_grid_columns): these_indices = grids.find_events_in_grid_cell( event_x_coords_metres=example_longitudes_deg, event_y_coords_metres=example_latitudes_deg, grid_edge_x_coords_metres=grid_edge_longitudes_deg, grid_edge_y_coords_metres=grid_edge_latitudes_deg, row_index=i, column_index=j, verbose=False ) this_prediction_dict = prediction_io.subset_by_index( prediction_dict=copy.deepcopy(prediction_dict), desired_indices=these_indices ) this_num_examples = len( this_prediction_dict[prediction_io.EXAMPLE_IDS_KEY] ) if this_num_examples == 0: continue this_output_file_name = prediction_io.find_file( directory_name=output_dir_name, grid_row=i, grid_column=j, raise_error_if_missing=False ) print('Writing {0:d} examples to: "{1:s}"...'.format( len(this_prediction_dict[prediction_io.EXAMPLE_IDS_KEY]), this_output_file_name )) prediction_io.write_file( netcdf_file_name=this_output_file_name, scalar_target_matrix= this_prediction_dict[prediction_io.SCALAR_TARGETS_KEY], vector_target_matrix= this_prediction_dict[prediction_io.VECTOR_TARGETS_KEY], scalar_prediction_matrix= this_prediction_dict[prediction_io.SCALAR_PREDICTIONS_KEY], vector_prediction_matrix= this_prediction_dict[prediction_io.VECTOR_PREDICTIONS_KEY], heights_m_agl=this_prediction_dict[prediction_io.HEIGHTS_KEY], example_id_strings= this_prediction_dict[prediction_io.EXAMPLE_IDS_KEY], model_file_name= this_prediction_dict[prediction_io.MODEL_FILE_KEY] ) print(SEPARATOR_STRING) grid_metafile_name = prediction_io.find_grid_metafile( prediction_dir_name=output_dir_name, raise_error_if_missing=False ) print('Writing grid metadata to: "{0:s}"...'.format(grid_metafile_name)) prediction_io.write_grid_metafile( grid_point_latitudes_deg=grid_point_latitudes_deg, grid_point_longitudes_deg=grid_point_longitudes_deg, netcdf_file_name=grid_metafile_name )
def write_file(netcdf_file_name, scalar_target_matrix, vector_target_matrix, scalar_prediction_matrix, vector_prediction_matrix, heights_m_agl, example_id_strings, model_file_name, isotonic_model_file_name=None): """Writes predictions to NetCDF file. E = number of examples H = number of heights T_s = number of scalar targets T_v = number of vector targets :param netcdf_file_name: Path to output file. :param scalar_target_matrix: numpy array (E x T_s) with actual values of scalar targets. :param vector_target_matrix: numpy array (E x H x T_v) with actual values of vector targets. :param scalar_prediction_matrix: Same as `scalar_target_matrix` but with predicted values. :param vector_prediction_matrix: Same as `vector_target_matrix` but with predicted values. :param heights_m_agl: length-H numpy array of heights (metres above ground level). :param example_id_strings: length-E list of IDs created by `example_utils.create_example_ids`. :param model_file_name: Path to file with trained model (readable by `neural_net.read_model`). :param isotonic_model_file_name: Path to file with trained isotonic- regression models (readable by `isotonic_regression.read_file`) used to make predictions. If isotonic regression was not used, leave this as None. """ # Check input args. error_checking.assert_is_numpy_array_without_nan(scalar_target_matrix) error_checking.assert_is_numpy_array(scalar_target_matrix, num_dimensions=2) error_checking.assert_is_numpy_array_without_nan(scalar_prediction_matrix) error_checking.assert_is_numpy_array(scalar_prediction_matrix, exact_dimensions=numpy.array( scalar_target_matrix.shape, dtype=int)) error_checking.assert_is_numpy_array_without_nan(vector_target_matrix) error_checking.assert_is_numpy_array(vector_target_matrix, num_dimensions=3) num_examples = scalar_target_matrix.shape[0] expected_dim = numpy.array( (num_examples, ) + vector_target_matrix.shape[1:], dtype=int) error_checking.assert_is_numpy_array(vector_target_matrix, exact_dimensions=expected_dim) error_checking.assert_is_numpy_array_without_nan(vector_prediction_matrix) error_checking.assert_is_numpy_array(vector_prediction_matrix, exact_dimensions=numpy.array( vector_target_matrix.shape, dtype=int)) num_heights = vector_target_matrix.shape[1] error_checking.assert_is_greater_numpy_array(heights_m_agl, 0.) error_checking.assert_is_numpy_array(heights_m_agl, exact_dimensions=numpy.array( [num_heights], dtype=int)) error_checking.assert_is_numpy_array(numpy.array(example_id_strings), exact_dimensions=numpy.array( [num_examples], dtype=int)) example_utils.parse_example_ids(example_id_strings) error_checking.assert_is_string(model_file_name) if isotonic_model_file_name is None: isotonic_model_file_name = '' error_checking.assert_is_string(isotonic_model_file_name) # Write to NetCDF file. file_system_utils.mkdir_recursive_if_necessary(file_name=netcdf_file_name) dataset_object = netCDF4.Dataset(netcdf_file_name, 'w', format='NETCDF3_64BIT_OFFSET') dataset_object.setncattr(MODEL_FILE_KEY, model_file_name) dataset_object.setncattr(ISOTONIC_MODEL_FILE_KEY, isotonic_model_file_name) num_examples = vector_target_matrix.shape[0] dataset_object.createDimension(EXAMPLE_DIMENSION_KEY, num_examples) dataset_object.createDimension(HEIGHT_DIMENSION_KEY, vector_target_matrix.shape[1]) dataset_object.createDimension(VECTOR_TARGET_DIMENSION_KEY, vector_target_matrix.shape[2]) num_scalar_targets = scalar_target_matrix.shape[1] if num_scalar_targets > 0: dataset_object.createDimension(SCALAR_TARGET_DIMENSION_KEY, scalar_target_matrix.shape[1]) if num_examples == 0: num_id_characters = 1 else: num_id_characters = numpy.max( numpy.array([len(id) for id in example_id_strings])) dataset_object.createDimension(EXAMPLE_ID_CHAR_DIM_KEY, num_id_characters) this_string_format = 'S{0:d}'.format(num_id_characters) example_ids_char_array = netCDF4.stringtochar( numpy.array(example_id_strings, dtype=this_string_format)) dataset_object.createVariable(EXAMPLE_IDS_KEY, datatype='S1', dimensions=(EXAMPLE_DIMENSION_KEY, EXAMPLE_ID_CHAR_DIM_KEY)) dataset_object.variables[EXAMPLE_IDS_KEY][:] = numpy.array( example_ids_char_array) dataset_object.createVariable(HEIGHTS_KEY, datatype=numpy.float32, dimensions=HEIGHT_DIMENSION_KEY) dataset_object.variables[HEIGHTS_KEY][:] = heights_m_agl if num_scalar_targets > 0: dataset_object.createVariable(SCALAR_TARGETS_KEY, datatype=numpy.float32, dimensions=(EXAMPLE_DIMENSION_KEY, SCALAR_TARGET_DIMENSION_KEY)) dataset_object.variables[SCALAR_TARGETS_KEY][:] = scalar_target_matrix dataset_object.createVariable(SCALAR_PREDICTIONS_KEY, datatype=numpy.float32, dimensions=(EXAMPLE_DIMENSION_KEY, SCALAR_TARGET_DIMENSION_KEY)) dataset_object.variables[SCALAR_PREDICTIONS_KEY][:] = ( scalar_prediction_matrix) these_dimensions = (EXAMPLE_DIMENSION_KEY, HEIGHT_DIMENSION_KEY, VECTOR_TARGET_DIMENSION_KEY) dataset_object.createVariable(VECTOR_TARGETS_KEY, datatype=numpy.float32, dimensions=these_dimensions) dataset_object.variables[VECTOR_TARGETS_KEY][:] = vector_target_matrix dataset_object.createVariable(VECTOR_PREDICTIONS_KEY, datatype=numpy.float32, dimensions=these_dimensions) dataset_object.variables[VECTOR_PREDICTIONS_KEY][:] = ( vector_prediction_matrix) dataset_object.close()
def get_raw_examples(example_file_name, num_examples, example_dir_name, example_id_file_name): """Returns raw examples. The difference between `get_raw_examples` and `get_examples_for_inference` is that `get_raw_examples` returns examples in their raw form, *not* pre-processed to be fed through a model for inference. :param example_file_name: See doc for `get_examples_for_inference`. :param num_examples: Same. :param example_dir_name: Same. :param example_id_file_name: Same. :return: example_dict: See doc for `example_io.read_file`. """ error_checking.assert_is_string(example_file_name) use_specific_ids = example_file_name == '' if use_specific_ids: error_checking.assert_is_string(example_id_file_name) print('Reading desired example IDs from: "{0:s}"...'.format( example_id_file_name)) example_id_strings = read_example_ids_from_netcdf(example_id_file_name) valid_times_unix_sec = example_utils.parse_example_ids( example_id_strings)[example_utils.VALID_TIMES_KEY] example_file_names = example_io.find_many_files( directory_name=example_dir_name, first_time_unix_sec=numpy.min(valid_times_unix_sec), last_time_unix_sec=numpy.max(valid_times_unix_sec)) num_files = len(example_file_names) example_dicts = [dict()] * num_files for i in range(num_files): print('Reading data from: "{0:s}"...'.format( example_file_names[i])) example_dicts[i] = example_io.read_file(example_file_names[i]) example_dict = example_utils.concat_examples(example_dicts) good_indices = example_utils.find_examples( all_id_strings=example_dict[example_utils.EXAMPLE_IDS_KEY], desired_id_strings=example_id_strings, allow_missing=False) example_dict = example_utils.subset_by_index( example_dict=example_dict, desired_indices=good_indices) else: error_checking.assert_is_string(example_dir_name) error_checking.assert_is_integer(num_examples) error_checking.assert_is_greater(num_examples, 0) print('Reading data from: "{0:s}"...'.format(example_file_name)) example_dict = example_io.read_file(example_file_name) num_examples_total = len(example_dict[example_utils.VALID_TIMES_KEY]) desired_indices = numpy.linspace(0, num_examples_total - 1, num=num_examples_total, dtype=int) if num_examples < num_examples_total: desired_indices = numpy.random.choice(desired_indices, size=num_examples, replace=False) example_dict = example_utils.subset_by_index( example_dict=example_dict, desired_indices=desired_indices) return example_dict
def _run(tropical_example_dir_name, non_tropical_example_dir_name, year, assorted1_example_dir_name, assorted2_example_dir_name): """Splits examples into Assorted1 and Assorted2 sites. This is effectively the main method. :param tropical_example_dir_name: See documentation at top of file. :param non_tropical_example_dir_name: Same. :param year: Same. :param assorted1_example_dir_name: Same. :param assorted2_example_dir_name: Same. """ tropical_example_file_name = example_io.find_file( directory_name=tropical_example_dir_name, year=year, raise_error_if_missing=True ) non_tropical_example_file_name = example_io.find_file( directory_name=non_tropical_example_dir_name, year=year, raise_error_if_missing=True ) print('Reading data from: "{0:s}"...'.format(tropical_example_file_name)) tropical_example_dict = example_io.read_file(tropical_example_file_name) print('Reading data from: "{0:s}"...'.format( non_tropical_example_file_name )) non_tropical_example_dict = example_io.read_file( non_tropical_example_file_name ) example_dict = example_utils.concat_examples([ tropical_example_dict, non_tropical_example_dict ]) del tropical_example_dict, non_tropical_example_dict example_metadata_dict = example_utils.parse_example_ids( example_dict[example_utils.EXAMPLE_IDS_KEY] ) example_latitudes_deg_n = example_metadata_dict[example_utils.LATITUDES_KEY] example_longitudes_deg_e = lng_conversion.convert_lng_positive_in_west( example_metadata_dict[example_utils.LONGITUDES_KEY] ) example_coord_matrix = numpy.transpose(numpy.vstack(( example_latitudes_deg_n, example_longitudes_deg_e ))) assorted2_coord_matrix = numpy.transpose(numpy.vstack(( ASSORTED2_LATITUDES_DEG_N, ASSORTED2_LONGITUDES_DEG_E ))) distance_matrix_deg2 = euclidean_distances( X=example_coord_matrix, Y=assorted2_coord_matrix, squared=True ) assorted2_flags = numpy.any(distance_matrix_deg2 <= TOLERANCE_DEG2, axis=1) assorted2_example_dict = example_utils.subset_by_index( example_dict=copy.deepcopy(example_dict), desired_indices=numpy.where(assorted2_flags)[0] ) assorted2_example_file_name = example_io.find_file( directory_name=assorted2_example_dir_name, year=year, raise_error_if_missing=False ) print('Writing {0:d} examples in set Assorted2 to: "{1:s}"...'.format( len(assorted2_example_dict[example_utils.VALID_TIMES_KEY]), assorted2_example_file_name )) example_io.write_file( example_dict=assorted2_example_dict, netcdf_file_name=assorted2_example_file_name ) assorted1_example_dict = example_utils.subset_by_index( example_dict=example_dict, desired_indices=numpy.where(numpy.invert(assorted2_flags))[0] ) assorted1_example_file_name = example_io.find_file( directory_name=assorted1_example_dir_name, year=year, raise_error_if_missing=False ) print('Writing {0:d} examples in set Assorted1 to: "{1:s}"...'.format( len(assorted1_example_dict[example_utils.VALID_TIMES_KEY]), assorted1_example_file_name )) example_io.write_file( example_dict=assorted1_example_dict, netcdf_file_name=assorted1_example_file_name )