def test_combine_first_both_none(self): data1 = COMBINE_TEST_DATA_1.copy() data2 = COMBINE_TEST_DATA_2.copy() result = combine_tables([data1[0:1], data2[3:4]], ["key"]) self.assertEqual(1, len(result)) self.assertEqual(2, result.loc[0, "value_column_1"]) self.assertEqual(2, result.loc[0, "value_column_2"])
def test_combine_second_right_none(self): data1 = COMBINE_TEST_DATA_1.copy() data2 = COMBINE_TEST_DATA_2.copy() result = combine_tables([data1[3:4], data2[2:3]], ["key"]) self.assertEqual(1, len(result)) self.assertEqual(2, result.loc[0, "value_column_1"]) self.assertEqual(1, result.loc[0, "value_column_2"])
def test_combine_all_none(self): data1 = COMBINE_TEST_DATA_1.copy() data2 = COMBINE_TEST_DATA_2.copy() result = combine_tables([data1[0:1], data2[0:1]], ["key"]) self.assertEqual(1, len(result)) self.assertTrue(isnull(result.loc[0, "value_column_1"])) self.assertTrue(isnull(result.loc[0, "value_column_2"]))
def test_combine_second_left_none(self): data1 = TEST_DATA_1.copy() data2 = TEST_DATA_2.copy() result = combine_tables([data1[3:4], data2[1:2]], ["key"]) self.assertEqual(1, len(result)) self.assertEqual("1", result.loc[0, "value_column_1"]) self.assertEqual("2", result.loc[0, "value_column_2"])
def test_combine_first_right_none(self): data1 = TEST_DATA_1.copy() data2 = TEST_DATA_2.copy() result = combine_tables([data1[2:3], data2[3:4]], ["key"]) self.assertEqual(1, len(result)) self.assertEqual("2", result.loc[0, "value_column_1"]) self.assertEqual("2", result.loc[0, "value_column_2"])
def process_location(station_cache: Dict[str, DataFrame], stations: DataFrame, location: Series): nearest = stations.copy() nearest["key"] = location.key # Get the nearest stations from our list of stations given lat and lon nearest["distance"] = NoaaGsodDataSource.haversine_distance( nearest, location.lat, location.lon) # Filter out the 10 nearest stations nearest = nearest[nearest.distance < _DISTANCE_THRESHOLD].sort_values( "distance").iloc[:10] # Early exit: no stations found within distance threshold if len(nearest) == 0 or all(station_id not in station_cache for station_id in nearest.id.values): return DataFrame(columns=_OUTPUT_COLUMNS) # Get station records from the cache nearest = nearest.rename(columns={ "id": "noaa_station", "distance": "noaa_distance" }) station_tables = [ station_cache.get(station_id) for station_id in nearest.noaa_station.values ] station_tables = [ table.merge(nearest, on="noaa_station") for table in station_tables if table is not None ] data = combine_tables(reversed(station_tables), ["date", "key"]) # Return all the available data from the records return data[[col for col in _OUTPUT_COLUMNS if col in data.columns]]
def station_records(station_cache: Dict[str, DataFrame], stations: DataFrame, location: Series): nearest = stations.copy() nearest["key"] = location.key # Get the nearest stations from our list of stations given lat and lon nearest["distance"] = NoaaGhcnDataSource.haversine_distance( nearest, location.lat, location.lon) # Filter out the 10 nearest stations nearest = nearest[nearest.distance < _DISTANCE_THRESHOLD].sort_values( "distance").iloc[:20] # Early exit: no stations found within distance threshold if len(nearest) == 0: return DataFrame(columns=_OUTPUT_COLUMNS) # Query the cache and pull data only if not already cached for station_id in filter(lambda x: x not in station_cache, nearest.id.values): # Read the records from the nearest station # Use our mirror since NOAA's website is very flaky station_url = _STATION_URL_TPL.format(station_id) data = read_csv( station_url, usecols=lambda column: column in _COLUMN_MAPPING.keys()) data = data.rename(columns=_COLUMN_MAPPING) # Convert temperature to correct values data["minimum_temperature"] = data["minimum_temperature"].apply( NoaaGhcnDataSource.fix_temp) data["maximum_temperature"] = data["maximum_temperature"].apply( NoaaGhcnDataSource.fix_temp) # Get only data for 2020 and add location values data = data[data.date > "2019-12-31"] # Save into the cache station_cache[station_id] = data # Get station records from the cache nearest = nearest.rename(columns={ "id": "noaa_station", "distance": "noaa_distance" }) station_tables = [ station_cache[station_id] for station_id in nearest.noaa_station.values ] station_tables = [table.merge(nearest) for table in station_tables] data = combine_tables(reversed(station_tables), ["date", "key"]) # Return all the available data from the records return data[[col for col in _OUTPUT_COLUMNS if col in data.columns]]