def get_noaa_data(start_time, end_time): columns = [ 'usaf', 'wban', 'datetime', 'latitude', 'longitude', 'elevation', 'windAngle', 'windSpeed', 'temperature', 'stationName', 'p_k' ] isd = NoaaIsdWeather(start_time, end_time, cols=columns) noaa_df = isd.to_pandas_dataframe() df_filtered = noaa_df[noaa_df["usaf"].isin(usaf_list)] df_filtered.reset_index(drop=True) print("Received {0} rows of training data between {1} and {2}".format( df_filtered.shape[0], start_time, end_time)) return df_filtered
def __get_weather_data_for_day(self, day, lat, lon): ''' Gets weather data for a given day and pushes it to eventhub ''' try: # get data for given date range. start_time = time.time() LOG.info("Getting data for " + day.strftime("%m/%d/%Y, %H:%M:%S")) weather_data = NoaaIsdWeather(day, day) LOG.info("Successfully got data for " + day.strftime("%m/%d/%Y, %H:%M:%S")) # get the data into a pandas data frame, so we can filter and process weather_data_df = weather_data.to_pandas_dataframe() LOG.info("Took {} seconds to get the data.".format(time.time() - start_time)) # out of the lat longs available get the nearest points LOG.info("Finding the nearest latitude and longitude from the available data") (nearest_lat, nearest_lon) = UtilFunctions.find_nearest_lat_longs_in_data(weather_data_df, lat, lon) LOG.info("nearest lat, lon: [" + str(nearest_lat) + "," + str(nearest_lon) + "]") # filter the data to this lat and lon LOG.info("Filtering the data to nearest lat, lon") filtered_weather_data = weather_data_df[(weather_data_df['latitude'] == nearest_lat) & (weather_data_df['longitude'] == nearest_lon)] LOG.info(filtered_weather_data) # push the data to eventhub LOG.info("Pushing data to eventhub") wdl_id = self.__push_weather_data_to_farmbeats(filtered_weather_data) LOG.info("Successfully pushed data") # Update the status for the job if FLAGS.job_status_blob_sas_url: msg = "Weather data pushed for start_date: {} to end_date: {}\n for nearest_lat: {}, nearest_lon: {}\n provided lat:{}, lon:{}".format( FLAGS.start_date, FLAGS.end_date, nearest_lat, nearest_lon, FLAGS.latitude, FLAGS.longitude) writer = JobStatusWriter(FLAGS.job_status_blob_sas_url) output_writer = writer.get_output_writer() output_writer.set_prop("WeatherDataLocationId: ", wdl_id) output_writer.set_prop("Message: ", msg) writer.set_success(True) writer.flush() except Exception as err: # Update the status in failure if FLAGS.job_status_blob_sas_url: writer = JobStatusWriter(FLAGS.job_status_blob_sas_url) writer.set_success(False) writer.flush() raise JobError(str(err), JobConstants.INTERNAL_ERROR, False)
def __get_weather_data_for_date_range(self, start_date, end_date): ''' Returns all the weather data for a given date range. ''' start_date = parser.parse(FLAGS.start_date) end_date = parser.parse(FLAGS.end_date) return NoaaIsdWeather(start_date, end_date)
from datetime import datetime, timedelta from azureml.core import Dataset, Datastore, Workspace from azureml.opendatasets import NoaaIsdWeather # get workspace and datastore ws = Workspace.from_config() dstore = ws.get_default_datastore() # adjust parameters as needed target_years = list(range(2010, 2020)) start_month = 1 # get data for year in target_years: for month in range(start_month, 12 + 1): path = 'weather-data/{}/{:02d}/'.format(year, month) try: start = datetime(year, month, 1) end = datetime(year, month, monthrange(year, month)[1]) + timedelta(days=1) isd = NoaaIsdWeather(start, end).to_pandas_dataframe() isd = isd[isd['stationName'].str.contains('FLORIDA', regex=True, na=False)] os.makedirs(path, exist_ok=True) isd.to_parquet(path + 'data.parquet') except Exception as e: print('Month {} in year {} likely has no data.\n'.format( month, year)) print('Exception: {}'.format(e))
# imports import pickle from datetime import datetime from azureml.opendatasets import NoaaIsdWeather from sklearn.linear_model import LinearRegression # get weather dataset start = datetime(2019, 1, 1) end = datetime(2019, 1, 14) isd = NoaaIsdWeather(start, end) # convert to pandas dataframe and filter down df = isd.to_pandas_dataframe().fillna(0) df = df[df['stationName'].str.contains('FLORIDA', regex=True, na=False)] # features for training X_features = ['latitude', 'longitude', 'temperature', 'windAngle', 'windSpeed'] y_features = ['elevation'] # write the training dataset to csv training_dataset = df[X_features + y_features] training_dataset.to_csv('training.csv', index=False) # train the model X = training_dataset[X_features] y = training_dataset[y_features] model = LinearRegression().fit(X, y) # save the model as a .pkl file with open('elevation-regression-model.pkl', 'wb') as f: pickle.dump(model, f)