def get(cls): if cls.fse is None: from azure.storage.blob import BlobClient logger.debug("========PREPARING TO DOWNLOAD FSE MODEL") storage_account_name = args.storage_account_name container_client_credential = ClientSecretCredential( tenant_id=args.directory_id, client_id=args.application_id, client_secret=SERVICE_PRINCIPAL_SECRET) blob = BlobClient( account_url= f"https://{storage_account_name}.blob.core.windows.net", container_name=args.domain_expert_container_name, blob_name=args.fse_folder + "/" + args.fse_file, credential=container_client_credential) file_size = blob.get_blob_properties().size if os.path.exists(args.fse_file) == False or ( os.path.exists(args.fse_file) and os.stat(args.fse_file).st_size != file_size): with open(args.fse_file, "wb") as my_blob: blob_data = blob.download_blob() blob_data.readinto(my_blob) fse, ilist, jobs_lookup = dill.load(gzip.open(args.fse_file, "rb")) cls.fse = fse cls.ilist = ilist cls.jobs_lookup = jobs_lookup return cls.fse, cls.ilist, cls.jobs_lookup
def test_user_delegation_sas_for_container(self): # SAS URL is calculated from storage key, so this test runs live only pytest.skip("Current Framework Cannot Support OAUTH") if TestMode.need_recording_file(self.test_mode): return # Arrange token_credential = self.generate_oauth_token() service_client = BlobServiceClient(self._get_oauth_account_url(), credential=token_credential) user_delegation_key = service_client.get_user_delegation_key( datetime.utcnow(), datetime.utcnow() + timedelta(hours=1)) container_client = service_client.create_container( self.get_resource_name('oauthcontainer')) token = container_client.generate_shared_access_signature( expiry=datetime.utcnow() + timedelta(hours=1), permission=ContainerPermissions.READ, user_delegation_key=user_delegation_key, account_name='emilydevtest') blob_client = container_client.get_blob_client( self.get_resource_name('oauthblob')) blob_content = self.get_random_text_data(1024) blob_client.upload_blob(blob_content, length=len(blob_content)) # Act new_blob_client = BlobClient(blob_client.url, credential=token) content = new_blob_client.download_blob() # Assert self.assertEqual(blob_content, b"".join(list(content)).decode('utf-8'))
def blob_to_df(blob_client: BlobClient, pandas_kwargs: Optional[Dict[str, Any]] = None) -> pd.DataFrame: """Download a blob and return a pandas DataFrame.""" # check for kwargs if not pandas_kwargs: pandas_kwargs = {} # check the file extension extension = Path(blob_client.blob_name).suffix # download blob and return DataFrame data_stream = io.BytesIO(blob_client.download_blob().readall()) if extension == ".csv": return pd.read_csv(data_stream, **pandas_kwargs) if extension == ".txt": return pd.read_table(data_stream, **pandas_kwargs) if extension == ".json": return pd.read_json(data_stream, **pandas_kwargs) if extension in [".xls", ".xlsx"]: if extension == ".xls" and "engine" not in pandas_kwargs.keys(): pandas_kwargs.update({"engine": "xlrd"}) elif "engine" not in pandas_kwargs.keys(): pandas_kwargs.update({"engine": "openpyxl"}) return pd.read_excel(data_stream, **pandas_kwargs) raise TypeError(f"{extension} files are not yet supported.")
def test_get_blob_by_full_url(storage_account, sas_token): client = BlobClient( f"https://{storage_account}.blob.core.windows.net/{CONTAINER_NAME}/{FILE_NAME}", credential=sas_token, ) content = client.download_blob().content_as_text() assert len(content) > 0
def download_blob(container_name, blob_name): blob = BlobClient(env.REPORTS_STGACCT_URI, container_name, blob_name, credential=CREDENTIALS) blob_data = blob.download_blob() return blob_data
def downloadFile(funcData: dict, blobClient: BlobClient) -> str: try: localFileName = getUniqueTempFileName() with open(localFileName, 'wb') as download_file: download_file.write(blobClient.download_blob().readall()) return localFileName except Exception as _e: raise RuntimeError('Exiting due to unable to download source file: {:s}. Error message: {:s}'.format(json.dumps(funcData), str(_e))) from _e
def download_blob(blob_id: str, local_path: str) -> None: """Download file from remote storage to local path.""" bc = BlobClient(account_url=account_url, container_name=facts_container, blob_name=blob_id, snapshot=None, credential=facts_sas_token) with open(local_path, "wb") as download_file: download_file.write(bc.download_blob().readall()) return
def get(cls, args, SERVICE_PRINCIPAL_SECRET, logger): if "model" not in cls.gensim_model: from azure.storage.blob import BlobClient # logger.debug("========PREPARING TO SETUP GENSIM") storage_account_name = args.storage_account_name container_client_credential = ClientSecretCredential( tenant_id=args.directory_id, client_id=args.application_id, client_secret=SERVICE_PRINCIPAL_SECRET) blob = BlobClient( account_url= f"https://{storage_account_name}.blob.core.windows.net", container_name=args.domain_expert_container_name, blob_name=args.domain_expert_folder_path + "/" + args.domain_expert_file_name, credential=container_client_credential) file_size = blob.get_blob_properties().size if os.path.exists(args.domain_expert_file_name) is False or ( os.path.exists(args.domain_expert_file_name) and os.stat( args.domain_expert_file_name).st_size != file_size): print("getting domain expert") with open(args.domain_expert_file_name, "wb") as my_blob: if logger is not None: logger.debug("downloading domain expert") else: print("downloading domain expert") blob_data = blob.download_blob() blob_data.readinto(my_blob) # logger.debug("end downloading") if logger is not None: logger.debug("after domain expert download") else: print("after domain expert download") from gensim.models import Word2Vec try: cls.gensim_model["model"] = Word2Vec.load( args.domain_expert_file_name) # logger.debug("========MODEL SET") cls.gensim_model["vocab"] = cls.gensim_model["model"].wv.vocab # logger.debug("========VOCAB SET") except Exception as e: if logger is not None: logger.debug( f"******************========== EXCEPTION ENCOUNTERED {e}" ) else: print( f"******************========== EXCEPTION ENCOUNTERED {e}" ) return cls.gensim_model
def test_create_container_with_public_access_blob(self): # Arrange container_name = self._get_container_reference() # Act container = self.bsc.get_container_client(container_name) created = container.create_container(public_access='blob') blob = container.get_blob_client("blob1") blob.upload_blob(u'xyz') anonymous_service = BlobClient(self._get_account_url(), container=container_name, blob="blob1") # Assert self.assertTrue(created) anonymous_service.download_blob()
def __getitem__(self, item: str): blob = BlobClient(account_url=self.account_url, container_name=self.container_name, blob_name=item, credential=self.credential) blob_data = b'' for data in blob.download_blob(): blob_data += data return blob_data
def download_from_azure(self, source_file, dest_file): try: blob = BlobClient(account_url=env.AZURE_STORAGE_ACCOUNT, container_name=self.container_name, blob_name=source_file, credential=env.STORAGE_SAS_KEY) with open(join_paths([get_project_root(), dest_file]), "wb+") as f: data = blob.download_blob() data.readinto(f) except azure.core.exceptions.ResourceNotFoundError: logger.info('Blob not found %s', source_file)
def download_file(self, filename): src_blob = BlobClient( self.blob_service_client.url, container_name=self.settings.storage_container, blob_name=filename, credential=self.sas_token ) path_to_file = os.path.dirname(filename) download_path_for_file = os.path.join(self.settings.download_location, path_to_file) os.makedirs(download_path_for_file, exist_ok=True) download_file_path = os.path.join(self.settings.download_location, filename) print("\nDownloading blob to \n\t" + download_file_path) with open(download_file_path, "wb") as download_file: download_file.write(src_blob.download_blob().readall()) return
def _download(account_url: str, container_name: str, credential: str, blob_name: str) -> Tuple[Optional[bytes], str, Optional[str]]: """Internal method to download YAML's from Azure via BlobClient Arguments: account_url {str} -- Azure account url container_name {str} -- Azure container name credential {str} -- Azure credential token blob_name {str} -- Blob name to download Returns: tuple -- URL content, target file and placeholder for error """ blob = BlobClient(account_url=account_url, container_name=container_name, credential=credential, blob_name=blob_name) return (blob.download_blob().readall(), blob_name, None)
def downloadapi(container, blob, accountname=None, accountkey=None, replace=True, blob_target=None): block_blob_service = BlobClient(accountname, container, blob, credential=accountkey) blob_target = blob_target or os.path.join(os.getcwd(), blob) if not replace and os.path.isfile(blob_target): logger.info( "will skip download, {} already exists and replace=False".format( blob_target)) return logger.info("downloading '{}/{}' to '{}'".format(container, blob, blob_target)) with open(blob_target, "wb") as f: blob_data = block_blob_service.download_blob() blob_data.readinto(f) logger.info("finished download")
def get(cls, args, SERVICE_PRINCIPAL_SECRET, logger): if cls.big_taxo is None: from azure.storage.blob import BlobClient if logger is not None: logger.debug("========PREPARING TO DOWNLOAD TAXNOMY") else: print("========PREPARING TO DOWNLOAD TAXNOMY") storage_account_name = args.storage_account_name container_client_credential = ClientSecretCredential( tenant_id=args.directory_id, client_id=args.application_id, client_secret=SERVICE_PRINCIPAL_SECRET) blob = BlobClient( account_url= f"https://{storage_account_name}.blob.core.windows.net", container_name=args.domain_expert_container_name, blob_name=args.domain_expert_folder_path + "/taxo/" + "taxo_de.dill.gz", credential=container_client_credential) file_size = blob.get_blob_properties().size if os.path.exists("taxo_de.dill.gz") is False or ( os.path.exists("taxo_de.dill.gz") and os.stat("taxo_de.dill.gz").st_size != file_size): with open("taxo_de.dill.gz", "wb") as my_blob: # print("downloading") blob_data = blob.download_blob() blob_data.readinto(my_blob) general_taxo_dict = dill.load(gzip.open("taxo_de.dill.gz", "rb")) cls.big_taxo = general_taxo_dict return cls.big_taxo
def download_blob(blob: BlobClient, blob_info: BlobProperties, destination: str, overwrite: bool, attempt=0) -> dict: destination_filename = pathlib.Path(f'{destination}/{blob.blob_name}') blob_md5 = blob_info['metadata']['md5'] operation = {'operation': 'no-op'} # Default return if not overwrite and os.path.isfile(destination_filename): log.error(f'file {destination_filename} already exists and is not set to overwrite.') local_md5 = get_md5sum(destination_filename) log.error(f'local md5: {local_md5}, azure md5: {blob_md5}') return operation elif overwrite and os.path.isfile(destination_filename): local_md5 = get_md5sum(destination_filename) log.info(f'file {destination_filename} already exists locally. md5: {local_md5}') if local_md5 == blob_md5: log.info(f'local md5sum matches azure md5sum of {local_md5}') return operation log.info(f'Downloading {blob.blob_name} to {destination}/{blob.blob_name}.') log.debug('Creating path %s.', destination_filename.parent) os.makedirs(destination_filename.parent, exist_ok=True) with open(destination_filename, 'wb') as fp: download_client = blob.download_blob() operation = download_client.download_to_stream(fp) local_md5 = get_md5sum(destination_filename) if local_md5 == blob_md5: log.info(f'downloaded local md5sum of {destination_filename} matches azure md5sum of {local_md5}') else: log.error(f'downloaded file {destination_filename} md5sum mismatch with cloud.') if attempt >= 2: log.error(f'{destination_filename} md5sum mismatch after 3 tries downloading, giving up.') return operation attempt =+ 1 operation = download_blob(blob, blob_info, destination, overwrite, attempt) return operation
frame.loc[key] = f else: f = np.zeros(len(categories)) for j in i[sublist]: f[categories.index(j)] = 1 frame.loc[d] = f except: pass ann = annotator.split("-")[4] final_frame = frame.add_prefix(str(ann)) return final_frame filename = "../data/temp_annotations.jsonl" with open(filename, "wb") as f: f.write(blob.download_blob().readall()) annotations = list(read_jsonl(filename)) uq_annotators = set([x["_session_id"] for x in annotations]) dfs = [] for annotator in uq_annotators: sub_annotations = [an for an in annotations if an["_session_id"] == annotator] sub_df = pd.DataFrame(sub_annotations) dummy_df = sublists_to_dummies(sub_df, "accept", annotator=annotator, index_key="text") final_df = dummy_df.reset_index() dfs.append(final_df) df_final = reduce(lambda left, right: pd.merge(left, right, on="index"), dfs) print(df_final)
import azure.functions as func from azure.storage.blob import BlobClient import pickle import pandas as pd blob = BlobClient( account_url="https://blobhikeathon.blob.core.windows.net", container_name="blobcon", blob_name="model.pkl", credential= "qC8kJ7CvBvoEDAFHrNy2E3VJNCKFXkEyh2wb2yozxOkN+r7yGBgYxMy+cwS8UjEjj7hm3+tQWAj0bzAp3YVZog==" ) with open("model.pkl", "wb") as f: data = blob.download_blob() data.readinto(f) model = pickle.load(open('model.pkl', 'rb')) blob = BlobClient( account_url="https://blobhikeathon.blob.core.windows.net", container_name="blobcon", blob_name="test.csv", credential= "qC8kJ7CvBvoEDAFHrNy2E3VJNCKFXkEyh2wb2yozxOkN+r7yGBgYxMy+cwS8UjEjj7hm3+tQWAj0bzAp3YVZog==" ) with open("test.csv", "wb") as f: data = blob.download_blob() data.readinto(f)
def main(mytimer: func.TimerRequest) -> None: utc_timestamp = datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat() # Determine execution environment #if "rdo-env-production" in dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get(): # DEVELOPMENT = False historical_period = 5 # days forecasts_per_day = 4 # 0, 6, 12, 18 def d_print(phrase): print("{} {}".format(datetime.datetime.now(), phrase)) ## North American Mesoscale forecast for Alaska nam_AK_forecast_url_gf = "https://nomads.ncep.noaa.gov/cgi-bin/filter_nam_alaskanest.pl" nam_AK_forecast_dir_format = "nam.{}{:02d}{:02d}" # {1} is the year in YYYY format # {2} is the month in MM format # {3} is the day in DD format nam_AK_forecast_file_format = "nam.t{:02d}z.alaskanest.hiresf{:02d}.tm00.grib2" # {1} is time the forecast is made in hours 00, 06, 12, 18 # {2} is the forecast hour (in the future) 00, 01, 02...60 nam_AK_forecast_folder = "nam_alaska_forecasts" nam_AK_forecast_var_list = ['PRES', # Pressure (Pa) 'TMP', # Temperature (K) 'SPFH', # Specific humidity (kg/kg) 'SSRUN', # Storm surface runoff (kg/m^2) 'BGRUN', # Baseflow-groundwater runoff (kg/m^2) 'SNOD', # Snow depth (m) 'EVP', # Evaporation (kg/m^2) 'PRATE', # Precipitation rate (kg/m^2/s) 'DSWRF', # Downward short-wave radiation (W/m^2) 'DLWRF', # Downward long-wave radiation (W/m^2) 'SOILM', # Soil moisture content (kg/m^2) 'TSOIL', # Soil temperature (K) 'APCP', # Total precipitation (kg/m^2) 'UGRD', # U-component of wind (m/s) 'VGRD', # V-component of wind (m/s) 'CNWAT', # Plant canopy surface water (kg/m^2) 'NCPCP', # Large-scale precipitation (non-convective) (kg/m^2) 'LHTFL'] # Latent heat net flux (W/m^2) ## Inputs red_dog_lat = 68.0756 red_dog_lon = -162.8561 red_dog_NAM_lat = 68.07837 # closest points in the NAM dataset red_dog_NAM_lon = -162.85785 start_hour = 0 end_hour = 45 # ADLS path to save files save_directory_folder = "Raw_Data/Weather_NAM/" model_output_file_name_fmt = "Model_Data/Model_Output_TDS/{}" # if DEVELOPMENT: # save_directory_folder = save_directory_folder.replace("Raw_Data", "Raw_Data_Dev") # model_output_file_name_fmt = model_output_file_name_fmt.replace("Model_Data", "Model_Data_Dev") save_directory_fmt = save_directory_folder + "{}{:02d}{:02d}/{:02d}/" # YYYYMMDD/HH # Query parameters for file retrieval query_params = { 'file': None, # file name 'subregion': 'on', # retrieve data within bounds 'leftlon': red_dog_lon - 0.29, # left longitude bounds 'rightlon': red_dog_lon + 0.21, # right longitude bounds 'toplat': red_dog_lat + 0.21, # top latitude bounds 'bottomlat': red_dog_lat - 0.30, # bottom latitude bounds 'dir': None, # forecast directory 'lev_surface': 'on', # surface level 'lev_10_m_above_ground': 'on', # 10 m above ground 'lev_0-2_m_below_ground': 'on', # 0-2 m below ground 'lev_0-0.1_m_below_ground': 'on', # 0-0.1 m below ground 'lev_0.1-0.4_m_below_ground': 'on', # 0.1-0.4 m below ground 'lev_0-1_m_below_ground': 'on', # 0-1 m below ground 'lev_0.4-1_m_below_ground': 'on' # 0.4-1 m below ground } # Current time in UTC current_time = datetime.datetime.utcnow() d_print(">>> Downloading NAM Alaska forecasts ({})".format(current_time)) # Populate forecast directory forecast_dir = nam_AK_forecast_dir_format.format(current_time.year, current_time.month, current_time.day) d_print("... using directory: " + forecast_dir) # Populate query parameters (sans file name) query_params["dir"] = "/" + forecast_dir query_params = {**query_params, **{"var_" + v: "on" for v in nam_AK_forecast_var_list}} # Define download parameters and create directory if not existing forecast_prod_hour = current_time.hour - (current_time.hour % 6) # 0, 6, 12, 18 d_print("... using forecast production hour: {}".format(forecast_prod_hour)) save_directory = save_directory_fmt.format(current_time.year, current_time.month, current_time.day, forecast_prod_hour) temp_dir = tempfile.TemporaryDirectory() print(temp_dir.name) # Loop through forecast files and download start_download_time = time.time() for forecast_hour in range(start_hour, end_hour + 1): query_params["file"] = nam_AK_forecast_file_format.format(forecast_prod_hour, forecast_hour) d_print("... ... retrieving data from file: " + query_params["file"]) forecast_response_temp_file_name = temp_dir.name + "/" + query_params["file"] print("forecast_response_temp_file_name", forecast_response_temp_file_name) blob_path_file_name = save_directory + query_params["file"] # Create the BlobServiceClient object which will be used to create a container client conn_str = os.environ["AzureBlobStorage"] blob_service_client = BlobServiceClient.from_connection_string(conn_str) # Container name container_name = os.environ["ContainerName"] # Create the container client container_client = blob_service_client.get_container_client(container_name) #### UPLOAD TO AZURE #### # Create a blob client using the local file name as the name for the blob blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path_file_name) retries = 0 while True: try: response = requests.get(nam_AK_forecast_url_gf, params=query_params, timeout=120) response.raise_for_status() # Raise exception for bad status code open(forecast_response_temp_file_name, "wb").write(response.content) print("\nUploading to Azure Storage as blob:\n\t" + blob_path_file_name) # Upload the created file to Azure Blob with open(forecast_response_temp_file_name, "rb") as data: blob_client.upload_blob(data, overwrite=True) break except: retries += 1 time.sleep(retries * 5) if retries == 2: raise Exception("Max number of retries exceeded") d_print("<<< Download complete in {:.2f} seconds".format(time.time() - start_download_time)) d_print(">>> Starting post processing from GRIB2 to Parquet") data = {} for forecast_hour in range(start_hour, end_hour + 1): file_path = temp_dir.name + "/" + nam_AK_forecast_file_format.format(forecast_prod_hour, forecast_hour) print("file_path", file_path) grb = pygrib.open(file_path) for measure in grb: # Split label string into constituents # Ex: # 1:Surface pressure:Pa (instant):polar_stereographic:surface:level 0:fcst time 30 hrs:from 202004061800 description = str(measure).split(":")[1].replace(" ", "_") measurement_type = str(measure).split(":")[2].split("(")[1].strip(")") level = "_".join(str(measure).split(":")[5].split()[1:]) forecast_date = str(measure).split(":")[7].split()[1] # Extract arrays for values and coordinates values, lats, lons = measure.data() # Build column name and forecast date column = "{}_{}_{}".format(description, level, measurement_type) forecast_time = datetime.datetime.strptime(forecast_date, "%Y%m%d%H%M") forecast_time += timedelta(hours=forecast_hour) # Iterate through values for idx, value in np.ndenumerate(values): coords = (lats[idx], lons[idx]) if coords not in data: data[coords] = {} if forecast_time not in data[coords]: data[coords][forecast_time] = {} data[coords][forecast_time][column] = value # Construct dictionary of dataframes and concatenate data_df = {k: pd.DataFrame(data[k]).T for k, v in data.items()} comb_output_file_name_fmt = "nam.t{:02d}z.alaskanest.hirescombined.tm00.parquet" comb_output_file_name = save_directory + comb_output_file_name_fmt.format(forecast_prod_hour) data_df_concat = pd.concat(data_df, axis=0) # temp location for parquet local_path_blob_concat_parquet = temp_dir.name + '/data_df_concat.parquet' # Save the historical file df_historical = pd.DataFrame() df_historical.to_parquet(local_path_blob_concat_parquet) #### UPLOAD TO AZURE #### # Create a blob client using the local file name as the name for the blob blob_client = blob_service_client.get_blob_client(container=container_name, blob=comb_output_file_name) # Upload the created file to Azure Blob with open(local_path_blob_concat_parquet, "rb") as data: blob_client.upload_blob(data, overwrite=True) d_print("<<< Completed post processing") d_print(">>> Starting post processing from GRIB2 to Parquet") data = {} for forecast_hour in range(start_hour, end_hour + 1): file_path = temp_dir.name + "/" + nam_AK_forecast_file_format.format(forecast_prod_hour, forecast_hour) grb = pygrib.open(file_path) print(grb) blob_name_starts_with = "Raw_Data/Weather_NAM/" # if DEVELOPMENT: # blob_name_starts_with = blob_name_starts_with.replace("Raw_Data", "Raw_Data_Dev") blob_list = container_client.list_blobs(name_starts_with=blob_name_starts_with) date_list = [] for blob in blob_list: blob_folder_date = blob.name.split("/")[2] if blob_folder_date not in date_list: date_list.append(blob_folder_date) last_5_day_date_list = date_list[-1 * (historical_period + 1):] forecast_files = [] # looping through dates in last_5_day_date_list for weather_date in last_5_day_date_list: hour_list = [] blob_list = container_client.list_blobs(name_starts_with=blob_name_starts_with + weather_date + '/') # making a hour_list from hours in last_5_day_date_list for blob in blob_list: blob_folder_hour = blob.name.split("/")[3] if blob_folder_hour not in hour_list: hour_list.append(blob_folder_hour) # making a list of forecast Parquet files for hour in hour_list: blob_list_for_parquet = container_client.list_blobs( name_starts_with=blob_name_starts_with + weather_date + '/' + hour) for blob in blob_list_for_parquet: if '.parquet' in blob.name: forecast_files.append(blob.name) # Subset forecast files to last 5 days forecast_files = forecast_files[-1 * (historical_period * forecasts_per_day + 1):] local_path_blob_parquet = temp_dir.name + '/forecast.parquet' # Merge first 6h of every forecast into shared dataframe, excluding the most recent file, which is merged in entirety prev_accumulated = pd.DataFrame() account_url = os.environ["AccountUrl"] credential = os.environ["AccountCredential"] for forecast_file_path in forecast_files: blob = BlobClient(account_url = account_url, container_name=container_name, blob_name=forecast_file_path, credential=credential) with open(local_path_blob_parquet, "wb") as f: data = blob.download_blob() data.readinto(f) df = pd.read_parquet(local_path_blob_parquet) acc_col = [x for x in df.columns if x.endswith('_accum')] inst_col = [x.replace('_accum', '_instant') for x in acc_col] # transform accumulated columns to instant by storing the instant elements in a cloned dataframe df_accumulated = df[acc_col].reset_index() # mask for values to change (hours 0,2,3,5,6,8,9,11,12,14,15,17,18,20,21,23 without the first on in the file) change_mask = (((df_accumulated['level_2'].dt.hour - 1) % 3) > 0) & ( df_accumulated['level_2'] > df_accumulated['level_2'].min()) # mask for values to subtract from values to change (hours 1,2,4,5,7,8,10,11,13,14,16,17,19,20,22,23) subtract_mask = ((df_accumulated['level_2'].dt.hour - 1) % 3) < 2 # subtract previous column values for masked flows (change_mask and subtract_mask) df_accumulated.loc[change_mask, acc_col] = df_accumulated.loc[change_mask, acc_col].values - df_accumulated.loc[ subtract_mask, acc_col].values df_accumulated.set_index(['level_0', 'level_1', 'level_2'], inplace=True) # rename column names from *_accum to *_instant df_accumulated.rename(columns={x: y for x, y in zip(acc_col, inst_col)}, inplace=True) # write the cloned instant columns to the original dataframe for col in inst_col: df[col] = df_accumulated[col] # update the instant values according to the 6h forecast of the previous forecast and store 6h forecast for next iteration df.update(prev_accumulated) prev_accumulated = df[inst_col][ df.index.get_level_values(2) == df.index.get_level_values(2).min() + timedelta(hours=24 / forecasts_per_day)] # truncate forecast at 6h for all except for the last forecast file if forecast_file_path != forecast_files[-1]: df = df[ df.index.get_level_values(2) < df.index.get_level_values(2).min() + timedelta(hours=24 / forecasts_per_day)] df_historical = df_historical.append(df) # sort the indices df_historical.sort_index(inplace=True) # temp location for parquet local_path_blob_historical_parquet = temp_dir.name + '/historical.parquet' # Save the historical file df_historical.to_parquet(local_path_blob_historical_parquet) #### UPLOAD TO AZURE #### # Create a blob client using the local file name as the name for the blob blob_client = blob_service_client.get_blob_client(container=container_name, blob=model_output_file_name_fmt.format( "nam.hirescombined.5days.parquet")) # Upload the created file to Azure Blob with open(local_path_blob_historical_parquet, "rb") as data: blob_client.upload_blob(data, overwrite=True) d_print("<<< Merging complete") d_print(">>> Starting image processing from Parquet to PNG") # Reset index and select coordinates closest to RDO plot_df = data_df_concat.reset_index().round({"level_0": 5, "level_1": 5}) # plot_df = plot_df[(plot_df.level_0 == red_dog_NAM_lat) & (plot_df.level_1 == red_dog_NAM_lon)] # plot_df.set_index("level_2", inplace=True) # Datetime is the index plot_df = plot_df.groupby("level_2").mean() # Engineer features # Temperature (K) -> (C) and (F) # Snow depth (m) # Precipitation rate (kg/m^2/s) -> (mm) # Storm surface runoff (kg/m^2) -> instant plot_df["Temperature_0_instant_C"] = plot_df["Temperature_0_instant"] - 273.15 plot_df["Temperature_0_instant_F"] = (plot_df["Temperature_0_instant"] - 273.15) * (9.0 / 5.0) + 32.0 plot_df["Precipitation_rate_0_instant_mm"] = plot_df[ "Precipitation_rate_0_instant"] * 60.0 * 60.0 # 60 min/hr * 60 sec/min # SSRUN storm_surface_runoff_0_instant = [] last_val = None for idx, row in plot_df.iterrows(): ssrun = row["Storm_surface_runoff_0_accum"] if len(storm_surface_runoff_0_instant) == 0 or idx.hour % 3 == 1: storm_surface_runoff_0_instant.append(ssrun) else: storm_surface_runoff_0_instant.append(ssrun - last_val) last_val = ssrun plot_df["Storm_surface_runoff_0_instant"] = storm_surface_runoff_0_instant # Adjust timezone to AKST plot_df.index = plot_df.index.tz_localize(tz='UTC') plot_df.index = plot_df.index.tz_convert(tz="US/Alaska") plot_df.index = plot_df.index.tz_localize(None) # Create plots for column in ["Temperature_0_instant_C", "Snow_depth_0_instant", "Precipitation_rate_0_instant_mm", "Storm_surface_runoff_0_instant"]: plt.rc('font', size=14) plt.rc('xtick', color="#999999") plt.rc('ytick', color="#999999") plt.rc('axes', edgecolor='#999999') plt.figure(figsize=(5, 2.5)) plt.xticks(rotation=65) ax = plt.gca() ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d %HH')) plt.plot_date(plot_df.index, plot_df[column], lw=2.0, ls="-", c="#336699") # temp location for image local_path_blob_image = temp_dir.name + '/image.png' # Save the image locally in the temp location specified above plt.savefig(local_path_blob_image, dpi=96, bbox_inches="tight", transparent=True) # Raw_Data location on Azure output_file_name = save_directory + "nam.t{:02d}z.".format(forecast_prod_hour) + column + ".tm00.png" # Model_Output location on Azure model_output_file_name = model_output_file_name_fmt.format(column + ".png") #### UPLOAD TO AZURE #### # Loop through the Blob locations above and store the image from temp location to Azure Blob respectively. # Create a blob client using the local file name as the name for the blob for blob_location in [output_file_name, model_output_file_name]: blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_location) # Upload the image from temp location to Azure Blob with open(local_path_blob_image, "rb") as data: blob_client.upload_blob(data, overwrite=True) plt.close() d_print("<<< Completed image processing")
def rec(gstcod): try: # Read data from Blob Storage blob = BlobClient( account_url="https://recommendtest.blob.core.windows.net/", container_name="demo", blob_name="Guest Recommender.csv", credential= "eZ6rUu9dIV3I0ZZgyXDm2yAe+dJJ8m7C3YTlMuOUqHD7EWck9hLFVEamxJa9RQgIty81t32zNPjUie9Mt4rd9Q==" ) data = blob.download_blob() df = pd.DataFrame([ x.replace('\r', '').split(',') for x in data.content_as_text().split('\n') ]) df.columns = df.iloc[0] df = df.drop(0) df = df.reset_index(drop=True) pd.set_option('display.max_columns', None) df['ARRDAT'] = pd.to_datetime(df['ARRDAT'], format='%Y%m%d') df['DEPDAT'] = pd.to_datetime(df['DEPDAT'], format='%Y%m%d') df['RMNTS'] = df['DEPDAT'] - df['ARRDAT'] df['RMNTS'] = df['RMNTS'].astype('timedelta64[D]') df.drop([ 'REGNUB', 'FOLNUB', 'ROMNUB', 'ARRDAT', 'DEPDAT', 'TRC', 'COM', 'CHQ', 'TRD', 'CAS', 'ADQ', 'CRD', 'ADC', 'BOH', 'ADV', 'POT' ], axis=1, inplace=True) # Casting columns as data is readed from blob storage df[[ 'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH', 'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC', 'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD', 'JBR', 'MIS', 'AP', 'FX' ]] = df[[ 'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH', 'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC', 'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD', 'JBR', 'MIS', 'AP', 'FX' ]].astype(float) df[[ 'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH', 'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC', 'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD', 'JBR', 'MIS', 'AP', 'FX' ]] = df[[ 'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH', 'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC', 'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD', 'JBR', 'MIS', 'AP', 'FX' ]].div(df.RMNTS, axis=0) df.loc[df['RMNTS'] == 0] df.isnull().sum( ) # This is to confirm that division by 0 has caused the NaN df.dropna( axis=0, how='any', inplace=True ) # if any value in the row is NaN, it will be removed. Else use how='all' df.drop(['SPC'], axis=1, inplace=True) df['BBD'].describe() df.drop(['BBD'], axis=1, inplace=True) df.drop(['GBR'], axis=1, inplace=True) # #### Find all columns of the dataframe that have all 0 values in it. (It would be better to drop these in one go, than one by one). zeros = df.loc[:, ( df == 0).all()] # 17 columns of the dataframe are fully 0s df.drop(columns=zeros, axis=1, inplace=True ) # zeros was assigned columns of the df in the previous cell mms = MinMaxScaler( [0, 5]) # The parameter passed the range of values min=0 and max=5 df[[ 'PHT', 'JVH', 'MIT', 'EP', 'VCH', 'EXB', 'ART', 'SEC', 'FAX', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'BTB', 'FNC', 'LAU', 'MIS', 'RMNTS' ]] = mms.fit_transform(df[[ 'PHT', 'JVH', 'MIT', 'EP', 'VCH', 'EXB', 'ART', 'SEC', 'FAX', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'BTB', 'FNC', 'LAU', 'MIS', 'RMNTS' ]]) df = df.round(decimals=2) df = df.drop( ['GSTNAM'], axis=1 ) #Dropping Guest names so as to avoid confusions as there are several guests with same name df[df['GSTCOD'] == gstcod] #Grouping by guest code and returning the mean of rest of the columns df1 = df.groupby(['GSTCOD']).mean() df1.reset_index(inplace=True) df2 = df1.melt(id_vars=['GSTCOD'], var_name='Services', value_name='Rating') inputuser = df2[df2['GSTCOD'] == gstcod] inputuser[inputuser['Rating'] != 0] except_inputuser = df2[df2['GSTCOD'] != gstcod] except_inputuser usersubset = except_inputuser[except_inputuser['Services'].isin( inputuser['Services'].tolist())][except_inputuser['Rating'] != 0] userSubsetGroup = usersubset.groupby(['GSTCOD']) userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True) userSubsetGroup = userSubsetGroup[0:100] pearsonCorrelationDict = {} for name, group in userSubsetGroup: #Let's start by sorting the input and current user group so the values aren't mixed up later on group = group.sort_values(by='Services') inputuser = inputuser.sort_values(by='Services') #Get the N for the formula nRatings = len(group) #Get the ratings for the services that they both have in common temp_df = inputuser[inputuser['Services'].isin( group['Services'].tolist())] #And then store them in a temporary buffer variable in a list format to facilitate future calculations tempRatingList = temp_df['Rating'].tolist() #Let's also put the current user group ratings in a list format tempGroupList = group['Rating'].tolist() #Now let's calculate the pearson correlation between two users, so called, x and y Sxx = sum([i**2 for i in tempRatingList ]) - pow(sum(tempRatingList), 2) / float(nRatings) Syy = sum([i**2 for i in tempGroupList ]) - pow(sum(tempGroupList), 2) / float(nRatings) Sxy = sum( i * j for i, j in zip(tempRatingList, tempGroupList) ) - sum(tempRatingList) * sum(tempGroupList) / float(nRatings) #If the denominator is different than zero, then divide, else, 0 correlation. if Sxx != 0 and Syy != 0: pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy) else: pearsonCorrelationDict[name] = 0 # Changed three lines due to cached result error pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index') pearsonDF = pearsonDF.reset_index() pearsonDF = pearsonDF.rename(columns={ 'index': 'GSTCOD', 0: 'similarityIndex' }, inplace=False) topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50] except_inputuser1 = except_inputuser[except_inputuser['Rating'] != 0] topUsersRating = topUsers.merge(except_inputuser1, left_on='GSTCOD', right_on='GSTCOD', how='inner') topUsersRating['weightedRating'] = topUsersRating[ 'similarityIndex'] * topUsersRating['Rating'] tempTopUsersRating = topUsersRating.groupby('Services').sum()[[ 'similarityIndex', 'weightedRating' ]] tempTopUsersRating.columns = [ 'sum_similarityIndex', 'sum_weightedRating' ] tempTopUsersRating.head() recommendation_df = pd.DataFrame() recommendation_df[ 'weighted average recommendation score'] = tempTopUsersRating[ 'sum_weightedRating'] / tempTopUsersRating[ 'sum_similarityIndex'] recommendation_df['Services'] = tempTopUsersRating.index recommendation_df = recommendation_df.sort_values( by='weighted average recommendation score', ascending=False) recommendList = recommendation_df.values.tolist() return recommendList except Exception as e: return ('Error while passing GSTCOD : ' + str(e))