def get(cls):

        if cls.fse is None:
            from azure.storage.blob import BlobClient
            logger.debug("========PREPARING TO DOWNLOAD FSE MODEL")

            storage_account_name = args.storage_account_name
            container_client_credential = ClientSecretCredential(
                tenant_id=args.directory_id,
                client_id=args.application_id,
                client_secret=SERVICE_PRINCIPAL_SECRET)
            blob = BlobClient(
                account_url=
                f"https://{storage_account_name}.blob.core.windows.net",
                container_name=args.domain_expert_container_name,
                blob_name=args.fse_folder + "/" + args.fse_file,
                credential=container_client_credential)

            file_size = blob.get_blob_properties().size

            if os.path.exists(args.fse_file) == False or (
                    os.path.exists(args.fse_file)
                    and os.stat(args.fse_file).st_size != file_size):
                with open(args.fse_file, "wb") as my_blob:
                    blob_data = blob.download_blob()
                    blob_data.readinto(my_blob)

            fse, ilist, jobs_lookup = dill.load(gzip.open(args.fse_file, "rb"))
            cls.fse = fse
            cls.ilist = ilist
            cls.jobs_lookup = jobs_lookup

        return cls.fse, cls.ilist, cls.jobs_lookup
Ejemplo n.º 2
0
    def test_user_delegation_sas_for_container(self):
        # SAS URL is calculated from storage key, so this test runs live only
        pytest.skip("Current Framework Cannot Support OAUTH")
        if TestMode.need_recording_file(self.test_mode):
            return

        # Arrange
        token_credential = self.generate_oauth_token()
        service_client = BlobServiceClient(self._get_oauth_account_url(),
                                           credential=token_credential)
        user_delegation_key = service_client.get_user_delegation_key(
            datetime.utcnow(),
            datetime.utcnow() + timedelta(hours=1))

        container_client = service_client.create_container(
            self.get_resource_name('oauthcontainer'))
        token = container_client.generate_shared_access_signature(
            expiry=datetime.utcnow() + timedelta(hours=1),
            permission=ContainerPermissions.READ,
            user_delegation_key=user_delegation_key,
            account_name='emilydevtest')

        blob_client = container_client.get_blob_client(
            self.get_resource_name('oauthblob'))
        blob_content = self.get_random_text_data(1024)
        blob_client.upload_blob(blob_content, length=len(blob_content))

        # Act
        new_blob_client = BlobClient(blob_client.url, credential=token)
        content = new_blob_client.download_blob()

        # Assert
        self.assertEqual(blob_content, b"".join(list(content)).decode('utf-8'))
Ejemplo n.º 3
0
def blob_to_df(blob_client: BlobClient,
               pandas_kwargs: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
    """Download a blob and return a pandas DataFrame."""

    # check for kwargs
    if not pandas_kwargs:
        pandas_kwargs = {}

    # check the file extension
    extension = Path(blob_client.blob_name).suffix

    # download blob and return DataFrame
    data_stream = io.BytesIO(blob_client.download_blob().readall())
    if extension == ".csv":
        return pd.read_csv(data_stream, **pandas_kwargs)
    if extension == ".txt":
        return pd.read_table(data_stream, **pandas_kwargs)
    if extension == ".json":
        return pd.read_json(data_stream, **pandas_kwargs)
    if extension in [".xls", ".xlsx"]:
        if extension == ".xls" and "engine" not in pandas_kwargs.keys():
            pandas_kwargs.update({"engine": "xlrd"})
        elif "engine" not in pandas_kwargs.keys():
            pandas_kwargs.update({"engine": "openpyxl"})
        return pd.read_excel(data_stream, **pandas_kwargs)

    raise TypeError(f"{extension} files are not yet supported.")
Ejemplo n.º 4
0
def test_get_blob_by_full_url(storage_account, sas_token):
    client = BlobClient(
        f"https://{storage_account}.blob.core.windows.net/{CONTAINER_NAME}/{FILE_NAME}",
        credential=sas_token,
    )
    content = client.download_blob().content_as_text()
    assert len(content) > 0
Ejemplo n.º 5
0
def download_blob(container_name, blob_name):
    blob = BlobClient(env.REPORTS_STGACCT_URI,
                      container_name,
                      blob_name,
                      credential=CREDENTIALS)
    blob_data = blob.download_blob()
    return blob_data
Ejemplo n.º 6
0
def downloadFile(funcData: dict, blobClient: BlobClient) -> str:
    try:
        localFileName = getUniqueTempFileName()
        with open(localFileName, 'wb') as download_file:
            download_file.write(blobClient.download_blob().readall())
        return localFileName
    except Exception as _e:
        raise RuntimeError('Exiting due to unable to download source file: {:s}. Error message: {:s}'.format(json.dumps(funcData), str(_e))) from _e
def download_blob(blob_id: str, local_path: str) -> None:
    """Download file from remote storage to local path."""
    bc = BlobClient(account_url=account_url, container_name=facts_container,
                    blob_name=blob_id, snapshot=None,
                    credential=facts_sas_token)
    with open(local_path, "wb") as download_file:
        download_file.write(bc.download_blob().readall())
    return
Ejemplo n.º 8
0
    def get(cls, args, SERVICE_PRINCIPAL_SECRET, logger):

        if "model" not in cls.gensim_model:
            from azure.storage.blob import BlobClient
            # logger.debug("========PREPARING TO SETUP GENSIM")

            storage_account_name = args.storage_account_name
            container_client_credential = ClientSecretCredential(
                tenant_id=args.directory_id,
                client_id=args.application_id,
                client_secret=SERVICE_PRINCIPAL_SECRET)
            blob = BlobClient(
                account_url=
                f"https://{storage_account_name}.blob.core.windows.net",
                container_name=args.domain_expert_container_name,
                blob_name=args.domain_expert_folder_path + "/" +
                args.domain_expert_file_name,
                credential=container_client_credential)

            file_size = blob.get_blob_properties().size

            if os.path.exists(args.domain_expert_file_name) is False or (
                    os.path.exists(args.domain_expert_file_name) and os.stat(
                        args.domain_expert_file_name).st_size != file_size):
                print("getting domain expert")

                with open(args.domain_expert_file_name, "wb") as my_blob:
                    if logger is not None:
                        logger.debug("downloading domain expert")
                    else:
                        print("downloading domain expert")
                    blob_data = blob.download_blob()
                    blob_data.readinto(my_blob)
                    # logger.debug("end downloading")

            if logger is not None:
                logger.debug("after domain expert download")
            else:
                print("after domain expert download")
            from gensim.models import Word2Vec

            try:
                cls.gensim_model["model"] = Word2Vec.load(
                    args.domain_expert_file_name)
                # logger.debug("========MODEL SET")
                cls.gensim_model["vocab"] = cls.gensim_model["model"].wv.vocab
                # logger.debug("========VOCAB SET")
            except Exception as e:
                if logger is not None:
                    logger.debug(
                        f"******************========== EXCEPTION ENCOUNTERED {e}"
                    )
                else:
                    print(
                        f"******************========== EXCEPTION ENCOUNTERED {e}"
                    )

        return cls.gensim_model
Ejemplo n.º 9
0
    def test_create_container_with_public_access_blob(self):
        # Arrange
        container_name = self._get_container_reference()

        # Act
        container = self.bsc.get_container_client(container_name)
        created = container.create_container(public_access='blob')

        blob = container.get_blob_client("blob1")
        blob.upload_blob(u'xyz')

        anonymous_service = BlobClient(self._get_account_url(),
                                       container=container_name,
                                       blob="blob1")

        # Assert
        self.assertTrue(created)
        anonymous_service.download_blob()
Ejemplo n.º 10
0
    def __getitem__(self, item: str):
        blob = BlobClient(account_url=self.account_url,
                          container_name=self.container_name,
                          blob_name=item,
                          credential=self.credential)

        blob_data = b''
        for data in blob.download_blob():
            blob_data += data
        return blob_data
Ejemplo n.º 11
0
 def download_from_azure(self, source_file, dest_file):
     try:
         blob = BlobClient(account_url=env.AZURE_STORAGE_ACCOUNT,
                           container_name=self.container_name,
                           blob_name=source_file,
                           credential=env.STORAGE_SAS_KEY)
         with open(join_paths([get_project_root(), dest_file]), "wb+") as f:
             data = blob.download_blob()
             data.readinto(f)
     except azure.core.exceptions.ResourceNotFoundError:
         logger.info('Blob not found %s', source_file)
    def download_file(self, filename):

        src_blob = BlobClient(
            self.blob_service_client.url,
            container_name=self.settings.storage_container,
            blob_name=filename,
            credential=self.sas_token
        )

        path_to_file = os.path.dirname(filename)
        download_path_for_file = os.path.join(self.settings.download_location, path_to_file)
        os.makedirs(download_path_for_file, exist_ok=True)
        download_file_path = os.path.join(self.settings.download_location, filename)
        print("\nDownloading blob to \n\t" + download_file_path)

        with open(download_file_path, "wb") as download_file:
            download_file.write(src_blob.download_blob().readall())
        return
Ejemplo n.º 13
0
def _download(account_url: str, container_name: str, credential: str,
              blob_name: str) -> Tuple[Optional[bytes], str, Optional[str]]:
    """Internal method to download YAML's from Azure via BlobClient
    Arguments:
        account_url {str} -- Azure account url
        container_name {str} -- Azure container name
        credential {str} -- Azure credential token
        blob_name {str} -- Blob name to download
    Returns:
        tuple -- URL content, target file and placeholder for error
    """

    blob = BlobClient(account_url=account_url,
                      container_name=container_name,
                      credential=credential,
                      blob_name=blob_name)

    return (blob.download_blob().readall(), blob_name, None)
Ejemplo n.º 14
0
def downloadapi(container,
                blob,
                accountname=None,
                accountkey=None,
                replace=True,
                blob_target=None):
    block_blob_service = BlobClient(accountname,
                                    container,
                                    blob,
                                    credential=accountkey)
    blob_target = blob_target or os.path.join(os.getcwd(), blob)
    if not replace and os.path.isfile(blob_target):
        logger.info(
            "will skip download, {} already exists and replace=False".format(
                blob_target))
        return
    logger.info("downloading '{}/{}' to '{}'".format(container, blob,
                                                     blob_target))
    with open(blob_target, "wb") as f:
        blob_data = block_blob_service.download_blob()
        blob_data.readinto(f)
    logger.info("finished download")
Ejemplo n.º 15
0
    def get(cls, args, SERVICE_PRINCIPAL_SECRET, logger):

        if cls.big_taxo is None:
            from azure.storage.blob import BlobClient

            if logger is not None:
                logger.debug("========PREPARING TO DOWNLOAD TAXNOMY")
            else:
                print("========PREPARING TO DOWNLOAD TAXNOMY")

            storage_account_name = args.storage_account_name
            container_client_credential = ClientSecretCredential(
                tenant_id=args.directory_id,
                client_id=args.application_id,
                client_secret=SERVICE_PRINCIPAL_SECRET)

            blob = BlobClient(
                account_url=
                f"https://{storage_account_name}.blob.core.windows.net",
                container_name=args.domain_expert_container_name,
                blob_name=args.domain_expert_folder_path + "/taxo/" +
                "taxo_de.dill.gz",
                credential=container_client_credential)

            file_size = blob.get_blob_properties().size

            if os.path.exists("taxo_de.dill.gz") is False or (
                    os.path.exists("taxo_de.dill.gz")
                    and os.stat("taxo_de.dill.gz").st_size != file_size):
                with open("taxo_de.dill.gz", "wb") as my_blob:
                    # print("downloading")
                    blob_data = blob.download_blob()
                    blob_data.readinto(my_blob)

            general_taxo_dict = dill.load(gzip.open("taxo_de.dill.gz", "rb"))
            cls.big_taxo = general_taxo_dict

        return cls.big_taxo
Ejemplo n.º 16
0
def download_blob(blob: BlobClient, blob_info: BlobProperties, destination: str, overwrite: bool, attempt=0) -> dict:
    destination_filename = pathlib.Path(f'{destination}/{blob.blob_name}')
    blob_md5 = blob_info['metadata']['md5']
    operation = {'operation': 'no-op'} # Default return

    if not overwrite and os.path.isfile(destination_filename):
        log.error(f'file {destination_filename} already exists and is not set to overwrite.')
        local_md5 = get_md5sum(destination_filename)
        log.error(f'local md5: {local_md5}, azure md5: {blob_md5}')
        return operation
    elif overwrite and os.path.isfile(destination_filename):
        local_md5 = get_md5sum(destination_filename)
        log.info(f'file {destination_filename} already exists locally. md5: {local_md5}')
        if local_md5 == blob_md5:
            log.info(f'local md5sum matches azure md5sum of {local_md5}')
            return operation

    log.info(f'Downloading {blob.blob_name} to {destination}/{blob.blob_name}.')
    log.debug('Creating path %s.', destination_filename.parent)
    os.makedirs(destination_filename.parent, exist_ok=True)

    with open(destination_filename, 'wb') as fp:
        download_client = blob.download_blob()
        operation = download_client.download_to_stream(fp)
    local_md5 = get_md5sum(destination_filename)

    if local_md5 == blob_md5:
        log.info(f'downloaded local md5sum of {destination_filename} matches azure md5sum of {local_md5}')
    else:
        log.error(f'downloaded file {destination_filename} md5sum mismatch with cloud.')
        if attempt >= 2:
            log.error(f'{destination_filename} md5sum mismatch after 3 tries downloading, giving up.')
            return operation
        attempt =+ 1
        operation = download_blob(blob, blob_info, destination, overwrite, attempt)

    return operation
Ejemplo n.º 17
0
                        frame.loc[key] = f
                else:
                    f = np.zeros(len(categories))
                    for j in i[sublist]:
                        f[categories.index(j)] = 1
                    frame.loc[d] = f
            except:
                pass
    ann = annotator.split("-")[4]
    final_frame = frame.add_prefix(str(ann))
    return final_frame


filename = "../data/temp_annotations.jsonl"
with open(filename, "wb") as f:
    f.write(blob.download_blob().readall())

annotations = list(read_jsonl(filename))
uq_annotators = set([x["_session_id"] for x in annotations])

dfs = []
for annotator in uq_annotators:
    sub_annotations = [an for an in annotations if an["_session_id"] == annotator]
    sub_df = pd.DataFrame(sub_annotations)
    dummy_df = sublists_to_dummies(sub_df, "accept", annotator=annotator, index_key="text")
    final_df = dummy_df.reset_index()
    dfs.append(final_df)

df_final = reduce(lambda left, right: pd.merge(left, right, on="index"), dfs)
print(df_final)
Ejemplo n.º 18
0
import azure.functions as func
from azure.storage.blob import BlobClient
import pickle
import pandas as pd

blob = BlobClient(
    account_url="https://blobhikeathon.blob.core.windows.net",
    container_name="blobcon",
    blob_name="model.pkl",
    credential=
    "qC8kJ7CvBvoEDAFHrNy2E3VJNCKFXkEyh2wb2yozxOkN+r7yGBgYxMy+cwS8UjEjj7hm3+tQWAj0bzAp3YVZog=="
)

with open("model.pkl", "wb") as f:
    data = blob.download_blob()
    data.readinto(f)

model = pickle.load(open('model.pkl', 'rb'))

blob = BlobClient(
    account_url="https://blobhikeathon.blob.core.windows.net",
    container_name="blobcon",
    blob_name="test.csv",
    credential=
    "qC8kJ7CvBvoEDAFHrNy2E3VJNCKFXkEyh2wb2yozxOkN+r7yGBgYxMy+cwS8UjEjj7hm3+tQWAj0bzAp3YVZog=="
)

with open("test.csv", "wb") as f:
    data = blob.download_blob()
    data.readinto(f)
Ejemplo n.º 19
0
def main(mytimer: func.TimerRequest) -> None:
    utc_timestamp = datetime.datetime.utcnow().replace(
        tzinfo=datetime.timezone.utc).isoformat()

        # Determine execution environment
    #if "rdo-env-production" in dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get():
    #  DEVELOPMENT = False

    historical_period = 5 # days
    forecasts_per_day = 4 # 0, 6, 12, 18

    def d_print(phrase):
        print("{} {}".format(datetime.datetime.now(), phrase))


    ## North American Mesoscale forecast for Alaska
    nam_AK_forecast_url_gf = "https://nomads.ncep.noaa.gov/cgi-bin/filter_nam_alaskanest.pl"
    nam_AK_forecast_dir_format = "nam.{}{:02d}{:02d}"
    # {1} is the year in YYYY format
    # {2} is the month in MM format
    # {3} is the day in DD format
    nam_AK_forecast_file_format = "nam.t{:02d}z.alaskanest.hiresf{:02d}.tm00.grib2"
    # {1} is time the forecast is made in hours 00, 06, 12, 18
    # {2} is the forecast hour (in the future) 00, 01, 02...60
    nam_AK_forecast_folder = "nam_alaska_forecasts"
    nam_AK_forecast_var_list = ['PRES',  # Pressure (Pa)
                                'TMP',  # Temperature (K)
                                'SPFH',  # Specific humidity (kg/kg)
                                'SSRUN',  # Storm surface runoff (kg/m^2)
                                'BGRUN',  # Baseflow-groundwater runoff (kg/m^2)
                                'SNOD',  # Snow depth (m)
                                'EVP',  # Evaporation (kg/m^2)
                                'PRATE',  # Precipitation rate (kg/m^2/s)
                                'DSWRF',  # Downward short-wave radiation (W/m^2)
                                'DLWRF',  # Downward long-wave radiation (W/m^2)
                                'SOILM',  # Soil moisture content (kg/m^2)
                                'TSOIL',  # Soil temperature (K)
                                'APCP',  # Total precipitation (kg/m^2)
                                'UGRD',  # U-component of wind (m/s)
                                'VGRD',  # V-component of wind (m/s)
                                'CNWAT',  # Plant canopy surface water (kg/m^2)
                                'NCPCP',  # Large-scale precipitation (non-convective) (kg/m^2)
                                'LHTFL']  # Latent heat net flux (W/m^2)

    ## Inputs
    red_dog_lat = 68.0756
    red_dog_lon = -162.8561

    red_dog_NAM_lat = 68.07837  # closest points in the NAM dataset
    red_dog_NAM_lon = -162.85785

    start_hour = 0
    end_hour = 45

    # ADLS path to save files
    save_directory_folder = "Raw_Data/Weather_NAM/"
    model_output_file_name_fmt = "Model_Data/Model_Output_TDS/{}"
    # if DEVELOPMENT:
    #    save_directory_folder = save_directory_folder.replace("Raw_Data", "Raw_Data_Dev")
    #    model_output_file_name_fmt = model_output_file_name_fmt.replace("Model_Data", "Model_Data_Dev")
    save_directory_fmt = save_directory_folder + "{}{:02d}{:02d}/{:02d}/"  # YYYYMMDD/HH

    # Query parameters for file retrieval
    query_params = {
        'file': None,  # file name
        'subregion': 'on',  # retrieve data within bounds
        'leftlon': red_dog_lon - 0.29,  # left longitude bounds
        'rightlon': red_dog_lon + 0.21,  # right longitude bounds
        'toplat': red_dog_lat + 0.21,  # top latitude bounds
        'bottomlat': red_dog_lat - 0.30,  # bottom latitude bounds
        'dir': None,  # forecast directory
        'lev_surface': 'on',  # surface level
        'lev_10_m_above_ground': 'on',  # 10 m above ground
        'lev_0-2_m_below_ground': 'on',  # 0-2 m below ground
        'lev_0-0.1_m_below_ground': 'on',  # 0-0.1 m below ground
        'lev_0.1-0.4_m_below_ground': 'on',  # 0.1-0.4 m below ground
        'lev_0-1_m_below_ground': 'on',  # 0-1 m below ground
        'lev_0.4-1_m_below_ground': 'on'  # 0.4-1 m below ground
    }

    # Current time in UTC
    current_time = datetime.datetime.utcnow()
    d_print(">>> Downloading NAM Alaska forecasts ({})".format(current_time))

    # Populate forecast directory
    forecast_dir = nam_AK_forecast_dir_format.format(current_time.year, current_time.month, current_time.day)
    d_print("... using directory: " + forecast_dir)

    # Populate query parameters (sans file name)
    query_params["dir"] = "/" + forecast_dir
    query_params = {**query_params,
                    **{"var_" + v: "on" for v in nam_AK_forecast_var_list}}

    # Define download parameters and create directory if not existing
    forecast_prod_hour = current_time.hour - (current_time.hour % 6)  # 0, 6, 12, 18
    d_print("... using forecast production hour: {}".format(forecast_prod_hour))
    save_directory = save_directory_fmt.format(current_time.year, current_time.month,
                                               current_time.day, forecast_prod_hour)

    temp_dir = tempfile.TemporaryDirectory()
    print(temp_dir.name)

    # Loop through forecast files and download
    start_download_time = time.time()
    for forecast_hour in range(start_hour, end_hour + 1):
        query_params["file"] = nam_AK_forecast_file_format.format(forecast_prod_hour, forecast_hour)
        d_print("... ... retrieving data from file: " + query_params["file"])
        forecast_response_temp_file_name = temp_dir.name + "/" + query_params["file"]
        print("forecast_response_temp_file_name", forecast_response_temp_file_name)
        blob_path_file_name = save_directory + query_params["file"]

        # Create the BlobServiceClient object which will be used to create a container client
        conn_str = os.environ["AzureBlobStorage"] 
        blob_service_client = BlobServiceClient.from_connection_string(conn_str)

        # Container name
        container_name =  os.environ["ContainerName"]

        # Create the container client
        container_client = blob_service_client.get_container_client(container_name)

        #### UPLOAD TO AZURE ####
        # Create a blob client using the local file name as the name for the blob
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path_file_name)

        retries = 0

        while True:
            try:
                response = requests.get(nam_AK_forecast_url_gf, params=query_params, timeout=120)
                response.raise_for_status() # Raise exception for bad status code
                open(forecast_response_temp_file_name, "wb").write(response.content)

                print("\nUploading to Azure Storage as blob:\n\t" + blob_path_file_name)

                # Upload the created file to Azure Blob
                with open(forecast_response_temp_file_name, "rb") as data:
                    blob_client.upload_blob(data, overwrite=True)

                break
            except:
                retries += 1
                time.sleep(retries * 5)
                if retries == 2:
                    raise Exception("Max number of retries exceeded")

    d_print("<<< Download complete in {:.2f} seconds".format(time.time() - start_download_time))
    
    
    d_print(">>> Starting post processing from GRIB2 to Parquet")

    data = {}
    for forecast_hour in range(start_hour, end_hour + 1):
        file_path = temp_dir.name + "/" + nam_AK_forecast_file_format.format(forecast_prod_hour, forecast_hour)
        print("file_path", file_path)
        grb = pygrib.open(file_path)
        for measure in grb:
            # Split label string into constituents
            # Ex:
            # 1:Surface pressure:Pa (instant):polar_stereographic:surface:level 0:fcst time 30 hrs:from 202004061800
            description = str(measure).split(":")[1].replace(" ", "_")
            measurement_type = str(measure).split(":")[2].split("(")[1].strip(")")
            level = "_".join(str(measure).split(":")[5].split()[1:])
            forecast_date = str(measure).split(":")[7].split()[1]

            # Extract arrays for values and coordinates
            values, lats, lons = measure.data()

            # Build column name and forecast date
            column = "{}_{}_{}".format(description, level, measurement_type)
            forecast_time = datetime.datetime.strptime(forecast_date, "%Y%m%d%H%M")
            forecast_time += timedelta(hours=forecast_hour)

            # Iterate through values
            for idx, value in np.ndenumerate(values):
                coords = (lats[idx], lons[idx])
                if coords not in data:
                    data[coords] = {}
                if forecast_time not in data[coords]:
                    data[coords][forecast_time] = {}
                data[coords][forecast_time][column] = value

    # Construct dictionary of dataframes and concatenate
    data_df = {k: pd.DataFrame(data[k]).T for k, v in data.items()}

    comb_output_file_name_fmt = "nam.t{:02d}z.alaskanest.hirescombined.tm00.parquet"
    comb_output_file_name = save_directory  + comb_output_file_name_fmt.format(forecast_prod_hour)
    data_df_concat = pd.concat(data_df, axis=0)

    # temp location for parquet
    local_path_blob_concat_parquet = temp_dir.name + '/data_df_concat.parquet'

    # Save the historical file
    df_historical = pd.DataFrame()
    df_historical.to_parquet(local_path_blob_concat_parquet)

    #### UPLOAD TO AZURE ####
    # Create a blob client using the local file name as the name for the blob
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=comb_output_file_name)

    # Upload the created file to Azure Blob
    with open(local_path_blob_concat_parquet, "rb") as data:
        blob_client.upload_blob(data, overwrite=True)

    d_print("<<< Completed post processing")


    d_print(">>> Starting post processing from GRIB2 to Parquet")

    data = {}
    for forecast_hour in range(start_hour, end_hour + 1):
        file_path = temp_dir.name + "/" + nam_AK_forecast_file_format.format(forecast_prod_hour, forecast_hour)
        grb = pygrib.open(file_path)
        print(grb)

    blob_name_starts_with = "Raw_Data/Weather_NAM/"

    # if DEVELOPMENT:
    #     blob_name_starts_with = blob_name_starts_with.replace("Raw_Data", "Raw_Data_Dev")

    blob_list = container_client.list_blobs(name_starts_with=blob_name_starts_with)

    date_list = []

    for blob in blob_list:
        blob_folder_date = blob.name.split("/")[2]
        if blob_folder_date not in date_list:
            date_list.append(blob_folder_date)

    last_5_day_date_list = date_list[-1 * (historical_period + 1):]

    forecast_files = []

    # looping through dates in last_5_day_date_list
    for weather_date in last_5_day_date_list:
        hour_list = []
        blob_list = container_client.list_blobs(name_starts_with=blob_name_starts_with + weather_date + '/')

        # making a hour_list from hours in last_5_day_date_list
        for blob in blob_list:
            blob_folder_hour = blob.name.split("/")[3]
            if blob_folder_hour not in hour_list:
                hour_list.append(blob_folder_hour)

        # making a list of forecast Parquet files
        for hour in hour_list:
            blob_list_for_parquet = container_client.list_blobs(
                name_starts_with=blob_name_starts_with + weather_date + '/' + hour)
            for blob in blob_list_for_parquet:
                if '.parquet' in blob.name:
                    forecast_files.append(blob.name)

    # Subset forecast files to last 5 days
    forecast_files = forecast_files[-1 * (historical_period * forecasts_per_day + 1):]

    local_path_blob_parquet = temp_dir.name + '/forecast.parquet'

    # Merge first 6h of every forecast into shared dataframe, excluding the most recent file, which is merged in entirety
    prev_accumulated = pd.DataFrame()

    account_url = os.environ["AccountUrl"]
    credential =  os.environ["AccountCredential"]

    for forecast_file_path in forecast_files:

        blob = BlobClient(account_url = account_url,
                          container_name=container_name,
                          blob_name=forecast_file_path,
                          credential=credential)

        with open(local_path_blob_parquet, "wb") as f:
            data = blob.download_blob()
            data.readinto(f)

        df = pd.read_parquet(local_path_blob_parquet)
        acc_col = [x for x in df.columns if x.endswith('_accum')]
        inst_col = [x.replace('_accum', '_instant') for x in acc_col]

        # transform accumulated columns to instant by storing the instant elements in a cloned dataframe
        df_accumulated = df[acc_col].reset_index()

        # mask for values to change (hours 0,2,3,5,6,8,9,11,12,14,15,17,18,20,21,23 without the first on in the file)
        change_mask = (((df_accumulated['level_2'].dt.hour - 1) % 3) > 0) & (
                    df_accumulated['level_2'] > df_accumulated['level_2'].min())

        # mask for values to subtract from values to change (hours 1,2,4,5,7,8,10,11,13,14,16,17,19,20,22,23)
        subtract_mask = ((df_accumulated['level_2'].dt.hour - 1) % 3) < 2

        # subtract previous column values for masked flows (change_mask and subtract_mask)
        df_accumulated.loc[change_mask, acc_col] = df_accumulated.loc[change_mask, acc_col].values - df_accumulated.loc[
            subtract_mask, acc_col].values
        df_accumulated.set_index(['level_0', 'level_1', 'level_2'], inplace=True)

        # rename column names from *_accum to *_instant
        df_accumulated.rename(columns={x: y for x, y in zip(acc_col, inst_col)}, inplace=True)

        # write the cloned instant columns to the original dataframe
        for col in inst_col:
            df[col] = df_accumulated[col]

        # update the instant values according to the 6h forecast of the previous forecast and store 6h forecast for next iteration
        df.update(prev_accumulated)
        prev_accumulated = df[inst_col][
            df.index.get_level_values(2) == df.index.get_level_values(2).min() + timedelta(hours=24 / forecasts_per_day)]

        # truncate forecast at 6h for all except for the last forecast file
        if forecast_file_path != forecast_files[-1]:
            df = df[
                df.index.get_level_values(2) < df.index.get_level_values(2).min() + timedelta(hours=24 / forecasts_per_day)]

        df_historical = df_historical.append(df)

    # sort the indices
    df_historical.sort_index(inplace=True)

    # temp location for parquet
    local_path_blob_historical_parquet = temp_dir.name + '/historical.parquet'

    # Save the historical file
    df_historical.to_parquet(local_path_blob_historical_parquet)

    #### UPLOAD TO AZURE ####
    # Create a blob client using the local file name as the name for the blob
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=model_output_file_name_fmt.format(
        "nam.hirescombined.5days.parquet"))

    # Upload the created file to Azure Blob
    with open(local_path_blob_historical_parquet, "rb") as data:
        blob_client.upload_blob(data, overwrite=True)

    d_print("<<< Merging complete")


    d_print(">>> Starting image processing from Parquet to PNG")

    # Reset index and select coordinates closest to RDO
    plot_df = data_df_concat.reset_index().round({"level_0": 5, "level_1": 5})
    # plot_df = plot_df[(plot_df.level_0 == red_dog_NAM_lat) & (plot_df.level_1 == red_dog_NAM_lon)]
    # plot_df.set_index("level_2", inplace=True) # Datetime is the index
    plot_df = plot_df.groupby("level_2").mean()

    # Engineer features
    #    Temperature (K) -> (C) and (F)
    #    Snow depth (m)
    #    Precipitation rate (kg/m^2/s) -> (mm)
    #    Storm surface runoff (kg/m^2) -> instant
    plot_df["Temperature_0_instant_C"] = plot_df["Temperature_0_instant"] - 273.15
    plot_df["Temperature_0_instant_F"] = (plot_df["Temperature_0_instant"] - 273.15) * (9.0 / 5.0) + 32.0
    plot_df["Precipitation_rate_0_instant_mm"] = plot_df[
                                                     "Precipitation_rate_0_instant"] * 60.0 * 60.0  # 60 min/hr * 60 sec/min
    # SSRUN
    storm_surface_runoff_0_instant = []
    last_val = None
    for idx, row in plot_df.iterrows():
        ssrun = row["Storm_surface_runoff_0_accum"]
        if len(storm_surface_runoff_0_instant) == 0 or idx.hour % 3 == 1:
            storm_surface_runoff_0_instant.append(ssrun)
        else:
            storm_surface_runoff_0_instant.append(ssrun - last_val)
        last_val = ssrun
    plot_df["Storm_surface_runoff_0_instant"] = storm_surface_runoff_0_instant

    # Adjust timezone to AKST
    plot_df.index = plot_df.index.tz_localize(tz='UTC')
    plot_df.index = plot_df.index.tz_convert(tz="US/Alaska")
    plot_df.index = plot_df.index.tz_localize(None)

    # Create plots
    for column in ["Temperature_0_instant_C", "Snow_depth_0_instant", "Precipitation_rate_0_instant_mm",
                   "Storm_surface_runoff_0_instant"]:
        plt.rc('font', size=14)
        plt.rc('xtick', color="#999999")
        plt.rc('ytick', color="#999999")
        plt.rc('axes', edgecolor='#999999')

        plt.figure(figsize=(5, 2.5))
        plt.xticks(rotation=65)

        ax = plt.gca()
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d %HH'))
        plt.plot_date(plot_df.index, plot_df[column], lw=2.0, ls="-", c="#336699")

        # temp location for image
        local_path_blob_image = temp_dir.name + '/image.png'

        # Save the image locally in the temp location specified above
        plt.savefig(local_path_blob_image, dpi=96, bbox_inches="tight", transparent=True)

        # Raw_Data location on Azure
        output_file_name = save_directory + "nam.t{:02d}z.".format(forecast_prod_hour) + column + ".tm00.png"

        # Model_Output location on Azure
        model_output_file_name = model_output_file_name_fmt.format(column + ".png")

        #### UPLOAD TO AZURE ####
        # Loop through the Blob locations above and store the image from temp location to Azure Blob respectively.
        # Create a blob client using the local file name as the name for the blob
        for blob_location in [output_file_name, model_output_file_name]:
            blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_location)

            # Upload the image from temp location to Azure Blob
            with open(local_path_blob_image, "rb") as data:
                blob_client.upload_blob(data, overwrite=True)

        plt.close()

    d_print("<<< Completed image processing")
Ejemplo n.º 20
0
def rec(gstcod):
    try:
        # Read data from Blob Storage
        blob = BlobClient(
            account_url="https://recommendtest.blob.core.windows.net/",
            container_name="demo",
            blob_name="Guest Recommender.csv",
            credential=
            "eZ6rUu9dIV3I0ZZgyXDm2yAe+dJJ8m7C3YTlMuOUqHD7EWck9hLFVEamxJa9RQgIty81t32zNPjUie9Mt4rd9Q=="
        )

        data = blob.download_blob()
        df = pd.DataFrame([
            x.replace('\r', '').split(',')
            for x in data.content_as_text().split('\n')
        ])
        df.columns = df.iloc[0]
        df = df.drop(0)
        df = df.reset_index(drop=True)

        pd.set_option('display.max_columns', None)

        df['ARRDAT'] = pd.to_datetime(df['ARRDAT'], format='%Y%m%d')

        df['DEPDAT'] = pd.to_datetime(df['DEPDAT'], format='%Y%m%d')

        df['RMNTS'] = df['DEPDAT'] - df['ARRDAT']

        df['RMNTS'] = df['RMNTS'].astype('timedelta64[D]')

        df.drop([
            'REGNUB', 'FOLNUB', 'ROMNUB', 'ARRDAT', 'DEPDAT', 'TRC', 'COM',
            'CHQ', 'TRD', 'CAS', 'ADQ', 'CRD', 'ADC', 'BOH', 'ADV', 'POT'
        ],
                axis=1,
                inplace=True)

        # Casting columns as data is readed from blob storage
        df[[
            'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH',
            'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC',
            'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN',
            'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD',
            'JBR', 'MIS', 'AP', 'FX'
        ]] = df[[
            'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH',
            'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC',
            'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN',
            'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD',
            'JBR', 'MIS', 'AP', 'FX'
        ]].astype(float)

        df[[
            'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH',
            'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC',
            'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN',
            'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD',
            'JBR', 'MIS', 'AP', 'FX'
        ]] = df[[
            'TRS', 'PHT', 'SPC', 'BBD', 'GBR', 'JVH', 'MIT', 'EP', 'VCH',
            'RNT', 'ITV', 'CCF', 'PHC', 'TIP', 'EXB', 'STD', 'ART', 'SEC',
            'NOT', 'FAX', 'FST', 'BB', 'RMS', 'ITB', 'HTM', 'TRF', 'RTN',
            'LCA', 'PLT', 'PLC', 'BTB', 'FNC', 'SSG', 'LLT', 'LAU', 'IDD',
            'JBR', 'MIS', 'AP', 'FX'
        ]].div(df.RMNTS, axis=0)

        df.loc[df['RMNTS'] == 0]

        df.isnull().sum(
        )  # This is to confirm that division by 0 has caused the NaN

        df.dropna(
            axis=0, how='any', inplace=True
        )  # if any value in the row is NaN, it will be removed. Else use how='all'

        df.drop(['SPC'], axis=1, inplace=True)

        df['BBD'].describe()

        df.drop(['BBD'], axis=1, inplace=True)

        df.drop(['GBR'], axis=1, inplace=True)

        # #### Find all columns of the dataframe that have all 0 values in it. (It would be better to drop these in one go, than one by one).

        zeros = df.loc[:, (
            df == 0).all()]  # 17 columns of the dataframe are fully 0s

        df.drop(columns=zeros, axis=1, inplace=True
                )  # zeros was assigned columns of the df in the previous cell

        mms = MinMaxScaler(
            [0, 5])  # The parameter passed the range of values min=0 and max=5

        df[[
            'PHT', 'JVH', 'MIT', 'EP', 'VCH', 'EXB', 'ART', 'SEC', 'FAX', 'BB',
            'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'BTB', 'FNC', 'LAU', 'MIS',
            'RMNTS'
        ]] = mms.fit_transform(df[[
            'PHT', 'JVH', 'MIT', 'EP', 'VCH', 'EXB', 'ART', 'SEC', 'FAX', 'BB',
            'RMS', 'ITB', 'HTM', 'TRF', 'RTN', 'BTB', 'FNC', 'LAU', 'MIS',
            'RMNTS'
        ]])

        df = df.round(decimals=2)

        df = df.drop(
            ['GSTNAM'], axis=1
        )  #Dropping Guest names so as to avoid confusions as there are several guests with same name

        df[df['GSTCOD'] == gstcod]

        #Grouping by guest code and returning the mean of rest of the columns
        df1 = df.groupby(['GSTCOD']).mean()
        df1.reset_index(inplace=True)
        df2 = df1.melt(id_vars=['GSTCOD'],
                       var_name='Services',
                       value_name='Rating')
        inputuser = df2[df2['GSTCOD'] == gstcod]
        inputuser[inputuser['Rating'] != 0]
        except_inputuser = df2[df2['GSTCOD'] != gstcod]
        except_inputuser
        usersubset = except_inputuser[except_inputuser['Services'].isin(
            inputuser['Services'].tolist())][except_inputuser['Rating'] != 0]
        userSubsetGroup = usersubset.groupby(['GSTCOD'])
        userSubsetGroup = sorted(userSubsetGroup,
                                 key=lambda x: len(x[1]),
                                 reverse=True)
        userSubsetGroup = userSubsetGroup[0:100]
        pearsonCorrelationDict = {}

        for name, group in userSubsetGroup:
            #Let's start by sorting the input and current user group so the values aren't mixed up later on
            group = group.sort_values(by='Services')
            inputuser = inputuser.sort_values(by='Services')
            #Get the N for the formula
            nRatings = len(group)
            #Get the ratings for the services that they both have in common
            temp_df = inputuser[inputuser['Services'].isin(
                group['Services'].tolist())]
            #And then store them in a temporary buffer variable in a list format to facilitate future calculations
            tempRatingList = temp_df['Rating'].tolist()
            #Let's also put the current user group ratings in a list format
            tempGroupList = group['Rating'].tolist()
            #Now let's calculate the pearson correlation between two users, so called, x and y
            Sxx = sum([i**2 for i in tempRatingList
                       ]) - pow(sum(tempRatingList), 2) / float(nRatings)
            Syy = sum([i**2 for i in tempGroupList
                       ]) - pow(sum(tempGroupList), 2) / float(nRatings)
            Sxy = sum(
                i * j for i, j in zip(tempRatingList, tempGroupList)
            ) - sum(tempRatingList) * sum(tempGroupList) / float(nRatings)
            #If the denominator is different than zero, then divide, else, 0 correlation.
            if Sxx != 0 and Syy != 0:
                pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy)
            else:
                pearsonCorrelationDict[name] = 0

        # Changed three lines due to cached result error
        pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict,
                                           orient='index')
        pearsonDF = pearsonDF.reset_index()
        pearsonDF = pearsonDF.rename(columns={
            'index': 'GSTCOD',
            0: 'similarityIndex'
        },
                                     inplace=False)

        topUsers = pearsonDF.sort_values(by='similarityIndex',
                                         ascending=False)[0:50]
        except_inputuser1 = except_inputuser[except_inputuser['Rating'] != 0]
        topUsersRating = topUsers.merge(except_inputuser1,
                                        left_on='GSTCOD',
                                        right_on='GSTCOD',
                                        how='inner')

        topUsersRating['weightedRating'] = topUsersRating[
            'similarityIndex'] * topUsersRating['Rating']
        tempTopUsersRating = topUsersRating.groupby('Services').sum()[[
            'similarityIndex', 'weightedRating'
        ]]
        tempTopUsersRating.columns = [
            'sum_similarityIndex', 'sum_weightedRating'
        ]
        tempTopUsersRating.head()

        recommendation_df = pd.DataFrame()
        recommendation_df[
            'weighted average recommendation score'] = tempTopUsersRating[
                'sum_weightedRating'] / tempTopUsersRating[
                    'sum_similarityIndex']
        recommendation_df['Services'] = tempTopUsersRating.index
        recommendation_df = recommendation_df.sort_values(
            by='weighted average recommendation score', ascending=False)
        recommendList = recommendation_df.values.tolist()

        return recommendList

    except Exception as e:
        return ('Error while passing GSTCOD : ' + str(e))