Esempio n. 1
0
class DataSetCreator:
    def __init__(self):
        self.dsh = DataSetHelper()

    def load_new_dataset_doc(self):
        dataset_doc = self.get_next_dataset_doc()

        # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live.
        use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile')
        if not use_local_test_XML_file:
            if dataset_doc["version"] != 1:
                if not self.has_enough_time_elaspsed_since_last_dataset_created():
                    raise DataSetTooEarlyError

        self.dsh.create_item(dataset_doc)
        logging.info(f"Created new vertsion {dataset_doc['version']} DataSet")

    def get_next_dataset_doc(self):
        next_version_number = self.get_next_dataset_version_number()
        dataset_doc = {}
        dataset_doc["builds"] = get_builds_value()
        dataset_doc["created_at"] = datetime.now(timezone.utc).isoformat()
        dataset_doc["is_published"] = False
        dataset_doc["status"] = "in progress"
        dataset_doc["version"] = next_version_number
        return dataset_doc

    def get_next_dataset_version_number(self):
        if self.get_number_of_dataset_docs() == 0:
            return 1

        version = int(self.dsh.get_latest_version_number()) + 1
        return version
        #return self.dsh.get_latest_version_number() + 1

    def get_number_of_dataset_docs(self):
        query = "SELECT * FROM c "
        data_set_list = self.dsh.query_items(query)
        return len(data_set_list)

    def has_enough_time_elaspsed_since_last_dataset_created(self):
        dt_of_latest_dataset_doc = self.get_datetime_of_latest_dataset_doc()
        time_in_minutes_since_latest_dataset_doc = get_time_in_minutes_since_given_datetime(
            dt_of_latest_dataset_doc
        )
        time_in_minutes_to_wait = int(
            os.environ["TimeInMinsToWaitBeforeCreateNewDataSet"]
        )
        if time_in_minutes_to_wait > time_in_minutes_since_latest_dataset_doc:
            return False
        return True

    def get_datetime_of_latest_dataset_doc(self):
        max_version_number = self.dsh.get_latest_version_number()
        query = f"SELECT * FROM c WHERE c.version = {max_version_number}"
        latest_doc = self.dsh.query_items(query)[0]
        return convert_dt_str_to_dt_object(latest_doc["created_at"])
def main(msgin: func.QueueMessage, msgout: func.Out[str]):
    """Creates the UKRLP lookup tables for later use

    This Azure Function carries out the following steps:
    * Decompresses the XML HESA DataSet

    * Parses the INSTITUTION data from the DataSet

    * Retrieves enrichment data from the UKRLP API for each institution

    * Creates a lookup item for each Institution and writes it to CosmosDB

    * Currently, once completed successfully this function triggers the Etl function by copying
      the compressed XML passed in to a Blob storage monitored by the Etl function.

    """
    # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live.
    use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile')

    msgerror = ""

    mail_helper = MailHelper()
    environment = os.environ["Environment"]

    dsh = DataSetHelper()

    try:
        logging.info(f"CreateUkrlp message queue triggered")

        function_start_datetime = datetime.today().strftime(
            "%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateUkrlp function started on {function_start_datetime}")

        blob_helper = BlobHelper()

        storage_container_name = os.environ["AzureStorageHesaContainerName"]
        storage_blob_name = os.environ["AzureStorageHesaBlobName"]

        if use_local_test_XML_file:
            mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r")
            xml_string = mock_xml_source_file.read()
        else:
            xml_string = blob_helper.get_str_file(storage_container_name,
                                                  storage_blob_name)

        version = dsh.get_latest_version_number()

        storage_container_name = os.environ[
            "AzureStorageWelshUnisContainerName"]
        storage_blob_name = os.environ["AzureStorageWelshUnisBlobName"]

        csv_string = blob_helper.get_str_file(storage_container_name,
                                              storage_blob_name)

        # Parse the xml and create the lookups

        logging.info(f"using version number: {version}")
        dsh.update_status("institutions", "in progress")
        lookup_creator = LookupCreator(xml_string, csv_string, version)
        ukrlp_no_info_list = lookup_creator.create_ukrlp_lookups()

        msgerror += f"\n\nUKRLP did not return info for the following {len(ukrlp_no_info_list)} ukprn(s):\n"

        for ukprn in ukrlp_no_info_list:
            msgerror += f"\t{ukprn}\n"

        function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateUkrlp successfully finished on {function_end_datetime}")

        msgout.set(msgin.get_body().decode("utf-8") + msgerror)

    except Exception as e:
        # Unexpected exception
        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CreateUkrlp"
            + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed")

        logging.error(f"CreateUkrlp faile on {function_fail_datetime}")
        logging.error(traceback.format_exc())

        # Raise to Azure
        raise e
def main(msgin: func.QueueMessage, msgout: func.Out[str]):
    # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live.
    use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile')

    msgerror = ""

    mail_helper = MailHelper()
    environment = os.environ["Environment"]

    dsh = DataSetHelper()

    try:
        logging.info(f"CreateInst message queue triggered\n")

        function_start_datetime = datetime.today().strftime(
            "%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateInst function started on {function_start_datetime}")
        """ DECOMPRESSION - Decompress the compressed HESA XML """
        # The XML blob provided to this function will be gzip compressed.
        # This is a work around for a limitation discovered in Azure,
        # where Functions written in Python do not get triggered # correctly with large blobs. Tests showed this is not a limitation
        # with Funtions written in C#.

        blob_helper = BlobHelper()

        storage_container_name = os.environ["AzureStorageHesaContainerName"]
        storage_blob_name = os.environ["AzureStorageHesaBlobName"]

        if use_local_test_XML_file:
            mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r")
            xml_string = mock_xml_source_file.read()
        else:
            xml_string = blob_helper.get_str_file(storage_container_name,
                                                  storage_blob_name)

        version = dsh.get_latest_version_number()
        """ LOADING - extract data and load JSON Documents """

        logging.info(f"using version number: {version}")
        dsh.update_status("institutions", "in progress")

        inst_docs = InstitutionDocs(xml_string, version)
        inst_docs.create_institution_docs()
        dsh.update_status("institutions", "succeeded")

        function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateInst successfully finished on {function_end_datetime}")

        msgout.set(msgin.get_body().decode("utf-8") + msgerror)

    except exceptions.StopEtlPipelineWarningException:

        # A WARNING is raised while the function is running and
        # StopEtlPipelineOnWarning=True. For example, the incoming raw XML
        # is not valid against its XSD
        error_message = (
            "A WARNING has been encountered while the function is running. "
            "The function will be stopped since StopEtlPipelineOnWarning is "
            "set to TRUE in the Application Settings.")
        logging.error(error_message)

        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CreateInst"
            + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed")

        logging.error(f"CreateInst failed on {function_fail_datetime}")
        dsh.update_status("institutions", "failed")
        raise Exception(error_message)

    except Exception as e:
        # Unexpected exception
        dsh.update_status("institutions", "failed")

        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CreateInst"
            + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed")

        logging.error(f"CreateInst failed on {function_fail_datetime}",
                      exc_info=True)

        # Raise to Azure
        raise e
def main(msgin: func.QueueMessage):
    msgerror = ""
    mail_helper = MailHelper()
    environment = os.environ["Environment"]

    dsh = DataSetHelper()

    try:

        logging.info(
            f"CourseSearchBuilder message queue triggered \n"
        )

        function_start_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CourseSearchBuilder function started on {function_start_datetime}"
        )

        api_key = os.environ["SearchAPIKey"]
        search_url = os.environ["SearchURL"]
        api_version = os.environ["AzureSearchAPIVersion"]

        version = dsh.get_latest_version_number()

        dsh.update_status("search", "in progress")

        search.build_synonyms(search_url, api_key, api_version)

        search.build_index(search_url, api_key, api_version, version)

        courses = utils.get_courses_by_version(version)

        number_of_courses = len(courses)

        logging.info(
            f"attempting to load courses to azure search\n\
                        number_of_courses: {number_of_courses}\n"
        )

        search.load_index(search_url, api_key, api_version, version, courses)
        dsh.update_status("search", "succeeded")
        courses = None

        if dsh.have_all_builds_succeeded():
            build_institutions_json_files()
            build_subjects_json_file()
            build_version_json_file()

            dsh.update_status("root", "succeeded")
        else:
            dsh.update_status("root", "failed")

        function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_end_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import completed on {function_end_datetime}" + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_end_date} - Completed"
        )

        logging.info(
            f"CourseSearchBuilder successfully finished on {function_end_datetime}"
        )

    except Exception as e:
        # Unexpected exception
        dsh.update_status("search", "failed")
        dsh.update_status("root", "failed")

        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CourseSearchBuilder" + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed"
        )

        logging.error(f"CourseSearchBuilder failed on {function_fail_datetime}")
        logging.error(traceback.format_exc())

        # Raise to Azure
        raise e