Example #1
0
 def test_initialisation(self, mock_get_cosmos_client):
     mock_get_cosmos_client.return_value = mock.MagicMock()
     try:
         DataSetHelper()
     except:
         self.fail(
             "DataSetHelper initialisation raised unexpected Exception")
Example #2
0
def build_subjects_json_file():
    version = DataSetHelper().get_latest_version_number()
    blob_helper = BlobHelper()

    cosmos_db_client = get_cosmos_client()
    collection_link = get_collection_link("AzureCosmosDbDatabaseId",
                                          "AzureCosmosDbSubjectsCollectionId")

    query = f"SELECT * from c where c.version = {version}"

    options = {"enableCrossPartitionQuery": True}

    subjects_list = list(
        cosmos_db_client.QueryItems(collection_link, query, options))

    subjects_file = io.StringIO()

    subjects = []
    for subject in subjects_list:
        subject_entry = get_subject_entry(subject)
        subjects.append(subject_entry)

    subjects.sort(key=lambda x: x["english_name"])

    json.dump(subjects, subjects_file, indent=4)
    encoded_file = subjects_file.getvalue().encode('utf-8')

    storage_container_name = os.environ["AzureStorageJSONFilesContainerName"]
    storage_blob_name = os.environ["AzureStorageSubjectsJSONFileBlobName"]
    blob_helper.write_stream_file(storage_container_name, storage_blob_name,
                                  encoded_file)
    subjects_file.close()
Example #3
0
    def test_update_status(self, mock_get_cosmos_client):
        dsh = DataSetHelper()

        latest_dataset_doc = {}
        latest_dataset_doc["version"] = 3
        latest_dataset_doc["builds"] = {"courses": {"status": "pending"}}
        latest_dataset_doc["updated_at"] = "dave"
        dsh.get_latest_doc = mock.MagicMock(return_value=latest_dataset_doc)

        dsh.cosmos_client.UpsertItem = mock.MagicMock()

        dsh.update_status("courses", "in progress", "dave")

        expected_connection_link = (
            "dbs/test-db-id/colls/test-dataset-collection-id")
        expected_dataset_doc = {}
        expected_dataset_doc["version"] = 3
        expected_dataset_doc["builds"] = {"courses": {"status": "in progress"}}
        expected_dataset_doc["updated_at"] = "dave"
        dsh.cosmos_client.UpsertItem.assert_called_once_with(
            expected_connection_link, expected_dataset_doc)
Example #4
0
    def test_have_all_builds_succeeded_with_one_pending(
            self, mock_get_cosmos_client):
        dsh = DataSetHelper()

        latest_dataset_doc = {}
        latest_dataset_doc["version"] = 3
        latest_dataset_doc["builds"] = {
            "courses": {
                "status": "pending"
            },
            "institutions": {
                "status": "succeeded"
            },
            "search": {
                "status": "succeeded"
            },
            "subjects": {
                "status": "succeeded"
            },
        }
        dsh.get_latest_doc = mock.MagicMock(return_value=latest_dataset_doc)
        self.assertFalse(dsh.have_all_builds_succeeded())
Example #5
0
class DataSetCreator:
    def __init__(self):
        self.dsh = DataSetHelper()

    def load_new_dataset_doc(self):
        dataset_doc = self.get_next_dataset_doc()

        # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live.
        use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile')
        if not use_local_test_XML_file:
            if dataset_doc["version"] != 1:
                if not self.has_enough_time_elaspsed_since_last_dataset_created():
                    raise DataSetTooEarlyError

        self.dsh.create_item(dataset_doc)
        logging.info(f"Created new vertsion {dataset_doc['version']} DataSet")

    def get_next_dataset_doc(self):
        next_version_number = self.get_next_dataset_version_number()
        dataset_doc = {}
        dataset_doc["builds"] = get_builds_value()
        dataset_doc["created_at"] = datetime.now(timezone.utc).isoformat()
        dataset_doc["is_published"] = False
        dataset_doc["status"] = "in progress"
        dataset_doc["version"] = next_version_number
        return dataset_doc

    def get_next_dataset_version_number(self):
        if self.get_number_of_dataset_docs() == 0:
            return 1

        version = int(self.dsh.get_latest_version_number()) + 1
        return version
        #return self.dsh.get_latest_version_number() + 1

    def get_number_of_dataset_docs(self):
        query = "SELECT * FROM c "
        data_set_list = self.dsh.query_items(query)
        return len(data_set_list)

    def has_enough_time_elaspsed_since_last_dataset_created(self):
        dt_of_latest_dataset_doc = self.get_datetime_of_latest_dataset_doc()
        time_in_minutes_since_latest_dataset_doc = get_time_in_minutes_since_given_datetime(
            dt_of_latest_dataset_doc
        )
        time_in_minutes_to_wait = int(
            os.environ["TimeInMinsToWaitBeforeCreateNewDataSet"]
        )
        if time_in_minutes_to_wait > time_in_minutes_since_latest_dataset_doc:
            return False
        return True

    def get_datetime_of_latest_dataset_doc(self):
        max_version_number = self.dsh.get_latest_version_number()
        query = f"SELECT * FROM c WHERE c.version = {max_version_number}"
        latest_doc = self.dsh.query_items(query)[0]
        return convert_dt_str_to_dt_object(latest_doc["created_at"])
def build_version_json_file():
    version = DataSetHelper().get_latest_version_number()

    blob_helper = BlobHelper()

    version_file = io.StringIO()

    version_json = {}
    version_json["version"] = version

    json.dump(version_json, version_file, indent=4)
    encoded_file = version_file.getvalue().encode('utf-8')

    storage_container_name = os.environ["AzureStorageJSONFilesContainerName"]
    blob_helper.write_stream_file(storage_container_name, "version.json",
                                  encoded_file)
def build_institutions_json_files():
    version = DataSetHelper().get_latest_version_number()
    blob_helper = BlobHelper()

    cosmos_db_client = get_cosmos_client()
    collection_link = get_collection_link(
        "AzureCosmosDbDatabaseId", "AzureCosmosDbInstitutionsCollectionId")

    query = f"SELECT * from c where c.version = {version}"

    options = {"enableCrossPartitionQuery": True}

    institution_list = list(
        cosmos_db_client.QueryItems(collection_link, query, options))

    institutions_file = io.StringIO()

    institutions = []
    for val in institution_list:
        institution = val["institution"]
        if isinstance(institution["pub_ukprn_name"], str):
            inst_entry = get_inst_entry(institution["pub_ukprn_name"])
            institutions.append(inst_entry)

    institutions.sort(key=lambda x: x["order_by_name"])

    json.dump(institutions, institutions_file, indent=4)
    encoded_file = institutions_file.getvalue().encode('utf-8')

    storage_container_name = os.environ["AzureStorageJSONFilesContainerName"]
    storage_blob_name = os.environ[
        "AzureStorageInstitutionsENJSONFileBlobName"]
    blob_helper.write_stream_file(storage_container_name, storage_blob_name,
                                  encoded_file)
    institutions_file.close()

    institutions_file = io.StringIO()

    institutions = []
    for val in institution_list:
        institution = val["institution"]
        isnt_name = institution["pub_ukprn_name"]
        inst_welsh_name = institution["pub_ukprn_welsh_name"]

        if isinstance(inst_welsh_name, str):
            inst_entry = get_inst_entry(inst_welsh_name)
            institutions.append(inst_entry)
        elif isinstance(isnt_name, str):
            inst_entry = get_inst_entry(isnt_name)
            institutions.append(inst_entry)

    institutions.sort(key=lambda x: x["order_by_name"])

    json.dump(institutions, institutions_file, indent=4)
    encoded_file = institutions_file.getvalue().encode('utf-8')

    storage_container_name = os.environ["AzureStorageJSONFilesContainerName"]
    storage_blob_name = os.environ[
        "AzureStorageInstitutionsCYJSONFileBlobName"]
    blob_helper.write_stream_file(storage_container_name, storage_blob_name,
                                  encoded_file)
    institutions_file.close()
def main(msgin: func.QueueMessage, msgout: func.Out[str]):
    """Creates the UKRLP lookup tables for later use

    This Azure Function carries out the following steps:
    * Decompresses the XML HESA DataSet

    * Parses the INSTITUTION data from the DataSet

    * Retrieves enrichment data from the UKRLP API for each institution

    * Creates a lookup item for each Institution and writes it to CosmosDB

    * Currently, once completed successfully this function triggers the Etl function by copying
      the compressed XML passed in to a Blob storage monitored by the Etl function.

    """
    # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live.
    use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile')

    msgerror = ""

    mail_helper = MailHelper()
    environment = os.environ["Environment"]

    dsh = DataSetHelper()

    try:
        logging.info(f"CreateUkrlp message queue triggered")

        function_start_datetime = datetime.today().strftime(
            "%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateUkrlp function started on {function_start_datetime}")

        blob_helper = BlobHelper()

        storage_container_name = os.environ["AzureStorageHesaContainerName"]
        storage_blob_name = os.environ["AzureStorageHesaBlobName"]

        if use_local_test_XML_file:
            mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r")
            xml_string = mock_xml_source_file.read()
        else:
            xml_string = blob_helper.get_str_file(storage_container_name,
                                                  storage_blob_name)

        version = dsh.get_latest_version_number()

        storage_container_name = os.environ[
            "AzureStorageWelshUnisContainerName"]
        storage_blob_name = os.environ["AzureStorageWelshUnisBlobName"]

        csv_string = blob_helper.get_str_file(storage_container_name,
                                              storage_blob_name)

        # Parse the xml and create the lookups

        logging.info(f"using version number: {version}")
        dsh.update_status("institutions", "in progress")
        lookup_creator = LookupCreator(xml_string, csv_string, version)
        ukrlp_no_info_list = lookup_creator.create_ukrlp_lookups()

        msgerror += f"\n\nUKRLP did not return info for the following {len(ukrlp_no_info_list)} ukprn(s):\n"

        for ukprn in ukrlp_no_info_list:
            msgerror += f"\t{ukprn}\n"

        function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateUkrlp successfully finished on {function_end_datetime}")

        msgout.set(msgin.get_body().decode("utf-8") + msgerror)

    except Exception as e:
        # Unexpected exception
        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CreateUkrlp"
            + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed")

        logging.error(f"CreateUkrlp faile on {function_fail_datetime}")
        logging.error(traceback.format_exc())

        # Raise to Azure
        raise e
def main(msgin: func.QueueMessage, msgout: func.Out[str]):
    # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live.
    use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile')

    msgerror = ""

    mail_helper = MailHelper()
    environment = os.environ["Environment"]

    dsh = DataSetHelper()

    try:
        logging.info(f"CreateInst message queue triggered\n")

        function_start_datetime = datetime.today().strftime(
            "%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateInst function started on {function_start_datetime}")
        """ DECOMPRESSION - Decompress the compressed HESA XML """
        # The XML blob provided to this function will be gzip compressed.
        # This is a work around for a limitation discovered in Azure,
        # where Functions written in Python do not get triggered # correctly with large blobs. Tests showed this is not a limitation
        # with Funtions written in C#.

        blob_helper = BlobHelper()

        storage_container_name = os.environ["AzureStorageHesaContainerName"]
        storage_blob_name = os.environ["AzureStorageHesaBlobName"]

        if use_local_test_XML_file:
            mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r")
            xml_string = mock_xml_source_file.read()
        else:
            xml_string = blob_helper.get_str_file(storage_container_name,
                                                  storage_blob_name)

        version = dsh.get_latest_version_number()
        """ LOADING - extract data and load JSON Documents """

        logging.info(f"using version number: {version}")
        dsh.update_status("institutions", "in progress")

        inst_docs = InstitutionDocs(xml_string, version)
        inst_docs.create_institution_docs()
        dsh.update_status("institutions", "succeeded")

        function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CreateInst successfully finished on {function_end_datetime}")

        msgout.set(msgin.get_body().decode("utf-8") + msgerror)

    except exceptions.StopEtlPipelineWarningException:

        # A WARNING is raised while the function is running and
        # StopEtlPipelineOnWarning=True. For example, the incoming raw XML
        # is not valid against its XSD
        error_message = (
            "A WARNING has been encountered while the function is running. "
            "The function will be stopped since StopEtlPipelineOnWarning is "
            "set to TRUE in the Application Settings.")
        logging.error(error_message)

        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CreateInst"
            + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed")

        logging.error(f"CreateInst failed on {function_fail_datetime}")
        dsh.update_status("institutions", "failed")
        raise Exception(error_message)

    except Exception as e:
        # Unexpected exception
        dsh.update_status("institutions", "failed")

        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CreateInst"
            + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed")

        logging.error(f"CreateInst failed on {function_fail_datetime}",
                      exc_info=True)

        # Raise to Azure
        raise e
Example #10
0
 def __init__(self):
     self.dsh = DataSetHelper()
def main(msgin: func.QueueMessage):
    msgerror = ""
    mail_helper = MailHelper()
    environment = os.environ["Environment"]

    dsh = DataSetHelper()

    try:

        logging.info(
            f"CourseSearchBuilder message queue triggered \n"
        )

        function_start_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")

        logging.info(
            f"CourseSearchBuilder function started on {function_start_datetime}"
        )

        api_key = os.environ["SearchAPIKey"]
        search_url = os.environ["SearchURL"]
        api_version = os.environ["AzureSearchAPIVersion"]

        version = dsh.get_latest_version_number()

        dsh.update_status("search", "in progress")

        search.build_synonyms(search_url, api_key, api_version)

        search.build_index(search_url, api_key, api_version, version)

        courses = utils.get_courses_by_version(version)

        number_of_courses = len(courses)

        logging.info(
            f"attempting to load courses to azure search\n\
                        number_of_courses: {number_of_courses}\n"
        )

        search.load_index(search_url, api_key, api_version, version, courses)
        dsh.update_status("search", "succeeded")
        courses = None

        if dsh.have_all_builds_succeeded():
            build_institutions_json_files()
            build_subjects_json_file()
            build_version_json_file()

            dsh.update_status("root", "succeeded")
        else:
            dsh.update_status("root", "failed")

        function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_end_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import completed on {function_end_datetime}" + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_end_date} - Completed"
        )

        logging.info(
            f"CourseSearchBuilder successfully finished on {function_end_datetime}"
        )

    except Exception as e:
        # Unexpected exception
        dsh.update_status("search", "failed")
        dsh.update_status("root", "failed")

        function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S")
        function_fail_date = datetime.today().strftime("%d.%m.%Y")

        mail_helper.send_message(
            f"Automated data import failed on {function_fail_datetime} at CourseSearchBuilder" + msgin.get_body().decode("utf-8") + msgerror,
            f"Data Import {environment} - {function_fail_date} - Failed"
        )

        logging.error(f"CourseSearchBuilder failed on {function_fail_datetime}")
        logging.error(traceback.format_exc())

        # Raise to Azure
        raise e