def test_initialisation(self, mock_get_cosmos_client): mock_get_cosmos_client.return_value = mock.MagicMock() try: DataSetHelper() except: self.fail( "DataSetHelper initialisation raised unexpected Exception")
def build_subjects_json_file(): version = DataSetHelper().get_latest_version_number() blob_helper = BlobHelper() cosmos_db_client = get_cosmos_client() collection_link = get_collection_link("AzureCosmosDbDatabaseId", "AzureCosmosDbSubjectsCollectionId") query = f"SELECT * from c where c.version = {version}" options = {"enableCrossPartitionQuery": True} subjects_list = list( cosmos_db_client.QueryItems(collection_link, query, options)) subjects_file = io.StringIO() subjects = [] for subject in subjects_list: subject_entry = get_subject_entry(subject) subjects.append(subject_entry) subjects.sort(key=lambda x: x["english_name"]) json.dump(subjects, subjects_file, indent=4) encoded_file = subjects_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] storage_blob_name = os.environ["AzureStorageSubjectsJSONFileBlobName"] blob_helper.write_stream_file(storage_container_name, storage_blob_name, encoded_file) subjects_file.close()
def test_update_status(self, mock_get_cosmos_client): dsh = DataSetHelper() latest_dataset_doc = {} latest_dataset_doc["version"] = 3 latest_dataset_doc["builds"] = {"courses": {"status": "pending"}} latest_dataset_doc["updated_at"] = "dave" dsh.get_latest_doc = mock.MagicMock(return_value=latest_dataset_doc) dsh.cosmos_client.UpsertItem = mock.MagicMock() dsh.update_status("courses", "in progress", "dave") expected_connection_link = ( "dbs/test-db-id/colls/test-dataset-collection-id") expected_dataset_doc = {} expected_dataset_doc["version"] = 3 expected_dataset_doc["builds"] = {"courses": {"status": "in progress"}} expected_dataset_doc["updated_at"] = "dave" dsh.cosmos_client.UpsertItem.assert_called_once_with( expected_connection_link, expected_dataset_doc)
def test_have_all_builds_succeeded_with_one_pending( self, mock_get_cosmos_client): dsh = DataSetHelper() latest_dataset_doc = {} latest_dataset_doc["version"] = 3 latest_dataset_doc["builds"] = { "courses": { "status": "pending" }, "institutions": { "status": "succeeded" }, "search": { "status": "succeeded" }, "subjects": { "status": "succeeded" }, } dsh.get_latest_doc = mock.MagicMock(return_value=latest_dataset_doc) self.assertFalse(dsh.have_all_builds_succeeded())
class DataSetCreator: def __init__(self): self.dsh = DataSetHelper() def load_new_dataset_doc(self): dataset_doc = self.get_next_dataset_doc() # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live. use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile') if not use_local_test_XML_file: if dataset_doc["version"] != 1: if not self.has_enough_time_elaspsed_since_last_dataset_created(): raise DataSetTooEarlyError self.dsh.create_item(dataset_doc) logging.info(f"Created new vertsion {dataset_doc['version']} DataSet") def get_next_dataset_doc(self): next_version_number = self.get_next_dataset_version_number() dataset_doc = {} dataset_doc["builds"] = get_builds_value() dataset_doc["created_at"] = datetime.now(timezone.utc).isoformat() dataset_doc["is_published"] = False dataset_doc["status"] = "in progress" dataset_doc["version"] = next_version_number return dataset_doc def get_next_dataset_version_number(self): if self.get_number_of_dataset_docs() == 0: return 1 version = int(self.dsh.get_latest_version_number()) + 1 return version #return self.dsh.get_latest_version_number() + 1 def get_number_of_dataset_docs(self): query = "SELECT * FROM c " data_set_list = self.dsh.query_items(query) return len(data_set_list) def has_enough_time_elaspsed_since_last_dataset_created(self): dt_of_latest_dataset_doc = self.get_datetime_of_latest_dataset_doc() time_in_minutes_since_latest_dataset_doc = get_time_in_minutes_since_given_datetime( dt_of_latest_dataset_doc ) time_in_minutes_to_wait = int( os.environ["TimeInMinsToWaitBeforeCreateNewDataSet"] ) if time_in_minutes_to_wait > time_in_minutes_since_latest_dataset_doc: return False return True def get_datetime_of_latest_dataset_doc(self): max_version_number = self.dsh.get_latest_version_number() query = f"SELECT * FROM c WHERE c.version = {max_version_number}" latest_doc = self.dsh.query_items(query)[0] return convert_dt_str_to_dt_object(latest_doc["created_at"])
def build_version_json_file(): version = DataSetHelper().get_latest_version_number() blob_helper = BlobHelper() version_file = io.StringIO() version_json = {} version_json["version"] = version json.dump(version_json, version_file, indent=4) encoded_file = version_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] blob_helper.write_stream_file(storage_container_name, "version.json", encoded_file)
def build_institutions_json_files(): version = DataSetHelper().get_latest_version_number() blob_helper = BlobHelper() cosmos_db_client = get_cosmos_client() collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbInstitutionsCollectionId") query = f"SELECT * from c where c.version = {version}" options = {"enableCrossPartitionQuery": True} institution_list = list( cosmos_db_client.QueryItems(collection_link, query, options)) institutions_file = io.StringIO() institutions = [] for val in institution_list: institution = val["institution"] if isinstance(institution["pub_ukprn_name"], str): inst_entry = get_inst_entry(institution["pub_ukprn_name"]) institutions.append(inst_entry) institutions.sort(key=lambda x: x["order_by_name"]) json.dump(institutions, institutions_file, indent=4) encoded_file = institutions_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] storage_blob_name = os.environ[ "AzureStorageInstitutionsENJSONFileBlobName"] blob_helper.write_stream_file(storage_container_name, storage_blob_name, encoded_file) institutions_file.close() institutions_file = io.StringIO() institutions = [] for val in institution_list: institution = val["institution"] isnt_name = institution["pub_ukprn_name"] inst_welsh_name = institution["pub_ukprn_welsh_name"] if isinstance(inst_welsh_name, str): inst_entry = get_inst_entry(inst_welsh_name) institutions.append(inst_entry) elif isinstance(isnt_name, str): inst_entry = get_inst_entry(isnt_name) institutions.append(inst_entry) institutions.sort(key=lambda x: x["order_by_name"]) json.dump(institutions, institutions_file, indent=4) encoded_file = institutions_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] storage_blob_name = os.environ[ "AzureStorageInstitutionsCYJSONFileBlobName"] blob_helper.write_stream_file(storage_container_name, storage_blob_name, encoded_file) institutions_file.close()
def main(msgin: func.QueueMessage, msgout: func.Out[str]): """Creates the UKRLP lookup tables for later use This Azure Function carries out the following steps: * Decompresses the XML HESA DataSet * Parses the INSTITUTION data from the DataSet * Retrieves enrichment data from the UKRLP API for each institution * Creates a lookup item for each Institution and writes it to CosmosDB * Currently, once completed successfully this function triggers the Etl function by copying the compressed XML passed in to a Blob storage monitored by the Etl function. """ # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live. use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile') msgerror = "" mail_helper = MailHelper() environment = os.environ["Environment"] dsh = DataSetHelper() try: logging.info(f"CreateUkrlp message queue triggered") function_start_datetime = datetime.today().strftime( "%d-%m-%Y %H:%M:%S") logging.info( f"CreateUkrlp function started on {function_start_datetime}") blob_helper = BlobHelper() storage_container_name = os.environ["AzureStorageHesaContainerName"] storage_blob_name = os.environ["AzureStorageHesaBlobName"] if use_local_test_XML_file: mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r") xml_string = mock_xml_source_file.read() else: xml_string = blob_helper.get_str_file(storage_container_name, storage_blob_name) version = dsh.get_latest_version_number() storage_container_name = os.environ[ "AzureStorageWelshUnisContainerName"] storage_blob_name = os.environ["AzureStorageWelshUnisBlobName"] csv_string = blob_helper.get_str_file(storage_container_name, storage_blob_name) # Parse the xml and create the lookups logging.info(f"using version number: {version}") dsh.update_status("institutions", "in progress") lookup_creator = LookupCreator(xml_string, csv_string, version) ukrlp_no_info_list = lookup_creator.create_ukrlp_lookups() msgerror += f"\n\nUKRLP did not return info for the following {len(ukrlp_no_info_list)} ukprn(s):\n" for ukprn in ukrlp_no_info_list: msgerror += f"\t{ukprn}\n" function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") logging.info( f"CreateUkrlp successfully finished on {function_end_datetime}") msgout.set(msgin.get_body().decode("utf-8") + msgerror) except Exception as e: # Unexpected exception function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CreateUkrlp" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed") logging.error(f"CreateUkrlp faile on {function_fail_datetime}") logging.error(traceback.format_exc()) # Raise to Azure raise e
def main(msgin: func.QueueMessage, msgout: func.Out[str]): # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live. use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile') msgerror = "" mail_helper = MailHelper() environment = os.environ["Environment"] dsh = DataSetHelper() try: logging.info(f"CreateInst message queue triggered\n") function_start_datetime = datetime.today().strftime( "%d-%m-%Y %H:%M:%S") logging.info( f"CreateInst function started on {function_start_datetime}") """ DECOMPRESSION - Decompress the compressed HESA XML """ # The XML blob provided to this function will be gzip compressed. # This is a work around for a limitation discovered in Azure, # where Functions written in Python do not get triggered # correctly with large blobs. Tests showed this is not a limitation # with Funtions written in C#. blob_helper = BlobHelper() storage_container_name = os.environ["AzureStorageHesaContainerName"] storage_blob_name = os.environ["AzureStorageHesaBlobName"] if use_local_test_XML_file: mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r") xml_string = mock_xml_source_file.read() else: xml_string = blob_helper.get_str_file(storage_container_name, storage_blob_name) version = dsh.get_latest_version_number() """ LOADING - extract data and load JSON Documents """ logging.info(f"using version number: {version}") dsh.update_status("institutions", "in progress") inst_docs = InstitutionDocs(xml_string, version) inst_docs.create_institution_docs() dsh.update_status("institutions", "succeeded") function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") logging.info( f"CreateInst successfully finished on {function_end_datetime}") msgout.set(msgin.get_body().decode("utf-8") + msgerror) except exceptions.StopEtlPipelineWarningException: # A WARNING is raised while the function is running and # StopEtlPipelineOnWarning=True. For example, the incoming raw XML # is not valid against its XSD error_message = ( "A WARNING has been encountered while the function is running. " "The function will be stopped since StopEtlPipelineOnWarning is " "set to TRUE in the Application Settings.") logging.error(error_message) function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CreateInst" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed") logging.error(f"CreateInst failed on {function_fail_datetime}") dsh.update_status("institutions", "failed") raise Exception(error_message) except Exception as e: # Unexpected exception dsh.update_status("institutions", "failed") function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CreateInst" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed") logging.error(f"CreateInst failed on {function_fail_datetime}", exc_info=True) # Raise to Azure raise e
def __init__(self): self.dsh = DataSetHelper()
def main(msgin: func.QueueMessage): msgerror = "" mail_helper = MailHelper() environment = os.environ["Environment"] dsh = DataSetHelper() try: logging.info( f"CourseSearchBuilder message queue triggered \n" ) function_start_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") logging.info( f"CourseSearchBuilder function started on {function_start_datetime}" ) api_key = os.environ["SearchAPIKey"] search_url = os.environ["SearchURL"] api_version = os.environ["AzureSearchAPIVersion"] version = dsh.get_latest_version_number() dsh.update_status("search", "in progress") search.build_synonyms(search_url, api_key, api_version) search.build_index(search_url, api_key, api_version, version) courses = utils.get_courses_by_version(version) number_of_courses = len(courses) logging.info( f"attempting to load courses to azure search\n\ number_of_courses: {number_of_courses}\n" ) search.load_index(search_url, api_key, api_version, version, courses) dsh.update_status("search", "succeeded") courses = None if dsh.have_all_builds_succeeded(): build_institutions_json_files() build_subjects_json_file() build_version_json_file() dsh.update_status("root", "succeeded") else: dsh.update_status("root", "failed") function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_end_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import completed on {function_end_datetime}" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_end_date} - Completed" ) logging.info( f"CourseSearchBuilder successfully finished on {function_end_datetime}" ) except Exception as e: # Unexpected exception dsh.update_status("search", "failed") dsh.update_status("root", "failed") function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CourseSearchBuilder" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed" ) logging.error(f"CourseSearchBuilder failed on {function_fail_datetime}") logging.error(traceback.format_exc()) # Raise to Azure raise e