class DataSetCreator: def __init__(self): self.dsh = DataSetHelper() def load_new_dataset_doc(self): dataset_doc = self.get_next_dataset_doc() # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live. use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile') if not use_local_test_XML_file: if dataset_doc["version"] != 1: if not self.has_enough_time_elaspsed_since_last_dataset_created(): raise DataSetTooEarlyError self.dsh.create_item(dataset_doc) logging.info(f"Created new vertsion {dataset_doc['version']} DataSet") def get_next_dataset_doc(self): next_version_number = self.get_next_dataset_version_number() dataset_doc = {} dataset_doc["builds"] = get_builds_value() dataset_doc["created_at"] = datetime.now(timezone.utc).isoformat() dataset_doc["is_published"] = False dataset_doc["status"] = "in progress" dataset_doc["version"] = next_version_number return dataset_doc def get_next_dataset_version_number(self): if self.get_number_of_dataset_docs() == 0: return 1 version = int(self.dsh.get_latest_version_number()) + 1 return version #return self.dsh.get_latest_version_number() + 1 def get_number_of_dataset_docs(self): query = "SELECT * FROM c " data_set_list = self.dsh.query_items(query) return len(data_set_list) def has_enough_time_elaspsed_since_last_dataset_created(self): dt_of_latest_dataset_doc = self.get_datetime_of_latest_dataset_doc() time_in_minutes_since_latest_dataset_doc = get_time_in_minutes_since_given_datetime( dt_of_latest_dataset_doc ) time_in_minutes_to_wait = int( os.environ["TimeInMinsToWaitBeforeCreateNewDataSet"] ) if time_in_minutes_to_wait > time_in_minutes_since_latest_dataset_doc: return False return True def get_datetime_of_latest_dataset_doc(self): max_version_number = self.dsh.get_latest_version_number() query = f"SELECT * FROM c WHERE c.version = {max_version_number}" latest_doc = self.dsh.query_items(query)[0] return convert_dt_str_to_dt_object(latest_doc["created_at"])
def main(msgin: func.QueueMessage, msgout: func.Out[str]): """Creates the UKRLP lookup tables for later use This Azure Function carries out the following steps: * Decompresses the XML HESA DataSet * Parses the INSTITUTION data from the DataSet * Retrieves enrichment data from the UKRLP API for each institution * Creates a lookup item for each Institution and writes it to CosmosDB * Currently, once completed successfully this function triggers the Etl function by copying the compressed XML passed in to a Blob storage monitored by the Etl function. """ # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live. use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile') msgerror = "" mail_helper = MailHelper() environment = os.environ["Environment"] dsh = DataSetHelper() try: logging.info(f"CreateUkrlp message queue triggered") function_start_datetime = datetime.today().strftime( "%d-%m-%Y %H:%M:%S") logging.info( f"CreateUkrlp function started on {function_start_datetime}") blob_helper = BlobHelper() storage_container_name = os.environ["AzureStorageHesaContainerName"] storage_blob_name = os.environ["AzureStorageHesaBlobName"] if use_local_test_XML_file: mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r") xml_string = mock_xml_source_file.read() else: xml_string = blob_helper.get_str_file(storage_container_name, storage_blob_name) version = dsh.get_latest_version_number() storage_container_name = os.environ[ "AzureStorageWelshUnisContainerName"] storage_blob_name = os.environ["AzureStorageWelshUnisBlobName"] csv_string = blob_helper.get_str_file(storage_container_name, storage_blob_name) # Parse the xml and create the lookups logging.info(f"using version number: {version}") dsh.update_status("institutions", "in progress") lookup_creator = LookupCreator(xml_string, csv_string, version) ukrlp_no_info_list = lookup_creator.create_ukrlp_lookups() msgerror += f"\n\nUKRLP did not return info for the following {len(ukrlp_no_info_list)} ukprn(s):\n" for ukprn in ukrlp_no_info_list: msgerror += f"\t{ukprn}\n" function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") logging.info( f"CreateUkrlp successfully finished on {function_end_datetime}") msgout.set(msgin.get_body().decode("utf-8") + msgerror) except Exception as e: # Unexpected exception function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CreateUkrlp" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed") logging.error(f"CreateUkrlp faile on {function_fail_datetime}") logging.error(traceback.format_exc()) # Raise to Azure raise e
def main(msgin: func.QueueMessage, msgout: func.Out[str]): # TODO: apw: Ensure that UseLocalTestXMLFile is set to false in local.settings.json before going live. use_local_test_XML_file = os.environ.get('UseLocalTestXMLFile') msgerror = "" mail_helper = MailHelper() environment = os.environ["Environment"] dsh = DataSetHelper() try: logging.info(f"CreateInst message queue triggered\n") function_start_datetime = datetime.today().strftime( "%d-%m-%Y %H:%M:%S") logging.info( f"CreateInst function started on {function_start_datetime}") """ DECOMPRESSION - Decompress the compressed HESA XML """ # The XML blob provided to this function will be gzip compressed. # This is a work around for a limitation discovered in Azure, # where Functions written in Python do not get triggered # correctly with large blobs. Tests showed this is not a limitation # with Funtions written in C#. blob_helper = BlobHelper() storage_container_name = os.environ["AzureStorageHesaContainerName"] storage_blob_name = os.environ["AzureStorageHesaBlobName"] if use_local_test_XML_file: mock_xml_source_file = open(os.environ["LocalTestXMLFile"], "r") xml_string = mock_xml_source_file.read() else: xml_string = blob_helper.get_str_file(storage_container_name, storage_blob_name) version = dsh.get_latest_version_number() """ LOADING - extract data and load JSON Documents """ logging.info(f"using version number: {version}") dsh.update_status("institutions", "in progress") inst_docs = InstitutionDocs(xml_string, version) inst_docs.create_institution_docs() dsh.update_status("institutions", "succeeded") function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") logging.info( f"CreateInst successfully finished on {function_end_datetime}") msgout.set(msgin.get_body().decode("utf-8") + msgerror) except exceptions.StopEtlPipelineWarningException: # A WARNING is raised while the function is running and # StopEtlPipelineOnWarning=True. For example, the incoming raw XML # is not valid against its XSD error_message = ( "A WARNING has been encountered while the function is running. " "The function will be stopped since StopEtlPipelineOnWarning is " "set to TRUE in the Application Settings.") logging.error(error_message) function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CreateInst" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed") logging.error(f"CreateInst failed on {function_fail_datetime}") dsh.update_status("institutions", "failed") raise Exception(error_message) except Exception as e: # Unexpected exception dsh.update_status("institutions", "failed") function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CreateInst" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed") logging.error(f"CreateInst failed on {function_fail_datetime}", exc_info=True) # Raise to Azure raise e
def main(msgin: func.QueueMessage): msgerror = "" mail_helper = MailHelper() environment = os.environ["Environment"] dsh = DataSetHelper() try: logging.info( f"CourseSearchBuilder message queue triggered \n" ) function_start_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") logging.info( f"CourseSearchBuilder function started on {function_start_datetime}" ) api_key = os.environ["SearchAPIKey"] search_url = os.environ["SearchURL"] api_version = os.environ["AzureSearchAPIVersion"] version = dsh.get_latest_version_number() dsh.update_status("search", "in progress") search.build_synonyms(search_url, api_key, api_version) search.build_index(search_url, api_key, api_version, version) courses = utils.get_courses_by_version(version) number_of_courses = len(courses) logging.info( f"attempting to load courses to azure search\n\ number_of_courses: {number_of_courses}\n" ) search.load_index(search_url, api_key, api_version, version, courses) dsh.update_status("search", "succeeded") courses = None if dsh.have_all_builds_succeeded(): build_institutions_json_files() build_subjects_json_file() build_version_json_file() dsh.update_status("root", "succeeded") else: dsh.update_status("root", "failed") function_end_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_end_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import completed on {function_end_datetime}" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_end_date} - Completed" ) logging.info( f"CourseSearchBuilder successfully finished on {function_end_datetime}" ) except Exception as e: # Unexpected exception dsh.update_status("search", "failed") dsh.update_status("root", "failed") function_fail_datetime = datetime.today().strftime("%d-%m-%Y %H:%M:%S") function_fail_date = datetime.today().strftime("%d.%m.%Y") mail_helper.send_message( f"Automated data import failed on {function_fail_datetime} at CourseSearchBuilder" + msgin.get_body().decode("utf-8") + msgerror, f"Data Import {environment} - {function_fail_date} - Failed" ) logging.error(f"CourseSearchBuilder failed on {function_fail_datetime}") logging.error(traceback.format_exc()) # Raise to Azure raise e