def create_institution_docs(self): """Parse HESA XML and create JSON institution docs in Cosmos DB.""" cosmosdb_client = get_cosmos_client() collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbInstitutionsCollectionId" ) options = {"partitionKey": str(self.version)} sproc_link = collection_link + "/sprocs/bulkImport" institution_count = 0 new_docs = [] sproc_count = 0 for institution in self.root.iter("INSTITUTION"): institution_count += 1 sproc_count += 1 new_docs.append(self.get_institution_doc(institution)) if sproc_count == 100: logging.info(f"Begining execution of stored procedure for {sproc_count} documents") cosmosdb_client.ExecuteStoredProcedure(sproc_link, [new_docs], options) logging.info(f"Successfully loaded another {sproc_count} documents") # Reset values new_docs = [] sproc_count = 0 time.sleep(2) if sproc_count > 0: logging.info(f"Begining execution of stored procedure for {sproc_count} documents") cosmosdb_client.ExecuteStoredProcedure(sproc_link, [new_docs], options) logging.info(f"Successfully loaded another {sproc_count} documents") logging.info(f"Processed {institution_count} institutions")
def build_subjects_json_file(): version = DataSetHelper().get_latest_version_number() blob_helper = BlobHelper() cosmos_db_client = get_cosmos_client() collection_link = get_collection_link("AzureCosmosDbDatabaseId", "AzureCosmosDbSubjectsCollectionId") query = f"SELECT * from c where c.version = {version}" options = {"enableCrossPartitionQuery": True} subjects_list = list( cosmos_db_client.QueryItems(collection_link, query, options)) subjects_file = io.StringIO() subjects = [] for subject in subjects_list: subject_entry = get_subject_entry(subject) subjects.append(subject_entry) subjects.sort(key=lambda x: x["english_name"]) json.dump(subjects, subjects_file, indent=4) encoded_file = subjects_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] storage_blob_name = os.environ["AzureStorageSubjectsJSONFileBlobName"] blob_helper.write_stream_file(storage_container_name, storage_blob_name, encoded_file) subjects_file.close()
def __init__(self, xml_string, welsh_uni_string, version): self.version = version self.cosmosdb_client = get_cosmos_client() self.xml_string = xml_string self.lookups_created = [] self.ukrlp_no_info_list = [] self.collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbUkRlpCollectionId" ) __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)) ) with open( os.path.join(__location__, "institution_whitelist.txt") ) as f: institutions_whitelist = f.readlines() self.institutions_whitelist = [ institution.strip() for institution in institutions_whitelist ] if welsh_uni_string: rows = welsh_uni_string.splitlines() # csv header row if not self.validate_column_headers(rows[0]): logging.error( "file headers are incorrect, expecting the following: code, english_label, level, welsh_label" ) raise exceptions.StopEtlPipelineErrorException self.welsh_uni = rows else: self.welsh_uni = []
def __init__(self, xml_string): self.cosmosdb_client = utils.get_cosmos_client() self.xml_string = xml_string self.lookups_created = [] self.ukrlp_no_info_list = [] self.db_entries_list = [] self.collection_link = utils.get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbUkRlpCollectionId")
def test_institution_fetcher(): db_id = "AzureCosmosDbDatabaseId" collection_id = "AzureCosmosDbInstitutionsCollectionId" # Get the relevant properties from Application Settings collection_link = utils.get_collection_link(db_id, collection_id) client = utils.get_cosmos_client() institution_fetcher = InstitutionFetcher(client, collection_link) institution_id = "10007857" version = 1 return institution_fetcher.get_institution(institution_id, version)
def create_course_docs(xml_string): """Parse HESA XML passed in and create JSON course docs in Cosmos DB.""" # TODO Investigate writing docs to CosmosDB in bulk to speed things up. cosmosdb_client = utils.get_cosmos_client() logging.info( "adding ukrlp data into memory ahead of building course documents" ) enricher = UkRlpCourseEnricher() logging.info( "adding subject data into memory ahead of building course documents" ) subject_enricher = SubjectCourseEnricher() collection_link = utils.get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbCoursesCollectionId" ) # Import the XML dataset root = ET.fromstring(xml_string) # Import accreditations, common, kisaims and location nodes accreditations = Accreditations(root) kisaims = KisAims(root) locations = Locations(root) course_count = 0 for institution in root.iter("INSTITUTION"): raw_inst_data = xmltodict.parse(ET.tostring(institution))[ "INSTITUTION" ] ukprn = raw_inst_data["UKPRN"] for course in institution.findall("KISCOURSE"): raw_course_data = xmltodict.parse(ET.tostring(course))["KISCOURSE"] locids = get_locids(raw_course_data, ukprn) course_doc = get_course_doc( accreditations, locations, locids, raw_inst_data, raw_course_data, kisaims, ) enricher.enrich_course(course_doc) subject_enricher.enrich_course(course_doc) cosmosdb_client.CreateItem(collection_link, course_doc) course_count += 1 logging.info(f"Processed {course_count} courses")
def create_institution_docs(self): """Parse HESA XML and create JSON institution docs in Cosmos DB.""" # TODO Investigate writing docs to CosmosDB in bulk to speed things up. cosmosdb_client = get_cosmos_client() collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbInstitutionsCollectionId" ) institution_count = 0 for institution in self.root.iter("INSTITUTION"): institution_count += 1 institution_doc = self.get_institution_doc(institution) cosmosdb_client.CreateItem(collection_link, institution_doc) logging.info(f"Processed {institution_count} institutions")
def main(req: func.HttpRequest) -> func.HttpResponse: """Implements the REST API endpoint for getting course documents. The endpoint implemented is: /institutions/{institution_id}/courses/{course_id}/modes/{mode} The API is fully documented in a swagger document in the same repo as this module. """ try: logging.info("Process a request for a course.") logging.info(f"url: {req.url}") logging.info(f"params: {req.params}") logging.info(f"route_params: {req.route_params}") # Put all the parameters together params = dict(req.route_params) version = req.params.get("version", "1") params["version"] = version logging.info(f"Parameters: {params}") # # The params are used in DB queries, so let's do # some basic sanitisation of them. # if not valid_course_params(params): logging.error(f"valid_course_params returned false for {params}") return func.HttpResponse( get_http_error_response_json( "Bad Request", "Parameter Error", "Invalid parameter passed" ), headers={"Content-Type": "application/json"}, status_code=400, ) logging.info("The parameters look good") # Intialise a CourseFetcher client = get_cosmos_client() collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbCoursesCollectionId" ) course_fetcher = CourseFetcher(client, collection_link) # Get the course course = course_fetcher.get_course(**params) if course: logging.info(f"Found a course {course}") return func.HttpResponse( course, headers={"Content-Type": "application/json"}, status_code=200 ) else: return func.HttpResponse( get_http_error_response_json( "Not Found", "course", "Course was not found." ), headers={"Content-Type": "application/json"}, status_code=404, ) except Exception as e: logging.error(traceback.format_exc()) # Raise so Azure sends back the HTTP 500 raise e
def build_institutions_json_files(): version = DataSetHelper().get_latest_version_number() blob_helper = BlobHelper() cosmos_db_client = get_cosmos_client() collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbInstitutionsCollectionId") query = f"SELECT * from c where c.version = {version}" options = {"enableCrossPartitionQuery": True} institution_list = list( cosmos_db_client.QueryItems(collection_link, query, options)) institutions_file = io.StringIO() institutions = [] for val in institution_list: institution = val["institution"] if isinstance(institution["pub_ukprn_name"], str): inst_entry = get_inst_entry(institution["pub_ukprn_name"]) institutions.append(inst_entry) institutions.sort(key=lambda x: x["order_by_name"]) json.dump(institutions, institutions_file, indent=4) encoded_file = institutions_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] storage_blob_name = os.environ[ "AzureStorageInstitutionsENJSONFileBlobName"] blob_helper.write_stream_file(storage_container_name, storage_blob_name, encoded_file) institutions_file.close() institutions_file = io.StringIO() institutions = [] for val in institution_list: institution = val["institution"] isnt_name = institution["pub_ukprn_name"] inst_welsh_name = institution["pub_ukprn_welsh_name"] if isinstance(inst_welsh_name, str): inst_entry = get_inst_entry(inst_welsh_name) institutions.append(inst_entry) elif isinstance(isnt_name, str): inst_entry = get_inst_entry(isnt_name) institutions.append(inst_entry) institutions.sort(key=lambda x: x["order_by_name"]) json.dump(institutions, institutions_file, indent=4) encoded_file = institutions_file.getvalue().encode('utf-8') storage_container_name = os.environ["AzureStorageJSONFilesContainerName"] storage_blob_name = os.environ[ "AzureStorageInstitutionsCYJSONFileBlobName"] blob_helper.write_stream_file(storage_container_name, storage_blob_name, encoded_file) institutions_file.close()
def __init__(self): self.cosmosdb_client = utils.get_cosmos_client() self.collection_link = utils.get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbFeedbackCollectionId")
get_cosmos_client, get_http_error_response_json, sanitise_feedback, ) from SharedCode.exceptions import ValidationError from .feedback_creator import FeedbackCreator from .validators import validate_feedback cosmosdb_uri = os.environ["AzureCosmosDbUri"] cosmosdb_key = os.environ["AzureCosmosDbKey"] cosmosdb_database_id = os.environ["AzureCosmosDbDatabaseId"] cosmosdb_collection_id = os.environ["AzureCosmosDbFeedbackCollectionId"] # Intialise cosmos db client client = get_cosmos_client(cosmosdb_uri, cosmosdb_key) def main(req: func.HttpRequest) -> func.HttpResponse: try: logging.info("PostFeedback http triggered.") function_start_datetime = datetime.today().strftime( "%d-%m-%Y %H:%M:%S") logging.info( f"PostFeedback function started on {function_start_datetime}") try: feedback = req.get_json()
def __init__(self): logging.info("Init for DataSetHelper") self.cosmos_client = get_cosmos_client() self.collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbDataSetCollectionId")
def main(req: func.HttpRequest) -> func.HttpResponse: """Implements the REST API endpoint for getting an institution document. The endpoint implemented is: /institutions/{institution_id}/ The API is documented in a swagger document. """ try: logging.info("Process a request for an institution resource.") logging.info(f"url: {req.url}") logging.info(f"params: {req.params}") logging.info(f"route_params: {req.route_params}") # Put all the parameters together params = dict(req.route_params) version = req.params.get("version", "1") params["version"] = version logging.info(f"Parameters: {params}") if not valid_institution_params(params): logging.error( f"valid_institution_params returned false for {params}") return func.HttpResponse( get_http_error_response_json("Bad Request", "Parameter Error", "Invalid parameter passed"), headers={"Content-Type": "application/json"}, status_code=400, ) logging.info("The parameters look good") # Intialise an InstitutionFetcher client = get_cosmos_client() collection_link = get_collection_link( "AzureCosmosDbDatabaseId", "AzureCosmosDbInstitutionsCollectionId") institution_fetcher = InstitutionFetcher(client, collection_link) # Get the institution institution = institution_fetcher.get_institution(**params) if institution: logging.info(f"Found a institution {institution}") return func.HttpResponse( institution, headers={"Content-Type": "application/json"}, status_code=200, ) return func.HttpResponse( get_http_error_response_json("Not Found", "institution", "Institution was not found."), headers={"Content-Type": "application/json"}, status_code=404, ) except Exception as e: logging.error(traceback.format_exc()) # Raise so Azure sends back the HTTP 500 raise e