def prepare_oai_boundwiths(**kwargs): """Grab Boundwith Files and Generate Child Lookup.""" access_id = kwargs.get("AWS_ACCESS_KEY_ID") access_secret = kwargs.get("AWS_SECRET_ACCESS_KEY") bucket = kwargs.get("BUCKET") bw_prefix = kwargs.get("SOURCE_FOLDER") lookup_prefix = kwargs.get("DEST_FOLDER") bw_keys = [key for key in ast.literal_eval(kwargs.get("S3_KEYS")) if key.startswith(bw_prefix)] csv_in_mem = io.StringIO() lookup_csv = csv.DictWriter(csv_in_mem, fieldnames=["child_id", "parent_id", "parent_xml"]) lookup_csv.writeheader() logging.info("Starting to iterate over S3 Boundwith objects") for key in bw_keys: logging.info("Loading s3 key %s", key) source_obj = process.get_s3_content(bucket, key, access_id, access_secret) source_xml = process.add_marc21xml_root_ns(source_obj) for record in source_xml.xpath("oai:record/oai:metadata/marc21:record", namespaces=NS): boundwith_record_process(record, lookup_csv) process.generate_s3_object( csv_in_mem.getvalue(), bucket, lookup_prefix, access_id, access_secret )
def dag_write_string_to_s3(string, prefix, **kwargs): """Push a string in memory to s3 with a defined prefix""" access_id = kwargs.get("access_id") access_secret = kwargs.get("access_secret") bucket_name = kwargs.get("bucket_name") logging.info("Writing to S3 Bucket %s", bucket_name) our_hash = hashlib.md5(string.encode("utf-8")).hexdigest() filename = "{}/{}".format(prefix, our_hash) process.generate_s3_object(string, bucket_name, filename, access_id, access_secret)
def report_s3_schematron(**kwargs): """Wrapper function for using S3 Retrieval, Schematron Reporting, and S3 Writer.""" source_prefix = kwargs.get("source_prefix") dest_prefix = kwargs.get("destination_prefix") bucket = kwargs.get("bucket") schematron_file = kwargs.get("schematron_filename") access_id = kwargs.get("access_id") access_secret = kwargs.get("access_secret") # create reporting csv csv_in_mem = io.StringIO() report_csv = csv.DictWriter( csv_in_mem, fieldnames=["id", "report", "record", "source_file"]) report_csv.writeheader() # get schematron doc & return lxml.etree.Schematron validator schematron_doc = process.get_github_content("tulibraries/aggregator_mdx", schematron_file) schematron = isoschematron.Schematron(etree.fromstring(schematron_doc), store_report=True) # Iterate through S3 Files, Validate, & Save Report to CSV total_transform_count = 0 for s3_key in process.list_s3_content(bucket, access_id, access_secret, source_prefix): logging.info("Validating & Reporting On File: %s", s3_key) s3_content = process.get_s3_content(bucket, s3_key, access_id, access_secret) s3_xml = etree.fromstring(s3_content) for record in s3_xml.iterchildren(): total_transform_count += 1 record_id = record.get("airflow-record-id") logging.info("Ran report on record: %s", record_id) schematron.validate(record) report_csv.writerow({ "id": record_id, "report": schematron_failed_validation_text( schematron.validation_report), "record": identifier_or_full_record(record), "source_file": f"https://s3.console.aws.amazon.com/s3/object/{bucket}/{s3_key}" }) report_filename = dest_prefix + "-report.csv" logging.info("Records report: https://%s.s3.amazonaws.com/%s", bucket, report_filename) logging.info("Total Transform Count: %s", total_transform_count) process.generate_s3_object(csv_in_mem.getvalue(), bucket, report_filename, access_id, access_secret) return {"transformed": total_transform_count}
def test_genereate_s3_object(self): bucket = "test_bucket" key = "test_key_2" body = b"<test>even more content</test>" access_id = "test_access_id" access_secret = "test_access_secret" conn = boto3.client("s3", aws_access_key_id=access_id, aws_secret_access_key=access_secret) conn.create_bucket(Bucket=bucket) process.generate_s3_object(body, bucket, key, access_id, access_secret) test_content_exists = conn.get_object(Bucket=bucket, Key=key) test_object_exists = conn.list_objects(Bucket=bucket) self.assertEqual(test_content_exists["Body"].read(), body) self.assertEqual( test_content_exists["ResponseMetadata"]["HTTPStatusCode"], 200) self.assertEqual(test_object_exists["Contents"][0]["Key"], key)
def prepare_alma_data(**kwargs): """Update XML records by injecting parent xml when record 001 is in lookup child_id column.""" access_id = kwargs.get("AWS_ACCESS_KEY_ID") access_secret = kwargs.get("AWS_SECRET_ACCESS_KEY") bucket = kwargs.get("BUCKET") dest_prefix = kwargs.get("DEST_PREFIX") lookup_key = kwargs.get("LOOKUP_KEY") src_prefix = kwargs.get("SOURCE_PREFIX") src_suffix = kwargs.get("SOURCE_SUFFIX") s3_keys = ast.literal_eval(kwargs.get("S3_KEYS")) # Generate list of S3 keys we want to index alma_keys = [key for key in s3_keys if key.startswith(src_prefix) and key.endswith(src_suffix)] # Read Boundwith Lookup file into Memory, with child_id column as array csv_data = process.get_s3_content(bucket, lookup_key, access_id, access_secret) lookup_csv = pandas.read_csv(io.BytesIO(csv_data), header=0) # Process filtered set of keys to untar, ungzip, add MARC21 XML namespaces, # & inject parent XML if the record is an identified (via lookup) child record. logging.info("Starting to iterate over S3 objects") for key in alma_keys: logging.info("Loading s3 key %s", key) src_obj = process.get_s3_content(bucket, key, access_id, access_secret) src_data = process.expand_alma_sftp_tarball(key, src_obj) src_xml = process.add_marc21xml_root_ns(src_data) for record in src_xml.findall("{http://www.loc.gov/MARC21/slim}record"): record_id = process.get_record_001(record) parent_txt = lookup_csv.loc[lookup_csv.child_id == int(record_id), 'parent_xml'].values if len(set(parent_txt)) >= 1: logging.info("Child XML record found %s", record_id) for parent_node in parent_txt[0].split("||"): try: record.append(etree.fromstring(parent_node)) except etree.XMLSyntaxError as error: logging.error("Problem with string syntax:") logging.error(error) logging.error(parent_node) dest_key = key.replace(src_suffix, "").replace(src_prefix, dest_prefix + "/alma_bibs__") process.generate_s3_object( etree.tostring(src_xml), bucket, dest_key, access_id, access_secret )
def transform_s3_xsl(**kwargs): """Transform & Write XML data to S3 using Saxon XSLT Engine.""" access_id = kwargs.get("access_id") access_secret = kwargs.get("access_secret") bucket = kwargs.get("bucket") dest_prefix = kwargs.get("destination_prefix") source_prefix = kwargs.get("source_prefix") if kwargs.get("dag"): run_id = kwargs.get("dag").dag_id else: run_id = "no-dag-provided" saxon = prepare_saxon_engine() transformed = etree.Element("collection") transformed.attrib["dag-id"] = run_id transformed.attrib["dag-timestamp"] = kwargs.get("timestamp", "no-timestamp-provided") xsl = "https://raw.github.com/{repo}/{branch}/{filename}".format( repo=kwargs.get("xsl_repository", "tulibraries/aggregator_mdx"), branch=kwargs.get("xsl_branch", "master"), filename=kwargs.get("xsl_filename")) for s3_key in process.list_s3_content(bucket, access_id, access_secret, source_prefix): logging.info("Transforming File %s", s3_key) s3_content = process.get_s3_content(bucket, s3_key, access_id, access_secret) s3_xml = etree.fromstring(s3_content) for record in s3_xml.iterchildren(): record_id = record.get("airflow-record-id") logging.info("Transforming Record %s", record_id) result_str = subprocess.check_output( ["java", "-jar", saxon, "-xsl:" + xsl, "-s:-"], input=etree.tostring(record)) result = etree.fromstring(result_str) result.attrib["airflow-record-id"] = record_id transformed.append(result) filename = s3_key.replace(source_prefix, dest_prefix) transformed_xml = etree.tostring(transformed) process.generate_s3_object(transformed_xml, bucket, filename, access_id, access_secret)
def filter_s3_schematron(**kwargs): """Wrapper function for using S3 Retrieval, Schematron Filtering, and S3 Writer.""" source_prefix = kwargs.get("source_prefix") dest_prefix = kwargs.get("destination_prefix") report_prefix = kwargs.get("report_prefix") bucket = kwargs.get("bucket") schematron_file = kwargs.get("schematron_filename") access_id = kwargs.get("access_id") access_secret = kwargs.get("access_secret") if kwargs.get("dag"): run_id = kwargs.get("dag").dag_id else: run_id = "no-dag-provided" if kwargs.get("timestamp"): timestamp = kwargs.get("timestamp") else: timestamp = "no-timestamp-provided" # create invalid records reporting csv csv_in_mem = io.StringIO() invalid_csv = csv.DictWriter( csv_in_mem, fieldnames=["id", "report", "record", "source_file"]) invalid_csv.writeheader() # get schematron doc & return lxml.etree.Schematron validator schematron_doc = process.get_github_content("tulibraries/aggregator_mdx", schematron_file) schematron = isoschematron.Schematron(etree.fromstring(schematron_doc), store_report=True) total_filter_count = 0 total_record_count = 0 for s3_key in process.list_s3_content(bucket, access_id, access_secret, source_prefix): logging.info("Validating & Filtering File: %s", s3_key) s3_content = process.get_s3_content(bucket, s3_key, access_id, access_secret) s3_xml = etree.fromstring(s3_content) invalid_xml = etree.Element("collection") invalid_xml.attrib["dag-id"] = run_id invalid_xml.attrib["dag-timestamp"] = timestamp filter_count = 0 record_count = 0 for record in s3_xml.iterchildren(): record_count += 1 total_record_count += 1 if not schematron.validate(record): record_id = record.get("airflow-record-id") logging.error("Invalid record found: %s", record_id) s3_xml.remove(record) filter_count += 1 invalid_csv.writerow({ "id": record_id, "report": schematron_failed_validation_text( schematron.validation_report), "record": identifier_or_full_record(record), "source_file": f"https://s3.console.aws.amazon.com/s3/object/{bucket}/{s3_key}" }) total_filter_count += filter_count filename = s3_key.replace(source_prefix, dest_prefix) updated_s3_xml = etree.tostring(s3_xml) process.generate_s3_object(updated_s3_xml, bucket, filename, access_id, access_secret) if filter_count == record_count and record_count != 0: logging.warning( f"All records filtered from {filename}. record_count: {record_count}" ) invalid_filename = report_prefix + "-invalid.csv" logging.info("Total Filter Count: %s", total_filter_count) logging.info("Invalid Records report: https://%s.s3.amazonaws.com/%s", bucket, invalid_filename) process.generate_s3_object(csv_in_mem.getvalue(), bucket, invalid_filename, access_id, access_secret) if total_filter_count == total_record_count and total_record_count != 0: raise AirflowException( f"All records were filtered out: {total_record_count}") return {"filtered": total_filter_count}