コード例 #1
0
ファイル: harvest.py プロジェクト: tulibraries/tulflow
    def perform_xml_lookup(oai_record, **kwargs):
        """Parse additions/updates & add boundwiths."""

        if len(cache) == 0:
            logging.info("*** Fetching CSV lookup file from s3 ***")
            access_id = kwargs.get("access_id")
            access_secret = kwargs.get("access_secret")
            bucket = kwargs.get("bucket_name")
            lookup_key = kwargs.get("lookup_key")
            csv_data = process.get_s3_content(bucket, lookup_key, access_id,
                                              access_secret)
            cache["value"] = pandas.read_csv(io.BytesIO(csv_data), header=0)

        lookup_csv = cache["value"]

        for record in oai_record.xpath(".//marc21:record", namespaces=NS):
            record_id = process.get_record_001(record)
            logging.info("Reading in Record %s", record_id)
            parent_txt = lookup_csv.loc[lookup_csv.child_id == int(record_id),
                                        "parent_xml"].values
            if len(set(parent_txt)) >= 1:
                logging.info("Child XML record found %s", record_id)
                for parent_node in parent_txt[0].split("||"):
                    try:
                        record.append(etree.fromstring(parent_node))
                    except etree.XMLSyntaxError as error:
                        logging.error("Problem with string syntax:")
                        logging.error(error)
                        logging.error(parent_node)
        return oai_record
コード例 #2
0
def prepare_oai_boundwiths(**kwargs):
    """Grab Boundwith Files and Generate Child Lookup."""
    access_id = kwargs.get("AWS_ACCESS_KEY_ID")
    access_secret = kwargs.get("AWS_SECRET_ACCESS_KEY")
    bucket = kwargs.get("BUCKET")
    bw_prefix = kwargs.get("SOURCE_FOLDER")
    lookup_prefix = kwargs.get("DEST_FOLDER")

    bw_keys = [key for key in ast.literal_eval(kwargs.get("S3_KEYS")) if key.startswith(bw_prefix)]
    csv_in_mem = io.StringIO()
    lookup_csv = csv.DictWriter(csv_in_mem, fieldnames=["child_id", "parent_id", "parent_xml"])
    lookup_csv.writeheader()

    logging.info("Starting to iterate over S3 Boundwith objects")
    for key in bw_keys:
        logging.info("Loading s3 key %s", key)
        source_obj = process.get_s3_content(bucket, key, access_id, access_secret)
        source_xml = process.add_marc21xml_root_ns(source_obj)
        for record in source_xml.xpath("oai:record/oai:metadata/marc21:record", namespaces=NS):
            boundwith_record_process(record, lookup_csv)
    process.generate_s3_object(
        csv_in_mem.getvalue(),
        bucket,
        lookup_prefix,
        access_id,
        access_secret
    )
コード例 #3
0
def prepare_alma_data(**kwargs):
    """Update XML records by injecting parent xml when record 001 is in lookup child_id column."""
    access_id = kwargs.get("AWS_ACCESS_KEY_ID")
    access_secret = kwargs.get("AWS_SECRET_ACCESS_KEY")
    bucket = kwargs.get("BUCKET")
    dest_prefix = kwargs.get("DEST_PREFIX")
    lookup_key = kwargs.get("LOOKUP_KEY")
    src_prefix = kwargs.get("SOURCE_PREFIX")
    src_suffix = kwargs.get("SOURCE_SUFFIX")
    s3_keys = ast.literal_eval(kwargs.get("S3_KEYS"))

    # Generate list of S3 keys we want to index
    alma_keys = [key for key in s3_keys if key.startswith(src_prefix) and key.endswith(src_suffix)]

    # Read Boundwith Lookup file into Memory, with child_id column as array
    csv_data = process.get_s3_content(bucket, lookup_key, access_id, access_secret)
    lookup_csv = pandas.read_csv(io.BytesIO(csv_data), header=0)

    # Process filtered set of keys to untar, ungzip, add MARC21 XML namespaces,
    # & inject parent XML if the record is an identified (via lookup) child record.
    logging.info("Starting to iterate over S3 objects")
    for key in alma_keys:
        logging.info("Loading s3 key %s", key)
        src_obj = process.get_s3_content(bucket, key, access_id, access_secret)
        src_data = process.expand_alma_sftp_tarball(key, src_obj)
        src_xml = process.add_marc21xml_root_ns(src_data)
        for record in src_xml.findall("{http://www.loc.gov/MARC21/slim}record"):
            record_id = process.get_record_001(record)
            parent_txt = lookup_csv.loc[lookup_csv.child_id == int(record_id), 'parent_xml'].values
            if len(set(parent_txt)) >= 1:
                logging.info("Child XML record found %s", record_id)
                for parent_node in parent_txt[0].split("||"):
                    try:
                        record.append(etree.fromstring(parent_node))
                    except etree.XMLSyntaxError as error:
                        logging.error("Problem with string syntax:")
                        logging.error(error)
                        logging.error(parent_node)
        dest_key = key.replace(src_suffix, "").replace(src_prefix, dest_prefix + "/alma_bibs__")
        process.generate_s3_object(
            etree.tostring(src_xml),
            bucket,
            dest_key,
            access_id,
            access_secret
        )
コード例 #4
0
ファイル: validate.py プロジェクト: tulibraries/tulflow
def report_s3_schematron(**kwargs):
    """Wrapper function for using S3 Retrieval, Schematron Reporting, and S3 Writer."""
    source_prefix = kwargs.get("source_prefix")
    dest_prefix = kwargs.get("destination_prefix")
    bucket = kwargs.get("bucket")
    schematron_file = kwargs.get("schematron_filename")
    access_id = kwargs.get("access_id")
    access_secret = kwargs.get("access_secret")

    # create reporting csv
    csv_in_mem = io.StringIO()
    report_csv = csv.DictWriter(
        csv_in_mem, fieldnames=["id", "report", "record", "source_file"])
    report_csv.writeheader()

    # get schematron doc & return lxml.etree.Schematron validator
    schematron_doc = process.get_github_content("tulibraries/aggregator_mdx",
                                                schematron_file)
    schematron = isoschematron.Schematron(etree.fromstring(schematron_doc),
                                          store_report=True)

    # Iterate through S3 Files, Validate, & Save Report to CSV
    total_transform_count = 0
    for s3_key in process.list_s3_content(bucket, access_id, access_secret,
                                          source_prefix):
        logging.info("Validating & Reporting On File: %s", s3_key)
        s3_content = process.get_s3_content(bucket, s3_key, access_id,
                                            access_secret)
        s3_xml = etree.fromstring(s3_content)
        for record in s3_xml.iterchildren():
            total_transform_count += 1
            record_id = record.get("airflow-record-id")
            logging.info("Ran report on record: %s", record_id)
            schematron.validate(record)
            report_csv.writerow({
                "id":
                record_id,
                "report":
                schematron_failed_validation_text(
                    schematron.validation_report),
                "record":
                identifier_or_full_record(record),
                "source_file":
                f"https://s3.console.aws.amazon.com/s3/object/{bucket}/{s3_key}"
            })
    report_filename = dest_prefix + "-report.csv"
    logging.info("Records report: https://%s.s3.amazonaws.com/%s", bucket,
                 report_filename)
    logging.info("Total Transform Count: %s", total_transform_count)
    process.generate_s3_object(csv_in_mem.getvalue(), bucket, report_filename,
                               access_id, access_secret)

    return {"transformed": total_transform_count}
コード例 #5
0
ファイル: test_process.py プロジェクト: tulibraries/tulflow
 def test_get_s3_content(self):
     bucket = "test_bucket"
     key = "test_key_2"
     access_id = "test_access_id"
     access_secret = "test_access_secret"
     conn = boto3.client("s3",
                         aws_access_key_id=access_id,
                         aws_secret_access_key=access_secret)
     conn.create_bucket(Bucket=bucket)
     conn.put_object(Bucket=bucket, Key=key, Body="test more content")
     test_content_exists = conn.get_object(Bucket=bucket, Key=key)
     test_object_exists = conn.list_objects(Bucket=bucket)
     self.assertEqual(test_content_exists["Body"].read(),
                      b"test more content")
     self.assertEqual(
         test_content_exists["ResponseMetadata"]["HTTPStatusCode"], 200)
     self.assertEqual(test_object_exists["Contents"][0]["Key"], key)
     test_run = process.get_s3_content(bucket, key, access_id,
                                       access_secret)
     self.assertEqual(test_run, b"test more content")
コード例 #6
0
ファイル: transform.py プロジェクト: tulibraries/tulflow
def transform_s3_xsl(**kwargs):
    """Transform & Write XML data to S3 using Saxon XSLT Engine."""
    access_id = kwargs.get("access_id")
    access_secret = kwargs.get("access_secret")
    bucket = kwargs.get("bucket")
    dest_prefix = kwargs.get("destination_prefix")
    source_prefix = kwargs.get("source_prefix")
    if kwargs.get("dag"):
        run_id = kwargs.get("dag").dag_id
    else:
        run_id = "no-dag-provided"

    saxon = prepare_saxon_engine()
    transformed = etree.Element("collection")
    transformed.attrib["dag-id"] = run_id
    transformed.attrib["dag-timestamp"] = kwargs.get("timestamp",
                                                     "no-timestamp-provided")
    xsl = "https://raw.github.com/{repo}/{branch}/{filename}".format(
        repo=kwargs.get("xsl_repository", "tulibraries/aggregator_mdx"),
        branch=kwargs.get("xsl_branch", "master"),
        filename=kwargs.get("xsl_filename"))

    for s3_key in process.list_s3_content(bucket, access_id, access_secret,
                                          source_prefix):
        logging.info("Transforming File %s", s3_key)
        s3_content = process.get_s3_content(bucket, s3_key, access_id,
                                            access_secret)
        s3_xml = etree.fromstring(s3_content)
        for record in s3_xml.iterchildren():
            record_id = record.get("airflow-record-id")
            logging.info("Transforming Record %s", record_id)
            result_str = subprocess.check_output(
                ["java", "-jar", saxon, "-xsl:" + xsl, "-s:-"],
                input=etree.tostring(record))
            result = etree.fromstring(result_str)
            result.attrib["airflow-record-id"] = record_id
            transformed.append(result)
        filename = s3_key.replace(source_prefix, dest_prefix)
        transformed_xml = etree.tostring(transformed)
        process.generate_s3_object(transformed_xml, bucket, filename,
                                   access_id, access_secret)
コード例 #7
0
ファイル: validate.py プロジェクト: tulibraries/tulflow
def filter_s3_schematron(**kwargs):
    """Wrapper function for using S3 Retrieval, Schematron Filtering, and S3 Writer."""
    source_prefix = kwargs.get("source_prefix")
    dest_prefix = kwargs.get("destination_prefix")
    report_prefix = kwargs.get("report_prefix")
    bucket = kwargs.get("bucket")
    schematron_file = kwargs.get("schematron_filename")
    access_id = kwargs.get("access_id")
    access_secret = kwargs.get("access_secret")
    if kwargs.get("dag"):
        run_id = kwargs.get("dag").dag_id
    else:
        run_id = "no-dag-provided"
    if kwargs.get("timestamp"):
        timestamp = kwargs.get("timestamp")
    else:
        timestamp = "no-timestamp-provided"

    # create invalid records reporting csv
    csv_in_mem = io.StringIO()
    invalid_csv = csv.DictWriter(
        csv_in_mem, fieldnames=["id", "report", "record", "source_file"])
    invalid_csv.writeheader()

    # get schematron doc & return lxml.etree.Schematron validator
    schematron_doc = process.get_github_content("tulibraries/aggregator_mdx",
                                                schematron_file)
    schematron = isoschematron.Schematron(etree.fromstring(schematron_doc),
                                          store_report=True)
    total_filter_count = 0
    total_record_count = 0
    for s3_key in process.list_s3_content(bucket, access_id, access_secret,
                                          source_prefix):
        logging.info("Validating & Filtering File: %s", s3_key)
        s3_content = process.get_s3_content(bucket, s3_key, access_id,
                                            access_secret)
        s3_xml = etree.fromstring(s3_content)
        invalid_xml = etree.Element("collection")
        invalid_xml.attrib["dag-id"] = run_id
        invalid_xml.attrib["dag-timestamp"] = timestamp
        filter_count = 0
        record_count = 0
        for record in s3_xml.iterchildren():
            record_count += 1
            total_record_count += 1
            if not schematron.validate(record):
                record_id = record.get("airflow-record-id")
                logging.error("Invalid record found: %s", record_id)
                s3_xml.remove(record)
                filter_count += 1
                invalid_csv.writerow({
                    "id":
                    record_id,
                    "report":
                    schematron_failed_validation_text(
                        schematron.validation_report),
                    "record":
                    identifier_or_full_record(record),
                    "source_file":
                    f"https://s3.console.aws.amazon.com/s3/object/{bucket}/{s3_key}"
                })
        total_filter_count += filter_count
        filename = s3_key.replace(source_prefix, dest_prefix)
        updated_s3_xml = etree.tostring(s3_xml)
        process.generate_s3_object(updated_s3_xml, bucket, filename, access_id,
                                   access_secret)
        if filter_count == record_count and record_count != 0:
            logging.warning(
                f"All records filtered from {filename}. record_count: {record_count}"
            )

    invalid_filename = report_prefix + "-invalid.csv"
    logging.info("Total Filter Count: %s", total_filter_count)
    logging.info("Invalid Records report: https://%s.s3.amazonaws.com/%s",
                 bucket, invalid_filename)
    process.generate_s3_object(csv_in_mem.getvalue(), bucket, invalid_filename,
                               access_id, access_secret)
    if total_filter_count == total_record_count and total_record_count != 0:
        raise AirflowException(
            f"All records were filtered out: {total_record_count}")
    return {"filtered": total_filter_count}