Esempio n. 1
0
def validate_volmets(volume_file, s3_items_by_type, path_prefix):
    """
        Confirm that all paths and hashes in volmets match files in S3.

        Returns 'None' if no errors detected, or else dict of files only in S3 and only in the METS file.
    """
    parsed = parse_xml(volume_file)

    only_in_mets = set()
    only_in_s3 = set()

    for file_type in ('jp2', 'tiff', 'alto', 'casemets'):
        volmets_files = set(
            (path_prefix +
             i.children('mets|FLocat').attr(resolve_namespace('xlink|href')),
             i.attr('CHECKSUM'))
            for i in parsed('mets|fileGrp[USE="%s"] mets|file' %
                            file_type).items())
        s3_files = set(s3_items_by_type[file_type])
        if s3_files != volmets_files:
            only_in_mets |= volmets_files - s3_files
            only_in_s3 |= s3_files - volmets_files

    if only_in_mets or only_in_s3:
        return {
            'only_in_mets': list(only_in_mets),
            'only_in_s3': list(only_in_s3),
        }

    return None
Esempio n. 2
0
    def fix_file_group(group_name, new_mime_type=None, new_id_prefix=None):
        file_group = mets_xml('mets|fileGrp[USE="%s"]' % group_name)
        for file_el in file_group('mets|file'):
            file_el = parse_xml(file_el)
            flocat_el = file_el('mets|FLocat')
            old_href = flocat_el.attr(resolve_namespace('xlink|href'))
            new_data = new_file_info[old_href.replace(relative_path_prefix,
                                                      '')]

            if new_id_prefix:
                file_el.attr(
                    'ID',
                    file_el.attr('ID').replace(group_name, new_id_prefix))
            if new_mime_type:
                file_el.attr('MIMETYPE', new_mime_type)
            file_el.attr('CHECKSUM', new_data['digest'])
            file_el.attr('SIZE', str(new_data['length']))

            flocat_el.attr(resolve_namespace('xlink|href'),
                           relative_path_prefix + new_data['new_path'])

        if new_id_prefix:
            # fix fileGrp element
            file_group.attr('USE', new_id_prefix)

            # fix <fptr> elements
            for fptr in fptr_elements:
                fileid = fptr.attrib.get('FILEID', '')
                if fileid.startswith(group_name):
                    fptr.attrib['FILEID'] = fileid.replace(
                        group_name, new_id_prefix)
Esempio n. 3
0
def test_serialize_xml_should_not_modify_input_xml(unaltered_alto_xml):
    parsed = parse_xml(unaltered_alto_xml)

    # make a change
    parsed('[ID="b17-15"]').attr('ID', 'replace_me')

    # serialize parsed xml
    new_xml = serialize_xml(parsed)

    # undo the change for comparison
    assert b'replace_me' in new_xml  # make sure modification worked
    new_xml = new_xml.replace(b'replace_me', b'b17-15')

    # serialized xml should be identical
    assert unaltered_alto_xml == new_xml
Esempio n. 4
0
def handle_alto_file(volume_file_path, tempdir, storage_name):
    storage, out_path = single_file_setup(volume_file_path, tempdir,
                                          storage_name)

    with storage.open(volume_file_path, "r") as in_file:
        alto_xml = parse_xml(in_file.read())

    filename_el = alto_xml('alto|fileName')
    filename_el.text(filename_el.text().replace('.tif', '.png'))

    pstep_el = alto_xml('alto|processingStepDescription')
    pstep_el.text(pstep_el.text().replace('TIFF', 'PNG'))

    # write out xml
    out_file, out_path = write_xml_gz(alto_xml, out_path)
    return format_new_file_info(volume_file_path, out_path, out_file)
Esempio n. 5
0
def test_versioning(transactional_db, versioned_fixture_name, request):
    # load initial volume_xml/case_xml/page_xml
    versioned_instance = request.getfixturevalue(versioned_fixture_name)
    original_instance = deepcopy(versioned_instance)

    # starts with no history
    assert versioned_instance.history.count() == 0

    # versions are only created once per transaction.
    # since tests run in transactions, run an initial sub-transaction to
    # make sure our next save causes a new version to be created.
    # note that this is not sufficient when using the temporal_tables
    # extension, which additionally requires (transaction=True) as an
    # argument to the pytest.mark.django_db decorator
    with transaction.atomic(using='capdb'):
        versioned_instance.save()

    # make some modifications:
    versioned_instance.s3_key = 'changed'
    parsed = parse_xml(versioned_instance.orig_xml)
    parsed('mets').append("<new_element/>")
    versioned_instance.orig_xml = serialize_xml(parsed)

    # save modified version:
    with transaction.atomic(using='capdb'):
        versioned_instance.save()

    # historical version should now exist:
    previous_version = versioned_instance.history.first()
    assert previous_version

    # current version's sys_period should start where historical version's sys_period ends:
    versioned_instance.refresh_from_db()  # load current sys_period
    assert versioned_instance.sys_period.lower == previous_version.sys_period.upper

    # historical version should have values from before latest save:
    assert previous_version.s3_key == original_instance.s3_key
    assert previous_version.orig_xml == original_instance.orig_xml
Esempio n. 6
0
def get_case_metadata(case_xml):
    parsed = parse_xml(case_xml.replace('\xad', ''))

    # duplicative cases won't have a case section, so rather than using case.caseid we get the volume barcode from the
    # first alto file entry, and the case number from the casebody:
    alto_name = parsed('mets|fileGrp[USE="alto"] mets|FLocat')[0].attrib[
        resolve_namespace('xlink|href')].split('/')[-1]
    volume_barcode = re.match(r'([A-Za-z0-9_]+)_(un)?redacted([0-9_]*)',
                              alto_name).group(1)
    case_number = parsed(
        'mets|fileGrp[USE="casebody"] > mets|file').attr["ID"].split('_')[1]
    case_id = "%s_%s" % (volume_barcode, case_number)

    metadata = {'volume_barcode': volume_barcode, 'case_id': case_id}
    if parsed('duplicative|casebody'):
        first_page = parsed('duplicative|casebody').attr.firstpage
        last_page = parsed('duplicative|casebody').attr.lastpage
        return dict(
            metadata, **{
                'duplicative': True,
                'first_page': first_page,
                'last_page': last_page,
            }), parsed

    citation_entries = parsed('case|case').find('case|citation')
    citations = [{
        'citation_type': cite.attrib['category'],
        'citation_text': cite.text,
        'is_duplicative': False
    } for cite in citation_entries]
    jurisdiction = parsed('case|court').attr('jurisdiction').strip()

    name = parsed('case|name').text()
    name_abbreviation = parsed('case|name').attr('abbreviation')

    first_page = parsed('casebody|casebody').attr.firstpage
    last_page = parsed('casebody|casebody').attr.lastpage

    decision_date_original = parsed('case|decisiondate').text()
    decision_date = parse_decision_date(decision_date_original)

    docket_number = parsed('case|docketnumber').text()

    court = {
        'name_abbreviation': parsed('case|court').attr.abbreviation,
        'name': parsed('case|court').text(),
    }

    # apply manual fixes
    jurisdiction, court['name'], court['name_abbreviation'] = fix_court_tag(
        jurisdiction, court['name'], court['name_abbreviation'])

    judges = [judge.text for judge in parsed('casebody|judges')]
    attorneys = [attorney.text for attorney in parsed('casebody|attorneys')]
    parties = [party.text for party in parsed('casebody|parties')]
    opinions = [{
        'type': opinion.attr('type'),
        'author': opinion('casebody|author').text() or None,
    } for opinion in parsed.items('casebody|opinion')]

    return dict(
        metadata, **{
            'name': name,
            'name_abbreviation': name_abbreviation,
            'jurisdiction': jurisdiction,
            'citations': citations,
            'first_page': first_page,
            'last_page': last_page,
            'decision_date_original': decision_date_original,
            'decision_date': decision_date,
            'court': court,
            'docket_number': docket_number,
            'duplicative': False,
            'judges': judges,
            'attorneys': attorneys,
            'parties': parties,
            'opinions': opinions
        }), parsed
Esempio n. 7
0
def xml_equal(s1, s2, **kwargs):
    e1 = parse_xml(s1)[0]
    e2 = parse_xml(s2)[0]
    return elements_equal(e1, e2, **kwargs)
Esempio n. 8
0
def validate_volume(volume_path):
    """
        Perform basic sanity checks on captar archives, and write "ok" or an error into `validation` folder.

        Relative to captar_storage:
            volume_path looks like 'redacted/32044031754302_redacted'
            output file looks like 'validation/redacted/32044031754302_redacted.txt'
    """

    # helpers
    top_level_file_sets = {("METS.md5", "METS.xml.gz"),
                           ("BOXES.xml.gz", "METS.md5", "METS.xml.gz")}

    class ValidationResult(Exception):
        pass

    # check last result
    result_path = str(Path('validation', volume_path).with_suffix('.txt'))
    if captar_storage.exists(result_path):
        last_result = json.loads(captar_storage.contents(result_path))
        if last_result[0] == "ok":
            print("Volume %s already validated; skipping." % volume_path)
            return

    try:

        # load tar file as a storage wrapper and get list of items
        with open_captar_volume(volume_path,
                                raise_on_not_found=False) as volume_storage:
            if not volume_storage:
                raise ValidationResult(
                    "index_missing",
                    "Failed to load index for %s" % volume_path)
            tar_items = set(volume_storage.iter_files_recursive(with_md5=True))

            # volmets_path is path with no slashes ending in METS.xml.gz
            volmets_path = next(
                (item for item in tar_items
                 if item[0].count("/") == 0 and item[0].endswith("METS.xml.gz")
                 ), None)

            # check for missing volmets
            if not volmets_path:
                raise ValidationResult("volmets_missing", volume_path)

            # check md5 of volmets
            md5_path = next(
                (item[0] for item in tar_items
                 if item[0].count("/") == 0 and item[0].endswith(".md5")),
                None)
            if not md5_path:
                raise ValidationResult("md5_missing")
            if volmets_path[1] != volume_storage.contents(md5_path):
                raise ValidationResult("volmets_md5_mismatch")

            # strip .gz so the storage will decompress for us
            volmets_path = volmets_path[0][:-3]

            # check for mismatched files
            orig_xml = volume_storage.contents(volmets_path)
            parsed = parse_xml(orig_xml)
            tar_item_checksum_lookup = dict(tar_items)
            volmets_files = set()
            for i in parsed('mets|file').items():
                file_name = i.children('mets|FLocat').attr(
                    resolve_namespace('xlink|href'))
                checksum = i.attr('CHECKSUM')
                # special case -- because of a processing error, pdf files in volmets don't always have checksums
                # in that case, default the checksum to the actual file checksum so it will match
                if checksum is None and file_name.endswith('.pdf'):
                    checksum = tar_item_checksum_lookup.get(file_name)
                volmets_files.add((file_name, checksum))

            # check that all files in METS are expected
            only_in_mets = volmets_files - tar_items
            if only_in_mets:
                raise ValidationResult("only_in_mets",
                                       [list(i) for i in only_in_mets])

            # check that all files only_in_tar are expected (should be one volmets and one volmets md5)
            only_in_tar = tuple(
                sorted(item[0].rsplit('_', 1)[-1]
                       for item in tar_items - volmets_files))
            if only_in_tar not in top_level_file_sets:
                raise ValidationResult("only_in_tar", list(only_in_tar))

            # count suffixes
            suffix_counts = defaultdict(int)
            for item in volmets_files:
                suffix_counts[item[0].split('.', 1)[1]] += 1
            color_image_count = suffix_counts['jpg'] or suffix_counts['pdf']
            if color_image_count == 0 or color_image_count != suffix_counts[
                    'tif'] or suffix_counts['xml.gz'] <= color_image_count:
                raise ValidationResult("unexpected_suffix_counts",
                                       suffix_counts)

            raise ValidationResult("ok")

    except ValidationResult as result:
        print(result.args)
        captar_storage.save(result_path,
                            BytesIO(json.dumps(result.args).encode()))
Esempio n. 9
0
def handle_mets_file(volume_file_path,
                     tempdir,
                     storage_name,
                     new_file_info,
                     relative_path_prefix=''):
    storage, out_path = single_file_setup(volume_file_path, tempdir,
                                          storage_name)

    with storage.open(volume_file_path, "r") as in_file:
        mets_xml = parse_xml(in_file.read())

    # add provenance data
    # spacing at start and end of string matters here -- makes sure formatting matches surrounding elements
    mets_xml('mets|amdSec').append("""  <digiprovMD ID="digi004">
      <mdWrap MDTYPE="PREMIS">
        <xmlData>
          <event xmlns="info:lc/xmlns/premis-v2">
            <eventIdentifier>
              <eventIdentifierType>Local</eventIdentifierType>
              <eventIdentifierValue>proc0001</eventIdentifierValue>
            </eventIdentifier>
            <eventType>compression</eventType>
            <eventDateTime>%s</eventDateTime>
            <eventDetail>File compression</eventDetail>
          </event>
          <agent xmlns="info:lc/xmlns/premis-v2">
            <agentIdentifier>
              <agentIdentifierType>Local</agentIdentifierType>
              <agentIdentifierValue>HLSL</agentIdentifierValue>
            </agentIdentifier>
            <agentName>Harvard Law School Library</agentName>
            <agentType>organization</agentType>
          </agent>
        </xmlData>
      </mdWrap>
    </digiprovMD>
  """ % (datetime.utcnow().isoformat().split('.')[0] + 'Z'))

    # update <fileGrp> sections
    fptr_elements = mets_xml('mets|fptr')

    def fix_file_group(group_name, new_mime_type=None, new_id_prefix=None):
        file_group = mets_xml('mets|fileGrp[USE="%s"]' % group_name)
        for file_el in file_group('mets|file'):
            file_el = parse_xml(file_el)
            flocat_el = file_el('mets|FLocat')
            old_href = flocat_el.attr(resolve_namespace('xlink|href'))
            new_data = new_file_info[old_href.replace(relative_path_prefix,
                                                      '')]

            if new_id_prefix:
                file_el.attr(
                    'ID',
                    file_el.attr('ID').replace(group_name, new_id_prefix))
            if new_mime_type:
                file_el.attr('MIMETYPE', new_mime_type)
            file_el.attr('CHECKSUM', new_data['digest'])
            file_el.attr('SIZE', str(new_data['length']))

            flocat_el.attr(resolve_namespace('xlink|href'),
                           relative_path_prefix + new_data['new_path'])

        if new_id_prefix:
            # fix fileGrp element
            file_group.attr('USE', new_id_prefix)

            # fix <fptr> elements
            for fptr in fptr_elements:
                fileid = fptr.attrib.get('FILEID', '')
                if fileid.startswith(group_name):
                    fptr.attrib['FILEID'] = fileid.replace(
                        group_name, new_id_prefix)

    fix_file_group('jp2', 'image/jpg', 'jpg')
    # use this if doing jp2 -> compressed jp2
    # fix_file_group('jp2')
    # use this if compressing tiff -> png
    # fix_file_group('tiff', 'image/png', 'png')
    fix_file_group('alto', 'text/xml+gzip')
    fix_file_group('casemets', 'text/xml+gzip')

    # write out xml
    out_file, out_path = write_xml_gz(mets_xml, out_path)
    return format_new_file_info(volume_file_path, out_path, out_file)