Example #1
0
def readRootMetsFromIP(tarFile):
    mets_info_entries = [
        member for member in tarFile.getmembers()
        if re.match(mets_entry_pattern, member.name)
    ]
    if len(mets_info_entries) == 1:
        logger.info("Root METS file found in container file")
        root_mets_file_entry = mets_info_entries[0].name
        root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry)
        root_mets_content = read_textfile_from_tar(tarFile,
                                                   root_mets_file_entry)
        root_mets = ET.fromstring(bytes(root_mets_content, 'utf-8'))
        return (root_mets, root_mets_file_entry_base_dir)
    return None
Example #2
0
def validate_and_detect_ips(ips_root_dir, user_id, task=None, task_log=None):  # task parameter is provided by huey context
    task_log.log("Loading information packages from user directory: %s" % ips_root_dir)
    # TODO: implement validation and detection of ips
    u = User.objects.get(id=user_id)
    # delete all user records first
    ret = DetectedInformationPackage.objects.filter(user=u).delete() # noqa
    # read tar files
    all_files_in_user_dir = list_files_in_dir(ips_root_dir)
    container_files_in_user_dir = [f for f in all_files_in_user_dir if re.match(input_file_filter_regex, f)]
    task_log.log("There are %d container files in the user's directory" % len(container_files_in_user_dir))
    for file_in_user_dir in container_files_in_user_dir:
        task_log.log(html_bold("Container file: %s" % file_in_user_dir))
        mime_str = str(get_mime_type(file_in_user_dir))
        if mime_str not in supported_processing_mime_types:
            task_log.log("File skipped: Mime type not supported: %s ")
        else:
            try:
                object_path = os.path.join(ips_root_dir, file_in_user_dir)
                t = tarfile.open(object_path, 'r')
                mets_info_entries = [member for member in t.getmembers() if re.match(mets_entry_pattern, member.name)]
                if len(mets_info_entries) == 1:
                    task_log.log("Root METS file found in container file")
                    root_mets_file_entry = mets_info_entries[0].name
                    root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry)
                    root_mets_content = read_textfile_from_tar(t, root_mets_file_entry)
                    parsed_mets = ET.fromstring(bytes(root_mets_content, 'utf-8'))
                    package_mets_schema_location = get_schema_location_by_regex(parsed_mets, r".*/METS/.*")

                    mets_schema_locations = get_schema_locations(parsed_mets)

                    mets_schema_locations_container_entries = [
                        os.path.join(root_mets_file_entry_base_dir, loc) for loc in mets_schema_locations.values()
                    ]
                    # extract schema files to local user directory
                    extract_container_entries(t, mets_schema_locations_container_entries, "%s/schemas" % ips_root_dir)

                    mets_schema_path = os.path.join(ips_root_dir, extracted_schemas_directory, root_mets_file_entry_base_dir, package_mets_schema_location)
                    if package_mets_schema_location:
                        if not package_mets_schema_location.startswith("schemas/"):
                            task_log.error("schema files must be included in the 'schemas' directory")
                        else:
                            xmlval = XmlValidation()
                            try:
                                parsed_mets_schema = ET.parse(mets_schema_path)
                                validation_result = xmlval.validate_XML(parsed_mets, parsed_mets_schema)
                                task_log.success("The root METS file is %s" % "valid"
                                                 if validation_result.valid else "invalid")
                                if len(validation_result.err) > 0:
                                    task_log.log("Errors: " % ", ".join(validation_result.err))
                            except Exception as e:
                                task_log.error("Errors occurred when validating root METS file:")
                                task_log.log("%s" % str(e))
                    else:
                        task_log.warning("Root METS file not validated because no METS schema location was provided")
                    # create record only if OBJID is available
                    if 'OBJID' in parsed_mets.attrib:
                        obj_id = parsed_mets.attrib['OBJID']
                        task_log.log("Object ID: %s" % obj_id)
                        DetectedInformationPackage.objects.create(
                            task_id=task.id,
                            information_package=obj_id,
                            title="First IP",
                            ip_base_dir=root_mets_file_entry_base_dir,
                            ip_filename=file_in_user_dir,
                            user=u,
                            selected=False
                        ).save()
                    else:
                        task_log.log("No object ID defined!")

                t.close()
            except tarfile.TarError:
                print("Error reading tar file: %s" % file_in_user_dir)
    task_log.log("Finished detecting information packages")
    return True
Example #3
0
def get_basic_metadata(request, file_path):
    user_data_path = os.path.join(ip_data_path, request.user.username)
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    selected_ip = vars['selected_ip']
    print(file_path)
    archive_file_path = os.path.join(user_data_path, selected_ip.ip_filename)
    t = tarfile.open(archive_file_path, 'r')
    file_content = read_textfile_from_tar(t, file_path)
    tmp_file_path = "/tmp/%s" % randomword(10)
    res_events = []
    try:
        title = ""
        date = ""
        with open(tmp_file_path, 'w') as tmp_file:
            tmp_file.write(file_content)
        if fnmatch.fnmatch(file_path, metadata_file_pattern_ead):
            pead = ParsedEad("/tmp", tmp_file_path)
            dao_elements = pead.get_dao_elements()
            actual = pead.ead_tree.getroot().tag
            unit_titles = []
            unit_dates = []
            for dao_elm in dao_elements:
                unit_titles.append(
                    pead._first_md_val_ancpath(dao_elm, "unittitle"))
                unit_dates.append(
                    pead._first_md_val_ancpath(dao_elm, "unitdate"))
            title = unit_titles[0]
            date = unit_dates[0]
            events = ""
        elif fnmatch.fnmatch(file_path, metadata_file_pattern_premis):
            structure = get_ip_structure(request)
            logical_view = search(structure, "logical_view_data")
            events = search(logical_view, "events")
            for event in events:
                if len(event):
                    res_events.append({
                        'type':
                        event[0]['type'],
                        'datetime':
                        event[0]['datetime'],
                        'agent':
                        event[0]['linking_agent_id']['value']
                    })
            title = "Root PREMIS"
            date = "20.09.2017"

        md_type = ead_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_ead)  \
            else premis_md_type if fnmatch.fnmatch(file_path, metadata_file_pattern_premis) else "Other"

        return JsonResponse(
            {
                'success': True,
                'type': md_type,
                'title': title,
                'date': date,
                'events': res_events,
                'file_path': file_path
            },
            status=200)
    except Exception as error:
        logger.exception(error)
        return JsonResponse({
            'success': False,
            'error': str(error)
        },
                            status=500)
Example #4
0
def get_ip_overview_context(request):
    user_data_path = os.path.join(ip_data_path, request.user.username)
    vars = environment_variables(request)
    if not vars['selected_ip']:
        return {}
    object_path = os.path.join(user_data_path, vars['selected_ip'].ip_filename)
    tarFile = tarfile.open(object_path, 'r')
    mets_info_entries = [
        member for member in tarFile.getmembers()
        if re.match(mets_entry_pattern, member.name)
    ]
    if len(mets_info_entries) == 1:
        logger.info("Root METS file found in container file")
        root_mets_file_entry = mets_info_entries[0].name
        root_mets_file_entry_base_dir = os.path.dirname(root_mets_file_entry)
        root_mets_content = read_textfile_from_tar(tarFile,
                                                   root_mets_file_entry)
        root_mets = ET.fromstring(bytes(root_mets_content, 'utf-8'))

        all_schemas = []
        for root_structMap in root_mets.iter(
                '{http://www.loc.gov/METS/}structMap'):
            if root_structMap.get('TYPE') == 'PHYSICAL':
                for div in root_structMap.find(
                        '{http://www.loc.gov/METS/}div'):
                    label = div.get('LABEL')
                    if label == 'schemas':
                        schemas = get_schemas_section(
                            div, root_mets, root_mets_file_entry_base_dir)
                        all_schemas += [
                            schema['text'] for schema in schemas['nodes']
                        ]
                        continue

        #print(root_mets.attrib['OBJID'])
        # for child in parsed_mets:
        #    print(child.tag, child.attrib)
        # for neighbor in parsed_mets.iter('neighbor'):
        #    print(neighbor.attrib)

        overview = {}
        total_size = 0
        total_number_content_files = 0
        content_mime_types = []
        representations = []
        overview['object_id'] = root_mets.attrib['OBJID']

        ead_info_entries = [
            member for member in tarFile.getmembers()
            if re.match(ead_entry_pattern, member.name)
        ]
        if len(ead_info_entries) == 1:
            logger.info("EAD file found in container file")
            root_ead_file_entry = ead_info_entries[0].name
            root_ead_file_entry_base_dir = os.path.dirname(root_ead_file_entry)
            root_ead_content = read_textfile_from_tar(tarFile,
                                                      root_ead_file_entry)
            root_ead = ET.fromstring(bytes(root_ead_content, 'utf-8'))

            found = [
                element.text for element in root_ead.iter(
                    '{http://ead3.archivists.org/schema/}titleproper')
            ]
            #TODO: test for empty
            overview['title'] = found[0]
        else:
            overview['title'] = "Unknown. EAD file missing."

        for root_fileGrp in root_mets.iter(
                '{http://www.loc.gov/METS/}fileGrp'):
            if root_fileGrp.attrib['USE'] == 'representations':
                print(root_fileGrp.tag, root_fileGrp.attrib)
                for root_file in root_fileGrp.iter(
                        '{http://www.loc.gov/METS/}file'):
                    FLocat = root_file.find('{http://www.loc.gov/METS/}FLocat')
                    rep_mets_file_entry = FLocat.get(
                        "{http://www.w3.org/1999/xlink}href")
                    rep_mets_file_entry = root_mets_file_entry_base_dir + rep_mets_file_entry.strip(
                        '.')
                    rep_mets_content = read_textfile_from_tar(
                        tarFile, rep_mets_file_entry)
                    rep_mets = ET.fromstring(bytes(rep_mets_content, 'utf-8'))
                    representation = {}
                    representation['identifier'] = rep_mets.get('OBJID')
                    print(rep_mets)
                    for rep_fileGrp in rep_mets.iter(
                            '{http://www.loc.gov/METS/}fileGrp'):
                        print(rep_fileGrp.tag, rep_fileGrp.attrib)
                        for rep_file in rep_fileGrp.iter(
                                '{http://www.loc.gov/METS/}file'):
                            mimetype = rep_file.get('MIMETYPE')
                            #print(mimetype)
                            #mime = MimeTypes()
                            #file_mimetype, _ = mime.guess_type(file_url)
                            representation[
                                'label'] = get_representation_label_for_id(
                                    root_mets, root_file.get('ID'))
                            #representation['description'] = "From Where???"
                            content_mime_types.append(rep_file.get('MIMETYPE'))
                            total_size += int(rep_file.get('SIZE'))
                            total_number_content_files += 1
                    representations.append(representation)
        overview['representations'] = representations
        total_number_representations = len(representations)
        overview['stats'] = {
            "total_size": total_size,
            "total_number_content_files": total_number_content_files,
            "total_number_representations": total_number_representations,
            "schemas": ','.join(all_schemas),
            "content_mime_types": ", ".join(list(set(content_mime_types))),
        }
    return overview
Example #5
0
def get_representation_section(div, root_mets, tarFile,
                               root_mets_file_entry_base_dir):
    if type(root_mets) == tuple:
        e = root_mets[0]
    else:
        e = root_mets
    fileSec = next(e.iter('{http://www.loc.gov/METS/}fileSec'))
    label = div.get('LABEL')
    rep_node = {}
    for fptr in div.iter('{http://www.loc.gov/METS/}fptr'):
        file_name = fileSec_get_file_for_id(fileSec, fptr.get('FILEID'))
        if file_name is not None:
            if file_name.endswith('METS.xml'):
                rep_mets_file_entry = root_mets_file_entry_base_dir + file_name.strip(
                    '.')
                rep_mets_content = read_textfile_from_tar(
                    tarFile, rep_mets_file_entry)
                rep_mets = ET.fromstring(bytes(rep_mets_content, 'utf-8'))
                representation = {}
                rep_node = {
                    "icon": "fa fa-ibox fa-fw",
                    "text": rep_mets.get('OBJID'),
                    "nodes": []
                }
                for rep_structMap in rep_mets.iter(
                        '{http://www.loc.gov/METS/}structMap'):
                    if rep_structMap.get('TYPE') == 'PHYSICAL':
                        for div in rep_structMap.find(
                                '{http://www.loc.gov/METS/}div'):
                            label = div.get('LABEL')
                            if label == 'data':
                                data = get_data_section(
                                    div, rep_mets,
                                    os.path.dirname(rep_mets_file_entry))
                                rep_node['nodes'].append(data)
                                continue
                            if label == 'schemas':
                                schemas = get_schemas_section(
                                    div, rep_mets,
                                    os.path.dirname(rep_mets_file_entry))
                                rep_node['nodes'].append(schemas)
                                continue
                            if label == 'metadata':
                                metadata = get_metadata_section(
                                    div, rep_mets,
                                    os.path.dirname(rep_mets_file_entry))
                                # Read premis and create premis entry
                                for entry in metadata['nodes']:
                                    if entry['text'].endswith('premis.xml'):
                                        premis_file_entry = os.path.dirname(
                                            rep_mets_file_entry
                                        ) + entry['text'].strip(".")
                                        print(premis_file_entry)
                                        rep_premis_content = read_textfile_from_tar(
                                            tarFile, premis_file_entry)
                                        rep_premis = ET.fromstring(
                                            bytes(rep_premis_content, 'utf-8'))
                                        metadata['premis'] = {
                                            'representations':
                                            getRepresentations(rep_premis),
                                            'events':
                                            getPemisEvents(rep_premis)
                                        }
                                rep_node['nodes'].append(metadata)
                                continue
    return rep_node