Ejemplo n.º 1
0
def run():
    import html
    doc = ocrspace_text
    doc = etl.escape_xml_text(tesseract_text)

    # Get the 'floridathes' terms
    project = 'floridathes'
    r = get_suggested_terms_data_harmony_api_result(project=project, doc=doc)

    # Get the 'floridathes' terms
    project = 'geothesFlorida'
    r = get_suggested_terms_data_harmony_api_result(project=project, doc=doc)
Ejemplo n.º 2
0
def manioc_node_writer(bib_vid=None,
                       d_record=None,
                       metadata_prefix=None,
                       output_folder=None,
                       verbosity=0):
    me = 'manioc_node_writer'
    rparams = ['d_record', 'bib_vid', 'output_folder', 'metadata_prefix']
    if not all(rparams):
        raise ValueError("Missing some required params from {}".format(
            repr(rparams)))

    if metadata_prefix != 'oai_dc':
        raise ValueError(
            "{}: Currently only support metadata_prefix of oai_dc, not '{}''".
            format(me, metadata_prefix))

    # 20170815 NOTE
    # cannot get shutil to remove older files properly on windows 7...
    # so if needed must remember to remove them by hand before running this.
    output_folder_xml = '{}{}/xml/'.format(output_folder, metadata_prefix)
    os.makedirs(output_folder_xml, exist_ok=True)

    output_folder_mets = '{}{}/mets/'.format(output_folder, metadata_prefix)
    os.makedirs(output_folder_mets, exist_ok=True)
    rights = 'Your rights: '
    d_mets_template = {
        "genre_authority": "Manioc",
        "physical_location_name": "Manioc",
        "physical_location_code": "MANIOC",
        "rights_text": rights,
        # List of extant SobekCM database wordmark codes to derive xml for mets template
        # eg for miami merrick it was ['UM','DLOC']
        "list_sobekcm_wordmarks": [],
        # List of extant SobekCM database aggregation codes to derive xml for mets template
        "list_sobekcm_aggregations": [
            'ALL',
        ],

        ###########################################################################
        # Following must be or usually are software derived-supplied
        ###########################################################################
        "agent_creator_individual_name": None,
        "agent_creator_individual_note": 'Creation via Manioc OAI  harvest',
        "bibid": None,
        "create_date": None,
        "description": None,
        "harvest_utc_secs_z": None,
        #Consider using this later...
        "header_identifier_text": "",
        "last_mod_date": None,
        "mods_subjects": None,
        "mods_title": None,
        "personal_creator_name": None,
        "personal_creator_role": None,
        "related_url": None,
        "sha1_mets_v1": None,
        "sobekcm_thumbnail_src": "",  # Not used for Manioc...  yet...
        "source_id_name": "manioc_OAI_header_identifier_2017",
        "vid": None,
        "xml_dc_ids": "",
        "xml_sobekcm_aggregations":
        "",  # Just puts wrapping around list_sobekcm_aggregtions
        "xml_sobekcm_subjects": "",  # Derived from OAI record metadata
        "xml_sobekcm_wordmarks":
        "",  # Puts xml wrapping around list_sobekcm_wordmarks
    }

    ok_prefixes = [
        'oai_dc',
    ]
    if metadata_prefix not in ok_prefixes:
        raise ValueError("{}:Paramter metadata_prefix {} is not in {}".format(
            me, metadata_prefix, repr(ok_prefixes)))

    node_record = d_record['node_record']
    namespaces = d_record['namespaces']
    # Note: node_root and n_batch and node_record_count are also in d_record if needed

    bibid = bib_vid[:10]
    if bib_vid[10:11] != '_':
        raise ValueError("Bad bib_vid format for {}".format(bib_vid))
    vid = bib_vid[11:16]

    node_type = node_record.find(".//{*}dc/{*}type", namespaces=namespaces)
    genre = '' if node_type is None else node_type.text
    genre = etl.escape_xml_text(genre)
    d_mets_template['genre'] = genre

    node_identifier = node_record.find("./{*}header/{*}identifier",
                                       namespaces=namespaces)
    header_identifier_text = '' if node_identifier is None else node_identifier.text

    header_identifier_normalized = (header_identifier_text.replace(
        ':', '_').replace('/', '_').replace('.', '-'))

    if verbosity > 0:
        print("using bib={}, vid={}, bib_vid={} to output item with "
              "manioc header_identifier_normalized={}".format(
                  bibid, vid, bib_vid, header_identifier_normalized))

    nodes_source = node_record.findall(".//{*}dc/{*}publisher",
                                       namespaces=namespaces)
    n = 0 if nodes_source is None else len(nodes_source)

    node_source_text = '' if n == 0 else nodes_source[0].text
    d_mets_template['content_source_name'] = node_source_text

    # From node_record,create the b_xml_record_output
    # Note: the encoding argument is needed to create unicode string from
    # lxml internal representation
    xml_record_str = etree.tostring(node_record,
                                    pretty_print=True,
                                    xml_declaration=True,
                                    encoding="utf-8")
    if verbosity > 1:
        print("{}:Got xml_record_str={}".format(me, xml_record_str))

    filename_xml = output_folder_xml + header_identifier_normalized + '.xml'
    if verbosity > 0:
        print("{}:using output_filename_xml={}".format(me, filename_xml))

    #with open(filename_xml, mode='w', encoding='utf-8') as outfile:
    with open(filename_xml, mode='wb') as outfile:
        if verbosity > 0:
            print("{}:Writing filename_xml ='{}'".format(me, filename_xml))
        outfile.write(xml_record_str)

    # Set some variables to potentially output into the METS template
    utc_now = datetime.datetime.utcnow()
    utc_secs_z = utc_now.strftime("%Y-%m-%dT%H:%M:%SZ")
    d_mets_template['utc_secs_z'] = utc_secs_z

    node_mdp = node_record.find(".//{*}dc", namespaces=namespaces)

    if node_mdp is None:
        # This happens for received 'delete' records
        # Just return to ignore these records pending requirements to process them
        # print ("Cannot find node_mdp for xml tag/node: {*}dc")
        return 0
    else:
        #print("{}: Got node_mdp with tag='{}'",format(node_mdp.tag))
        pass

    #print("{}:got namespaces='{}'".format(me,repr(namespaces)))

    node_creator = node_mdp.find(".//{*}creator", namespaces=namespaces)
    dc_creator = '' if node_creator is None else node_creator.text
    dc_creator = etl.escape_xml_text(dc_creator)
    #print("{}:Got creator={}".format(me,dc_creator))

    node_publisher = node_mdp.find(".//{*}dc/{*}publisher",
                                   namespaces=namespaces)
    publisher_text = '' if node_publisher is None else node_publisher.text
    publisher_text = etl.escape_xml_text(publisher_text)
    d_mets_template['publisher'] = publisher_text

    # For manioc, they encode the thumbnail in dc:relation
    node = node_mdp.find(".//{*}relation", namespaces=namespaces)
    node_text = '' if node is None else node.text

    # Skip over the beginning "vignette : " expected in this field
    if len(node_text) >= 10:
        node_text = node_text[11:]
    d_mets_template['sobekcm_thumbnail_src'] = node_text

    node_date = node_mdp.find(".//{*}date", namespaces=namespaces)
    dc_date_orig = '1969-01-01'
    if node_date is not None:
        dc_date_orig = node_date.text
    # If we get only a year, (we have had some like this) pad it to year-01-01 to make
    # it valid for this field.
    if len(dc_date_orig) < 5:
        dc_date_orig += "-01-01"
    elif len(dc_date_orig) < 8:
        dc_date_orig += "-01"

    dc_date = '{}T12:00:00Z'.format(dc_date_orig)
    # print("Got dc date orig={}".format(dc_date_orig))
    # Must convert dc_date_orig to valid METS format:

    # Make an element tree style tree to invoke pattern to remove inner xml
    # str_description = etree.tostring(node_description,encoding='unicode',method='text').strip().replace('\n','')

    node_description = node_mdp.find(".//{*}description",
                                     namespaces=namespaces)
    str_description = ''
    if (node_description is not None):
        str_description = etl.escape_xml_text(node_description.text)

    if (1 == 1):
        dc_description = str_description
        # avoid charmap codec windows errors:print("Using dc_description='{}'".format(dc_description))

    # Manioc has only one dc:identifier used for related url, so now keep it in the template
    # incase the server response evolves, but for now just stick in a newline value for # the template
    xml_dc_ids = '\n'

    # For manioc, the first dc identifier is the related url of the item
    nodes = node_mdp.findall(".//{*}identifier", namespaces=namespaces)

    related_url_text = '' if nodes is None or len(
        nodes) == 0 else nodes[0].text
    d_mets_template['related_url'] = related_url_text

    nodes = node_mdp.findall(".//{*}language", namespaces=namespaces)
    lang_code = 'eng' if nodes is None or len(
        nodes) < 1 else nodes[0].text.lower()
    iso639_2b_code = etl.d_language_639_2b[lang_code]
    d_mets_template['iso639_2b_code'] = iso639_2b_code

    iso639_2b_text = etl.d_langcode_langtext[iso639_2b_code]
    d_mets_template['iso639_2b_text'] = iso639_2b_text

    nodes_rights = node_mdp.findall(".//{*}rights", namespaces=namespaces)
    # Some concatenate rights with our rights text
    rights_text = d_mets_template['rights_text']
    for node_rights in nodes_rights:
        rights_text += '\n' + node_rights.text
    rights_text = etl.escape_xml_text(rights_text)
    d_mets_template['rights_text'] = rights_text

    # Subjects
    nodes_subject = node_mdp.findall(".//{*}subject", namespaces=namespaces)
    mods_subjects = '<mods:subject>'
    for node_subject in nodes_subject:
        subjects = node_subject.text.split(';')
        for subject in subjects:
            subject = subject.strip()
            if len(subject) < 1:
                continue
            mods_subjects += '<mods:topic>' + etl.escape_xml_text(
                subject) + '</mods:topic>\n'
    mods_subjects += ('</mods:subject>\n')

    tnode = node_mdp.find(".//{*}title", namespaces=namespaces)
    dc_title = '(none)' if tnode is None else etl.escape_xml_text(tnode.text)

    sobekcm_aggregations = d_mets_template['list_sobekcm_aggregations']
    xml_sobekcm_aggregations = ''
    for aggregation in sobekcm_aggregations:
        xml_sobekcm_aggregations += (
            '<sobekcm:Aggregation>{}</sobekcm:Aggregation>'.format(aggregation)
        )
    sobekcm_wordmarks = d_mets_template['list_sobekcm_wordmarks']
    xml_sobekcm_wordmarks = ''
    for wordmark in sobekcm_wordmarks:
        xml_sobekcm_wordmarks += (
            '<sobekcm:Wordmark>{}</sobekcm:Wordmark>\n'.format(wordmark))

    # Set some template variable values
    d_mets_template['bib_vid'] = bib_vid
    d_mets_template['create_date'] = dc_date
    d_mets_template['last_mod_date'] = utc_secs_z
    d_mets_template['agent_creator_individual_name'] = dc_creator
    d_mets_template['header_identifier_text'] = header_identifier_text
    d_mets_template['mods_subjects'] = mods_subjects
    d_mets_template['mods_title'] = dc_title
    d_mets_template['xml_sobekcm_aggregations'] = xml_sobekcm_aggregations
    d_mets_template['xml_sobekcm_wordmarks'] = xml_sobekcm_wordmarks
    d_mets_template['xml_dc_ids'] = xml_dc_ids
    d_mets_template['description'] = dc_description
    d_mets_template['personal_creator_name'] = dc_creator
    d_mets_template['bibid'] = bibid
    d_mets_template['vid'] = vid
    d_mets_template['sha1_mets_v1'] = ''

    # Create mets_str and write it to mets.xml output file
    mets_str = manioc_mets_format_str.format(**d_mets_template)
    # Nest filename in folder of the bib_vid,
    # because loads in sobek bulder faster this way
    output_folder_mets_item = output_folder_mets + bib_vid + '/'

    os.makedirs(output_folder_mets_item, exist_ok=True)
    filename_mets = output_folder_mets_item + bib_vid + '.mets.xml'
    if verbosity > 0:
        print("{}:using output_filename_mets={}".format(me, filename_mets))

    fn = filename_mets
    with open(fn, mode='w', encoding='utf-8') as outfile:
        #print("{}:Writing METS filename='{}'".format(me,fn))
        #outfile.write(mets_str.encode('utf-8'))
        outfile.write(mets_str)
    return 1
Ejemplo n.º 3
0
def list_records_to_mets_xml_files(d_run_params,
                                   set_spec='user-genetics-datasets',
                                   verbosity=1):
    #
    if verbosity > 0:
        msg = ("{}: getting zenodo records for set_spec='{}'".format(
            me, set_spec))

    output_folder = d_run_params['output_folder']
    mets_output_folder = output_folder + '/mets_output/'

    os.makedirs(mets_output_folder, exist_ok=True)
    d_request = d_run_params['d_request_zenodo']

    response = response_of_zenodo(d_request, dataset_name=set_spec)

    # Construct a curl command that repesents the sent request, just to
    # provide printed output of this auxiliary info
    curl = d_request['curl']

    # Show the API response and the auxiliary info for a similar curl command
    print("Got response for url={}, curl={}".format(d_request['url'],
                                                    d_request['curl']))

    # Process the response
    xml = response.text.encode('utf-8')
    print("Response text len={}".format(len(xml)))

    node_root = etree.fromstring(response.text.encode('utf-8'))
    #str_pretty = etree.tostring(node_root, pretty_print=True)
    d_namespaces = {
        key: value
        for key, value in dict(node_root.nsmap).items() if key is not None
    }

    nodes_record = node_root.findall(".//{*}record", namespaces=d_namespaces)

    print(
        "ListRecords request found root tag name='{}', and {} records".format(
            node_root.tag, len(nodes_record)))
    #print("found str_pretty='{}'".format(str_pretty))
    #testing
    bib_prefix = "DS"
    vid = "00001"

    # One must now manually find the highest bibint in the DS mets files
    # and add 1
    # 20180626 the highest was 9, and 1 is added in the loop below
    start_bibint = bibint = 9
    os.makedirs(output_folder + '/received/', exist_ok=True)

    #  Examine each output record from the OAI command
    for node_record in nodes_record:
        # identifier
        bibint += 1
        bibid = bib_prefix + str(bibint).zfill(8)
        bib_vid = "{}_{}".format(bibid, vid)

        node_type = node_record.find(".//{}type", namespaces=d_namespaces)
        genre = '' if not node_type else node_type.text

        header_identifier = node_record.find("./{*}header/{*}identifier").text

        identifier_normalized = header_identifier.replace(':', '_') + '.xml'
        print(
            "using bib_vid={} to output item with zenodo identifier_normalized={}"
            .format(bib_vid, identifier_normalized))
        #zenodo_string_xml = etree.tostring(node_record, pretty_print=True)

        # Parse the input record and save it to a string
        record_str = etree.tostring(node_record,
                                    pretty_print=True,
                                    xml_declaration=True)

        filename_received = output_folder + '/received/' + identifier_normalized

        fn = filename_received
        with open(fn, 'wb') as outfile:
            print("Writing filename_received ='{}'".format(fn))
            outfile.write(record_str)

        # Set some variable to potentially output into the METS template
        utc_now = datetime.datetime.utcnow()
        utc_secs_z = utc_now.strftime("%Y-%m-%dT%H:%M:%SZ")

        #Get basic values from input doc

        node_oaidc = node_record.find(".//{*}dc", namespaces=d_namespaces)
        if node_oaidc is None:
            raise Exception("Cannot find oai_dc:dc node")

        namespaces = {
            key: value
            for key, value in dict(node_oaidc.nsmap).items() if key is not None
        }

        print("Got oai_dc prefix map='{}'\n\n".format(repr(namespaces)))

        node_creator = node_oaidc.find(".//dc:creator", namespaces=namespaces)
        dc_creator = '' if node_creator is None else node_creator.text
        print("Got creator={}".format(dc_creator))

        dc_date_orig = node_oaidc.find("./dc:date", namespaces=namespaces).text
        print("Got dc date orig={}".format(dc_date_orig))
        # Must convert dc_date_orig to valid METS format:
        dc_date = '{}T12:00:00Z'.format(dc_date_orig)
        print("Got dc_date='{}'".format(dc_date))

        node_description = node_oaidc.find(".//{*}description",
                                           namespaces=namespaces)
        # Make an element trree style tree to invoke pattern to remove innter xml
        str_description = tostring(node_description,
                                   encoding='unicode',
                                   method='text').strip().replace('\n', '')
        # Special doctype needed to handle nbsp... copyright
        xml_dtd = '''<?xml version="1.1" encoding="UTF-8" ?><!DOCTYPE naughtyxml [
            <!ENTITY nbsp "&#0160;">
            <!ENTITY copy "&#0169;">
            ]>'''
        xml_description = '{}<doc>{}</doc>'.format(xml_dtd, str_description)
        print("Got str_description='{}'".format(str_description))
        print("Got xml_description='{}'".format(xml_description))

        # See: https://stackoverflow.com/questions/19369901/python-element-tree-extract-text-from-element-stripping-tags#19370075
        tree_description = ET.fromstring(xml_description)
        dc_description = etl.escape_xml_text(''.join(
            tree_description.itertext()))
        #dc_description = xml_description
        print("Using dc_description='{}'".format(dc_description))

        nodes_identifier = node_oaidc.findall(".//{*}identifier")
        #inferred the following indexes by pure manual inspection!
        doi = nodes_identifier[0].text
        zenodo_id = nodes_identifier[2].text
        related_url = '{}'.format(doi)

        #relation_doi = node_oaidc.find(".//{*}relation").text
        nodes_rights = node_oaidc.findall(".//{*}rights")
        rights_text = 'See:'
        for node_rights in nodes_rights:
            rights_text += ' ' + node_rights.text

        nodes_subject = node_oaidc.findall(".//{*}subject")
        mods_subjects = ''
        for node_subject in nodes_subject:
            mods_subjects += ('<mods:subject><mods:topic>' +
                              node_subject.text +
                              '</mods:topic></mods:subject>\n')

        dc_title = node_oaidc.find(".//{*}title").text
        dc_type = node_oaidc.find(".//{*}type").text

        sobekcm_aggregations = ['UFDATASETS']
        xml_sobekcm_aggregations = ''
        for aggregation in sobekcm_aggregations:
            xml_sobekcm_aggregations += (
                '<sobekcm:Aggregation>{}</sobekcm:Aggregation>'.format(
                    aggregation))

        # Apply basic input values to METS template variables

        d_var_val = {
            'bib_vid': bib_vid,
            'create_date': dc_date,
            'last_mod_date': utc_secs_z,
            'agent_creator_individual_name': dc_creator,
            'agent_creator_individual_note': 'Creation via zenodo harvest',
            'identifier': header_identifier,
            'mods_subjects': mods_subjects,
            'rights_text': rights_text,
            'utc_secs_z': utc_secs_z,
            'title': dc_title,
            'related_url': related_url,
            'xml_sobekcm_aggregations': xml_sobekcm_aggregations,
            'doi': doi,
            'description': dc_description,
            'creator': dc_creator,
            'bibid': bibid,
            'vid': vid,
            'type_of_resource': dc_type,
            'sha1-mets-v1': '',
            'genre': 'dataset',
            'genre_authority': 'zenodo',
        }

        # Create mets_str and write it
        mets_str = mets_format_str.format(**d_var_val)
        item_output_folder = mets_output_folder + '/' + bib_vid
        os.makedirs(item_output_folder, exist_ok=True)
        filename_mets = item_output_folder + '/' + bib_vid + '.mets.xml'
        fn = filename_mets
        with open(fn, 'wb') as outfile:
            print("Writing filename='{}'".format(fn))
            outfile.write(mets_str.encode('utf-8'))
def make_apa_citations(input_folder=None,
                       output_folder=None,
                       input_glob='**/*utf8.txt'):
    me = 'make_apa_citations'
    if input_folder is None:
        raise ValueError("input_folder is not given as an argument")
    if output_folder is None:
        output_folder = input_folder
    print("{}: using input_folder={},output_folder={},input_glob={}".format(
        me, input_folder, output_folder, input_glob))
    input_folder_path = Path(input_folder)
    input_file_paths = list(input_folder_path.glob(input_glob))
    n_input_files = 0
    n_citations = 0
    print("Found {} input files".format(len(input_file_paths)))
    for path in input_file_paths:
        input_file_name = "{}\{}".format(path.parents[0], path.name)
        print("Processing file name={}".format(input_file_name))
        n_input_files += 1
        output_file_name = input_file_name + '.html'
        n_file_citations = 0
        with open(str(output_file_name), encoding="utf-8",
                  mode="w") as output_file:
            print("\nReading input file {}".format(path.name))
            print(
                "<!DOCTYPE html> <html>\n<head><meta charset='UTF-8'></head>\n"
                "<body>\n<h3>APA Citations for Input File {}</h3>\n"
                "<table border=2>\n".format(input_file_name),
                file=output_file)
            # NOTE: save EXCEL file as utf-8 encoded file
            with open(str(input_file_name),
                      mode="r",
                      encoding="utf-8",
                      errors="ignore") as input_file:
                # NOTE: may use VIM or other tools to change input file encoding to required
                # utf-8 here if not already in utf-8 format
                # :set fileencoding=utf-8
                input_lines = input_file.readlines()
                for line in input_lines:
                    eline = etl.escape_xml_text(line)
                    print("Got line='{}'.,eline='{}'".format(line, eline))
                    n_file_citations += 1
                    parts = line.split('\t')
                    print("Line has {} tab-separated parts")
                    nparts = len(parts)
                    authors, pubyear, title, journal, volume, issue, pages, doi = (
                        "", ) * 8
                    colskip = 0
                    colskip = 1  #per file from Suzanne 2017050x email with 'problem' in column 1,

                    index = colskip
                    if nparts > index:
                        authors = (parts[index].replace('"', '').replace(
                            ',;', ',').replace('; ', ', '))

                    index += 1
                    if nparts > index:
                        pubyear = parts[index]

                    index += 1
                    if nparts > index:  ### TITLE ###

                        # Replace nonbreaking spaces with 'normal' spaces first
                        title = parts[index].replace('\u00A0', ' ')
                        # Remove multiple spaces everywhere. Split with no arguments adds this service
                        title = ' '.join(title.split())
                        # Remove troublesome quotation characters for APA citations
                        title = title.replace('"', '')
                        title_words = title.split(' ')
                        # Enforce APA title style: First char of word must be capitalized, but lower
                        # first char for other words in title
                        title = ''
                        delim = ''

                        for word in title_words:
                            nchars = len(word)
                            if nchars < 1:
                                continue
                            title += delim
                            if delim == '':
                                title += word[0].upper()
                                if nchars > 1:
                                    title += word[1:]
                            elif nchars == 1:
                                title += word[0].lower()
                            elif (nchars > 2 and not has_digit(word[1:])
                                  and not has_upper(word[1:])):
                                # This is a second or following title word.
                                # APA style says it should not be upper-case, but probably
                                # only unless it has other uppercase characters
                                # or digits (for example "RNA" "O2").
                                # So here we make first letter lowercase only if
                                # second (and greater) letter of word has no uppercase
                                # nor digit characters
                                title += word[0].lower()
                                title += word[1:]
                            else:
                                title += word
                            delim = ' '
                        # end for word in title_words

                        # Get rid of trailing . in title
                        while title.endswith('.'):
                            title = title[:-1]
                        # end title
                        index += 1
                        if nparts > index: journal = parts[index]

                        index += 1
                        if nparts > index: volume = parts[index]

                        index += 1
                        if nparts > index: issue = parts[index]

                        index += 1
                        if nparts > index:
                            pages = parts[index]
                            while pages.endswith('.'):
                                pages = pages[:-1]

                        index += 1
                        if nparts > index:
                            doi = parts[index].replace(' ',
                                                       '').replace('\n', '')
                            if doi.startswith('http://dx.doi.org/'):
                                doi = doi[18:]
                            if doi.upper().startswith('DOI:'):
                                doi = doi[4:]

                        p_volume = '' if volume == '' else ', {}'.format(
                            volume)
                        p_issue = '' if issue == '' else '({})'.format(issue)
                        p_pages = '' if pages == '' else ', {}'.format(pages)
                        p_doi = '' if doi == '' else (
                            ' <a href="http:/dx.doi.org/{}"> {}</a>'.format(
                                doi, doi))

                        print(
                            "<tr><td>{} ({}). {}. "
                            "<span style='font-style: italic;'>{}{}</span>{}{}.{}\n</td></tr>\n"
                            .format(html_escape(authors), html_escape(pubyear),
                                    html_escape(title), html_escape(journal),
                                    html_escape(p_volume),
                                    html_escape(p_issue), html_escape(p_pages),
                                    p_doi),
                            file=output_file)
                    # end nparts > title index value
                # for line in input_lines

            print("Produced APA citation output file {} with {} citations.".
                  format(output_file_name, n_file_citations))
            print("</table></body></html>\n", file=output_file)
            # withoutput_file
    # with input_file
    return None
    def node_record_process(
            self,
            node_record=None,
            namespaces=None,
            bibid_str='DS12345678',  # format 'XY12345678'
            vid_str='12345',  # format '12345'
            save_xml=False):

        # Caller with node_root can provide overrides for node_record or
        # and namespaces. Eg, before calling, caller can set namespaces  like:
        # namespaces={key:value for key,value in dict(node_root.nsmap).items()
        # if key is not None}
        me = 'node_record_process'

        if self.verbosity > 0:
            print(f"{me}: Using save_xml={save_xml}")

        if node_record is None:
            node_record = self.node_record
        if namespaces is None:
            namespaces = self.namespaces

        #bibid = bib_prefix + str(bibint).zfill(8)
        bib_vid = f"{bibid_str}_{vid_str}"

        node_type = node_record.find(".//{}type", namespaces=namespaces)
        genre = '' if not node_type else node_type.text

        # NOTE: this also appers to be the OAI Server record identifier
        header_identifier = node_record.find("./{*}header/{*}identifier").text

        identifier_normalized = header_identifier.replace(':', '_') + '.xml'
        if self.verbosity > 0:
            print(f"using bib_vid={bib_vid} to output item with zenodo "
                  "identifier_normalized={identifier_normalized}")

        if save_xml == True:
            # Parse the input record and save it to a string
            record_str = etree.tostring(node_record,
                                        pretty_print=True,
                                        xml_declaration=True,
                                        encoding='utf-8')

            save_folder = self.output_folder + '/recieved/'
            os.makedirs(save_folder, exist_ok=True)
            save_file = (save_folder + identifier_normalized)

            with open(save_file, 'wb') as outfile:
                if self.verbosity > 0:
                    print(f"Writing save_file ='{save_file}'")
                outfile.write(record_str)

        # Set some variables to potentially output into the Zenodo METS
        # output template

        utc_now = datetime.datetime.utcnow()
        utc_secs_z = utc_now.strftime("%Y-%m-%dT%H:%M:%SZ")

        #Get basic values from input doc
        node_oaidc = node_record.find(".//{*}dc", namespaces=namespaces)

        if node_oaidc is None:
            raise Exception("Cannot find oai_dc:dc node")

        namespaces_oaidc = {
            key: value
            for key, value in dict(node_oaidc.nsmap).items() if key is not None
        }

        if self.verbosity > 0:
            print(f"Got oai_dc prefix map='{namespaces_oaidc}'\n\n")

        node_creator = node_oaidc.find(".//dc:creator",
                                       namespaces=namespaces_oaidc)

        dc_creator = '' if node_creator is None else node_creator.text

        nodes_creator = node_oaidc.findall(".//dc:creator",
                                           namespaces=namespaces_oaidc)

        # Handle multiple creators
        dc_creators = []
        for node_creator in nodes_creator:
            dc_creators.append(node_creator.text)

        if self.verbosity > 0:
            print(f"Got creators='{dc_creators}'")
            sys.stdout.flush()

        xml_sobekcm_creators = ''
        for creator in dc_creators:
            xml_sobekcm_creators += (f'''
<mods:name type="personal">
  <mods:namePart>{creator}</mods:namePart>
  <mods:role>
    <mods:roleTerm type="text">creator</mods:roleTerm>
  </mods:role>
</mods:name>
                ''')

        dc_date_orig = node_oaidc.find("./dc:date",
                                       namespaces=namespaces_oaidc).text
        if self.verbosity > 0:
            print("Got dc date orig={}".format(dc_date_orig))
        # Must convert dc_date_orig to valid METS format:
        dc_date = '{}T12:00:00Z'.format(dc_date_orig)
        if self.verbosity > 0:
            print("Got dc_date='{}'".format(dc_date))

        node_description = node_oaidc.find(".//{*}description",
                                           namespaces=namespaces_oaidc)

        # Make an element tree style tree to invoke pattern to remove innter xml
        # If strip newlines, text with newlines runs together so replace a space
        str_description = (tostring(node_description,
                                    encoding='unicode',
                                    method='text').strip().replace('\n', ' '))
        # Special doctype needed to handle nbsp... copyright
        xml_dtd = '''<?xml version="1.1" encoding="UTF-8" ?><!DOCTYPE naughtyxml [
            <!ENTITY nbsp "&#0160;">
            <!ENTITY copy "&#0169;">
            ]>'''
        xml_description = '{}<doc>{}</doc>'.format(xml_dtd, str_description)
        if self.verbosity > 0:
            print("Got str_description='{}'".format(str_description))
        if self.verbosity > 0:
            print("Got xml_description='{}'".format(xml_description))

        # See: https://stackoverflow.com/questions/19369901/python-element-tree-extract-text-from-element-stripping-tags#19370075
        tree_description = ET.fromstring(xml_description)
        dc_description = etl.escape_xml_text(''.join(
            tree_description.itertext()))
        #dc_description = xml_description
        if self.verbosity > 0:
            print("Using dc_description='{}'".format(dc_description))

        nodes_identifier = node_oaidc.findall(".//{*}identifier",
                                              namespaces=namespaces_oaidc)
        #inferred the following indexes by pure manual inspection!
        doi = nodes_identifier[0].text
        zenodo_id = nodes_identifier[2].text
        related_url = '{}'.format(doi)

        #relation_doi = node_oaidc.find(".//{*}relation").text
        nodes_rights = node_oaidc.findall(".//{*}rights",
                                          namespaces=namespaces_oaidc)
        rights_text = 'See:'
        for node_rights in nodes_rights:
            rights_text += ' ' + node_rights.text

        nodes_subject = node_oaidc.findall(".//{*}subject",
                                           namespaces=namespaces_oaidc)
        mods_subjects = ''
        for node_subject in nodes_subject:
            mods_subjects += ('<mods:subject><mods:topic>' +
                              node_subject.text +
                              '</mods:topic></mods:subject>\n')

        dc_title = node_oaidc.find(".//{*}title").text
        dc_type = node_oaidc.find(".//{*}type").text

        sobekcm_aggregations = ['UFDATASETS']
        xml_sobekcm_aggregations = ''
        for aggregation in sobekcm_aggregations:
            xml_sobekcm_aggregations += (
                '<sobekcm:Aggregation>{}</sobekcm:Aggregation>'.format(
                    aggregation))

        # Apply basic input values to METS template variables

        d_var_val = {
            'bib_vid': bib_vid,
            'create_date': dc_date,
            'last_mod_date': utc_secs_z,
            'agent_creator_individual_name': dc_creator,
            'agent_creator_individual_note': 'Creation via zenodo harvest',
            'identifier': header_identifier,
            'mods_subjects': mods_subjects,
            'rights_text': rights_text,
            'utc_secs_z': utc_secs_z,
            'title': dc_title,
            'related_url': related_url,
            'xml_sobekcm_aggregations': xml_sobekcm_aggregations,
            'doi': doi,
            'description': dc_description,
            'xml_sobekcm_creators': xml_sobekcm_creators,
            'bibid': bibid_str,
            'vid': vid_str,
            'type_of_resource': dc_type,
            'sha1-mets-v1': '',
            'genre': 'dataset',
            'genre_authority': 'zenodo',
        }

        # Create mets_str and write it
        mets_str = mets_format_str.format(**d_var_val)

        mets_output_folder = self.output_folder + '/mets_output/'
        item_output_folder = mets_output_folder + '/' + bib_vid
        os.makedirs(item_output_folder, exist_ok=True)
        filename_mets = item_output_folder + '/' + bib_vid + '.mets.xml'

        if self.verbosity > 0:
            print("WRITING METS.XML to {}".format(filename_mets))

        fn = filename_mets
        with open(fn, 'wb') as outfile:
            print("Writing filename='{}'".format(fn))
            outfile.write(mets_str.encode('utf-8'))