def run(): import html doc = ocrspace_text doc = etl.escape_xml_text(tesseract_text) # Get the 'floridathes' terms project = 'floridathes' r = get_suggested_terms_data_harmony_api_result(project=project, doc=doc) # Get the 'floridathes' terms project = 'geothesFlorida' r = get_suggested_terms_data_harmony_api_result(project=project, doc=doc)
def manioc_node_writer(bib_vid=None, d_record=None, metadata_prefix=None, output_folder=None, verbosity=0): me = 'manioc_node_writer' rparams = ['d_record', 'bib_vid', 'output_folder', 'metadata_prefix'] if not all(rparams): raise ValueError("Missing some required params from {}".format( repr(rparams))) if metadata_prefix != 'oai_dc': raise ValueError( "{}: Currently only support metadata_prefix of oai_dc, not '{}''". format(me, metadata_prefix)) # 20170815 NOTE # cannot get shutil to remove older files properly on windows 7... # so if needed must remember to remove them by hand before running this. output_folder_xml = '{}{}/xml/'.format(output_folder, metadata_prefix) os.makedirs(output_folder_xml, exist_ok=True) output_folder_mets = '{}{}/mets/'.format(output_folder, metadata_prefix) os.makedirs(output_folder_mets, exist_ok=True) rights = 'Your rights: ' d_mets_template = { "genre_authority": "Manioc", "physical_location_name": "Manioc", "physical_location_code": "MANIOC", "rights_text": rights, # List of extant SobekCM database wordmark codes to derive xml for mets template # eg for miami merrick it was ['UM','DLOC'] "list_sobekcm_wordmarks": [], # List of extant SobekCM database aggregation codes to derive xml for mets template "list_sobekcm_aggregations": [ 'ALL', ], ########################################################################### # Following must be or usually are software derived-supplied ########################################################################### "agent_creator_individual_name": None, "agent_creator_individual_note": 'Creation via Manioc OAI harvest', "bibid": None, "create_date": None, "description": None, "harvest_utc_secs_z": None, #Consider using this later... "header_identifier_text": "", "last_mod_date": None, "mods_subjects": None, "mods_title": None, "personal_creator_name": None, "personal_creator_role": None, "related_url": None, "sha1_mets_v1": None, "sobekcm_thumbnail_src": "", # Not used for Manioc... yet... "source_id_name": "manioc_OAI_header_identifier_2017", "vid": None, "xml_dc_ids": "", "xml_sobekcm_aggregations": "", # Just puts wrapping around list_sobekcm_aggregtions "xml_sobekcm_subjects": "", # Derived from OAI record metadata "xml_sobekcm_wordmarks": "", # Puts xml wrapping around list_sobekcm_wordmarks } ok_prefixes = [ 'oai_dc', ] if metadata_prefix not in ok_prefixes: raise ValueError("{}:Paramter metadata_prefix {} is not in {}".format( me, metadata_prefix, repr(ok_prefixes))) node_record = d_record['node_record'] namespaces = d_record['namespaces'] # Note: node_root and n_batch and node_record_count are also in d_record if needed bibid = bib_vid[:10] if bib_vid[10:11] != '_': raise ValueError("Bad bib_vid format for {}".format(bib_vid)) vid = bib_vid[11:16] node_type = node_record.find(".//{*}dc/{*}type", namespaces=namespaces) genre = '' if node_type is None else node_type.text genre = etl.escape_xml_text(genre) d_mets_template['genre'] = genre node_identifier = node_record.find("./{*}header/{*}identifier", namespaces=namespaces) header_identifier_text = '' if node_identifier is None else node_identifier.text header_identifier_normalized = (header_identifier_text.replace( ':', '_').replace('/', '_').replace('.', '-')) if verbosity > 0: print("using bib={}, vid={}, bib_vid={} to output item with " "manioc header_identifier_normalized={}".format( bibid, vid, bib_vid, header_identifier_normalized)) nodes_source = node_record.findall(".//{*}dc/{*}publisher", namespaces=namespaces) n = 0 if nodes_source is None else len(nodes_source) node_source_text = '' if n == 0 else nodes_source[0].text d_mets_template['content_source_name'] = node_source_text # From node_record,create the b_xml_record_output # Note: the encoding argument is needed to create unicode string from # lxml internal representation xml_record_str = etree.tostring(node_record, pretty_print=True, xml_declaration=True, encoding="utf-8") if verbosity > 1: print("{}:Got xml_record_str={}".format(me, xml_record_str)) filename_xml = output_folder_xml + header_identifier_normalized + '.xml' if verbosity > 0: print("{}:using output_filename_xml={}".format(me, filename_xml)) #with open(filename_xml, mode='w', encoding='utf-8') as outfile: with open(filename_xml, mode='wb') as outfile: if verbosity > 0: print("{}:Writing filename_xml ='{}'".format(me, filename_xml)) outfile.write(xml_record_str) # Set some variables to potentially output into the METS template utc_now = datetime.datetime.utcnow() utc_secs_z = utc_now.strftime("%Y-%m-%dT%H:%M:%SZ") d_mets_template['utc_secs_z'] = utc_secs_z node_mdp = node_record.find(".//{*}dc", namespaces=namespaces) if node_mdp is None: # This happens for received 'delete' records # Just return to ignore these records pending requirements to process them # print ("Cannot find node_mdp for xml tag/node: {*}dc") return 0 else: #print("{}: Got node_mdp with tag='{}'",format(node_mdp.tag)) pass #print("{}:got namespaces='{}'".format(me,repr(namespaces))) node_creator = node_mdp.find(".//{*}creator", namespaces=namespaces) dc_creator = '' if node_creator is None else node_creator.text dc_creator = etl.escape_xml_text(dc_creator) #print("{}:Got creator={}".format(me,dc_creator)) node_publisher = node_mdp.find(".//{*}dc/{*}publisher", namespaces=namespaces) publisher_text = '' if node_publisher is None else node_publisher.text publisher_text = etl.escape_xml_text(publisher_text) d_mets_template['publisher'] = publisher_text # For manioc, they encode the thumbnail in dc:relation node = node_mdp.find(".//{*}relation", namespaces=namespaces) node_text = '' if node is None else node.text # Skip over the beginning "vignette : " expected in this field if len(node_text) >= 10: node_text = node_text[11:] d_mets_template['sobekcm_thumbnail_src'] = node_text node_date = node_mdp.find(".//{*}date", namespaces=namespaces) dc_date_orig = '1969-01-01' if node_date is not None: dc_date_orig = node_date.text # If we get only a year, (we have had some like this) pad it to year-01-01 to make # it valid for this field. if len(dc_date_orig) < 5: dc_date_orig += "-01-01" elif len(dc_date_orig) < 8: dc_date_orig += "-01" dc_date = '{}T12:00:00Z'.format(dc_date_orig) # print("Got dc date orig={}".format(dc_date_orig)) # Must convert dc_date_orig to valid METS format: # Make an element tree style tree to invoke pattern to remove inner xml # str_description = etree.tostring(node_description,encoding='unicode',method='text').strip().replace('\n','') node_description = node_mdp.find(".//{*}description", namespaces=namespaces) str_description = '' if (node_description is not None): str_description = etl.escape_xml_text(node_description.text) if (1 == 1): dc_description = str_description # avoid charmap codec windows errors:print("Using dc_description='{}'".format(dc_description)) # Manioc has only one dc:identifier used for related url, so now keep it in the template # incase the server response evolves, but for now just stick in a newline value for # the template xml_dc_ids = '\n' # For manioc, the first dc identifier is the related url of the item nodes = node_mdp.findall(".//{*}identifier", namespaces=namespaces) related_url_text = '' if nodes is None or len( nodes) == 0 else nodes[0].text d_mets_template['related_url'] = related_url_text nodes = node_mdp.findall(".//{*}language", namespaces=namespaces) lang_code = 'eng' if nodes is None or len( nodes) < 1 else nodes[0].text.lower() iso639_2b_code = etl.d_language_639_2b[lang_code] d_mets_template['iso639_2b_code'] = iso639_2b_code iso639_2b_text = etl.d_langcode_langtext[iso639_2b_code] d_mets_template['iso639_2b_text'] = iso639_2b_text nodes_rights = node_mdp.findall(".//{*}rights", namespaces=namespaces) # Some concatenate rights with our rights text rights_text = d_mets_template['rights_text'] for node_rights in nodes_rights: rights_text += '\n' + node_rights.text rights_text = etl.escape_xml_text(rights_text) d_mets_template['rights_text'] = rights_text # Subjects nodes_subject = node_mdp.findall(".//{*}subject", namespaces=namespaces) mods_subjects = '<mods:subject>' for node_subject in nodes_subject: subjects = node_subject.text.split(';') for subject in subjects: subject = subject.strip() if len(subject) < 1: continue mods_subjects += '<mods:topic>' + etl.escape_xml_text( subject) + '</mods:topic>\n' mods_subjects += ('</mods:subject>\n') tnode = node_mdp.find(".//{*}title", namespaces=namespaces) dc_title = '(none)' if tnode is None else etl.escape_xml_text(tnode.text) sobekcm_aggregations = d_mets_template['list_sobekcm_aggregations'] xml_sobekcm_aggregations = '' for aggregation in sobekcm_aggregations: xml_sobekcm_aggregations += ( '<sobekcm:Aggregation>{}</sobekcm:Aggregation>'.format(aggregation) ) sobekcm_wordmarks = d_mets_template['list_sobekcm_wordmarks'] xml_sobekcm_wordmarks = '' for wordmark in sobekcm_wordmarks: xml_sobekcm_wordmarks += ( '<sobekcm:Wordmark>{}</sobekcm:Wordmark>\n'.format(wordmark)) # Set some template variable values d_mets_template['bib_vid'] = bib_vid d_mets_template['create_date'] = dc_date d_mets_template['last_mod_date'] = utc_secs_z d_mets_template['agent_creator_individual_name'] = dc_creator d_mets_template['header_identifier_text'] = header_identifier_text d_mets_template['mods_subjects'] = mods_subjects d_mets_template['mods_title'] = dc_title d_mets_template['xml_sobekcm_aggregations'] = xml_sobekcm_aggregations d_mets_template['xml_sobekcm_wordmarks'] = xml_sobekcm_wordmarks d_mets_template['xml_dc_ids'] = xml_dc_ids d_mets_template['description'] = dc_description d_mets_template['personal_creator_name'] = dc_creator d_mets_template['bibid'] = bibid d_mets_template['vid'] = vid d_mets_template['sha1_mets_v1'] = '' # Create mets_str and write it to mets.xml output file mets_str = manioc_mets_format_str.format(**d_mets_template) # Nest filename in folder of the bib_vid, # because loads in sobek bulder faster this way output_folder_mets_item = output_folder_mets + bib_vid + '/' os.makedirs(output_folder_mets_item, exist_ok=True) filename_mets = output_folder_mets_item + bib_vid + '.mets.xml' if verbosity > 0: print("{}:using output_filename_mets={}".format(me, filename_mets)) fn = filename_mets with open(fn, mode='w', encoding='utf-8') as outfile: #print("{}:Writing METS filename='{}'".format(me,fn)) #outfile.write(mets_str.encode('utf-8')) outfile.write(mets_str) return 1
def list_records_to_mets_xml_files(d_run_params, set_spec='user-genetics-datasets', verbosity=1): # if verbosity > 0: msg = ("{}: getting zenodo records for set_spec='{}'".format( me, set_spec)) output_folder = d_run_params['output_folder'] mets_output_folder = output_folder + '/mets_output/' os.makedirs(mets_output_folder, exist_ok=True) d_request = d_run_params['d_request_zenodo'] response = response_of_zenodo(d_request, dataset_name=set_spec) # Construct a curl command that repesents the sent request, just to # provide printed output of this auxiliary info curl = d_request['curl'] # Show the API response and the auxiliary info for a similar curl command print("Got response for url={}, curl={}".format(d_request['url'], d_request['curl'])) # Process the response xml = response.text.encode('utf-8') print("Response text len={}".format(len(xml))) node_root = etree.fromstring(response.text.encode('utf-8')) #str_pretty = etree.tostring(node_root, pretty_print=True) d_namespaces = { key: value for key, value in dict(node_root.nsmap).items() if key is not None } nodes_record = node_root.findall(".//{*}record", namespaces=d_namespaces) print( "ListRecords request found root tag name='{}', and {} records".format( node_root.tag, len(nodes_record))) #print("found str_pretty='{}'".format(str_pretty)) #testing bib_prefix = "DS" vid = "00001" # One must now manually find the highest bibint in the DS mets files # and add 1 # 20180626 the highest was 9, and 1 is added in the loop below start_bibint = bibint = 9 os.makedirs(output_folder + '/received/', exist_ok=True) # Examine each output record from the OAI command for node_record in nodes_record: # identifier bibint += 1 bibid = bib_prefix + str(bibint).zfill(8) bib_vid = "{}_{}".format(bibid, vid) node_type = node_record.find(".//{}type", namespaces=d_namespaces) genre = '' if not node_type else node_type.text header_identifier = node_record.find("./{*}header/{*}identifier").text identifier_normalized = header_identifier.replace(':', '_') + '.xml' print( "using bib_vid={} to output item with zenodo identifier_normalized={}" .format(bib_vid, identifier_normalized)) #zenodo_string_xml = etree.tostring(node_record, pretty_print=True) # Parse the input record and save it to a string record_str = etree.tostring(node_record, pretty_print=True, xml_declaration=True) filename_received = output_folder + '/received/' + identifier_normalized fn = filename_received with open(fn, 'wb') as outfile: print("Writing filename_received ='{}'".format(fn)) outfile.write(record_str) # Set some variable to potentially output into the METS template utc_now = datetime.datetime.utcnow() utc_secs_z = utc_now.strftime("%Y-%m-%dT%H:%M:%SZ") #Get basic values from input doc node_oaidc = node_record.find(".//{*}dc", namespaces=d_namespaces) if node_oaidc is None: raise Exception("Cannot find oai_dc:dc node") namespaces = { key: value for key, value in dict(node_oaidc.nsmap).items() if key is not None } print("Got oai_dc prefix map='{}'\n\n".format(repr(namespaces))) node_creator = node_oaidc.find(".//dc:creator", namespaces=namespaces) dc_creator = '' if node_creator is None else node_creator.text print("Got creator={}".format(dc_creator)) dc_date_orig = node_oaidc.find("./dc:date", namespaces=namespaces).text print("Got dc date orig={}".format(dc_date_orig)) # Must convert dc_date_orig to valid METS format: dc_date = '{}T12:00:00Z'.format(dc_date_orig) print("Got dc_date='{}'".format(dc_date)) node_description = node_oaidc.find(".//{*}description", namespaces=namespaces) # Make an element trree style tree to invoke pattern to remove innter xml str_description = tostring(node_description, encoding='unicode', method='text').strip().replace('\n', '') # Special doctype needed to handle nbsp... copyright xml_dtd = '''<?xml version="1.1" encoding="UTF-8" ?><!DOCTYPE naughtyxml [ <!ENTITY nbsp " "> <!ENTITY copy "©"> ]>''' xml_description = '{}<doc>{}</doc>'.format(xml_dtd, str_description) print("Got str_description='{}'".format(str_description)) print("Got xml_description='{}'".format(xml_description)) # See: https://stackoverflow.com/questions/19369901/python-element-tree-extract-text-from-element-stripping-tags#19370075 tree_description = ET.fromstring(xml_description) dc_description = etl.escape_xml_text(''.join( tree_description.itertext())) #dc_description = xml_description print("Using dc_description='{}'".format(dc_description)) nodes_identifier = node_oaidc.findall(".//{*}identifier") #inferred the following indexes by pure manual inspection! doi = nodes_identifier[0].text zenodo_id = nodes_identifier[2].text related_url = '{}'.format(doi) #relation_doi = node_oaidc.find(".//{*}relation").text nodes_rights = node_oaidc.findall(".//{*}rights") rights_text = 'See:' for node_rights in nodes_rights: rights_text += ' ' + node_rights.text nodes_subject = node_oaidc.findall(".//{*}subject") mods_subjects = '' for node_subject in nodes_subject: mods_subjects += ('<mods:subject><mods:topic>' + node_subject.text + '</mods:topic></mods:subject>\n') dc_title = node_oaidc.find(".//{*}title").text dc_type = node_oaidc.find(".//{*}type").text sobekcm_aggregations = ['UFDATASETS'] xml_sobekcm_aggregations = '' for aggregation in sobekcm_aggregations: xml_sobekcm_aggregations += ( '<sobekcm:Aggregation>{}</sobekcm:Aggregation>'.format( aggregation)) # Apply basic input values to METS template variables d_var_val = { 'bib_vid': bib_vid, 'create_date': dc_date, 'last_mod_date': utc_secs_z, 'agent_creator_individual_name': dc_creator, 'agent_creator_individual_note': 'Creation via zenodo harvest', 'identifier': header_identifier, 'mods_subjects': mods_subjects, 'rights_text': rights_text, 'utc_secs_z': utc_secs_z, 'title': dc_title, 'related_url': related_url, 'xml_sobekcm_aggregations': xml_sobekcm_aggregations, 'doi': doi, 'description': dc_description, 'creator': dc_creator, 'bibid': bibid, 'vid': vid, 'type_of_resource': dc_type, 'sha1-mets-v1': '', 'genre': 'dataset', 'genre_authority': 'zenodo', } # Create mets_str and write it mets_str = mets_format_str.format(**d_var_val) item_output_folder = mets_output_folder + '/' + bib_vid os.makedirs(item_output_folder, exist_ok=True) filename_mets = item_output_folder + '/' + bib_vid + '.mets.xml' fn = filename_mets with open(fn, 'wb') as outfile: print("Writing filename='{}'".format(fn)) outfile.write(mets_str.encode('utf-8'))
def make_apa_citations(input_folder=None, output_folder=None, input_glob='**/*utf8.txt'): me = 'make_apa_citations' if input_folder is None: raise ValueError("input_folder is not given as an argument") if output_folder is None: output_folder = input_folder print("{}: using input_folder={},output_folder={},input_glob={}".format( me, input_folder, output_folder, input_glob)) input_folder_path = Path(input_folder) input_file_paths = list(input_folder_path.glob(input_glob)) n_input_files = 0 n_citations = 0 print("Found {} input files".format(len(input_file_paths))) for path in input_file_paths: input_file_name = "{}\{}".format(path.parents[0], path.name) print("Processing file name={}".format(input_file_name)) n_input_files += 1 output_file_name = input_file_name + '.html' n_file_citations = 0 with open(str(output_file_name), encoding="utf-8", mode="w") as output_file: print("\nReading input file {}".format(path.name)) print( "<!DOCTYPE html> <html>\n<head><meta charset='UTF-8'></head>\n" "<body>\n<h3>APA Citations for Input File {}</h3>\n" "<table border=2>\n".format(input_file_name), file=output_file) # NOTE: save EXCEL file as utf-8 encoded file with open(str(input_file_name), mode="r", encoding="utf-8", errors="ignore") as input_file: # NOTE: may use VIM or other tools to change input file encoding to required # utf-8 here if not already in utf-8 format # :set fileencoding=utf-8 input_lines = input_file.readlines() for line in input_lines: eline = etl.escape_xml_text(line) print("Got line='{}'.,eline='{}'".format(line, eline)) n_file_citations += 1 parts = line.split('\t') print("Line has {} tab-separated parts") nparts = len(parts) authors, pubyear, title, journal, volume, issue, pages, doi = ( "", ) * 8 colskip = 0 colskip = 1 #per file from Suzanne 2017050x email with 'problem' in column 1, index = colskip if nparts > index: authors = (parts[index].replace('"', '').replace( ',;', ',').replace('; ', ', ')) index += 1 if nparts > index: pubyear = parts[index] index += 1 if nparts > index: ### TITLE ### # Replace nonbreaking spaces with 'normal' spaces first title = parts[index].replace('\u00A0', ' ') # Remove multiple spaces everywhere. Split with no arguments adds this service title = ' '.join(title.split()) # Remove troublesome quotation characters for APA citations title = title.replace('"', '') title_words = title.split(' ') # Enforce APA title style: First char of word must be capitalized, but lower # first char for other words in title title = '' delim = '' for word in title_words: nchars = len(word) if nchars < 1: continue title += delim if delim == '': title += word[0].upper() if nchars > 1: title += word[1:] elif nchars == 1: title += word[0].lower() elif (nchars > 2 and not has_digit(word[1:]) and not has_upper(word[1:])): # This is a second or following title word. # APA style says it should not be upper-case, but probably # only unless it has other uppercase characters # or digits (for example "RNA" "O2"). # So here we make first letter lowercase only if # second (and greater) letter of word has no uppercase # nor digit characters title += word[0].lower() title += word[1:] else: title += word delim = ' ' # end for word in title_words # Get rid of trailing . in title while title.endswith('.'): title = title[:-1] # end title index += 1 if nparts > index: journal = parts[index] index += 1 if nparts > index: volume = parts[index] index += 1 if nparts > index: issue = parts[index] index += 1 if nparts > index: pages = parts[index] while pages.endswith('.'): pages = pages[:-1] index += 1 if nparts > index: doi = parts[index].replace(' ', '').replace('\n', '') if doi.startswith('http://dx.doi.org/'): doi = doi[18:] if doi.upper().startswith('DOI:'): doi = doi[4:] p_volume = '' if volume == '' else ', {}'.format( volume) p_issue = '' if issue == '' else '({})'.format(issue) p_pages = '' if pages == '' else ', {}'.format(pages) p_doi = '' if doi == '' else ( ' <a href="http:/dx.doi.org/{}"> {}</a>'.format( doi, doi)) print( "<tr><td>{} ({}). {}. " "<span style='font-style: italic;'>{}{}</span>{}{}.{}\n</td></tr>\n" .format(html_escape(authors), html_escape(pubyear), html_escape(title), html_escape(journal), html_escape(p_volume), html_escape(p_issue), html_escape(p_pages), p_doi), file=output_file) # end nparts > title index value # for line in input_lines print("Produced APA citation output file {} with {} citations.". format(output_file_name, n_file_citations)) print("</table></body></html>\n", file=output_file) # withoutput_file # with input_file return None
def node_record_process( self, node_record=None, namespaces=None, bibid_str='DS12345678', # format 'XY12345678' vid_str='12345', # format '12345' save_xml=False): # Caller with node_root can provide overrides for node_record or # and namespaces. Eg, before calling, caller can set namespaces like: # namespaces={key:value for key,value in dict(node_root.nsmap).items() # if key is not None} me = 'node_record_process' if self.verbosity > 0: print(f"{me}: Using save_xml={save_xml}") if node_record is None: node_record = self.node_record if namespaces is None: namespaces = self.namespaces #bibid = bib_prefix + str(bibint).zfill(8) bib_vid = f"{bibid_str}_{vid_str}" node_type = node_record.find(".//{}type", namespaces=namespaces) genre = '' if not node_type else node_type.text # NOTE: this also appers to be the OAI Server record identifier header_identifier = node_record.find("./{*}header/{*}identifier").text identifier_normalized = header_identifier.replace(':', '_') + '.xml' if self.verbosity > 0: print(f"using bib_vid={bib_vid} to output item with zenodo " "identifier_normalized={identifier_normalized}") if save_xml == True: # Parse the input record and save it to a string record_str = etree.tostring(node_record, pretty_print=True, xml_declaration=True, encoding='utf-8') save_folder = self.output_folder + '/recieved/' os.makedirs(save_folder, exist_ok=True) save_file = (save_folder + identifier_normalized) with open(save_file, 'wb') as outfile: if self.verbosity > 0: print(f"Writing save_file ='{save_file}'") outfile.write(record_str) # Set some variables to potentially output into the Zenodo METS # output template utc_now = datetime.datetime.utcnow() utc_secs_z = utc_now.strftime("%Y-%m-%dT%H:%M:%SZ") #Get basic values from input doc node_oaidc = node_record.find(".//{*}dc", namespaces=namespaces) if node_oaidc is None: raise Exception("Cannot find oai_dc:dc node") namespaces_oaidc = { key: value for key, value in dict(node_oaidc.nsmap).items() if key is not None } if self.verbosity > 0: print(f"Got oai_dc prefix map='{namespaces_oaidc}'\n\n") node_creator = node_oaidc.find(".//dc:creator", namespaces=namespaces_oaidc) dc_creator = '' if node_creator is None else node_creator.text nodes_creator = node_oaidc.findall(".//dc:creator", namespaces=namespaces_oaidc) # Handle multiple creators dc_creators = [] for node_creator in nodes_creator: dc_creators.append(node_creator.text) if self.verbosity > 0: print(f"Got creators='{dc_creators}'") sys.stdout.flush() xml_sobekcm_creators = '' for creator in dc_creators: xml_sobekcm_creators += (f''' <mods:name type="personal"> <mods:namePart>{creator}</mods:namePart> <mods:role> <mods:roleTerm type="text">creator</mods:roleTerm> </mods:role> </mods:name> ''') dc_date_orig = node_oaidc.find("./dc:date", namespaces=namespaces_oaidc).text if self.verbosity > 0: print("Got dc date orig={}".format(dc_date_orig)) # Must convert dc_date_orig to valid METS format: dc_date = '{}T12:00:00Z'.format(dc_date_orig) if self.verbosity > 0: print("Got dc_date='{}'".format(dc_date)) node_description = node_oaidc.find(".//{*}description", namespaces=namespaces_oaidc) # Make an element tree style tree to invoke pattern to remove innter xml # If strip newlines, text with newlines runs together so replace a space str_description = (tostring(node_description, encoding='unicode', method='text').strip().replace('\n', ' ')) # Special doctype needed to handle nbsp... copyright xml_dtd = '''<?xml version="1.1" encoding="UTF-8" ?><!DOCTYPE naughtyxml [ <!ENTITY nbsp " "> <!ENTITY copy "©"> ]>''' xml_description = '{}<doc>{}</doc>'.format(xml_dtd, str_description) if self.verbosity > 0: print("Got str_description='{}'".format(str_description)) if self.verbosity > 0: print("Got xml_description='{}'".format(xml_description)) # See: https://stackoverflow.com/questions/19369901/python-element-tree-extract-text-from-element-stripping-tags#19370075 tree_description = ET.fromstring(xml_description) dc_description = etl.escape_xml_text(''.join( tree_description.itertext())) #dc_description = xml_description if self.verbosity > 0: print("Using dc_description='{}'".format(dc_description)) nodes_identifier = node_oaidc.findall(".//{*}identifier", namespaces=namespaces_oaidc) #inferred the following indexes by pure manual inspection! doi = nodes_identifier[0].text zenodo_id = nodes_identifier[2].text related_url = '{}'.format(doi) #relation_doi = node_oaidc.find(".//{*}relation").text nodes_rights = node_oaidc.findall(".//{*}rights", namespaces=namespaces_oaidc) rights_text = 'See:' for node_rights in nodes_rights: rights_text += ' ' + node_rights.text nodes_subject = node_oaidc.findall(".//{*}subject", namespaces=namespaces_oaidc) mods_subjects = '' for node_subject in nodes_subject: mods_subjects += ('<mods:subject><mods:topic>' + node_subject.text + '</mods:topic></mods:subject>\n') dc_title = node_oaidc.find(".//{*}title").text dc_type = node_oaidc.find(".//{*}type").text sobekcm_aggregations = ['UFDATASETS'] xml_sobekcm_aggregations = '' for aggregation in sobekcm_aggregations: xml_sobekcm_aggregations += ( '<sobekcm:Aggregation>{}</sobekcm:Aggregation>'.format( aggregation)) # Apply basic input values to METS template variables d_var_val = { 'bib_vid': bib_vid, 'create_date': dc_date, 'last_mod_date': utc_secs_z, 'agent_creator_individual_name': dc_creator, 'agent_creator_individual_note': 'Creation via zenodo harvest', 'identifier': header_identifier, 'mods_subjects': mods_subjects, 'rights_text': rights_text, 'utc_secs_z': utc_secs_z, 'title': dc_title, 'related_url': related_url, 'xml_sobekcm_aggregations': xml_sobekcm_aggregations, 'doi': doi, 'description': dc_description, 'xml_sobekcm_creators': xml_sobekcm_creators, 'bibid': bibid_str, 'vid': vid_str, 'type_of_resource': dc_type, 'sha1-mets-v1': '', 'genre': 'dataset', 'genre_authority': 'zenodo', } # Create mets_str and write it mets_str = mets_format_str.format(**d_var_val) mets_output_folder = self.output_folder + '/mets_output/' item_output_folder = mets_output_folder + '/' + bib_vid os.makedirs(item_output_folder, exist_ok=True) filename_mets = item_output_folder + '/' + bib_vid + '.mets.xml' if self.verbosity > 0: print("WRITING METS.XML to {}".format(filename_mets)) fn = filename_mets with open(fn, 'wb') as outfile: print("Writing filename='{}'".format(fn)) outfile.write(mets_str.encode('utf-8'))