def SSDN_MODS(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('SSDN_MODS', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': pass except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': pass except AttributeError: pass if VERBOSE: print(record.oai_urn) logger.debug(record.oai_urn) sourceResource = {} if record.metadata is None: continue # sourceResource.alternative if len(record.metadata.titles) > 1: sourceResource['alternative'] = [] if len(record.metadata.titles[1:]) >= 1: for alternative_title in record.metadata.titles[1:]: sourceResource['alternative'].append(alternative_title) # sourceResource.collection # sourceResource.contributor try: for name in record.metadata.names: if name.role.text != 'Creator' and name.role.code != 'cre' and name.role.text is not None and name.role.code is not None: sourceResource['contributor'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text }] except KeyError as err: logger.error('sourceResource.contributor: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.creator name_list = [] if record.metadata.get_creators: for name in record.metadata.get_creators: name_list.append(name) if record.metadata.names: for name in record.metadata.names: if name.role.text is None or name.role.code is None: name_list.append(name) sourceResource['creator'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text } for name in name_list] # sourceResource.date if record.metadata.dates: date = record.metadata.dates[0].text if ' - ' in date: sourceResource['date'] = { "displayDate": date, "begin": date[0:4], "end": date[-4:] } else: sourceResource['date'] = { "displayDate": date, "begin": date, "end": date } # sourceResource.description if record.metadata.abstract: sourceResource['description'] = [ abstract.text for abstract in record.metadata.abstract ] try: for toc in record.metadata.iterfind( './/{http://www.loc.gov/mods/v3}tableOfContents'): sourceResource['description'].append(toc.text) except KeyError: sourceResource['description'] = [ toc.text for toc in record.metadata.findall( './/{http://www.loc.gov/mods/v3}tableOfContents') ] # sourceResource.extent if record.metadata.extent: sourceResource['extent'] = record.metadata.extent # sourceResource.format if record.metadata.genre: sourceResource['format'] = [{ 'name': genre.text, '@id': genre.uri } if genre.uri else { 'name': genre.text } for genre in record.metadata.genre] # sourceResource.identifier try: sourceResource['identifier'] = record.metadata.purl[0] except IndexError as err: logger.error('sourceResource.identifier: {0}, {1}'.format( err, record.oai_urn)) continue # sourceResource.language try: if record.metadata.language: sourceResource['language'] = [{ "name": lang.text, "iso_639_3": lang.code } for lang in record.metadata.language] except AttributeError as err: logger.error('sourceResource.language: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.place : sourceResource['spatial'] for subject in record.metadata.subjects: for c in subject.elem.getchildren(): if 'eographic' in c.tag: sourceResource['spatial'] = {"name": subject.text} # sourceResource.publisher if record.metadata.publisher: sourceResource['publisher'] = record.metadata.publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights if record.metadata.rights: sourceResource['rights'] = [{ "@id": rights.text } if "http://rightsstatements.org" in rights.text else { "text": rights.text } for rights in record.metadata.rights[:2]] # slicing isn't ideal here since it depends on element order else: logger.error('No sourceResource.rights - {0}'.format( record.oai_urn)) continue # sourceResource.subject try: if record.metadata.subjects: sourceResource['subject'] = [] for subject in record.metadata.subjects: for child in subject.elem: if 'eographic' not in child.tag: sourceResource['subject'].append( {"name": subject.text}) except (TypeError, IndexError) as err: logger.error('sourceResource.subject: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.title if record.metadata.titles: sourceResource['title'] = [ '{}'.format(record.metadata.titles[0]) ] else: logger.error('No sourceResource.title: {0}'.format( record.oai_urn)) continue # sourceResource.type sourceResource['type'] = record.metadata.type_of_resource # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None pid = record.metadata.pid if pid is None: pid = record.oai_urn.split(':')[-1].replace('_', ':') preview = assets.thumbnail_service(pid, tn) # aggregation.provider # build record try: if record.metadata.purl[0]: doc = assets.build(record.oai_urn, sourceResource, data_provider, record.metadata.purl[0], preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format( record.oai_urn)) continue return docs
def SSDN_QDC(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('SSDN_QDC', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative alt_title = record.metadata.get_element( './/{0}alternative'.format(dcterms)) if alt_title: sourceResource['alternative'] = alt_title # sourceResource.collection if record.metadata.get_element('.//{0}isPartOf'.format(dcterms)): sourceResource['collection'] = record.metadata.get_element( './/{0}isPartOf'.format(dcterms)) # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}created'.format(dcterms)) if date is None: # TODO: there has to be a better way to do this date = record.metadata.get_element( './/{0}issued'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}date'.format(dcterms)) if date is None: date = record.metadata.get_element('.//{0}date'.format(dc)) if date is None: date = record.metadata.get_element( './/{0}available'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}dateAccepted'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}dateCopyrighted'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}dateSubmitted'.format(dcterms)) if date is not None: sourceResource['date'] = { "begin": date[0], "end": date[0], "displayDate": date[0] } # sourceResource.description description = [] if record.metadata.get_element( './/{0}description'.format(dc)) is not None: for item in record.metadata.get_element( './/{0}description'.format(dc)): description.append(item) if record.metadata.get_element( './/{0}abstract'.format(dcterms)) is not None: for item in record.metadata.get_element( './/{0}abstract'.format(dcterms)): description.append(item) if description: sourceResource['description'] = description # sourceResource.extent if record.metadata.get_element('.//{0}extent'.format(dcterms)): sourceResource['extent'] = record.metadata.get_element( './/{0}extent'.format(dcterms), delimiter=';') # sourceResource.format if record.metadata.get_element('.//{0}medium'.format(dcterms)): sourceResource['format'] = [] for element in record.metadata.get_element( './/{0}medium'.format(dcterms), delimiter=';'): if element.lower() in IANA_type_list: file_format = element.lower() pass elif len(element) > 0: sourceResource['format'].append( {'name': element.strip(' ')}) if len(sourceResource['format']) == 0: del sourceResource['format'] # sourceResource.genre # sourceResource.identifier sourceResource['identifier'] = oai_id # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for element in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): if len(element) > 3: sourceResource['language'].append({"name": element}) else: sourceResource['language'].append( {"iso_639_3": element}) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}spatial'.format(dcterms)): for place in record.metadata.get_element( './/{0}spatial'.format(dcterms), delimiter=';'): try: float(place) except ValueError: sourceResource['spatial'] = [place] # sourceResource.publisher publisher = record.metadata.get_element( './/{0}publisher'.format(dc)) if publisher: sourceResource['publisher'] = publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights_uri = re.compile('http://rightsstatements') if record.metadata.get_element('.//{0}rights'.format(dc)): for rights_statement in record.metadata.get_element( './/{0}rights'.format(dc)): uri = rights_uri.search(rights_statement) if uri: sourceResource['rights'] = [{ "@id": uri.string.strip() }] break else: sourceResource['rights'] = [{ "text": rights_statement.strip() }] else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [{ "name": name } for name in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';')] # sourceResource.temporal temporal = record.metadata.get_element( './/{0}temporal'.format(dcterms)) if temporal: sourceResource['temporal'] = temporal # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title is not None: sourceResource['title'] = title else: logger.error('No sourceResource.title - {0}'.format(oai_id)) continue # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # TODO: file_format kicked out of SR.genre # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None for identifier in record.metadata.get_element( './/{0}identifier'.format(dc)): if 'http' in identifier: is_shown_at = identifier preview = assets.thumbnail_service(identifier, tn) # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs
def SSDN_DC(file_in, tn, dprovide, iprovide=None): def clean_mark_up(text): mark_up_re = re.compile('<.*?>') new_line_re = re.compile('\n') clean_text = re.sub(mark_up_re, '', text) clean_text = re.sub(new_line_re, ' ', clean_text) return clean_text with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('SSDN_DC', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative # sourceResource.collection if record.metadata.get_element('.//{0}relation'.format(dc)): sourceResource['collection'] = record.metadata.get_element( './/{0}relation'.format(dc)) # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}date'.format(dc)) if date: try: d = dateparser.parse(date[0], languages=['en']).date().isoformat() sourceResource['date'] = { "begin": d, "end": d, "displayDate": d } except AttributeError as err: logger.warning('sourceResource.date: {0}, {1}'.format( err, record.oai_urn)) sourceResource['date'] = date[0] # sourceResource.description if record.metadata.get_element('.//{0}description'.format(dc)): sourceResource['description'] = [ clean_mark_up(desc) for desc in record.metadata.get_element( './/{0}description'.format(dc), delimiter=';') ] # sourceResource.extent # sourceResource.format if record.metadata.get_element('.//{0}format'.format(dc)): sourceResource['format'] = record.metadata.get_element( './/{0}format'.format(dc)) # sourceResource.genre # sourceResource.identifier sourceResource['identifier'] = oai_id # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for lang in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): sourceResource['language'].append(lang) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}coverage'.format(dc)): sourceResource['spatial'] = [{ 'name': place } for place in record.metadata.get_element( './/{0}coverage'.format(dc))] # sourceResource.publisher publisher = record.metadata.get_element( './/{0}publisher'.format(dc)) if publisher: sourceResource['publisher'] = publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights_uri = re.compile('http://rightsstatements') if record.metadata.get_element('.//{0}rights'.format(dc)): for rights_statement in record.metadata.get_element( './/{0}rights'.format(dc)): uri = rights_uri.search(rights_statement) if uri: sourceResource['rights'] = [{ "@id": uri.string.strip() }] break else: sourceResource['rights'] = [{ "text": rights_statement.strip() }] else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [] for term in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';'): term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append( {"name": term.strip(". ")}) # sourceResource.temporal # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title is not None: sourceResource['title'] = title else: logger.error('No sourceResource.title - {0}'.format(oai_id)) continue # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # TODO: file_format kicked out of SR.genre # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt for identifier in record.metadata.get_element( './/{0}identifier'.format(dc)): if 'http' in identifier: is_shown_at = identifier # aggregation.preview preview = None try: preview = assets.thumbnail_service(record, tn) except (TypeError, UnboundLocalError) as err: logger.warning('aggregation.preview: {0} - {1}'.format( err, oai_id)) pass # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except (NameError, UnboundLocalError): logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs
def FlMem(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('FlMem', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative # sourceResource.collection if record.metadata.get_element('.//{0}source'.format(dc)): sourceResource['collection'] = { 'name': record.metadata.get_element('.//{0}source'.format(dc))[0] } # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}date'.format(dc)) if date: sourceResource['date'] = { "begin": date[0], "end": date[0], "displayDate": date[0] } # sourceResource.description if record.metadata.get_element('.//{0}description'.format(dc)): sourceResource['description'] = record.metadata.get_element( './/{0}description'.format(dc), delimiter=';') # sourceResource.extent # sourceResource.format if record.metadata.get_element('.//{0}format'.format(dc)): sourceResource['format'] = record.metadata.get_element( './/{0}format'.format(dc)) # sourceResource.genre # sourceResource.identifier for identifier in record.metadata.get_element( './/{0}identifier'.format(dc)): if 'http' in identifier: is_shown_at = identifier.replace( identifier.split('/')[2], 'www.floridamemory.com') is_shown_at = is_shown_at.replace('http:', 'https:') sourceResource['identifier'] = oai_id.replace( oai_id.split(':')[1], 'www.floridamemory.com') # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for lang in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): results = assets.iso639_2code(lang.split('-')[0]) sourceResource['language'].append(results) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}coverage'.format(dc)): sourceResource['spatial'] = [{ 'name': place } for place in record.metadata.get_element( './/{0}coverage'.format(dc))] # sourceResource.publisher if record.metadata.get_element('.//{0}publisher'.format(dc)): sourceResource['publisher'] = record.metadata.get_element( './/{0}publisher'.format(dc)) # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights # TODO: hard-coding is only temporary sourceResource['rights'] = { '@id': 'http://rightsstatements.org/vocab/NoC-US/1.0/' } # rights = record.metadata.get_element('.//{0}rights'.format(dc)) # if rights: # sourceResource['rights'] = [{'text': rights[0]}] # else: # logger.error('No sourceResource.rights - {0}'.format(oai_id)) # # continue # TODO renable for prod # pass # local test # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [] for term in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';'): term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append( {"name": term.strip(". ")}) # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title: sourceResource['title'] = title else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.temporal temporal = record.metadata.get_element('.//{0}coverage'.format(dc)) if temporal: sourceResource['temporal'] = temporal # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): if 'type' in sourceResource.keys(): sourceResource['type'] = sourceResource[ 'type'] + record.metadata.get_element( './/{0}type'.format(dc)) else: sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') if record.metadata.get_element('.//{0}format'.format(dc)): if 'type' in sourceResource.keys(): sourceResource['type'] = sourceResource[ 'type'] + record.metadata.get_element( './/{0}format'.format(dc)) else: sourceResource['type'] = record.metadata.get_element( './/{0}format'.format(dc), delimiter=';') # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None try: preview = assets.thumbnail_service(is_shown_at, tn) except UnboundLocalError as err: logger.error('aggregation.preview: {0} - {1}'.format( err, oai_id)) pass # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs
def FlaLD_MODS(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('FlaLD_MODS', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': pass except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': pass except AttributeError: pass if VERBOSE: print(record.oai_urn) logger.debug(record.oai_urn) sourceResource = {} if record.metadata is None: continue # sourceResource.alternative if len(record.metadata.titles) > 1: sourceResource['alternative'] = [] if len(record.metadata.titles[1:]) >= 1: for alternative_title in record.metadata.titles[1:]: sourceResource['alternative'].append(alternative_title) # sourceResource.collection if record.metadata.collection: collection = record.metadata.collection sourceResource['collection'] = {} if collection.title: sourceResource['collection']['name'] = collection.title if collection.location: sourceResource['collection']['host'] = collection.location if collection.url: sourceResource['collection']['_:id'] = collection.url # sourceResource.contributor try: for name in record.metadata.names: if name.role.text != 'Creator' or name.role.code != 'cre': sourceResource['contributor'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text }] except KeyError as err: logger.error('sourceResource.contributor: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.creator if record.metadata.get_creators: sourceResource['creator'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text } for name in record.metadata.get_creators] # sourceResource.date if record.metadata.dates: date = record.metadata.dates[0].text if ' - ' in date: sourceResource['date'] = { "displayDate": date, "begin": date[0:4], "end": date[-4:] } else: sourceResource['date'] = { "displayDate": date, "begin": date, "end": date } # sourceResource.description if record.metadata.abstract: sourceResource['description'] = [ abstract.text for abstract in record.metadata.abstract ] # sourceResource.extent if record.metadata.extent: sourceResource['extent'] = record.metadata.extent # sourceResource.format if record.metadata.form: sourceResource['format'] = record.metadata.form # sourceResource.genre if record.metadata.genre: sourceResource['genre'] = [{ 'name': genre.text, '@id': genre.uri } if genre.uri else { 'name': genre.text } for genre in record.metadata.genre] # sourceResource.identifier try: sourceResource['identifier'] = record.metadata.purl[0] except IndexError as err: logger.error('sourceResource.identifier: {0}, {1}'.format( err, record.oai_urn)) continue # sourceResource.language try: if record.metadata.language: sourceResource['language'] = [{ "name": lang.text, "iso_639_3": lang.code } for lang in record.metadata.language] except AttributeError as err: logger.error('sourceResource.language: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.place : sourceResource['spatial'] try: if record.metadata.geographic_code and len( record.metadata.geographic_code) > 0: sourceResource['spatial'] = [] for geo_code in record.metadata.geographic_code: code, lat, long, label = assets.tgn_cache( geo_code.strip()) sourceResource['spatial'].append({ "lat": lat, "long": long, "name": label, "_:attribution": "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License." }) except TypeError as err: logger.error('sourceResource.spatial: {0}, {1}'.format( err, record.oai_urn)) continue # sourceResource.publisher if record.metadata.publisher: sourceResource['publisher'] = record.metadata.publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights if record.metadata.rights: sourceResource['rights'] = [{ "@id": rights.uri } if rights.uri else { "text": rights.text } for rights in record.metadata.rights] else: logger.error('No sourceResource.rights - {0}'.format( record.oai_urn)) continue # sourceResource.subject try: if record.metadata.subjects: sourceResource['subject'] = [{ "@id": subject.uri, "name": subject.text } if subject.uri is not None else { "name": subject.text } for subject in record.metadata.subjects] except (TypeError, IndexError) as err: logger.error('sourceResource.subject: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.title if record.metadata.titles: sourceResource['title'] = [ '{}'.format(record.metadata.titles[0]) ] else: logger.error('No sourceResource.title: {0}'.format( record.oai_urn)) continue # sourceResource.type sourceResource['type'] = record.metadata.type_of_resource # aggregation.dataProvider first_baptist = re.compile('^FSU_FBCTLH') leon_high = re.compile('^FSU_LeonHigh') godby_high = re.compile('^FSU_Godby') havana_hhs = re.compile('^FSU_HHHS') # ringling = re.compile('^FSU_Ringling') first_baptist_iid = first_baptist.search(record.metadata.iid) leon_high_iid = leon_high.search(record.metadata.iid) godby_high_iid = godby_high.search(record.metadata.iid) havana_hhs_iid = havana_hhs.search(record.metadata.iid) # ringling_iid = ringling.search(record.metadata.iid) if first_baptist_iid: data_provider = 'First Baptist Church of Tallahassee' iprovide = 'Florida State University Libraries' elif leon_high_iid: data_provider = 'Leon High School, Tallahassee, Florida' iprovide = 'Florida State University Libraries' elif godby_high_iid: data_provider = 'Godby High School, Tallahassee, Florida' iprovide = 'Florida State University Libraries' elif havana_hhs_iid: data_provider = 'Havana History & Heritage Society, Havana, Florida' iprovide = 'Florida State University Libraries' # elif ringling_iid: # data_provider = 'John and Mable Ringling Museum of Art' # iprovide = 'Florida State University Libraries' else: data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None pid = record.metadata.pid if pid is None: pid = record.oai_urn.split(':')[-1].replace('_', ':') preview = assets.thumbnail_service(pid, tn) # aggregation.provider # build record try: if record.metadata.purl[0]: doc = assets.build(record.oai_urn, sourceResource, data_provider, record.metadata.purl[0], preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format( record.oai_urn)) continue return docs
def FlaLD_DC(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('FlaLD_DC', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative # sourceResource.collection # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}date'.format(dc)) if date: sourceResource['date'] = { "begin": date[0], "end": date[0], "displayDate": date[0] } # sourceResource.description if record.metadata.get_element('.//{0}description'.format(dc)): sourceResource['description'] = record.metadata.get_element( './/{0}description'.format(dc), delimiter=';') # sourceResource.extent # sourceResource.format if record.metadata.get_element('.//{0}format'.format(dc)): sourceResource['format'] = record.metadata.get_element( './/{0}format'.format(dc)) # sourceResource.genre # sourceResource.identifier dPantherPURL = re.compile( 'http://dpanther.fiu.edu/dpService/dpPurlService') dPantherURL = re.compile('http://dpanther') identifier = record.metadata.get_element( './/{0}identifier'.format(dc)) try: for ID in identifier: if dPantherPURL.search(ID): PURL_match = ID sourceResource['identifier'] = ID break elif dPantherURL.search(ID): sourceResource['identifier'] = ID logger.warning( 'sourceResource.identifier: {0} - {1}'.format( 'Not a PURL', oai_id)) is_shown_at = sourceResource['identifier'] except (TypeError, UnboundLocalError) as err: logger.error('sourceResource.identifier: {0} - {1}'.format( err, oai_id)) continue # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for element in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): if len(element) > 3: sourceResource['language'].append({"name": element}) else: sourceResource['language'].append( {"iso_639_3": element}) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}coverage'.format(dc)): sourceResource['spatial'] = [{ 'name': place } for place in record.metadata.get_element( './/{0}coverage'.format(dc))] # sourceResource.publisher if record.metadata.get_element('.//{0}publisher'.format(dc)): sourceResource['publisher'] = record.metadata.get_element( './/{0}publisher'.format(dc)) # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights = record.metadata.get_element('.//{0}rights'.format(dc)) if rights: sourceResource['rights'] = [{'text': rights[0]}] else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [] for term in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';'): term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append( {"name": term.strip(" ")}) # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title: sourceResource['title'] = title else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None try: preview = assets.thumbnail_service(record, tn) except (TypeError, UnboundLocalError) as err: logger.warning('aggregation.preview: {0} - {1}'.format( err, oai_id)) pass # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs