def biblfull_xmletree_to_metajson(biblfull, laboratories, projects, source): """ biblFull xmletree -> MetaJSON Document """ if biblfull is None: return None document = Document() # titleStmt tei_titlestmt = biblfull.find(xmletree.prefixtag("tei", "titleStmt")) # editionStmt tei_editionstmt = biblfull.find(xmletree.prefixtag("tei", "editionStmt")) # extent tei_extent = biblfull.find(xmletree.prefixtag("tei", "extent")) # publicationStmt tei_publicationstmt = biblfull.find( xmletree.prefixtag("tei", "publicationStmt")) # seriesStmt tei_seriesstmt = biblfull.find(xmletree.prefixtag("tei", "seriesStmt")) # notesStmt tei_notesstmt = biblfull.find(xmletree.prefixtag("tei", "notesStmt")) # sourceDesc tei_sourcedescs = biblfull.findall(xmletree.prefixtag("tei", "sourceDesc")) # profileDesc tei_profiledesc = biblfull.find(xmletree.prefixtag("tei", "profileDesc")) tei_langusage = tei_profiledesc.find(xmletree.prefixtag( "tei", "langUsage")) tei_languages = tei_langusage.findall(xmletree.prefixtag( "tei", "language")) tei_textclass = tei_profiledesc.find(xmletree.prefixtag( "tei", "textClass")) tei_keywords = tei_textclass.findall(xmletree.prefixtag("tei", "keywords")) tei_classcodes = tei_textclass.findall( xmletree.prefixtag("tei", "classCode")) # language doc_language = None if tei_languages: languages = [] for tei_language in tei_languages: language = tei_language.get("ident") languages.append(language) if languages: document["languages"] = languages doc_language = languages[0] # title document.update(get_tei_titles_to_metason(tei_titlestmt, doc_language)) metajson_service.pretty_print_document(document) metajson_service.print_document(document) return document
def ddi_xmletree_to_metajson(ddi_root, source, rec_id_prefix): """ DDI xmletree -> MetaJSON Document """ if ddi_root is None: return None document = Document() document["rec_type"] = constants.DOC_TYPE_DATASETQUALI if source: document["rec_source"] = source # stdyDscr/citation/titlStmt/titl ddi_stdydscr = ddi_root.find(xmletree.prefixtag("ddi", "stdyDscr")) if ddi_stdydscr is not None: ddi_stdydscr_citation = ddi_stdydscr.find( xmletree.prefixtag("ddi", "citation")) ddi_stdydscr_citation_titlstmt = ddi_stdydscr_citation.find( xmletree.prefixtag("ddi", "titlStmt")) ddi_stdydscr_citation_titlstmt_titl = ddi_stdydscr_citation_titlstmt.find( xmletree.prefixtag("ddi", "titl")) if ddi_stdydscr_citation_titlstmt_titl is not None: document["title"] = ddi_stdydscr_citation_titlstmt_titl.text return document
def load_dict(meta_dict): if "rec_class" not in meta_dict: return Common(meta_dict) elif meta_dict["rec_class"] == "Document": return Document(meta_dict) elif meta_dict["rec_class"] == "Person": return Person(meta_dict) elif meta_dict["rec_class"] == "Orgunit": return Orgunit(meta_dict) elif meta_dict["rec_class"] == "Project": return Project(meta_dict) elif meta_dict["rec_class"] == "Event": return Event(meta_dict) elif meta_dict["rec_class"] == "Family": return Family(meta_dict) elif meta_dict["rec_class"] == "Field": return Field(meta_dict) elif meta_dict["rec_class"] == "Resource": return Resource(meta_dict) elif meta_dict["rec_class"] == "Target": return Target(meta_dict) elif meta_dict["rec_class"] == "Type": return Type(meta_dict) elif meta_dict["rec_class"] == "Collection": return Collection(meta_dict) else: logging.debug(jsonbson.dumps_bson(meta_dict)) logging.warning("Unknown rec_class: {O}".format(meta_dict["rec_class"])) return Common(meta_dict)
def tikadict_to_metajson(tikadict, source): document = Document() resource = Resource() if source: document["source"] = source # document # dc:title -> document title if "dc:title" in tikadict: document["title"] = tikadict["dc:title"] # dcterms:created -> document date_issued ou resource rec_created_date ? if "dcterms:created" in tikadict: document["date_issued"] = tikadict["dcterms:created"] # xmpTPg:NPages -> document extent_pages if "xmpTPg:NPages" in tikadict: document["extent_pages"] = tikadict["xmpTPg:NPages"] # resource # dcterms:modified -> resource rec_modified_date ? if "dcterms:modified" in tikadict: resource["rec_modified_date"] = tikadict["dcterms:modified"] # producer -> resource processing_software_name if "producer" in tikadict: resource["processing_software_name"] = tikadict["producer"] # Content-Type -> resource format_mimetype if "Content-Type" in tikadict: resource["format_mimetype"] = tikadict["Content-Type"] document["resources"] = [resource] logging.debug(document) return document
def biblfull_xmletree_to_metajson(biblfull, laboratories, projects, source): """ biblFull xmletree -> MetaJSON Document """ if biblfull is None: return None document = Document() # titleStmt tei_titlestmt = biblfull.find(xmletree.prefixtag("tei", "titleStmt")) # editionStmt tei_editionstmt = biblfull.find(xmletree.prefixtag("tei", "editionStmt")) # extent tei_extent = biblfull.find(xmletree.prefixtag("tei", "extent")) # publicationStmt tei_publicationstmt = biblfull.find(xmletree.prefixtag("tei", "publicationStmt")) # seriesStmt tei_seriesstmt = biblfull.find(xmletree.prefixtag("tei", "seriesStmt")) # notesStmt tei_notesstmt = biblfull.find(xmletree.prefixtag("tei", "notesStmt")) # sourceDesc tei_sourcedescs = biblfull.findall(xmletree.prefixtag("tei", "sourceDesc")) # profileDesc tei_profiledesc = biblfull.find(xmletree.prefixtag("tei", "profileDesc")) tei_langusage = tei_profiledesc.find(xmletree.prefixtag("tei", "langUsage")) tei_languages = tei_langusage.findall(xmletree.prefixtag("tei", "language")) tei_textclass = tei_profiledesc.find(xmletree.prefixtag("tei", "textClass")) tei_keywords = tei_textclass.findall(xmletree.prefixtag("tei", "keywords")) tei_classcodes = tei_textclass.findall(xmletree.prefixtag("tei", "classCode")) # language doc_language = None if tei_languages: languages = [] for tei_language in tei_languages: language = tei_language.get("ident") languages.append(language) if languages: document["languages"] = languages doc_language = languages[0] # title document.update(get_tei_titles_to_metason(tei_titlestmt, doc_language)) metajson_service.pretty_print_document(document) metajson_service.print_document(document) return document
def hyphe_webentity_to_metajson(webentity, source): document = Document() # static document["rec_type"] = "WebEntity" if source: document["source"] = source # MetaJSON # id -> rec_id document["rec_id"] = webentity["id"] # creation_date -> rec_created_date #document["rec_created_date"] = date_service.parse_timestamp(webentity["creation_date"]) # last_modification_date -> rec_modified_date #document["rec_modified_date"] = date_service.parse_timestamp(webentity["last_modification_date"]) document["title"] = webentity["name"] # Hyphe specific hyphe = {} # id -> hyphe/webentity_id hyphe["webentity_id"] = webentity["id"] # crawling_status -> hyphe/crawling_status hyphe["crawling_status"] = webentity["crawling_status"] # indexing_status -> hyphe/indexing_status hyphe["indexing_status"] = webentity["indexing_status"] # status -> hyphe/status hyphe["status"] = webentity["status"] # lru_prefixes -> hyphe/lru_prefixes hyphe["lru_prefixes"] = webentity["lru_prefixes"] # startpages -> hyphe/startpages hyphe["startpages"] = webentity["startpages"] # todo : tags to metajson specific #if webentity["tags"]: # hyphe["tags"] = webentity["tags"] document["hyphe"] = hyphe return document
def mets_xmletree_to_metajson(mets, source, rec_id_prefix): document = Document() # source if source: document["rec_source"] = source # mets # metsHdr # dmdSec document["dmds"] = extract_dmdsecs(mets) # techMD # sourceMD # digiprovMD # fileGrp # structMap return document
def openurl_xmletree_to_metajson_list(openurl_response, source, rec_id_prefix, only_first_record): documents = [] if openurl_response is not None: #logging.debug(type(openurl_response)) #logging.debug(openurl_response) # results openurl_results = openurl_response.find( xmletree.prefixtag("ssopenurl", "results")) if openurl_results is not None: # result openurl_result_list = openurl_results.findall( xmletree.prefixtag("ssopenurl", "result")) if openurl_result_list: for openurl_result in openurl_result_list: document = Document() if source: document["source"] = source # citation openurl_citation = openurl_result.find( xmletree.prefixtag("ssopenurl", "citation")) if openurl_citation is not None: # issn openurl_issn = openurl_citation.find( xmletree.prefixtag("ssopenurl", "issn")) if openurl_issn is not None: identifier_issn = Identifier() identifier_issn["id_type"] = "issn" identifier_issn["value"] = openurl_issn.text document.add_item_to_key(identifier_issn, "identifiers") # eissn openurl_eissn = openurl_citation.find( xmletree.prefixtag("ssopenurl", "eissn")) if openurl_eissn is not None: identifier_eissn = Identifier() identifier_eissn["id_type"] = "eissn" identifier_eissn["value"] = openurl_eissn.text document.add_item_to_key(identifier_eissn, "identifiers") # linkGroups openurl_linkgroups = openurl_result.find( xmletree.prefixtag("ssopenurl", "linkGroups")) if openurl_linkgroups is not None: # linkGroup openurl_linkgroup_list = openurl_linkgroups.findall( xmletree.prefixtag("ssopenurl", "linkGroup")) if openurl_linkgroup_list is not None: for openurl_linkgroup in openurl_linkgroup_list: service_name = None institution_name = None period_begin = None period_end = None url = None # holdingData openurl_holdingdata = openurl_linkgroup.find( xmletree.prefixtag("ssopenurl", "holdingData")) if openurl_holdingdata is not None: # institution_name openurl_providername = openurl_holdingdata.find( xmletree.prefixtag( "ssopenurl", "providerName")) if openurl_providername is not None: institution_name = openurl_providername.text # service_name openurl_databasename = openurl_holdingdata.find( xmletree.prefixtag( "ssopenurl", "databaseName")) if openurl_databasename is not None: service_name = openurl_databasename.text # normalizedData openurl_normalizeddata = openurl_holdingdata.find( xmletree.prefixtag( "ssopenurl", "normalizedData")) if openurl_normalizeddata is not None: # startDate openurl_startdate = openurl_normalizeddata.find( xmletree.prefixtag( "ssopenurl", "startDate")) if openurl_startdate is not None: period_begin = openurl_startdate.text # endDate openurl_enddate = openurl_normalizeddata.find( xmletree.prefixtag( "ssopenurl", "endDate")) if openurl_enddate is not None: period_end = openurl_enddate.text # url openurl_url_list = openurl_linkgroup.findall( xmletree.prefixtag("ssopenurl", "url")) if openurl_url_list is not None: for openurl_url in openurl_url_list: if openurl_url.get( "type") == "journal": url = openurl_url.text elif openurl_url.get( "type") == "source": url = openurl_url.text if url: resource = Resource() resource["rec_type"] = "ResourceRemote" resource["rec_state"] = "published" resource["relation_type"] = "eResource" resource[ "version_type"] = "publishedVersion" resource["access_rights"] = "closedAccess" resource["format_mimetype"] = "text/html" resource["url"] = url if service_name: resource["service_name"] = service_name if institution_name: resource[ "institution_name"] = institution_name if period_begin: resource["period_begin"] = period_begin if period_end: resource["period_end"] = period_end document.add_item_to_key( resource, "resources") documents.append(document) if only_first_record: break #logging.debug(jsonbson.dumps_json(documents)) return documents
def csv_dict_reader_to_metasjon(csv_row, input_format, source, rec_id_prefix): document = Document() if source: document["rec_source"] = source if input_format == constants.FORMAT_CSV_SITPOL: #logging.debug("csv_dict_reader_to_metasjon type(csv_row): {}".format(type(csv_row))) #print csv_row document["title"] = csv_row["title"] classifications_sitpol = [x.strip() for x in csv_row["classifications_sitpol"].split(";") if x.strip()] if classifications_sitpol: document["classifications_sitpol"] = classifications_sitpol classifications_ddc = [x.strip() for x in csv_row["classifications_ddc"].split(";") if x.strip()] if classifications_ddc: document["classifications_ddc"] = classifications_ddc formatted_names = [x.strip() for x in csv_row["creators@role=pbl"].split(";") if x.strip()] if formatted_names: #logging.debug("formatted_names: {}".format(formatted_names)) creators = [] for formatted_name in formatted_names: if formatted_name: creator = creator_service.formatted_name_to_creator(formatted_name, None, "pbl") if creator: creators.append(creator) if creators: document["creators"] = creators document["date_last_accessed"] = datetime.now().isoformat() document["descriptions"] = [{"language":"fr", "value":csv_row["descriptions@lang=fr"]}] keywords_fr = [x.strip() for x in csv_row["keywords@lang=fr"].split(";") if x.strip()] keywords_en = [x.strip() for x in csv_row["keywords@lang=en"].split(";") if x.strip()] keywords = {} if keywords_fr: keywords["fr"] = keywords_fr if keywords_en: keywords["en"] = keywords_en if keywords: document["keywords"] = keywords document["languages"] = [x.strip() for x in csv_row["languages"].split(";") if x.strip()] note = csv_row["notes@lang=fr"] if note: document["notes"] = note document["publication_countries"] = [x.strip() for x in csv_row["publication_countries"].split(";") if x.strip()] if "rec_created_user" in csv_row: document["rec_created_user"] = csv_row["rec_created_user"] document["rec_type_cerimes"] = csv_row["rec_type_cerimes"] specific_agents = [x.strip() for x in csv_row["specific_agents"].split(";") if x.strip()] if specific_agents: document["specific_agents"] = specific_agents document["specific_actor_type"] = csv_row["specific_actor_type"] document["target_audiences_cerimes"] = csv_row["target_audiences_cerimes"] document["url"] = csv_row["url"] document["rec_type"] = constants.DOC_TYPE_WEBENTITY document["webentity_type"] = csv_row["webentity_type"] elif input_format == constants.FORMAT_CSV_METAJSON: document["rec_type"] = "DatasetQuali" creators = [] if "Laboratoire d'inventaire" in csv_row: creators.append(creator_service.formatted_name_to_creator(csv_row["Laboratoire d'inventaire"], constants.REC_CLASS_ORGUNIT, "dpt")) document["title"] = csv_row["Titre de l'enquete"] if "Sujet(s) de l'enquete" in csv_row: document["keywords"] = [x.strip() for x in csv_row["Sujet(s) de l'enquete"].split("\n") if x.strip()] if "Nom Auteur 1" in csv_row: name_familly = csv_row["Nom Auteur 1"] name_given = affiliation = "" if "Prenom Auteur 1" in csv_row: name_given = csv_row["Prenom Auteur 1"] if "Affiliation Auteur 1" in csv_row: affiliation = csv_row["Affiliation Auteur 1"] document["creators"] = creators else: logging.error("Unknown input_format: {}".format(input_format)) logging.info(jsonbson.dumps_json(document, True)) return document
def ris_txt_lines_to_metajson_list(txt_lines, source, rec_id_prefix, only_first_record): document = None ris_type = None rec_type = None is_part_of_rec_type = None previous_key = None previous_value = None for line in txt_lines: if line: line = line.rstrip('\r\n') #logging.debug("line: {}".format(line)) # multi line management if previous_key: key = previous_key value = previous_value + line previous_key = None previous_value = None else: key = line[:2].strip() value = line[6:].strip() if value.endswith("/") and key not in ["Y1", "PY"]: #logging.debug("multi line") previous_key = key previous_value = value.rstrip('/') continue if key is None or len(key) == 0: # empty line -> continue #logging.debug("empty line") continue elif key == RIS_KEY_BEGIN: # record begin with document type -> create document # init document = Document() is_part_of_rec_type = None if source: document["rec_source"] = source ris_type = value rec_type = ris_document_type_to_metajson_document_type[ ris_type] document["rec_type"] = rec_type if ris_type in ris_document_type_to_metajson_document_is_part_of_type: is_part_of_rec_type = ris_document_type_to_metajson_document_is_part_of_type[ ris_type] is_part_of = Document() is_part_of["rec_type"] = is_part_of_rec_type document["is_part_ofs"] = [is_part_of] elif key == RIS_KEY_END: # record end -> return the result # verify the is_part_ofs[0]["title"] if "is_part_ofs" in document and "title" not in document[ "is_part_ofs"][0] and "title_abbreviateds" in document[ "is_part_ofs"][0]: document["is_part_ofs"][0]["title"] = document[ "is_part_ofs"][0]["title_abbreviateds"][0]["title"] del document["is_part_ofs"][0]["title_abbreviateds"] logging.info("# RIS type: {}".format(ris_type)) metajson_service.pretty_print_document(document) yield document else: # process key value #logging.debug("key: {}; value: {}".format(key, value)) if key == "ID": document["rec_id"] = value elif key in [ "T1", "TI", "CT" ] or (key == "BT" and ris_type in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Primary -> title document["title"] = value elif key in [ "JF", "JO" ] or (key == "BT" and ris_type not in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Secondary -> is_part_of["title"] document.add_is_part_of_title(value) elif key in ["JA", "J1", "J2", "T2"]: # Title Secondary -> is_part_of["title_abbreviateds"][O]["title"] document.add_is_part_of_title_abbreviated(value) elif key == "T3": # Title Series document.add_series_title(value) elif key in ["A1", "AU"]: document.add_creator( creator_service.formatted_name_to_creator( value, None, "aut")) elif key in ["A2", "ED"]: if is_part_of_rec_type: document.add_is_part_of_creator( creator_service.formatted_name_to_creator( value, None, "edt")) else: document.add_creator( creator_service.formatted_name_to_creator( value, None, "edt")) elif key == "A3": document.add_series_creator( creator_service.formatted_name_to_creator( value, None, "aut")) elif key == "A4": document.add_creator( creator_service.formatted_name_to_creator( value, None, "ctb")) elif key in ["PY", "Y1", "DA"]: index_slash = value.find("/") if index_slash != -1: # YYYY/MM/DD/other info (like season) # todo document["date_issued"] = value.strip("/") else: document["date_issued"] = value elif key == "SP": document["part_page_begin"] = value elif key == "EP": document["part_page_end"] = value elif key == "VL": document["part_volume"] = value elif key in ["IS", "CP"]: document["part_issue"] = value elif key in ["AB", "N2"]: document["descriptions"] = [{ "language": "und", "value": value }] elif key == "N1": document["notes"] = [{"language": "und", "value": value}] elif key == "PB": document.add_item_to_key(value, "publishers") elif key == "CY": document.add_item_to_key(value, "publication_places") elif key == "RP": document["publication_status"] = value elif key == "ET": document["edition"] = value elif key == "UR": resource = Resource() resource["url"] = value document.add_item_to_key(resource, "resources") elif key == "AN": # Accession Number identifier = metajson_service.create_identifier( "accessionnumber", value) document.add_identifier(identifier) elif key == "CN": # Call Number identifier = metajson_service.create_identifier( "callnumber", value) document.add_identifier(identifier) elif key == "DO": # DOI identifier = metajson_service.create_identifier( "doi", value) document.add_identifier(identifier) elif key == "SN": # ISBN or ISSN ? id_type = None if rec_type in [ constants.DOC_TYPE_JOURNALARTICLE, constants.DOC_TYPE_MAGAZINEARTICLE, constants.DOC_TYPE_NEWSPAPERARTICLE, constants.DOC_TYPE_JOURNAL ]: id_type = "issn" else: id_type = "isbn" identifier = metajson_service.create_identifier( id_type, value) if is_part_of_rec_type is None: document.add_identifier(identifier) else: document["is_part_ofs"][0].add_identifier(identifier) elif key == "CA": document["caption"] = value elif key == "DB": # Name of Database -> rec_source ? document["rec_source"] = value elif key == "DP": # NDatabase Provider -> rec_source ? document["rec_source"] = value elif key == "KW": if "keywords" not in document: document["keywords"] = {"und": []} document["keywords"]["und"].append(value) else: logging.debug("Not managed key: {} with value: {}".format( key, value))
def openurl_xmletree_to_metajson_list(openurl_response, source, rec_id_prefix, only_first_record): documents = [] if openurl_response is not None: #logging.debug(type(openurl_response)) #logging.debug(openurl_response) # results openurl_results = openurl_response.find(xmletree.prefixtag("ssopenurl", "results")) if openurl_results is not None: # result openurl_result_list = openurl_results.findall(xmletree.prefixtag("ssopenurl", "result")) if openurl_result_list: for openurl_result in openurl_result_list: document = Document() if source: document["source"] = source # citation openurl_citation = openurl_result.find(xmletree.prefixtag("ssopenurl", "citation")) if openurl_citation is not None: # issn openurl_issn = openurl_citation.find(xmletree.prefixtag("ssopenurl", "issn")) if openurl_issn is not None: identifier_issn = Identifier() identifier_issn["id_type"] = "issn" identifier_issn["value"] = openurl_issn.text document.add_item_to_key(identifier_issn, "identifiers") # eissn openurl_eissn = openurl_citation.find(xmletree.prefixtag("ssopenurl", "eissn")) if openurl_eissn is not None: identifier_eissn = Identifier() identifier_eissn["id_type"] = "eissn" identifier_eissn["value"] = openurl_eissn.text document.add_item_to_key(identifier_eissn, "identifiers") # linkGroups openurl_linkgroups = openurl_result.find(xmletree.prefixtag("ssopenurl", "linkGroups")) if openurl_linkgroups is not None: # linkGroup openurl_linkgroup_list = openurl_linkgroups.findall(xmletree.prefixtag("ssopenurl", "linkGroup")) if openurl_linkgroup_list is not None: for openurl_linkgroup in openurl_linkgroup_list: service_name = None institution_name = None period_begin = None period_end = None url = None # holdingData openurl_holdingdata = openurl_linkgroup.find(xmletree.prefixtag("ssopenurl", "holdingData")) if openurl_holdingdata is not None: # institution_name openurl_providername = openurl_holdingdata.find(xmletree.prefixtag("ssopenurl", "providerName")) if openurl_providername is not None: institution_name = openurl_providername.text # service_name openurl_databasename = openurl_holdingdata.find(xmletree.prefixtag("ssopenurl", "databaseName")) if openurl_databasename is not None: service_name = openurl_databasename.text # normalizedData openurl_normalizeddata = openurl_holdingdata.find(xmletree.prefixtag("ssopenurl", "normalizedData")) if openurl_normalizeddata is not None: # startDate openurl_startdate = openurl_normalizeddata.find(xmletree.prefixtag("ssopenurl", "startDate")) if openurl_startdate is not None: period_begin = openurl_startdate.text # endDate openurl_enddate = openurl_normalizeddata.find(xmletree.prefixtag("ssopenurl", "endDate")) if openurl_enddate is not None: period_end = openurl_enddate.text # url openurl_url_list = openurl_linkgroup.findall(xmletree.prefixtag("ssopenurl", "url")) if openurl_url_list is not None: for openurl_url in openurl_url_list: if openurl_url.get("type") == "journal": url = openurl_url.text elif openurl_url.get("type") == "source": url = openurl_url.text if url: resource = Resource() resource["rec_type"] = "ResourceRemote" resource["rec_state"] = "published" resource["relation_type"] = "eResource" resource["version_type"] = "publishedVersion" resource["access_rights"] = "closedAccess" resource["format_mimetype"] = "text/html" resource["url"] = url if service_name: resource["service_name"] = service_name if institution_name: resource["institution_name"] = institution_name if period_begin: resource["period_begin"] = period_begin if period_end: resource["period_end"] = period_end document.add_item_to_key(resource, "resources") documents.append(document) if only_first_record: break #logging.debug(jsonbson.dumps_json(documents)) return documents
def endnotexml_record_to_metajson(record, source, rec_id_prefix): document = Document() # TODO # translated_creators: /contributors/translated-authors/author/style # auth_address: /auth-address/style # label: /label/style # custom1 # Extract endnote properties rec_id = record.find("rec-number").text endnote_type = record.find("ref-type").text rec_type = endnote_record_type_to_metajson_document_type[endnote_type] primary_creators = extract_creators(None, "aut", record, "./contributors/authors/author/style") secondary_creators = extract_creators( None, "pbd", record, "./contributors/secondary-authors/author/style") if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: tertiary_creators = extract_creators( None, "pbd", record, "./contributors/tertiary-authors/author/style") elif endnote_type == TYPE_THESIS: tertiary_creators = extract_creators( None, "ths", record, "./contributors/tertiary-authors/author/style") elif endnote_type == TYPE_FILM_OR_BROADCAST: tertiary_creators = extract_creators( None, "pro", record, "./contributors/tertiary-authors/author/style") if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: subsidiary_creators = extract_creators( None, "trl", record, "./contributors/subsidiary-authors/author/style") elif endnote_type == TYPE_FILM_OR_BROADCAST: subsidiary_creators = extract_creators( None, "act", record, "./contributors/subsidiary-authors/author/style") translated_creators = extract_creators( None, "trl", record, "./contributors/translated-authors/author/style") auth_address = extract_text(record, "./auth-address/style") title = extract_text(record, "./titles/title/style") title_secondary = extract_text(record, "./titles/secondary-title/style") title_tertiary = extract_text(record, "./titles/tertiary-title/style") title_alternative = extract_text(record, "./titles/alt-title/style") title_abbreviated = extract_text(record, "./titles/short-title/style") title_translated = extract_text(record, "./titles/translated-title/style") pages = extract_text(record, "./pages/style") part_volume = extract_text(record, "./volume/style") part_number = extract_text(record, "./number/style") extent_volumes = extract_text(record, "./num-vols/style") edition = extract_text(record, "./edition/style") part_section = extract_text(record, "./section/style") reprint_edition = extract_text(record, "./reprint-edition/style") keywords = extract_text(record, "./keywords/keyword/style") date_year = extract_text(record, "./dates/year/style") date_pub = extract_text(record, "./dates/pub-dates/date/style") publication_places_formatted = extract_text(record, "./pub-location/style") publishers_formatted = extract_text(record, "./publisher/style") orig_pub = extract_text(record, "./orig-pub/style") isbn_or_issn = extract_text(record, "./isbn/style") accessionnumber = extract_text(record, "./accession-num/style") callnumber = extract_text(record, "./call-num/style") if endnote_type == TYPE_WEB_PAGE: abstract = extract_text(record, "./pages/style") else: abstract = extract_text(record, "./abstract/style") label = extract_text(record, "./label/style") caption = extract_text(record, "./caption/style") note = extract_text(record, "./notes/style") reviewed_item = extract_text(record, "./reviewed-item/style") rec_type_description = extract_text(record, "./work-type/style") url = extract_text(record, "./urls/related-urls/url/style") custom1 = extract_text(record, "./custom1/style") custom2 = extract_text(record, "./custom2/style") custom3 = extract_text(record, "./custom3/style") custom4 = extract_text(record, "./custom4/style") custom5 = extract_text(record, "./custom5/style") custom6 = extract_text(record, "./custom6/style") custom7 = extract_text(record, "./custom7/style") doi = extract_text(record, "./electronic-resource-num/style") remote_database_name = extract_text(record, "./remote-database-name/style") remote_database_provider = extract_text( record, "./remote-database-provider/style") research_notes = extract_text(record, "./research-notes/style") language = extract_text(record, "./language/style") access_date = extract_text(record, "./access-date/style") # rec_id, rec_source document["rec_id"] = rec_id if source: document["rec_source"] = source # publishers_formatted, publication_places_formatted publishers = None publication_places = None if publishers_formatted: publishers = publishers_formatted.split("\r") if publication_places_formatted: publication_places = publication_places_formatted.split("\r") # type, is_part_of.type, is_part_of.is_part_of.type try: is_part_of_type = endnote_record_type_to_metajson_document_is_part_of_type[ endnote_type] except: is_part_of_type = None is_part_of_is_part_of_type = None if title_secondary is not None: if endnote_type == TYPE_FIGURE: # how to determine the is_part_of type ? # if there is a volume or an issue number, it's a JournalArticle, else it's a Book or BookPart if part_volume is not None or part_number is not None: is_part_of_type = constants.DOC_TYPE_JOURNALARTICLE is_part_of_is_part_of_type = constants.DOC_TYPE_JOURNAL else: if title_translated is not None: is_part_of_type = constants.DOC_TYPE_BOOKPART is_part_of_is_part_of_type = constants.DOC_TYPE_BOOK else: is_part_of_type = constants.DOC_TYPE_BOOK elif endnote_type == TYPE_FILM_OR_BROADCAST: rec_type = constants.DOC_TYPE_VIDEOPART is_part_of_type = constants.DOC_TYPE_VIDEORECORDING document["rec_type"] = rec_type document.set_key_if_not_none("rec_type_description", rec_type_description) # issn or isbn ? if is_part_of_type in [ constants.DOC_TYPE_JOURNAL, constants.DOC_TYPE_NEWSPAPER, constants.DOC_TYPE_MAGAZINE ]: isbn_or_issn_type = "issn" else: isbn_or_issn_type = "isbn" # is_part_of, is_part_of.is_part_of if is_part_of_type is not None and title_secondary: is_part_of = Document() is_part_of.set_key_if_not_none("rec_type", is_part_of_type) is_part_of.set_key_if_not_none("title", title_secondary) if is_part_of_is_part_of_type is not None: # is_part_of in case of is_part_of.is_part_of # creators with role aut is_part_of.add_creators( creator_service.change_contibutors_role( secondary_creators, "aut")) # is_part_of.is_part_of is_part_of_is_part_of = Document() is_part_of_is_part_of.set_key_if_not_none( "rec_type", is_part_of_is_part_of_type) is_part_of_is_part_of.set_key_if_not_none("title", title_translated) # creators with role pbd is_part_of_is_part_of.add_creators( creator_service.change_contibutors_role( translated_creators, "pbd")) #is_part_of_is_part_of.set_key_if_not_none("date_issued",date_year) is_part_of_is_part_of.set_key_if_not_none("publishers", publishers) is_part_of_is_part_of.set_key_if_not_none("publication_places", publication_places) if isbn_or_issn: is_part_of["identifiers"] = [ metajson_service.create_identifier(isbn_or_issn_type, isbn_or_issn) ] is_part_of.add_items_to_key([is_part_of_is_part_of], "is_part_ofs") else: # is_part_of in case of no is_part_of.is_part_of # creators with role edt is_part_of.add_creators(secondary_creators) #is_part_of.set_key_if_not_none("date_issued",date_year) is_part_of.set_key_if_not_none("publishers", publishers) is_part_of.set_key_if_not_none("publication_places", publication_places) if isbn_or_issn: is_part_of["identifiers"] = [ metajson_service.create_identifier(isbn_or_issn_type, isbn_or_issn) ] if "title" in is_part_of and is_part_of["title"]: document.add_items_to_key([is_part_of], "is_part_ofs") else: if isbn_or_issn: document["identifiers"] = [ metajson_service.create_identifier(isbn_or_issn_type, isbn_or_issn) ] if publishers: if endnote_type == TYPE_THESIS: document.add_creators([ creator_service.formatted_name_to_creator( publishers[0], "orgunit", "dgg") ]) elif endnote_type == TYPE_FILM_OR_BROADCAST: document.add_creators([ creator_service.formatted_name_to_creator( publishers[0], "orgunit", "dst") ]) else: document.set_key_if_not_none("publishers", publishers) document.set_key_if_not_none("publication_places", publication_places) # seriess[] if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: series = Document() series["rec_type"] = constants.DOC_TYPE_SERIES if endnote_type == TYPE_BOOK and title_secondary: series.set_key_if_not_none("title", title_secondary) series.add_creators(secondary_creators) series.set_key_if_not_none("part_volume", part_number) if endnote_type == TYPE_BOOK_SECTION and title_tertiary: series.set_key_if_not_none("title", title_tertiary) series.add_creators(tertiary_creators) series.set_key_if_not_none("part_volume", part_number) if "title" in series and len(series) > 2: document.add_items_to_key([series], "seriess") # originals[] if (reprint_edition or title_translated or orig_pub) and endnote_type in [ TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_JOURNAL_ARTICLE, TYPE_FILM_OR_BROADCAST ]: original_title = None original_is_part_of = None if reprint_edition: original_title = reprint_edition if title_translated and is_part_of_is_part_of_type is None: original_title = title_translated if orig_pub: if is_part_of_type is not None: original_is_part_of = Document() original_is_part_of["rec_type"] = is_part_of_type original_is_part_of["title"] = orig_pub else: original_title = orig_pub if original_title: original = Document() original["rec_type"] = rec_type original.set_key_if_not_none("title", original_title) original.add_item_to_key(original_is_part_of, "is_part_ofs") document.add_item_to_key(original, "originals") # is_review_ofs[] if reviewed_item and endnote_type in [ TYPE_BOOK_SECTION, TYPE_JOURNAL_ARTICLE ]: is_review_ofs = Document() is_review_ofs.set_key_if_not_none("title", reviewed_item) is_review_ofs.set_key_if_not_none("rec_type", "Book") document.add_items_to_key([is_review_ofs], "is_review_ofs") # descriptions[0].value if abstract: document["descriptions"] = [{"value": abstract, "language": "und"}] # archive if endnote_type == TYPE_FIGURE and remote_database_provider: archive = Document() archive["title"] = remote_database_provider document.add_items_to_key([archive], "archive") # caption document.set_key_if_not_none("caption", caption) # creators[] document.add_creators(primary_creators) if endnote_type in [TYPE_BOOK, TYPE_THESIS, TYPE_FILM_OR_BROADCAST]: document.add_creators(tertiary_creators) if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_FILM_OR_BROADCAST]: document.add_creators(subsidiary_creators) if custom4: document.add_creators( endnote_authors_to_creators(custom4, "person", "rev")) if endnote_type == TYPE_FIGURE and remote_database_name: document.add_creators( endnote_authors_to_creators(remote_database_name, None, "cph")) # edition if endnote_type in [ TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_FILM_OR_BROADCAST, TYPE_WEB_PAGE ] and edition: document["edition"] = edition # extent_pages, extent_volumes if endnote_type in [TYPE_BOOK, TYPE_THESIS] and pages: document["extent_pages"] = pages.replace("p.", "").strip() if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION] and extent_volumes: document["extent_volumes"] = extent_volumes # date_issued, date_issued_first if date_year: date_issued = "" date_issued_first = "" orig_index_begin = date_year.find("[") orig_index_end = date_year.find("]") if orig_index_begin != -1 and orig_index_end != -1: date_issued_first = date_year[orig_index_begin + 1:orig_index_end] date_issued = date_year.replace("[" + date_issued_first + "]", "").strip() else: date_issued = date_year.strip() if "is_part_ofs" in document: if document["is_part_ofs"][0][ "rec_type"] == constants.DOC_TYPE_BOOK: document["is_part_ofs"][0].set_key_if_not_none( "date_issued", date_issued) document["is_part_ofs"][0].set_key_if_not_none( "date_issued_first", date_issued_first) elif "is_part_ofs" in document["is_part_ofs"][0] and document[ "is_part_ofs"][0]["is_part_ofs"][0][ "rec_type"] == constants.DOC_TYPE_BOOK: document["is_part_ofs"][0]["is_part_ofs"][ 0].set_key_if_not_none("date_issued", date_issued) document["is_part_ofs"][0]["is_part_ofs"][ 0].set_key_if_not_none("date_issued_first", date_issued_first) else: document.set_key_if_not_none("date_issued_first", date_issued_first) if rec_type in unpublished_types: document.set_key_if_not_none("date_created", date_issued) else: document.set_key_if_not_none("date_issued", date_issued) else: document.set_key_if_not_none("date_issued_first", date_issued_first) if rec_type in unpublished_types: document.set_key_if_not_none("date_created", date_issued) else: document.set_key_if_not_none("date_issued", date_issued) # identifiers[] identifiers = [] if accessionnumber: identifiers.append( metajson_service.create_identifier("accessionnumber", accessionnumber)) if callnumber: identifiers.append( metajson_service.create_identifier("callnumber", callnumber)) if doi: identifiers.append(metajson_service.create_identifier("doi", doi)) if identifiers: document["identifiers"] = identifiers # language if language: rfc5646 = language_service.convert_unknown_format_to_rfc5646(language) if rfc5646: document["languages"] = [rfc5646] # note if endnote_import_note and note: document.set_key_with_value_type_in_list("notes", note, "general") if endnote_import_research_note and research_notes: document.set_key_with_value_type_in_list("notes", research_notes, "user") # part_page_begin & part_page_end if endnote_type in [TYPE_BOOK_SECTION, TYPE_FIGURE, TYPE_JOURNAL_ARTICLE ] and pages: hyphen_index = pages.find("-") if hyphen_index == -1: document["part_page_begin"] = pages.replace("p.", "").strip() else: document["part_page_begin"] = pages[:hyphen_index].replace( "p.", "").strip() document["part_page_end"] = pages[hyphen_index + 1:].replace( "p.", "").strip() if endnote_type in [TYPE_JOURNAL_ARTICLE]: document.set_key_if_not_none("part_issue", part_number) elif endnote_type in [TYPE_FIGURE]: document.set_key_if_not_none("part_number", part_number) document.set_key_if_not_none("part_section", part_section) document.set_key_if_not_none("part_volume", part_volume) # resources[0] if url is not None: resource = Resource() resource["rec_type"] = "ResourceRemote" resource.set_key_if_not_none("url", url) if endnote_type == TYPE_WEB_PAGE: resource.set_key_if_not_none("date_last_accessed", part_number) else: resource.set_key_if_not_none("date_last_accessed", access_date) document["resources"] = [resource] # subjects[] if endnote_import_keywords and keywords: for keyword in keywords.split(): document.set_key_with_value_type_in_list("subjects", keyword, "topic") # title, title_alternative, title_abbreviated, title_translated document["title"] = title if title_alternative: document["title_alternatives"] = [{"title": title_alternative}] if title_abbreviated: document["title_abbreviateds"] = [{"title": title_abbreviated}] #logging.debug("# endnote_type: {}".format(endnote_type)) metajson_service.pretty_print_document(document) return document
def endnotexml_record_to_metajson(record, source, rec_id_prefix): document = Document() # TODO # translated_creators: /contributors/translated-authors/author/style # auth_address: /auth-address/style # label: /label/style # custom1 # Extract endnote properties rec_id = record.find("rec-number").text endnote_type = record.find("ref-type").text rec_type = endnote_record_type_to_metajson_document_type[endnote_type] primary_creators = extract_creators(None, "aut", record, "./contributors/authors/author/style") secondary_creators = extract_creators(None, "pbd", record, "./contributors/secondary-authors/author/style") if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: tertiary_creators = extract_creators(None, "pbd", record, "./contributors/tertiary-authors/author/style") elif endnote_type == TYPE_THESIS: tertiary_creators = extract_creators(None, "ths", record, "./contributors/tertiary-authors/author/style") elif endnote_type == TYPE_FILM_OR_BROADCAST: tertiary_creators = extract_creators(None, "pro", record, "./contributors/tertiary-authors/author/style") if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: subsidiary_creators = extract_creators(None, "trl", record, "./contributors/subsidiary-authors/author/style") elif endnote_type == TYPE_FILM_OR_BROADCAST: subsidiary_creators = extract_creators(None, "act", record, "./contributors/subsidiary-authors/author/style") translated_creators = extract_creators(None, "trl", record, "./contributors/translated-authors/author/style") auth_address = extract_text(record, "./auth-address/style") title = extract_text(record, "./titles/title/style") title_secondary = extract_text(record, "./titles/secondary-title/style") title_tertiary = extract_text(record, "./titles/tertiary-title/style") title_alternative = extract_text(record, "./titles/alt-title/style") title_abbreviated = extract_text(record, "./titles/short-title/style") title_translated = extract_text(record, "./titles/translated-title/style") pages = extract_text(record, "./pages/style") part_volume = extract_text(record, "./volume/style") part_number = extract_text(record, "./number/style") extent_volumes = extract_text(record, "./num-vols/style") edition = extract_text(record, "./edition/style") part_section = extract_text(record, "./section/style") reprint_edition = extract_text(record, "./reprint-edition/style") keywords = extract_text(record, "./keywords/keyword/style") date_year = extract_text(record, "./dates/year/style") date_pub = extract_text(record, "./dates/pub-dates/date/style") publication_places_formatted = extract_text(record, "./pub-location/style") publishers_formatted = extract_text(record, "./publisher/style") orig_pub = extract_text(record, "./orig-pub/style") isbn_or_issn = extract_text(record, "./isbn/style") accessionnumber = extract_text(record, "./accession-num/style") callnumber = extract_text(record, "./call-num/style") if endnote_type == TYPE_WEB_PAGE: abstract = extract_text(record, "./pages/style") else: abstract = extract_text(record, "./abstract/style") label = extract_text(record, "./label/style") caption = extract_text(record, "./caption/style") note = extract_text(record, "./notes/style") reviewed_item = extract_text(record, "./reviewed-item/style") rec_type_description = extract_text(record, "./work-type/style") url = extract_text(record, "./urls/related-urls/url/style") custom1 = extract_text(record, "./custom1/style") custom2 = extract_text(record, "./custom2/style") custom3 = extract_text(record, "./custom3/style") custom4 = extract_text(record, "./custom4/style") custom5 = extract_text(record, "./custom5/style") custom6 = extract_text(record, "./custom6/style") custom7 = extract_text(record, "./custom7/style") doi = extract_text(record, "./electronic-resource-num/style") remote_database_name = extract_text(record, "./remote-database-name/style") remote_database_provider = extract_text(record, "./remote-database-provider/style") research_notes = extract_text(record, "./research-notes/style") language = extract_text(record, "./language/style") access_date = extract_text(record, "./access-date/style") # rec_id, rec_source document["rec_id"] = rec_id if source: document["rec_source"] = source # publishers_formatted, publication_places_formatted publishers = None publication_places = None if publishers_formatted: publishers = publishers_formatted.split("\r") if publication_places_formatted: publication_places = publication_places_formatted.split("\r") # type, is_part_of.type, is_part_of.is_part_of.type try: is_part_of_type = endnote_record_type_to_metajson_document_is_part_of_type[endnote_type] except: is_part_of_type = None is_part_of_is_part_of_type = None if title_secondary is not None: if endnote_type == TYPE_FIGURE: # how to determine the is_part_of type ? # if there is a volume or an issue number, it's a JournalArticle, else it's a Book or BookPart if part_volume is not None or part_number is not None: is_part_of_type = constants.DOC_TYPE_JOURNALARTICLE is_part_of_is_part_of_type = constants.DOC_TYPE_JOURNAL else: if title_translated is not None: is_part_of_type = constants.DOC_TYPE_BOOKPART is_part_of_is_part_of_type = constants.DOC_TYPE_BOOK else: is_part_of_type = constants.DOC_TYPE_BOOK elif endnote_type == TYPE_FILM_OR_BROADCAST: rec_type = constants.DOC_TYPE_VIDEOPART is_part_of_type = constants.DOC_TYPE_VIDEORECORDING document["rec_type"] = rec_type document.set_key_if_not_none("rec_type_description", rec_type_description) # issn or isbn ? if is_part_of_type in [constants.DOC_TYPE_JOURNAL, constants.DOC_TYPE_NEWSPAPER, constants.DOC_TYPE_MAGAZINE]: isbn_or_issn_type = "issn" else: isbn_or_issn_type = "isbn" # is_part_of, is_part_of.is_part_of if is_part_of_type is not None and title_secondary: is_part_of = Document() is_part_of.set_key_if_not_none("rec_type", is_part_of_type) is_part_of.set_key_if_not_none("title", title_secondary) if is_part_of_is_part_of_type is not None: # is_part_of in case of is_part_of.is_part_of # creators with role aut is_part_of.add_creators(creator_service.change_contibutors_role(secondary_creators, "aut")) # is_part_of.is_part_of is_part_of_is_part_of = Document() is_part_of_is_part_of.set_key_if_not_none("rec_type", is_part_of_is_part_of_type) is_part_of_is_part_of.set_key_if_not_none("title", title_translated) # creators with role pbd is_part_of_is_part_of.add_creators(creator_service.change_contibutors_role(translated_creators, "pbd")) #is_part_of_is_part_of.set_key_if_not_none("date_issued",date_year) is_part_of_is_part_of.set_key_if_not_none("publishers", publishers) is_part_of_is_part_of.set_key_if_not_none("publication_places", publication_places) if isbn_or_issn: is_part_of["identifiers"] = [metajson_service.create_identifier(isbn_or_issn_type, isbn_or_issn)] is_part_of.add_items_to_key([is_part_of_is_part_of], "is_part_ofs") else: # is_part_of in case of no is_part_of.is_part_of # creators with role edt is_part_of.add_creators(secondary_creators) #is_part_of.set_key_if_not_none("date_issued",date_year) is_part_of.set_key_if_not_none("publishers", publishers) is_part_of.set_key_if_not_none("publication_places", publication_places) if isbn_or_issn: is_part_of["identifiers"] = [metajson_service.create_identifier(isbn_or_issn_type, isbn_or_issn)] if "title" in is_part_of and is_part_of["title"]: document.add_items_to_key([is_part_of], "is_part_ofs") else: if isbn_or_issn: document["identifiers"] = [metajson_service.create_identifier(isbn_or_issn_type, isbn_or_issn)] if publishers: if endnote_type == TYPE_THESIS: document.add_creators([creator_service.formatted_name_to_creator(publishers[0], "orgunit", "dgg")]) elif endnote_type == TYPE_FILM_OR_BROADCAST: document.add_creators([creator_service.formatted_name_to_creator(publishers[0], "orgunit", "dst")]) else: document.set_key_if_not_none("publishers", publishers) document.set_key_if_not_none("publication_places", publication_places) # seriess[] if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION]: series = Document() series["rec_type"] = constants.DOC_TYPE_SERIES if endnote_type == TYPE_BOOK and title_secondary: series.set_key_if_not_none("title", title_secondary) series.add_creators(secondary_creators) series.set_key_if_not_none("part_volume", part_number) if endnote_type == TYPE_BOOK_SECTION and title_tertiary: series.set_key_if_not_none("title", title_tertiary) series.add_creators(tertiary_creators) series.set_key_if_not_none("part_volume", part_number) if "title" in series and len(series) > 2: document.add_items_to_key([series], "seriess") # originals[] if (reprint_edition or title_translated or orig_pub) and endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_JOURNAL_ARTICLE, TYPE_FILM_OR_BROADCAST]: original_title = None original_is_part_of = None if reprint_edition: original_title = reprint_edition if title_translated and is_part_of_is_part_of_type is None: original_title = title_translated if orig_pub: if is_part_of_type is not None: original_is_part_of = Document() original_is_part_of["rec_type"] = is_part_of_type original_is_part_of["title"] = orig_pub else: original_title = orig_pub if original_title: original = Document() original["rec_type"] = rec_type original.set_key_if_not_none("title", original_title) original.add_item_to_key(original_is_part_of, "is_part_ofs") document.add_item_to_key(original, "originals") # is_review_ofs[] if reviewed_item and endnote_type in [TYPE_BOOK_SECTION, TYPE_JOURNAL_ARTICLE]: is_review_ofs = Document() is_review_ofs.set_key_if_not_none("title", reviewed_item) is_review_ofs.set_key_if_not_none("rec_type", "Book") document.add_items_to_key([is_review_ofs], "is_review_ofs") # descriptions[0].value if abstract: document["descriptions"] = [{"value": abstract, "language": "und"}] # archive if endnote_type == TYPE_FIGURE and remote_database_provider: archive = Document() archive["title"] = remote_database_provider document.add_items_to_key([archive], "archive") # caption document.set_key_if_not_none("caption", caption) # creators[] document.add_creators(primary_creators) if endnote_type in [TYPE_BOOK, TYPE_THESIS, TYPE_FILM_OR_BROADCAST]: document.add_creators(tertiary_creators) if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_FILM_OR_BROADCAST]: document.add_creators(subsidiary_creators) if custom4: document.add_creators(endnote_authors_to_creators(custom4, "person", "rev")) if endnote_type == TYPE_FIGURE and remote_database_name: document.add_creators(endnote_authors_to_creators(remote_database_name, None, "cph")) # edition if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION, TYPE_FILM_OR_BROADCAST, TYPE_WEB_PAGE] and edition: document["edition"] = edition # extent_pages, extent_volumes if endnote_type in [TYPE_BOOK, TYPE_THESIS] and pages: document["extent_pages"] = pages.replace("p.", "").strip() if endnote_type in [TYPE_BOOK, TYPE_BOOK_SECTION] and extent_volumes: document["extent_volumes"] = extent_volumes # date_issued, date_issued_first if date_year: date_issued = "" date_issued_first = "" orig_index_begin = date_year.find("[") orig_index_end = date_year.find("]") if orig_index_begin != -1 and orig_index_end != -1: date_issued_first = date_year[orig_index_begin + 1: orig_index_end] date_issued = date_year.replace("[" + date_issued_first + "]", "").strip() else: date_issued = date_year.strip() if "is_part_ofs" in document: if document["is_part_ofs"][0]["rec_type"] == constants.DOC_TYPE_BOOK: document["is_part_ofs"][0].set_key_if_not_none("date_issued", date_issued) document["is_part_ofs"][0].set_key_if_not_none("date_issued_first", date_issued_first) elif "is_part_ofs" in document["is_part_ofs"][0] and document["is_part_ofs"][0]["is_part_ofs"][0]["rec_type"] == constants.DOC_TYPE_BOOK: document["is_part_ofs"][0]["is_part_ofs"][0].set_key_if_not_none("date_issued", date_issued) document["is_part_ofs"][0]["is_part_ofs"][0].set_key_if_not_none("date_issued_first", date_issued_first) else: document.set_key_if_not_none("date_issued_first", date_issued_first) if rec_type in unpublished_types: document.set_key_if_not_none("date_created", date_issued) else: document.set_key_if_not_none("date_issued", date_issued) else: document.set_key_if_not_none("date_issued_first", date_issued_first) if rec_type in unpublished_types: document.set_key_if_not_none("date_created", date_issued) else: document.set_key_if_not_none("date_issued", date_issued) # identifiers[] identifiers = [] if accessionnumber: identifiers.append(metajson_service.create_identifier("accessionnumber", accessionnumber)) if callnumber: identifiers.append(metajson_service.create_identifier("callnumber", callnumber)) if doi: identifiers.append(metajson_service.create_identifier("doi", doi)) if identifiers: document["identifiers"] = identifiers # language if language: rfc5646 = language_service.convert_unknown_format_to_rfc5646(language) if rfc5646: document["languages"] = [rfc5646] # note if endnote_import_note and note: document.set_key_with_value_type_in_list("notes", note, "general") if endnote_import_research_note and research_notes: document.set_key_with_value_type_in_list("notes", research_notes, "user") # part_page_begin & part_page_end if endnote_type in [TYPE_BOOK_SECTION, TYPE_FIGURE, TYPE_JOURNAL_ARTICLE] and pages: hyphen_index = pages.find("-") if hyphen_index == -1: document["part_page_begin"] = pages.replace("p.", "").strip() else: document["part_page_begin"] = pages[:hyphen_index].replace("p.", "").strip() document["part_page_end"] = pages[hyphen_index+1:].replace("p.", "").strip() if endnote_type in [TYPE_JOURNAL_ARTICLE]: document.set_key_if_not_none("part_issue", part_number) elif endnote_type in [TYPE_FIGURE]: document.set_key_if_not_none("part_number", part_number) document.set_key_if_not_none("part_section", part_section) document.set_key_if_not_none("part_volume", part_volume) # resources[0] if url is not None: resource = Resource() resource["rec_type"] = "ResourceRemote" resource.set_key_if_not_none("url", url) if endnote_type == TYPE_WEB_PAGE: resource.set_key_if_not_none("date_last_accessed", part_number) else: resource.set_key_if_not_none("date_last_accessed", access_date) document["resources"] = [resource] # subjects[] if endnote_import_keywords and keywords: for keyword in keywords.split(): document.set_key_with_value_type_in_list("subjects", keyword, "topic") # title, title_alternative, title_abbreviated, title_translated document["title"] = title if title_alternative: document["title_alternatives"] = [{"title": title_alternative}] if title_abbreviated: document["title_abbreviateds"] = [{"title": title_abbreviated}] #logging.debug("# endnote_type: {}".format(endnote_type)) metajson_service.pretty_print_document(document) return document
def ris_txt_lines_to_metajson_list(txt_lines, source, rec_id_prefix, only_first_record): document = None ris_type = None rec_type = None is_part_of_rec_type = None previous_key = None previous_value = None for line in txt_lines: if line: line = line.rstrip('\r\n') #logging.debug("line: {}".format(line)) # multi line management if previous_key: key = previous_key value = previous_value + line previous_key = None previous_value = None else: key = line[:2].strip() value = line[6:].strip() if value.endswith("/") and key not in ["Y1", "PY"]: #logging.debug("multi line") previous_key = key previous_value = value.rstrip('/') continue if key is None or len(key) == 0: # empty line -> continue #logging.debug("empty line") continue elif key == RIS_KEY_BEGIN: # record begin with document type -> create document # init document = Document() is_part_of_rec_type = None if source: document["rec_source"] = source ris_type = value rec_type = ris_document_type_to_metajson_document_type[ris_type] document["rec_type"] = rec_type if ris_type in ris_document_type_to_metajson_document_is_part_of_type: is_part_of_rec_type = ris_document_type_to_metajson_document_is_part_of_type[ris_type] is_part_of = Document() is_part_of["rec_type"] = is_part_of_rec_type document["is_part_ofs"] = [is_part_of] elif key == RIS_KEY_END: # record end -> return the result # verify the is_part_ofs[0]["title"] if "is_part_ofs" in document and "title" not in document["is_part_ofs"][0] and "title_abbreviateds" in document["is_part_ofs"][0]: document["is_part_ofs"][0]["title"] = document["is_part_ofs"][0]["title_abbreviateds"][0]["title"] del document["is_part_ofs"][0]["title_abbreviateds"] logging.info("# RIS type: {}".format(ris_type)) metajson_service.pretty_print_document(document) yield document else: # process key value #logging.debug("key: {}; value: {}".format(key, value)) if key == "ID": document["rec_id"] = value elif key in ["T1", "TI", "CT"] or (key == "BT" and ris_type in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Primary -> title document["title"] = value elif key in ["JF", "JO"] or (key == "BT" and ris_type not in [RIS_TYPE_BOOK, RIS_TYPE_UNPB]): # Title Secondary -> is_part_of["title"] document.add_is_part_of_title(value) elif key in ["JA", "J1", "J2", "T2"]: # Title Secondary -> is_part_of["title_abbreviateds"][O]["title"] document.add_is_part_of_title_abbreviated(value) elif key == "T3": # Title Series document.add_series_title(value) elif key in ["A1", "AU"]: document.add_creator(creator_service.formatted_name_to_creator(value, None, "aut")) elif key in ["A2", "ED"]: if is_part_of_rec_type: document.add_is_part_of_creator(creator_service.formatted_name_to_creator(value, None, "edt")) else: document.add_creator(creator_service.formatted_name_to_creator(value, None, "edt")) elif key == "A3": document.add_series_creator(creator_service.formatted_name_to_creator(value, None, "aut")) elif key == "A4": document.add_creator(creator_service.formatted_name_to_creator(value, None, "ctb")) elif key in ["PY", "Y1", "DA"]: index_slash = value.find("/") if index_slash != -1: # YYYY/MM/DD/other info (like season) # todo document["date_issued"] = value.strip("/") else: document["date_issued"] = value elif key == "SP": document["part_page_begin"] = value elif key == "EP": document["part_page_end"] = value elif key == "VL": document["part_volume"] = value elif key in ["IS", "CP"]: document["part_issue"] = value elif key in ["AB", "N2"]: document["descriptions"] = [{"language": "und", "value": value}] elif key == "N1": document["notes"] = [{"language": "und", "value": value}] elif key == "PB": document.add_item_to_key(value, "publishers") elif key == "CY": document.add_item_to_key(value, "publication_places") elif key == "RP": document["publication_status"] = value elif key == "ET": document["edition"] = value elif key == "UR": resource = Resource() resource["url"] = value document.add_item_to_key(resource, "resources") elif key == "AN": # Accession Number identifier = metajson_service.create_identifier("accessionnumber", value) document.add_identifier(identifier) elif key == "CN": # Call Number identifier = metajson_service.create_identifier("callnumber", value) document.add_identifier(identifier) elif key == "DO": # DOI identifier = metajson_service.create_identifier("doi", value) document.add_identifier(identifier) elif key == "SN": # ISBN or ISSN ? id_type = None if rec_type in [constants.DOC_TYPE_JOURNALARTICLE, constants.DOC_TYPE_MAGAZINEARTICLE, constants.DOC_TYPE_NEWSPAPERARTICLE, constants.DOC_TYPE_JOURNAL]: id_type = "issn" else: id_type = "isbn" identifier = metajson_service.create_identifier(id_type, value) if is_part_of_rec_type is None: document.add_identifier(identifier) else: document["is_part_ofs"][0].add_identifier(identifier) elif key == "CA": document["caption"] = value elif key == "DB": # Name of Database -> rec_source ? document["rec_source"] = value elif key == "DP": # NDatabase Provider -> rec_source ? document["rec_source"] = value elif key == "KW": if "keywords" not in document: document["keywords"] = {"und": []} document["keywords"]["und"].append(value) else: logging.debug("Not managed key: {} with value: {}".format(key, value))
def unixref_record_to_metajson(record, source, rec_id_prefix): journal_metadata = record.find("./crossref/journal/journal_metadata") if journal_metadata is not None: journal_title = journal_metadata.find("full_title") journal_title_abbreviated = journal_metadata.find("abbrev_title") for issn in journal_metadata.findall("issn"): if issn.get("media_type") == "print": journal_issn = issn elif issn.get("media_type") == "electronic": journal_eissn = issn journal_issue = record.find("./crossref/journal/journal_issue") if journal_issue is not None: part_issue = journal_issue.find("issue") part_volume = journal_issue.find("journal_volume/volume") journal_article = record.find("./crossref/journal/journal_article") if journal_article is not None: titles = journal_article.findall("titles/title") creators = journal_article.findall("creators") year = journal_article.find("publication_date/year") month = journal_article.find("publication_date/month") day = journal_article.find("publication_date/day") part_page_begin = journal_article.find("pages/first_page") part_page_end = journal_article.find("pages/last_page") doi = journal_article.find("doi_data/doi") resource_url = journal_article.find("doi_data/resource") document = Document() # todo: how to find the type ? document["rec_type"] = "JournalArticle" if doi is not None: document.set_key_with_value_type_in_list("identifiers", doi.text, "doi") if resource_url is not None: resource = Resource() resource["url"] = resource_url.text document["resources"] = [resource] if source: document["rec_source"] = source if titles is not None and len(titles) > 0: document["title"] = titles[0].text date_issued = year.text if month is not None: month = month.text if len(month) == 1: month = "0" + month date_issued += "-" + month if day is not None: day = day.text if len(day) == 1: day = "0" + day date_issued += "-" + day document["date_issued"] = date_issued if part_issue is not None: document["part_issue"] = part_issue.text if part_volume is not None: document["part_volume"] = part_volume.text if part_page_begin is not None: document["part_page_begin"] = part_page_begin.text if part_page_end is not None: document["part_page_end"] = part_page_end.text # todo: creator is_part_of = Document() # todo: how to find the type ? is_part_of["rec_type"] = "Journal" if journal_title is not None: is_part_of["title"] = journal_title.text if journal_title_abbreviated is not None: is_part_of["title_abbreviateds"] = [{"title":journal_title_abbreviated.text}] if journal_issn is not None: is_part_of.set_key_with_value_type_in_list("identifiers", journal_issn.text, "issn") if journal_eissn is not None: is_part_of.set_key_with_value_type_in_list("identifiers", journal_eissn.text, "eissn") document["is_part_ofs"] = [is_part_of] return document
def summonjson_document_to_metajson(sum_doc, source, rec_id_prefix): document = Document() # Extract Summon properties rec_id = sum_doc["ID"][0].replace("FETCH-", "") sum_type = sum_doc["ContentType"][0] rec_type = summon_document_type_to_metajson_document_type[sum_type] # rec_id, rec_source, rec_type document["rec_id"] = rec_id if source: document["rec_source"] = source document["rec_type"] = rec_type # languages main_language = None if "Language" in sum_doc: languages = [] for sum_lang in sum_doc["Language"]: lang = language_service.convert_english_to_rfc5646(sum_lang) if lang: languages.append(lang) if languages: main_language = languages[0] document["languages"] = languages # extract summon properties creators = extract_creators(sum_doc) copyright_statement = extract_value(sum_doc, "Copyright") database_id = extract_value(sum_doc, "DBID") #logging.debug("DBID: {}".format(database_id)) database_xml = extract_dict_value(sum_doc, "Database_xml") #logging.debug("database_xml: {}".format(database_xml)) date_issued = extract_date_issued(sum_doc) degree = extract_value(sum_doc, "DissertationDegree") descriptions = extract_convert_languageValues(sum_doc, "Abstract", main_language) edition = extract_value(sum_doc, "Edition") extent_pages = extract_value(sum_doc, "PageCount") genre = extract_value(sum_doc, "Genre") is_part_of_edition = extract_value(sum_doc, "PublicationEdition") is_part_of_title = extract_value(sum_doc, "PublicationTitle") is_part_of_title_sub = extract_value(sum_doc, "PublicationSubtitle") notes = extract_convert_languageValues(sum_doc, "Notes", main_language) part_issue = extract_value(sum_doc, "Issue") part_page_end = extract_value(sum_doc, "EndPage") part_page_begin = extract_value(sum_doc, "StartPage") part_volume = extract_value(sum_doc, "Volume") peer_reviewed = extract_boolean_value(sum_doc, "IsPeerReviewed") publisher = extract_value(sum_doc, "Publisher") publication_place = extract_value(sum_doc, "PublicationPlace") scholarly = extract_boolean_value(sum_doc, "IsScholarly") series_title = extract_value(sum_doc, "PublicationSeriesTitle") subject_keywords = extract_value(sum_doc, "Keywords", True) subject_agents = convert_creators(sum_doc, "RelatedPersons", None, "person", None) subject_topics = extract_value(sum_doc, "SubjectTerms", True) table_of_contents = extract_convert_languageValues(sum_doc, "TableOfContents", main_language) title = string.strip_html_tags(extract_value(sum_doc, "Title")) title_sub = string.strip_html_tags(extract_value(sum_doc, "Subtitle")) # identifiers has_isbn = False has_eissn = False identifiers_item = [] is_part_of_identifiers = [] for sum_key in summon_identifier_type_to_metajson_identifier_type: if sum_key in sum_doc: for id_value in sum_doc[sum_key]: id_type = summon_identifier_type_to_metajson_identifier_type[sum_key] if id_type == "issn": is_part_of_identifiers.append(metajson_service.create_identifier(id_type, id_value)) elif id_type == "eissn": has_eissn = True is_part_of_identifiers.append(metajson_service.create_identifier(id_type, id_value)) elif id_type == "isbn": has_isbn = True is_part_of_identifiers.append(metajson_service.create_identifier(id_type, id_value)) else: identifiers_item.append(metajson_service.create_identifier(id_type, id_value)) # is_part_of_type determination is_part_of_type = None if sum_type in summon_document_type_to_metajson_document_is_part_of_type: is_part_of_type = summon_document_type_to_metajson_document_is_part_of_type[sum_type] elif is_part_of_title and is_part_of_title != title and rec_type not in ["Book", "Journal", "Magazine", "Newspaper", "Periodical"]: if has_isbn: is_part_of_type = "Book" elif has_eissn: is_part_of_type = "Journal" elif is_part_of_title.lower().find("conference") != -1: is_part_of_type = "Book" elif is_part_of_title.lower().find("review") or is_part_of_title.lower().find("journal"): is_part_of_type = "Journal" elif rec_type == "Dataset": is_part_of_type = "Periodical" else: logging.debug("unknown is_part_of_type for rec_type: {}".format(rec_type)) # is_part_of if is_part_of_type: is_part_of = Document() is_part_of.set_key_if_not_none("rec_type", is_part_of_type) is_part_of.set_key_if_not_none("edition", is_part_of_edition) is_part_of.add_items_to_key(is_part_of_identifiers, "identifiers") is_part_of.set_key_if_not_none("peer_reviewed", peer_reviewed) is_part_of.set_key_if_not_none("publishers", [publisher]) is_part_of.set_key_if_not_none("publication_places", [publication_place]) is_part_of.set_key_if_not_none("title", is_part_of_title) is_part_of.set_key_if_not_none("title_sub", is_part_of_title_sub) document.add_items_to_key(identifiers_item, "identifiers") document.add_items_to_key([is_part_of], "is_part_ofs") else: document.set_key_if_not_none("peer_reviewed", peer_reviewed) document.set_key_if_not_none("publishers", [publisher]) document.set_key_if_not_none("publication_places", [publication_place]) document.add_items_to_key(is_part_of_identifiers, "identifiers") document.add_items_to_key(identifiers_item, "identifiers") # series if series_title: series = Document() series["rec_type"] = constants.DOC_TYPE_SERIES series.set_key_if_not_none("title", series_title) document.add_items_to_key([series], "seriess") # classificiations extract_convert_add_classifications(sum_doc, document, "DEWEY", "ddc") extract_convert_add_classifications(sum_doc, document, "Discipline", "discipline") extract_convert_add_classifications(sum_doc, document, "NAICS", "NAICS") # set properties document.set_key_if_not_none("creators", creators) document.set_key_if_not_none("copyright_statement", copyright_statement) document.set_key_if_not_none("date_issued", date_issued) document.set_key_if_not_none("degree", degree) document.set_key_if_not_none("descriptions", descriptions) document.set_key_if_not_none("edition", edition) document.set_key_if_not_none("extent_pages", extent_pages) document.set_key_if_not_none("genre", genre) document.set_key_if_not_none("notes", notes) document.set_key_if_not_none("part_issue", part_issue) document.set_key_if_not_none("part_page_end", part_page_end) document.set_key_if_not_none("part_page_begin", part_page_begin) document.set_key_if_not_none("part_volume", part_volume) document.set_key_if_not_none("scholarly", scholarly) document.set_key_if_not_none("table_of_contents", table_of_contents) document.set_key_if_not_none("title", title) document.set_key_if_not_none("title_sub", title_sub) # keywords, subjects subjects = [] keywords = {main_language: []} if subject_keywords: keywords[main_language].extend(subject_keywords) if subject_topics: keywords[main_language].extend(subject_topics) if subject_agents: subject = Subject() subject["agents"] = subject_agents subjects.append(subject) if subjects: document["subjects"] = subjects if keywords[main_language]: document["keywords"] = keywords logging.info("# Summon ContentType: {}".format(sum_type)) metajson_service.pretty_print_document(document) return document