def ddi_xmletree_to_metajson(ddi_root, source, rec_id_prefix): """ DDI xmletree -> MetaJSON Document """ if ddi_root is None: return None document = Document() document["rec_type"] = constants.DOC_TYPE_DATASETQUALI if source: document["rec_source"] = source # stdyDscr/citation/titlStmt/titl ddi_stdydscr = ddi_root.find(xmletree.prefixtag("ddi", "stdyDscr")) if ddi_stdydscr is not None: ddi_stdydscr_citation = ddi_stdydscr.find( xmletree.prefixtag("ddi", "citation")) ddi_stdydscr_citation_titlstmt = ddi_stdydscr_citation.find( xmletree.prefixtag("ddi", "titlStmt")) ddi_stdydscr_citation_titlstmt_titl = ddi_stdydscr_citation_titlstmt.find( xmletree.prefixtag("ddi", "titl")) if ddi_stdydscr_citation_titlstmt_titl is not None: document["title"] = ddi_stdydscr_citation_titlstmt_titl.text return document
def get_rml_phones(rml): """ phone -> phones """ result = {} rml_phones = rml.findall(xmletree.prefixtag("rml", "phone")) if rml_phones is not None: phones = [] for rml_phone in rml_phones: if rml_phone is not None: # @preferred -> preferred preferred = xmletree.get_element_attribute_as_boolean(rml_phone, "preferred") # @relationType -> relation_type relation_type = rml_phone.get("relationType") # @type -> phone_type phone_type = rml_phone.get("type") # @visible -> visible visible = xmletree.get_element_attribute_as_boolean(rml_phone, "visible") # formatted -> formatted rml_formatted = rml_phone.find(xmletree.prefixtag("rml", "formatted")) formatted = xmletree.get_element_text(rml_formatted) phone = metajson_service.create_phone(formatted, phone_type, preferred, relation_type, visible) if phone: phones.append(phone) if phones: result["phones"] = phones return result
def get_rml_research_coverages(rml): """ researchCoverage -> research_coverage_classifications & esearch_coverage_keywords """ result = {} rml_rcs = rml.findall(xmletree.prefixtag("rml", "researchCoverage")) if rml_rcs is not None: rc_classifications_dict = {} rc_keywords = {} for rml_rc in rml_rcs: if rml_rc is not None: value = rml_rc.text.strip() if value is not None: rc_type = rml_rc.get("type") if rc_type == "keyword": language = rml_rc.get(xmletree.prefixtag("xml", "lang")) if language is not None: if language in rc_keywords: rc_keywords[language].append(value) else: rc_keywords[language] = [value] else: rc_classification = {"term_id": value} authority = rml_rc.get("authority") if authority is None: authority = "undetermined" if authority not in rc_classifications_dict: rc_classifications_dict[authority] = [] rc_classifications_dict[authority].append(rc_classification) if rc_classifications_dict: result["research_coverage_classifications"] = rc_classifications_dict if rc_keywords: result["research_coverage_keywords"] = rc_keywords return result
def metajson_to_oai_dc_xmletree(document, with_schema_location=True): """ MetaJSON Document -> oai_dc xmletree """ rec_id = document["rec_id"] xmletree.register_namespaces() # oai_dc root oai_dc = ET.Element(xmletree.prefixtag("oai_dc", "oai_dc")) if with_schema_location: oai_dc.set(xmletree.prefixtag("xsi", "schemaLocation"), constants.xmlns_map["oai_dc"] + " " + constants.xmlns_schema_map["oai_dc"]) # title if "title" in document: dc_title = ET.SubElement(oai_dc, xmletree.prefixtag("dc", "title")) dc_title.text = document["title"] # creators if "creators" in document and document["creators"]: for creator in document["creators"]: if "roles" in creator and creator["roles"] and creator["roles"][0]: creator_role = creator["roles"][0] if creator_role in creator_service.role_type and creator_service.role_type[creator_role] == creator_service.role_type_creator: dc_creator = ET.SubElement(oai_dc, xmletree.prefixtag("dc", "creator")) dc_creator.text = creator.formatted_name(metajson.STYLE_FAMILY_COMMA_GIVEN) continue dc_contributor = ET.SubElement(oai_dc, xmletree.prefixtag("dc", "contributor")) dc_contributor.text = creator.formatted_name(metajson.STYLE_FAMILY_COMMA_GIVEN) return (rec_id, oai_dc)
def get_tei_titles_to_metason(tei_element, doc_language): result = {} title_translated_dict = {} title_abbreviated_list = [] tei_titles = tei_element.findall(xmletree.prefixtag("tei", "title")) for tei_title in tei_titles: title_type = tei_title.get("type") title_text = tei_title.text.strip() title_lang = tei_title.get(xmletree.prefixtag("xml", "lang")) if title_type == None: if title_lang == doc_language: result["title"] = title_text else: if title_lang in title_translated_dict: title_translated = title_translated_dict[title_lang] else: title_translated = {} title_translated["language"] = title_lang title_translated["title"] = title_text title_translated_dict[title_lang] = title_translated elif title_type == "main": result["title"] = title_text elif title_type == "alt": if title_lang in title_translated_dict: title_translated = title_translated_dict[title_lang] else: title_translated = {} title_translated["language"] = title_lang title_translated["title"] = title_text title_translated_dict[title_lang] = title_translated elif title_type == "sub": if title_lang == doc_language: result["title_sub"] = title_text else: if title_lang in title_translated_dict: title_translated = title_translated_dict[title_lang] else: title_translated = {} title_translated["language"] = title_lang title_translated["title_sub"] = title_text title_translated_dict[title_lang] = title_translated elif title_type == "short": title_abbreviated = {} title_abbreviated["language"] = title_lang title_abbreviated["title"] = title_text title_abbreviated_list.append(title_abbreviated) if title_translated_dict.values(): result["title_translateds"] = title_translated_dict.values() if title_abbreviated_list: result["title_abbreviateds"] = title_abbreviated_list return result
def tei_xmletree_to_metajson_list(tei_root, source, rec_id_prefix, only_first_record): """ TEI xmletree -> MetaJSON Document list""" if tei_root is not None: # TEI/text tei_text = tei_root.find(xmletree.prefixtag("tei", "text")) # TEI/text/body tei_body = tei_text.find(xmletree.prefixtag("tei", "body")) # TEI/text/body/listBibl tei_body_listbibl = tei_body.find(xmletree.prefixtag("tei", "listBibl")) # TEI/text/body/listBibl/biblFull tei_body_listbibl_biblfulls = tei_body_listbibl.findall(xmletree.prefixtag("tei", "biblFull")) # TEI/text/back tei_back = tei_text.find(xmletree.prefixtag("tei", "back")) # TEI/text/back/div tei_back_divs = tei_back.findall(xmletree.prefixtag("tei", "div")) laboratories = [] projects = [] if tei_back_divs: for tei_back_div in tei_back_divs: if tei_back_div.get("type") == "laboratories": orgs = tei_back_div.findall(xmletree.prefixtag("tei", "org")) for org in orgs: laboratories.append(org_laboratory_to_metajson(org)) elif tei_back_div.get("type") == "projects": orgs = tei_back_div.findall(xmletree.prefixtag("tei", "org")) for org in orgs: laboratories.append(org_project_to_metajson(org)) for biblfull in tei_body_listbibl_biblfulls: yield biblfull_xmletree_to_metajson(biblfull, laboratories, projects, source)
def get_rml_identifiers(rml): """ identifier -> identifiers """ result = {} rml_identifiers = rml.findall(xmletree.prefixtag("rml", "identifier")) if rml_identifiers is not None: identifiers = [] rec_id = None for rml_identifier in rml_identifiers: if rml_identifier is not None: # @type -> id_type id_type = rml_identifier.get("type") # value id_value = xmletree.get_element_text(rml_identifier) if id_type is None or id_type == "hdl": # rec_id rec_id = id_value else: # identifier identifier = metajson_service.create_identifier(id_type, id_value) if identifier is not None: identifiers.append(identifier) if identifiers: result["identifiers"] = identifiers if rec_id: result["rec_id"] = rec_id return result
def get_rml_emails(rml): """ email -> emails """ result = {} rml_emails = rml.findall(xmletree.prefixtag("rml", "email")) if rml_emails is not None: emails = [] for rml_email in rml_emails: if rml_email is not None: # @preferred -> preferred preferred = xmletree.get_element_attribute_as_boolean(rml_email, "preferred") # @relationType -> relation_type relation_type = rml_email.get("relationType") # @visible -> visible visible = xmletree.get_element_attribute_as_boolean(rml_email, "visible") # value value = xmletree.get_element_text(rml_email) email = metajson_service.create_email(value, preferred, relation_type, visible) if email: emails.append(email) if emails: result["emails"] = emails return result
def get_rml_instant_messages(rml): """ instantMessage -> instant_messages """ result = {} rml_ims = rml.findall(xmletree.prefixtag("rml", "instantMessage")) if rml_ims is not None: ims = [] for rml_im in rml_ims: if rml_im is not None: # @preferred -> preferred preferred = xmletree.get_element_attribute_as_boolean(rml_im, "preferred") # @relationType -> relation_type relation_type = rml_im.get("relationType") # @service -> service service = rml_im.get("service") # @visible -> visible visible = xmletree.get_element_attribute_as_boolean(rml_im, "visible") # value value = xmletree.get_element_text(rml_im) im = metajson_service.create_instant_message(value, service, preferred, relation_type, visible) if im: ims.append(im) if ims: result["instant_messages"] = ims return result
def get_rml_language_capabilities(rml): """ languageCapability -> language_capabilities """ result = {} rml_lcs = rml.findall(xmletree.prefixtag("rml", "languageCapability")) if rml_lcs is not None: lcs = [] for rml_lc in rml_lcs: if rml_lc is not None: # language -> language language = get_rml_element_text(rml_lc, "language") language = language_service.convert_unknown_format_to_rfc5646(language) # motherTong -> mother_tong mother_tong = get_rml_element_text_as_boolean(rml_lc, "motherTong") # oralInput -> oral_input oral_input = get_rml_element_text(rml_lc, "oralInput") # oralOutput -> oral_output oral_output = get_rml_element_text(rml_lc, "oralOutput") # textInput -> text_input text_input = get_rml_element_text(rml_lc, "textInput") # textOutput -> text_output text_output = get_rml_element_text(rml_lc, "textOutput") lc = metajson_service.create_language_capability(language, mother_tong, oral_input, oral_output, text_input, text_output) if lc is not None: lcs.append(lc) if lcs: result["language_capabilities"] = lcs return result
def get_rml_relationships(rml): """ relationship -> relationships """ result = {} rml_relationships = rml.findall(xmletree.prefixtag("rml", "relationship")) if rml_relationships is not None: relationships = [] for rml_relationship in rml_relationships: if rml_relationship is not None: # name -> agent.name name = get_rml_element_text(rml_relationship, "name") relationship = creator_service.formatted_name_to_creator(name, constants.REC_CLASS_PERSON, None) if relationship is None: relationship = {} relationship["agent"] = Person() # identifier -> agent.rec_id & agent.identifiers relationship["agent"].update(get_rml_identifiers(rml_relationship)) # relationType -> relation_type relationship.update(get_rml_element_text_and_set_key(rml_relationship, "relationType", "relation_type")) # descriptions -> descriptions relationship.update(get_rml_textlangs_and_set_key(rml_relationship, "description", "descriptions")) if relationship is not None: relationships.append(relationship) if relationships: result["relationships"] = relationships return result
def get_rml_textlangs_as_list(rml, element): """ @xml:lang -> language text -> value """ rml_sls = rml.findall(xmletree.prefixtag("rml", element)) if rml_sls is not None: sls = [] for rml_sl in rml_sls: if rml_sl is not None and rml_sl.text is not None: language = rml_sl.get(xmletree.prefixtag("xml", "lang")) value = rml_sl.text.strip() if value is not None: sl = {"value": value} if language is not None: sl["language"] = language.strip() sls.append(sl) if sls: return sls
def get_mods_textlangs_as_list(rml, element): """ @xml:lang -> language text -> value """ rml_sls = rml.findall(xmletree.prefixtag("mods", element)) if rml_sls is not None: sls = [] for rml_sl in rml_sls: if rml_sl is not None and rml_sl.text is not None: language = rml_sl.get(xmletree.prefixtag("xml", "lang")) value = rml_sl.text.strip() if value is not None: sl = {"value": value} if language is not None: sl["language"] = language.strip() sls.append(sl) if sls: return sls
def get_rml_call(rml): """ call -> call """ result = {} rml_call = rml.find(xmletree.prefixtag("rml", "call")) if rml_call is not None: call = Call() # funding -> funding rml_funding = rml_call.find(xmletree.prefixtag("rml", "funding")) if rml_funding is not None: # name -> agent.name name = get_rml_element_text(rml_funding, "name") funding = creator_service.formatted_name_to_creator(name, constants.REC_CLASS_ORGUNIT, None) if funding is None: funding = Creator() funding["agent"] = Orgunit() # identifier -> agent.rec_id & agent.identifiers funding["agent"].update(get_rml_identifiers(rml_funding)) # programme -> programme funding.update(get_rml_element_text_and_set_key(rml_funding, "programme", "programme")) # scheme -> scheme funding.update(get_rml_element_text_and_set_key(rml_funding, "scheme", "scheme")) # contribution -> contribution funding.update(get_rml_money_and_set_key(rml_funding, "contribution", "contribution")) if funding: call["funding"] = funding # identifier -> rec_id call.update(get_rml_element_text_and_set_key(rml_call, "identifier", "rec_id")) # title -> title call.update(get_rml_element_text_and_set_key(rml_call, "title", "title")) # year -> date_issued call.update(get_rml_element_text_and_set_key(rml_call, "year", "date_issued")) if call: result["call"] = call return result
def get_mods_elements_text(rml, element): elements_xmletree = rml.findall(xmletree.prefixtag("mods", element)) if elements_xmletree is not None: results = [] for element_xmletree in elements_xmletree: if element_xmletree is not None: results.append(xmletree.get_element_text(element_xmletree)) if results: return results return None
def get_rml_elements_text(rml, element): elements_xmletree = rml.findall(xmletree.prefixtag("rml", element)) if elements_xmletree is not None: results = [] for element_xmletree in elements_xmletree: if element_xmletree is not None: results.append(xmletree.get_element_text(element_xmletree)) if results: return results return None
def ddi_xmletree_to_metajson(ddi_root, source, rec_id_prefix): """ DDI xmletree -> MetaJSON Document """ if ddi_root is None: return None document = Document() document["rec_type"] = constants.DOC_TYPE_DATASETQUALI if source: document["rec_source"] = source # stdyDscr/citation/titlStmt/titl ddi_stdydscr = ddi_root.find(xmletree.prefixtag("ddi", "stdyDscr")) if ddi_stdydscr is not None: ddi_stdydscr_citation = ddi_stdydscr.find(xmletree.prefixtag("ddi", "citation")) ddi_stdydscr_citation_titlstmt = ddi_stdydscr_citation.find(xmletree.prefixtag("ddi", "titlStmt")) ddi_stdydscr_citation_titlstmt_titl = ddi_stdydscr_citation_titlstmt.find(xmletree.prefixtag("ddi", "titl")) if ddi_stdydscr_citation_titlstmt_titl is not None: document["title"] = ddi_stdydscr_citation_titlstmt_titl.text return document
def get_rml_addresses(rml): """ address -> addresses """ result = {} rml_addresses = rml.findall(xmletree.prefixtag("rml", "address")) if rml_addresses is not None: addresses = [] for rml_address in rml_addresses: if rml_address is not None: # country -> country country = xmletree.get_element_text(rml_address.find(xmletree.prefixtag("rml", "country"))) # locality_city_town -> locality_city_town locality_city_town = xmletree.get_element_text(rml_address.find(xmletree.prefixtag("rml", "localityCityTown"))) # post_code -> post_code post_code = xmletree.get_element_text(rml_address.find(xmletree.prefixtag("rml", "postCode"))) # @preferred -> preferred preferred = xmletree.get_element_attribute_as_boolean(rml_address, "preferred") # @relationType -> relation_type relation_type = rml_address.get("relationType") # street -> street street = xmletree.get_element_text(rml_address.find(xmletree.prefixtag("rml", "street"))) # @visible -> visible visible = xmletree.get_element_attribute_as_boolean(rml_address, "visible") # address -> addresses[i] address = metajson_service.create_address(street, post_code, locality_city_town, country, preferred, relation_type, visible) if address: addresses.append(address) if addresses: result["addresses"] = addresses return result
def get_rml_ongoing_researches(rml): """ ongoingResearch -> ongoing_researches """ result = {} rml_ors = rml.findall(xmletree.prefixtag("rml", "ongoingResearch")) if rml_ors is not None: ongoing_researches = {} for rml_or in rml_ors: rml_descriptions = rml_or.findall(xmletree.prefixtag("rml", "description")) if rml_descriptions is not None: for rml_description in rml_descriptions: if rml_description is not None: if rml_description.text is not None: value = rml_description.text.strip() if value is not None: language = rml_description.get(xmletree.prefixtag("xml", "lang")) if language is not None: if language in ongoing_researches: ongoing_researches[language].append(value) else: ongoing_researches[language] = [value] if ongoing_researches: result["ongoing_researches"] = ongoing_researches return result
def extract_dmdsecs(mets): #logging.debug("dmdsecs") dmdsecs = mets.findall(xmletree.prefixtag("mets", "dmdSec")) if dmdsecs: warppers = [] for dmdsec in dmdsecs: warpper = Warpper() warpper['rec_id'] = dmdsec.get("ID") warpper['rec_id_group'] = dmdsec.get("GROUPID") mdwrap = dmdsec.find(xmletree.prefixtag("mets", "mdWrap")) warpper['meta_type'] = mdwrap.get("MDTYPE") xmldatas = mdwrap.findall(xmletree.prefixtag("mets", "xmlData/*")) if xmldatas is not None: warpper['records'] = [] for xmldata in xmldatas: document = convert_xmldata(xmldata, warpper['meta_type']) warpper['records'].append(document) warppers.append(warpper) return warppers
def get_rml_images(rml, role): """ image -> resources[0] """ result = {} rml_images = rml.findall(xmletree.prefixtag("rml", "image")) if rml_images is not None: resources = [] for rml_image in rml_images: if rml_image is not None: url = xmletree.get_element_text(rml_image) resource = metajson_service.create_resource_remote(url, None, role) if resource is not None: resources.append(resource) if resources: result["resources"] = resources return result
def get_rml_money_and_set_key(rml, element, key): """ element -> key """ result = {} rml_element = rml.find(xmletree.prefixtag("rml", element)) if rml_element is not None: money = {} # currency -> currency money.update(xmletree.get_element_attribute_and_set_key(rml_element, "currency", "currency")) # text -> value money["value"] = xmletree.get_element_text(rml_element) if money: result[key] = money return result
def get_rml_teachings(rml): """ teaching -> teachings """ result = {} rml_teachings = rml.findall(xmletree.prefixtag("rml", "teaching")) if rml_teachings is not None: teachings = [] for rml_teaching in rml_teachings: if rml_teaching is not None: teaching = {} # dateBegin -> date_begin teaching.update(get_rml_element_text_and_set_key(rml_teaching, "dateBegin", "date_begin")) # dateEnd -> date_end teaching.update(get_rml_element_text_and_set_key(rml_teaching, "dateEnd", "date_end")) # description -> descriptions[i] teaching.update(get_rml_textlangs_and_set_key(rml_teaching, "description", "descriptions")) # level -> level teaching.update(get_rml_element_text_and_set_key(rml_teaching, "level", "level")) # title -> title teaching.update(get_rml_element_text_and_set_key(rml_teaching, "title", "title")) # creators # name -> creators[0].agent.name name = get_rml_element_text(rml_teaching, "name") creator = creator_service.formatted_name_to_creator(name, constants.REC_CLASS_ORGUNIT, "dgg") if creator is None: creator = Creator() creator["agent"] = Orgunit() creator["roles"] = "dgg" # identifiers -> creators[0].agent.rec_id or creators[0].agent.identifiers creator["agent"].update(get_rml_identifiers(rml_teaching)) if "name" in creator["agent"] or "rec_id" in creator["agent"] or "identifiers" in creator["agent"]: teaching["creators"] = [creator] if teaching is not None: teachings.append(teaching) if teachings: result["teachings"] = teachings return result
def get_rml_degrees(rml): """ degree -> degrees """ result = {} rml_degrees = rml.findall(xmletree.prefixtag("rml", "degree")) if rml_degrees is not None: degrees = [] for rml_degree in rml_degrees: if rml_degree is not None: degree = {} # dateBegin -> date_begin degree.update(get_rml_element_text_and_set_key(rml_degree, "dateBegin", "date_begin")) # dateEnd -> date_end degree.update(get_rml_element_text_and_set_key(rml_degree, "dateEnd", "date_end")) # description -> descriptions degree.update(get_rml_textlangs_and_set_key(rml_degree, "description", "descriptions")) # level -> level degree.update(get_rml_element_text_and_set_key(rml_degree, "level", "level")) # title -> title degree.update(get_rml_element_text_and_set_key(rml_degree, "title", "title")) # creators # name -> creators[0].agent.name name = get_rml_element_text(rml_degree, "name") creator = creator_service.formatted_name_to_creator(name, constants.REC_CLASS_ORGUNIT, "dgg") if creator is None: creator = Creator() creator["agent"] = Orgunit() creator["roles"] = ["dgg"] # identifiers -> creators[0].agent.rec_id or creators[0].agent.identifiers creator["agent"].update(get_rml_identifiers(rml_degree)) if "name" in creator["agent"] or "rec_id" in creator["agent"] or "identifiers" in creator["agent"]: degree["creators"] = [creator] if degree is not None: degrees.append(degree) if degrees: result["degrees"] = degrees return result
def get_rml_participants(rml): """ participant -> creators """ result = {} rml_participants = rml.findall(xmletree.prefixtag("rml", "participant")) if rml_participants is not None: creators = [] for rml_participant in rml_participants: if rml_participant is not None: creator_name = get_rml_element_text(rml_participant, "name") if creator_name: creator_rec_class = xmletree.get_element_attribute(rml_participant, "entityType") if creator_rec_class: creator_rec_class = creator_rec_class.title() creator = creator_service.formatted_name_to_creator(creator_name, creator_rec_class, None) if creator: creators.append(creator) if creators: result["creators"] = creators return result
def get_rml_turnovers(rml): """ turnover -> turnovers """ result = {} rml_turnovers = rml.findall(xmletree.prefixtag("rml", "turnover")) if rml_turnovers is not None: turnovers = [] for rml_turnover in rml_turnovers: if rml_turnover is not None: turnover = {} turnover.update(xmletree.get_element_attribute_and_set_key(rml_turnover, "currency", "currency")) turnover.update(xmletree.get_element_attribute_and_set_key(rml_turnover, "year", "year")) turnover["value"] = xmletree.get_element_text(rml_turnover) if turnover: turnovers.append(turnover) if turnovers: result["turnovers"] = turnovers return result
def get_rml_uris(rml): """ uri -> urls """ result = {} rml_uris = rml.findall(xmletree.prefixtag("rml", "uri")) if rml_uris is not None: urls = [] for rml_uri in rml_uris: if rml_uri is not None: preferred = xmletree.get_element_attribute_as_boolean(rml_uri, "preferred") relation_type = rml_uri.get("relationType") visible = xmletree.get_element_attribute_as_boolean(rml_uri, "visible") value = xmletree.get_element_text(rml_uri) url = metajson_service.create_url(value, preferred, relation_type, None, None, visible) if url: urls.append(url) if urls: result["urls"] = urls return result
def get_rml_headcounts(rml): """ headcount -> headcounts """ result = {} rml_headcounts = rml.findall(xmletree.prefixtag("rml", "headcount")) if rml_headcounts is not None: headcounts = [] for rml_headcount in rml_headcounts: if rml_headcount is not None: # @year -> year year = rml_headcount.get("year") # value value = rml_headcount.text.strip() if value is not None: headcount = {"value": value} if year is not None: headcount["year"] = year.strip() headcounts.append(headcount) if headcounts: result["headcounts"] = headcounts return result
def tei_xmletree_to_metajson_list(tei_root, source, rec_id_prefix, only_first_record): """ TEI xmletree -> MetaJSON Document list""" if tei_root is not None: # TEI/text tei_text = tei_root.find(xmletree.prefixtag("tei", "text")) # TEI/text/body tei_body = tei_text.find(xmletree.prefixtag("tei", "body")) # TEI/text/body/listBibl tei_body_listbibl = tei_body.find(xmletree.prefixtag( "tei", "listBibl")) # TEI/text/body/listBibl/biblFull tei_body_listbibl_biblfulls = tei_body_listbibl.findall( xmletree.prefixtag("tei", "biblFull")) # TEI/text/back tei_back = tei_text.find(xmletree.prefixtag("tei", "back")) # TEI/text/back/div tei_back_divs = tei_back.findall(xmletree.prefixtag("tei", "div")) laboratories = [] projects = [] if tei_back_divs: for tei_back_div in tei_back_divs: if tei_back_div.get("type") == "laboratories": orgs = tei_back_div.findall( xmletree.prefixtag("tei", "org")) for org in orgs: laboratories.append(org_laboratory_to_metajson(org)) elif tei_back_div.get("type") == "projects": orgs = tei_back_div.findall( xmletree.prefixtag("tei", "org")) for org in orgs: laboratories.append(org_project_to_metajson(org)) for biblfull in tei_body_listbibl_biblfulls: yield biblfull_xmletree_to_metajson(biblfull, laboratories, projects, source)
def get_rml_affiliations(rml): """ affiliation -> affiliations """ result = {} rml_affiliations = rml.findall(xmletree.prefixtag("rml", "affiliation")) if rml_affiliations is not None: affiliations = [] for rml_affiliation in rml_affiliations: if rml_affiliation is not None: # dateBegin -> date_begin date_begin = get_rml_element_text(rml_affiliation, "dateBegin") # dateEnd -> date_end date_end = get_rml_element_text(rml_affiliation, "dateEnd") # description -> descriptions descriptions = get_rml_textlangs_as_list(rml_affiliation, "description") # identifier -> agent.rec_id identifiers = get_rml_identifiers(rml_affiliation) rec_id = None if "rec_id" in identifiers and identifiers["rec_id"]: rec_id = identifiers["rec_id"] # name -> agent.name name = get_rml_element_text(rml_affiliation, "name") # @preferred -> preferred preferred = xmletree.get_element_attribute_as_boolean(rml_affiliation, "preferred") # relationType -> role role = get_rml_element_text(rml_affiliation, "relationType") affiliation = metajson_service.create_affiliation(rec_id, name, role, date_begin, date_end, preferred, descriptions) if affiliation is not None: affiliations.append(affiliation) if affiliations: result["affiliations"] = affiliations return result
def openurl_xmletree_to_metajson_list(openurl_response, source, rec_id_prefix, only_first_record): documents = [] if openurl_response is not None: #logging.debug(type(openurl_response)) #logging.debug(openurl_response) # results openurl_results = openurl_response.find( xmletree.prefixtag("ssopenurl", "results")) if openurl_results is not None: # result openurl_result_list = openurl_results.findall( xmletree.prefixtag("ssopenurl", "result")) if openurl_result_list: for openurl_result in openurl_result_list: document = Document() if source: document["source"] = source # citation openurl_citation = openurl_result.find( xmletree.prefixtag("ssopenurl", "citation")) if openurl_citation is not None: # issn openurl_issn = openurl_citation.find( xmletree.prefixtag("ssopenurl", "issn")) if openurl_issn is not None: identifier_issn = Identifier() identifier_issn["id_type"] = "issn" identifier_issn["value"] = openurl_issn.text document.add_item_to_key(identifier_issn, "identifiers") # eissn openurl_eissn = openurl_citation.find( xmletree.prefixtag("ssopenurl", "eissn")) if openurl_eissn is not None: identifier_eissn = Identifier() identifier_eissn["id_type"] = "eissn" identifier_eissn["value"] = openurl_eissn.text document.add_item_to_key(identifier_eissn, "identifiers") # linkGroups openurl_linkgroups = openurl_result.find( xmletree.prefixtag("ssopenurl", "linkGroups")) if openurl_linkgroups is not None: # linkGroup openurl_linkgroup_list = openurl_linkgroups.findall( xmletree.prefixtag("ssopenurl", "linkGroup")) if openurl_linkgroup_list is not None: for openurl_linkgroup in openurl_linkgroup_list: service_name = None institution_name = None period_begin = None period_end = None url = None # holdingData openurl_holdingdata = openurl_linkgroup.find( xmletree.prefixtag("ssopenurl", "holdingData")) if openurl_holdingdata is not None: # institution_name openurl_providername = openurl_holdingdata.find( xmletree.prefixtag( "ssopenurl", "providerName")) if openurl_providername is not None: institution_name = openurl_providername.text # service_name openurl_databasename = openurl_holdingdata.find( xmletree.prefixtag( "ssopenurl", "databaseName")) if openurl_databasename is not None: service_name = openurl_databasename.text # normalizedData openurl_normalizeddata = openurl_holdingdata.find( xmletree.prefixtag( "ssopenurl", "normalizedData")) if openurl_normalizeddata is not None: # startDate openurl_startdate = openurl_normalizeddata.find( xmletree.prefixtag( "ssopenurl", "startDate")) if openurl_startdate is not None: period_begin = openurl_startdate.text # endDate openurl_enddate = openurl_normalizeddata.find( xmletree.prefixtag( "ssopenurl", "endDate")) if openurl_enddate is not None: period_end = openurl_enddate.text # url openurl_url_list = openurl_linkgroup.findall( xmletree.prefixtag("ssopenurl", "url")) if openurl_url_list is not None: for openurl_url in openurl_url_list: if openurl_url.get( "type") == "journal": url = openurl_url.text elif openurl_url.get( "type") == "source": url = openurl_url.text if url: resource = Resource() resource["rec_type"] = "ResourceRemote" resource["rec_state"] = "published" resource["relation_type"] = "eResource" resource[ "version_type"] = "publishedVersion" resource["access_rights"] = "closedAccess" resource["format_mimetype"] = "text/html" resource["url"] = url if service_name: resource["service_name"] = service_name if institution_name: resource[ "institution_name"] = institution_name if period_begin: resource["period_begin"] = period_begin if period_end: resource["period_end"] = period_end document.add_item_to_key( resource, "resources") documents.append(document) if only_first_record: break #logging.debug(jsonbson.dumps_json(documents)) return documents
def didl_xmletree_to_metajson(root_item, source, rec_id_prefix): document = None resources = [] items = root_item.findall(xmletree.prefixtag("didl", "Item")) if items: for item in items: # item types item_types = [] item_date_modified = None descriptors = item.findall(xmletree.prefixtag("didl", "Descriptor")) if descriptors: for descriptor in descriptors: statements = descriptor.findall(xmletree.prefixtag("didl", "Statement")) if statements: for statement in statements: rdf_type = statement.find(xmletree.prefixtag("rdf", "type")) if rdf_type is not None: item_types.append(rdf_type.text) dcterms_modified = statement.find(xmletree.prefixtag("dcterms", "modified")) if dcterms_modified is not None: item_date_modified = dcterms_modified.text # logging.debug("item_types: {}".format(item_types)) if "info:eu-repo/semantics/descriptiveMetadata" in item_types: # metadata # logging.debug("metadata") component = item.find(xmletree.prefixtag("didl", "Component")) if component is not None: resource = component.find(xmletree.prefixtag("didl", "Resource")) if resource is not None: mods = resource.find(xmletree.prefixtag("mods", "mods")) if mods is not None: # logging.debug("mods") document = mods_crosswalk.mods_xmletree_to_metajson(mods, source, rec_id_prefix) if item_date_modified: document["rec_modified_date"] = item_date_modified elif "info:eu-repo/semantics/objectFile" in item_types: # resource # logging.debug("resource") url = None date_last_accessed = None relation_type = "publication" relation_version = None access_rights = "openAccess" rec_state = "published" format_mimetype = None rec_created_date = None rec_modified_date = None if "info:eu-repo/semantics/publishedVersion" in item_types: relation_version = "publishedVersion" elif "info:eu-repo/semantics/authorVersion" in item_types: relation_version = "authorVersion" if item_date_modified: rec_modified_date = item_date_modified component = item.find(xmletree.prefixtag("didl", "Component")) if component is not None: didl_resource = component.find(xmletree.prefixtag("didl", "Resource")) if didl_resource is not None: url = didl_resource.get("ref") format_mimetype = didl_resource.get("mimeType") resource = metajson_service.create_resource_remote( url, date_last_accessed, relation_type, relation_version, access_rights, rec_state, format_mimetype, rec_created_date, rec_modified_date, ) resources.append(resource) if document and resources: document["resources"] = resources return document
def didl_xmletree_to_metajson_list(didl_root, source, rec_id_prefix, only_first_record): if didl_root is not None: item_list = didl_root.findall(xmletree.prefixtag("didl", "Item")) if item_list is not None: for item in item_list: yield didl_xmletree_to_metajson(item, source, rec_id_prefix)
def openurl_xmletree_to_metajson_list(openurl_response, source, rec_id_prefix, only_first_record): documents = [] if openurl_response is not None: #logging.debug(type(openurl_response)) #logging.debug(openurl_response) # results openurl_results = openurl_response.find(xmletree.prefixtag("ssopenurl", "results")) if openurl_results is not None: # result openurl_result_list = openurl_results.findall(xmletree.prefixtag("ssopenurl", "result")) if openurl_result_list: for openurl_result in openurl_result_list: document = Document() if source: document["source"] = source # citation openurl_citation = openurl_result.find(xmletree.prefixtag("ssopenurl", "citation")) if openurl_citation is not None: # issn openurl_issn = openurl_citation.find(xmletree.prefixtag("ssopenurl", "issn")) if openurl_issn is not None: identifier_issn = Identifier() identifier_issn["id_type"] = "issn" identifier_issn["value"] = openurl_issn.text document.add_item_to_key(identifier_issn, "identifiers") # eissn openurl_eissn = openurl_citation.find(xmletree.prefixtag("ssopenurl", "eissn")) if openurl_eissn is not None: identifier_eissn = Identifier() identifier_eissn["id_type"] = "eissn" identifier_eissn["value"] = openurl_eissn.text document.add_item_to_key(identifier_eissn, "identifiers") # linkGroups openurl_linkgroups = openurl_result.find(xmletree.prefixtag("ssopenurl", "linkGroups")) if openurl_linkgroups is not None: # linkGroup openurl_linkgroup_list = openurl_linkgroups.findall(xmletree.prefixtag("ssopenurl", "linkGroup")) if openurl_linkgroup_list is not None: for openurl_linkgroup in openurl_linkgroup_list: service_name = None institution_name = None period_begin = None period_end = None url = None # holdingData openurl_holdingdata = openurl_linkgroup.find(xmletree.prefixtag("ssopenurl", "holdingData")) if openurl_holdingdata is not None: # institution_name openurl_providername = openurl_holdingdata.find(xmletree.prefixtag("ssopenurl", "providerName")) if openurl_providername is not None: institution_name = openurl_providername.text # service_name openurl_databasename = openurl_holdingdata.find(xmletree.prefixtag("ssopenurl", "databaseName")) if openurl_databasename is not None: service_name = openurl_databasename.text # normalizedData openurl_normalizeddata = openurl_holdingdata.find(xmletree.prefixtag("ssopenurl", "normalizedData")) if openurl_normalizeddata is not None: # startDate openurl_startdate = openurl_normalizeddata.find(xmletree.prefixtag("ssopenurl", "startDate")) if openurl_startdate is not None: period_begin = openurl_startdate.text # endDate openurl_enddate = openurl_normalizeddata.find(xmletree.prefixtag("ssopenurl", "endDate")) if openurl_enddate is not None: period_end = openurl_enddate.text # url openurl_url_list = openurl_linkgroup.findall(xmletree.prefixtag("ssopenurl", "url")) if openurl_url_list is not None: for openurl_url in openurl_url_list: if openurl_url.get("type") == "journal": url = openurl_url.text elif openurl_url.get("type") == "source": url = openurl_url.text if url: resource = Resource() resource["rec_type"] = "ResourceRemote" resource["rec_state"] = "published" resource["relation_type"] = "eResource" resource["version_type"] = "publishedVersion" resource["access_rights"] = "closedAccess" resource["format_mimetype"] = "text/html" resource["url"] = url if service_name: resource["service_name"] = service_name if institution_name: resource["institution_name"] = institution_name if period_begin: resource["period_begin"] = period_begin if period_end: resource["period_end"] = period_end document.add_item_to_key(resource, "resources") documents.append(document) if only_first_record: break #logging.debug(jsonbson.dumps_json(documents)) return documents
def didl_xmletree_to_metajson(root_item, source, rec_id_prefix): document = None resources = [] items = root_item.findall(xmletree.prefixtag("didl", "Item")) if items: for item in items: # item types item_types = [] item_date_modified = None descriptors = item.findall(xmletree.prefixtag("didl", "Descriptor")) if descriptors: for descriptor in descriptors: statements = descriptor.findall(xmletree.prefixtag("didl", "Statement")) if statements: for statement in statements: rdf_type = statement.find(xmletree.prefixtag("rdf", "type")) if rdf_type is not None: item_types.append(rdf_type.text) dcterms_modified = statement.find(xmletree.prefixtag("dcterms", "modified")) if dcterms_modified is not None: item_date_modified = dcterms_modified.text #logging.debug("item_types: {}".format(item_types)) if 'info:eu-repo/semantics/descriptiveMetadata' in item_types: # metadata #logging.debug("metadata") component = item.find(xmletree.prefixtag("didl", "Component")) if component is not None: resource = component.find(xmletree.prefixtag("didl", "Resource")) if resource is not None: mods = resource.find(xmletree.prefixtag("mods", "mods")) if mods is not None: #logging.debug("mods") document = mods_crosswalk.mods_xmletree_to_metajson(mods, source, rec_id_prefix) if item_date_modified: document["rec_modified_date"] = item_date_modified elif 'info:eu-repo/semantics/objectFile' in item_types: # resource #logging.debug("resource") url = None date_last_accessed = None relation_type = "publication" relation_version = None access_rights = "openAccess" rec_state = "published" format_mimetype = None rec_created_date = None rec_modified_date = None if 'info:eu-repo/semantics/publishedVersion' in item_types: relation_version = "publishedVersion" elif 'info:eu-repo/semantics/authorVersion' in item_types: relation_version = "authorVersion" if item_date_modified: rec_modified_date = item_date_modified component = item.find(xmletree.prefixtag("didl", "Component")) if component is not None: didl_resource = component.find(xmletree.prefixtag("didl", "Resource")) if didl_resource is not None: url = didl_resource.get("ref") format_mimetype = didl_resource.get("mimeType") resource = metajson_service.create_resource_remote(url, date_last_accessed, relation_type, relation_version, access_rights, rec_state, format_mimetype, rec_created_date, rec_modified_date) resources.append(resource) if document and resources: document["resources"] = resources return document
def biblfull_xmletree_to_metajson(biblfull, laboratories, projects, source): """ biblFull xmletree -> MetaJSON Document """ if biblfull is None: return None document = Document() # titleStmt tei_titlestmt = biblfull.find(xmletree.prefixtag("tei", "titleStmt")) # editionStmt tei_editionstmt = biblfull.find(xmletree.prefixtag("tei", "editionStmt")) # extent tei_extent = biblfull.find(xmletree.prefixtag("tei", "extent")) # publicationStmt tei_publicationstmt = biblfull.find( xmletree.prefixtag("tei", "publicationStmt")) # seriesStmt tei_seriesstmt = biblfull.find(xmletree.prefixtag("tei", "seriesStmt")) # notesStmt tei_notesstmt = biblfull.find(xmletree.prefixtag("tei", "notesStmt")) # sourceDesc tei_sourcedescs = biblfull.findall(xmletree.prefixtag("tei", "sourceDesc")) # profileDesc tei_profiledesc = biblfull.find(xmletree.prefixtag("tei", "profileDesc")) tei_langusage = tei_profiledesc.find(xmletree.prefixtag( "tei", "langUsage")) tei_languages = tei_langusage.findall(xmletree.prefixtag( "tei", "language")) tei_textclass = tei_profiledesc.find(xmletree.prefixtag( "tei", "textClass")) tei_keywords = tei_textclass.findall(xmletree.prefixtag("tei", "keywords")) tei_classcodes = tei_textclass.findall( xmletree.prefixtag("tei", "classCode")) # language doc_language = None if tei_languages: languages = [] for tei_language in tei_languages: language = tei_language.get("ident") languages.append(language) if languages: document["languages"] = languages doc_language = languages[0] # title document.update(get_tei_titles_to_metason(tei_titlestmt, doc_language)) metajson_service.pretty_print_document(document) metajson_service.print_document(document) return document
def get_rml_self_archiving_policy(rml): """ ckbData -> self_archiving_policy """ result = {} rml_ckbdata = rml.find(xmletree.prefixtag("rml", "ckbData")) if rml_ckbdata is not None: sap = {} # romeoPublisher -> . rml_romeo = rml_ckbdata.find(xmletree.prefixtag("rml", "romeoPublisher")) if rml_romeo is not None: # publisher : don't repeate this information #publisher = Orgunit() #publisher["rec_type"] = "publisher" # alias -> publisher.acronym #publisher.update(get_rml_element_text_and_set_key(rml_romeo, "alias", "acronym")) # homeurl -> publisher.urls[] #rml_homeurl_value = xmletree.get_element_text(rml_romeo.find(xmletree.prefixtag("rml", "homeurl"))) #if rml_homeurl_value: # publisher["urls"] = [metajson_service.create_url(rml_homeurl_value, True, "work", None, None, True)] # id -> publisher.identifiers[i] #rml_id_value = xmletree.get_element_text(rml_romeo.find(xmletree.prefixtag("rml", "id"))) #if rml_id_value: # publisher["identifiers"] = [metajson_service.create_identifier("romeo", rml_id_value)] # name -> publisher.name #publisher.update(get_rml_element_text_and_set_key(rml_romeo, "name", "name")) #sap["publisher"] = publisher # conditions.condition -> conditions[] rml_conditions = rml_romeo.find(xmletree.prefixtag("rml", "conditions")) if rml_conditions is not None: rml_conditions_list = rml_conditions.findall(xmletree.prefixtag("rml", "condition")) if rml_conditions_list is not None: conditions = [] for rml_condition in rml_conditions_list: value = xmletree.get_element_text(rml_condition) if value: conditions.append(value) if conditions: sap["conditions"] = conditions # copyright -> copyright sap.update(get_rml_element_text_and_set_key(rml_romeo, "copyright", "copyright")) # copyrightlinks -> copyright_urls rml_copyrightlinks = rml_romeo.find(xmletree.prefixtag("rml", "copyrightlinks")) if rml_copyrightlinks is not None: rml_copyrightlinks_list = rml_copyrightlinks.findall(xmletree.prefixtag("rml", "copyrightlink")) if rml_copyrightlinks_list is not None: copyright_urls = [] for rml_copyrightlink in rml_copyrightlinks_list: copyrightlinktext = xmletree.get_element_text(rml_copyrightlink.find(xmletree.prefixtag("rml", "copyrightlinktext"))) copyrightlinkurl = xmletree.get_element_text(rml_copyrightlink.find(xmletree.prefixtag("rml", "copyrightlinkurl"))) copyright_url = metajson_service.create_url(copyrightlinkurl, None, None, copyrightlinktext, None, None) copyright_urls.append(copyright_url) if copyright_urls: sap["copyright_urls"] = copyright_urls # paidaccess -> paid_access rml_paidaccess = rml_romeo.find(xmletree.prefixtag("rml", "paidaccess")) if rml_paidaccess is not None: paid_access = {} # paidaccessname -> label paid_access.update(get_rml_element_text_and_set_key(rml_paidaccess, "paidaccessname", "label")) # paidaccessurl -> url paid_access.update(get_rml_element_text_and_set_key(rml_paidaccess, "paidaccessurl", "url")) # paidaccessnotes -> notes # rml_paidaccessnotes = rml_paidaccess.findall(xmletree.prefixtag("rml", "paidaccessnotes")) sap["paid_access"] = paid_access # postprints -> postprint rml_postprints = rml_romeo.find(xmletree.prefixtag("rml", "postprints")) if rml_postprints is not None: postprint = {} # postarchiving -> possibility postprint.update(get_rml_element_text_and_set_key(rml_postprints, "postarchiving", "possibility")) # postrestrictions -> restrictions postprint.update(get_rml_textlangs_and_set_key(rml_postprints, "postrestrictions", "restrictions")) sap["postprint"] = postprint # preprints -> preprint rml_preprints = rml_romeo.find(xmletree.prefixtag("rml", "preprints")) if rml_preprints is not None: preprint = {} # prearchiving -> possibility preprint.update(get_rml_element_text_and_set_key(rml_preprints, "prearchiving", "possibility")) # prerestrictions -> restrictions preprint.update(get_rml_textlangs_and_set_key(rml_preprints, "prerestrictions", "pre_restrictions")) sap["preprint"] = preprint # romeocolour -> romeo_color sap.update(get_rml_element_text_and_set_key(rml_romeo, "romeocolour", "romeo_color")) if sap: result["self_archiving_policy"] = sap return result
def get_mods_element_text_as_boolean(rml, element): element_xmletree = rml.find(xmletree.prefixtag("mods", element)) return xmletree.get_element_text_as_boolean(element_xmletree)