def add_to_collections(self, coll, data_provider=None): def _normalize(value): """Replaced whitespace with underscores""" return value.replace(" ", "__") if data_provider is None: data_provider = self.contributor["name"] couch_id_str = "%s--%s" % (data_provider, coll["title"]) _id = _normalize( couch_id_builder(self.provider, couch_id_str) ) id = hashlib.md5(_id.encode("utf-8")).hexdigest() at_id = "http://dp.la/api/collections/" + id coll_to_update = coll.copy() coll_to_update.update({ "_id" : _id, "id": id, "@id": at_id, "ingestType": "collection" }) desc = coll_to_update.get("description") if desc and isinstance(desc, dict): if 'dc' in desc.keys(): coll_to_update["description"] = getprop(coll_to_update, "description/dc/description") self.collections[coll_to_update["title"]] = coll_to_update
def fetch_sets(self): """Fetches all sets Returns an (error, sets) tuple """ error = None sets = {} if self.sets: set_ids = self.sets else: sets = {} url = self.get_sets_url error, content = self.request_content_from(url) if error is not None: return error, sets error, content = self.extract_content(content, url) if error is not None: return error, sets set_ids = [uuid for uuid in getprop(content, "response/uuids/uuid")] for set_id in set_ids: sets[set_id] = { "id": set_id } if not sets: error = "Error, no sets from URL %s" % url return error, sets
def add_to_collections(self, coll, data_provider=None): def _normalize(value): """Replaced whitespace with underscores""" return value.replace(" ", "__") if data_provider is None: data_provider = self.contributor["name"] couch_id_str = "%s--%s" % (data_provider, coll["title"]) _id = _normalize(couch_id_builder(self.provider, couch_id_str)) id = hashlib.md5(_id.encode("utf-8")).hexdigest() at_id = "http://dp.la/api/collections/" + id coll_to_update = coll.copy() coll_to_update.update({ "_id": _id, "id": id, "@id": at_id, "ingestType": "collection" }) desc = coll_to_update.get("description") if desc and isinstance(desc, dict): if 'dc' in desc.keys(): coll_to_update["description"] = getprop( coll_to_update, "description/dc/description") self.collections[coll_to_update["title"]] = coll_to_update
def extract_records(self, file_path): errors = [] records = [] error, content = self.extract_xml_content(file_path) if error is None: record = content.get("archival-description") try: record["_id"] = record["arc-id"] hier_items = getprop(record, "hierarchy/hierarchy-item") # Placeholder collection id and title coll_id = "placeholder" coll_title = "" # Check if record has a collection for hitem in iterify(hier_items): if hitem["hierarchy-item-lod"].lower() in ("record group", "collection"): coll_id = hitem["hierarchy-item-id"] coll_title = hitem["hierarchy-item-title"] break self.create_collection_record(coll_id, coll_title) self.add_collection_to_item_record(coll_id, record) records.append(record) except: errors.append("Could not find 'arc_id' in file %s" % file_path) else: errors.append(error) yield errors, records
def mdl_extract_records(self, content): error = None if not self.total_records: total_records_prop = "numFound" self.total_records = getprop(content, total_records_prop) records = getprop(content, "docs") if records: records = iterify(records) for record in records: record["_id"] = getprop(record, "record_id") self.get_collection_for_record(record) else: records = [] if not error: error = "No records found in MDL content: %s" % content return error, records
def get_collection_for_record(self, record): coll = getprop(record, "record/sourceResource/collection") data_provider = getprop(record, "record/sourceResource/dataProvider") if coll: coll_title = getprop(coll, "title") if coll_title: collections = [] for title in filter(None, iterify(coll_title)): if title not in self.collections: self.add_to_collections(coll, data_provider) collections.append(self.collections[title]) if len(collections) == 1: return collections[0] else: return collections else: return None else: return None
def primo_extract_records(self, content): error = None if not self.total_records: total_records_prop = "sear:SEGMENTS/sear:JAGROOT/sear:RESULT" \ "/sear:DOCSET/TOTALHITS" self.total_records = getprop(content, total_records_prop) records = getprop(content, "sear:SEGMENTS/sear:JAGROOT/sear:RESULT" "/sear:DOCSET/sear:DOC") if records: records = iterify(records) for record in records: record["_id"] = getprop(record, "PrimoNMBib/record/control/recordid") else: records = [] # Elements in the error response are not namespaced error = getprop(content, "SEGMENTS/JAGROOT/RESULT/ERROR/MESSAGE") if not error: error = "No records found in Primo content: %s" % content return error, records
def primo_extract_records(self, content): error = None if not self.total_records: total_records_prop = "sear:SEGMENTS/sear:JAGROOT/sear:RESULT" \ "/sear:DOCSET/TOTALHITS" self.total_records = getprop(content, total_records_prop) records = getprop( content, "sear:SEGMENTS/sear:JAGROOT/sear:RESULT" "/sear:DOCSET/sear:DOC") if records: records = iterify(records) for record in records: record["_id"] = getprop(record, "PrimoNMBib/record/control/recordid") else: records = [] # Elements in the error response are not namespaced error = getprop(content, "SEGMENTS/JAGROOT/RESULT/ERROR/MESSAGE") if not error: error = "No records found in Primo content: %s" % content return error, records
def request_records(self, content, set_id): self.endpoint_url_params["page"] += 1 error = None total_pages = getprop(content, "request/totalPages") current_page = getprop(content, "request/page") request_more = total_pages != current_page if not request_more: # Reset the page for the next collection self.endpoint_url_params["page"] = 1 records = [] items = getprop(content, "response/capture") count = 0 for item in iterify(items): count += 1 record_url = self.get_records_url.format(item["uuid"]) error, content = self.request_content_from(record_url) if error is None: error, content = self.extract_content(content, record_url) if error is None: record = getprop(content, "response/mods") record["_id"] = item["uuid"] record["tmp_image_id"] = item.get("imageID") record["tmp_item_link"] = item.get("itemLink") record["tmp_high_res_link"] = item.get("highResLink") record["tmp_rights_statement"] = \ getprop(content, "response/rightsStatement") record["rightsStatementURI"] = \ getprop(content, "response/rightsStatementURI") records.append(record) if error is not None: yield error, records, request_more print "Error %s, " % error +\ "but fetched %s of %s records from page %s of %s" % \ (count, len(items), current_page, total_pages) yield error, records, request_more