コード例 #1
0
ファイル: mdl_api_fetcher.py プロジェクト: dpla/ingestion
    def add_to_collections(self, coll, data_provider=None):
        def _normalize(value):
            """Replaced whitespace with underscores"""
            return value.replace(" ", "__")

        if data_provider is None:
            data_provider = self.contributor["name"]

        couch_id_str = "%s--%s" % (data_provider, coll["title"])
        _id = _normalize(
            couch_id_builder(self.provider, couch_id_str)
            )
        id = hashlib.md5(_id.encode("utf-8")).hexdigest()
        at_id = "http://dp.la/api/collections/" + id

        coll_to_update = coll.copy()
        coll_to_update.update({
            "_id" : _id,
            "id": id,
            "@id": at_id,
            "ingestType": "collection"
        })

        desc = coll_to_update.get("description")
        if desc and isinstance(desc, dict):
            if 'dc' in desc.keys():
                coll_to_update["description"] = getprop(coll_to_update, "description/dc/description")
        self.collections[coll_to_update["title"]] = coll_to_update
コード例 #2
0
ファイル: nypl_fetcher.py プロジェクト: dpla/ingestion
    def fetch_sets(self):
        """Fetches all sets

           Returns an (error, sets) tuple
        """
        error = None
        sets = {}
        if self.sets:
            set_ids = self.sets
        else:
            sets = {}
            url = self.get_sets_url
            error, content = self.request_content_from(url)
            if error is not None:
                return error, sets
            error, content = self.extract_content(content, url)
            if error is not None:
                return error, sets
            set_ids = [uuid for uuid in
                       getprop(content, "response/uuids/uuid")]

        for set_id in set_ids:
            sets[set_id] = {
                "id": set_id
            }

        if not sets:
            error = "Error, no sets from URL %s" % url

        return error, sets
コード例 #3
0
ファイル: mdl_api_fetcher.py プロジェクト: mlhale7/ingestion
    def add_to_collections(self, coll, data_provider=None):
        def _normalize(value):
            """Replaced whitespace with underscores"""
            return value.replace(" ", "__")

        if data_provider is None:
            data_provider = self.contributor["name"]

        couch_id_str = "%s--%s" % (data_provider, coll["title"])
        _id = _normalize(couch_id_builder(self.provider, couch_id_str))
        id = hashlib.md5(_id.encode("utf-8")).hexdigest()
        at_id = "http://dp.la/api/collections/" + id

        coll_to_update = coll.copy()
        coll_to_update.update({
            "_id": _id,
            "id": id,
            "@id": at_id,
            "ingestType": "collection"
        })

        desc = coll_to_update.get("description")
        if desc and isinstance(desc, dict):
            if 'dc' in desc.keys():
                coll_to_update["description"] = getprop(
                    coll_to_update, "description/dc/description")
        self.collections[coll_to_update["title"]] = coll_to_update
コード例 #4
0
ファイル: nara_fetcher.py プロジェクト: mlhale7/ingestion
    def extract_records(self, file_path):
        errors = []
        records = []

        error, content = self.extract_xml_content(file_path)
        if error is None:
            record = content.get("archival-description")

            try:
                record["_id"] = record["arc-id"]

                hier_items = getprop(record, "hierarchy/hierarchy-item")
                # Placeholder collection id and title
                coll_id = "placeholder"
                coll_title = ""
                # Check if record has a collection
                for hitem in iterify(hier_items):
                    if hitem["hierarchy-item-lod"].lower() in ("record group",
                                                               "collection"):
                        coll_id = hitem["hierarchy-item-id"]
                        coll_title = hitem["hierarchy-item-title"]
                        break
                self.create_collection_record(coll_id, coll_title)
                self.add_collection_to_item_record(coll_id, record)
                records.append(record)
            except:
                errors.append("Could not find 'arc_id' in file %s" % file_path)
        else:
            errors.append(error)

        yield errors, records
コード例 #5
0
ファイル: mdl_api_fetcher.py プロジェクト: dpla/ingestion
    def mdl_extract_records(self, content):
        error = None
        if not self.total_records:
            total_records_prop = "numFound"
            self.total_records = getprop(content, total_records_prop)
        records = getprop(content, "docs")

        if records:
            records = iterify(records)
            for record in records:
                record["_id"] = getprop(record, "record_id")
                self.get_collection_for_record(record)
        else:
            records = []
            if not error:
                error = "No records found in MDL content: %s" % content

        return error, records
コード例 #6
0
ファイル: mdl_api_fetcher.py プロジェクト: mlhale7/ingestion
    def get_collection_for_record(self, record):
        coll = getprop(record, "record/sourceResource/collection")
        data_provider = getprop(record, "record/sourceResource/dataProvider")
        if coll:
            coll_title = getprop(coll, "title")

            if coll_title:
                collections = []
                for title in filter(None, iterify(coll_title)):
                    if title not in self.collections:
                        self.add_to_collections(coll, data_provider)
                    collections.append(self.collections[title])
                if len(collections) == 1:
                    return collections[0]
                else:
                    return collections
            else:
                return None
        else:
            return None
コード例 #7
0
ファイル: mdl_api_fetcher.py プロジェクト: dpla/ingestion
    def get_collection_for_record(self, record):
        coll = getprop(record, "record/sourceResource/collection")
        data_provider = getprop(record, "record/sourceResource/dataProvider")
        if coll:
            coll_title = getprop(coll, "title")

            if coll_title:
                collections = []
                for title in filter(None, iterify(coll_title)):
                    if title not in self.collections:
                        self.add_to_collections(coll, data_provider)
                    collections.append(self.collections[title])
                if len(collections) == 1:
                    return collections[0]
                else:
                    return collections
            else:
                return None
        else:
            return None
コード例 #8
0
ファイル: primo_fetcher.py プロジェクト: dpla/ingestion
    def primo_extract_records(self, content):
        error = None
        if not self.total_records:
            total_records_prop = "sear:SEGMENTS/sear:JAGROOT/sear:RESULT" \
                                 "/sear:DOCSET/TOTALHITS"
            self.total_records = getprop(content, total_records_prop)
        records = getprop(content, "sear:SEGMENTS/sear:JAGROOT/sear:RESULT"
                                   "/sear:DOCSET/sear:DOC")

        if records:
            records = iterify(records)
            for record in records:
                record["_id"] = getprop(record,
                                        "PrimoNMBib/record/control/recordid")
        else:
            records = []
            # Elements in the error response are not namespaced
            error = getprop(content, "SEGMENTS/JAGROOT/RESULT/ERROR/MESSAGE")
            if not error:
                error = "No records found in Primo content: %s" % content

        return error, records
コード例 #9
0
    def primo_extract_records(self, content):
        error = None
        if not self.total_records:
            total_records_prop = "sear:SEGMENTS/sear:JAGROOT/sear:RESULT" \
                                 "/sear:DOCSET/TOTALHITS"
            self.total_records = getprop(content, total_records_prop)
        records = getprop(
            content, "sear:SEGMENTS/sear:JAGROOT/sear:RESULT"
            "/sear:DOCSET/sear:DOC")

        if records:
            records = iterify(records)
            for record in records:
                record["_id"] = getprop(record,
                                        "PrimoNMBib/record/control/recordid")
        else:
            records = []
            # Elements in the error response are not namespaced
            error = getprop(content, "SEGMENTS/JAGROOT/RESULT/ERROR/MESSAGE")
            if not error:
                error = "No records found in Primo content: %s" % content

        return error, records
コード例 #10
0
ファイル: nypl_fetcher.py プロジェクト: dpla/ingestion
    def request_records(self, content, set_id):
        self.endpoint_url_params["page"] += 1
        error = None
        total_pages = getprop(content, "request/totalPages")
        current_page = getprop(content, "request/page")
        request_more = total_pages != current_page
        if not request_more:
            # Reset the page for the next collection
            self.endpoint_url_params["page"] = 1

        records = []
        items = getprop(content, "response/capture")
        count = 0

        for item in iterify(items):
            count += 1
            record_url = self.get_records_url.format(item["uuid"])
            error, content = self.request_content_from(record_url)
            if error is None:
                error, content = self.extract_content(content, record_url)

            if error is None:
                record = getprop(content, "response/mods")
                record["_id"] = item["uuid"]
                record["tmp_image_id"] = item.get("imageID")
                record["tmp_item_link"] = item.get("itemLink")
                record["tmp_high_res_link"] = item.get("highResLink")
                record["tmp_rights_statement"] = \
                        getprop(content, "response/rightsStatement")
                record["rightsStatementURI"] = \
                        getprop(content, "response/rightsStatementURI")
                records.append(record)

            if error is not None:
                yield error, records, request_more
                print "Error %s, " % error +\
                      "but fetched %s of %s records from page %s of %s" % \
                      (count, len(items), current_page, total_pages)

        yield error, records, request_more