Example #1
0
 def test_create_and_delete_document(self):
     url = self.gemini_example.url_for(0)
     content = self.gemini_example.get_from_url(url)
     document = HarvestedDocument(url=url, content=content)
     document.save()
     document_id = document.id
     self.assert_equal(document.url, url)
     self.assert_equal(document.content, content)
     self.delete_commit(document)
     self.assert_raises(Exception, HarvestedDocument.get, document_id)
Example #2
0
 def assert_read_values(self, example_index, expect_values):
     url = self.gemini_example.url_for(file_index=example_index)
     content = self.gemini_example.get_from_url(url)
     document = HarvestedDocument(url=url, content=content)
     values = document.read_values()
     self.assert_gemini_values(values, expect_values)
Example #3
0
    def write_package_from_gemini_string(self, content):
        """Create or update a Package based on some content that has
        come from a URL.

        Also store the raw content as a HarvestedDocument (with
        references to its source and its package)
        """
        # Look for previously harvested document matching Gemini GUID
        gemini_document = GeminiDocument(content)
        gemini_values = gemini_document.read_values()
        gemini_guid = gemini_values["guid"]
        harvested_documents = HarvestedDocument.filter(guid=gemini_guid).all()
        if len(harvested_documents) > 1:
            # A programming error; should never happen
            raise Exception("More than one harvested document GUID %s" % gemini_guid)
        elif len(harvested_documents) == 1:
            # we've previously harvested this (i.e. it's an update)
            harvested_doc = harvested_documents[0]
            if harvested_doc.source.id != self.job.source.id:
                # A 'user' error: there are two or more sources
                # pointing to the same harvested document
                raise HarvesterError("Another source is using metadata GUID %s" % self.job.source.id)
            # XXX Not strictly true - we need to check the title, package resources etc
            if harvested_doc.read_values() == gemini_values:
                # nothing's changed
                return None
            package = harvested_doc.package
        else:
            harvested_doc = None
            package = None

        extras = {"publisher": int(self.job.source.publisher_ref or 0), "INSPIRE": "True"}
        # Just add some of the metadata as extras, not the whole lot
        for name in ["bbox-east-long", "bbox-north-lat", "bbox-south-lat", "bbox-west-long", "abstract", "guid"]:
            extras[name] = gemini_values[name]
        package_data = {"name": str(gemini_guid), "title": gemini_values["title"], "extras": extras}
        resource_locator = (
            gemini_values.get("resource-locator", []) and gemini_values["resource-locator"][0].get("url") or ""
        )
        if resource_locator:
            package_data["resources"] = [
                {"url": resource_locator, "description": "Resource locator", "format": "Unverified"},
                {
                    "url": "%s/api/2/rest/harvesteddocument/%s/xml/%s.xml"
                    % (config.get("ckan.api_url", "/").rstrip("/"), gemini_guid, gemini_guid),
                    "description": "Source GEMINI 2 document",
                    "format": "XML",
                },
                {
                    "url": "%s/api/2/rest/harvesteddocument/%s/html/%s.html"
                    % (config.get("ckan.api_url", "/").rstrip("/"), gemini_guid, gemini_guid),
                    "description": "Formatted GEMINI 2 document",
                    "format": "HTML",
                },
            ]
        if package == None:
            # Create new package from data.
            package = self._create_package_from_data(package_data)
        else:
            package = self._update_package_from_data(package, package_data)
        harvested_doc = HarvestedDocument(content=content, guid=gemini_guid, package=package, source=self.job.source)
        harvested_doc.save()
        return package