def test_create_and_delete_document(self): url = self.gemini_example.url_for(0) content = self.gemini_example.get_from_url(url) document = HarvestedDocument(url=url, content=content) document.save() document_id = document.id self.assert_equal(document.url, url) self.assert_equal(document.content, content) self.delete_commit(document) self.assert_raises(Exception, HarvestedDocument.get, document_id)
def assert_read_values(self, example_index, expect_values): url = self.gemini_example.url_for(file_index=example_index) content = self.gemini_example.get_from_url(url) document = HarvestedDocument(url=url, content=content) values = document.read_values() self.assert_gemini_values(values, expect_values)
def write_package_from_gemini_string(self, content): """Create or update a Package based on some content that has come from a URL. Also store the raw content as a HarvestedDocument (with references to its source and its package) """ # Look for previously harvested document matching Gemini GUID gemini_document = GeminiDocument(content) gemini_values = gemini_document.read_values() gemini_guid = gemini_values["guid"] harvested_documents = HarvestedDocument.filter(guid=gemini_guid).all() if len(harvested_documents) > 1: # A programming error; should never happen raise Exception("More than one harvested document GUID %s" % gemini_guid) elif len(harvested_documents) == 1: # we've previously harvested this (i.e. it's an update) harvested_doc = harvested_documents[0] if harvested_doc.source.id != self.job.source.id: # A 'user' error: there are two or more sources # pointing to the same harvested document raise HarvesterError("Another source is using metadata GUID %s" % self.job.source.id) # XXX Not strictly true - we need to check the title, package resources etc if harvested_doc.read_values() == gemini_values: # nothing's changed return None package = harvested_doc.package else: harvested_doc = None package = None extras = {"publisher": int(self.job.source.publisher_ref or 0), "INSPIRE": "True"} # Just add some of the metadata as extras, not the whole lot for name in ["bbox-east-long", "bbox-north-lat", "bbox-south-lat", "bbox-west-long", "abstract", "guid"]: extras[name] = gemini_values[name] package_data = {"name": str(gemini_guid), "title": gemini_values["title"], "extras": extras} resource_locator = ( gemini_values.get("resource-locator", []) and gemini_values["resource-locator"][0].get("url") or "" ) if resource_locator: package_data["resources"] = [ {"url": resource_locator, "description": "Resource locator", "format": "Unverified"}, { "url": "%s/api/2/rest/harvesteddocument/%s/xml/%s.xml" % (config.get("ckan.api_url", "/").rstrip("/"), gemini_guid, gemini_guid), "description": "Source GEMINI 2 document", "format": "XML", }, { "url": "%s/api/2/rest/harvesteddocument/%s/html/%s.html" % (config.get("ckan.api_url", "/").rstrip("/"), gemini_guid, gemini_guid), "description": "Formatted GEMINI 2 document", "format": "HTML", }, ] if package == None: # Create new package from data. package = self._create_package_from_data(package_data) else: package = self._update_package_from_data(package, package_data) harvested_doc = HarvestedDocument(content=content, guid=gemini_guid, package=package, source=self.job.source) harvested_doc.save() return package