def test_parse_without_pagination(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"> <rdfs:SomeClass rdf:about="http://example.org"> <rdfs:label>Some label</rdfs:label> </rdfs:SomeClass> </rdf:RDF> ''' p = RDFParser() p.parse(data) eq_(p.next_page(), None)
def test_parse_pagination_last_page(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems> <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage> <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage> <hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage> </hydra:PagedCollection> </rdf:RDF> ''' p = RDFParser() p.parse(data) eq_(p.next_page(), None)
def test_parse_pagination_next_page(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=1"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems> <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage> <hydra:nextPage>http://example.com/catalog.xml?page=2</hydra:nextPage> <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage> </hydra:PagedCollection> </rdf:RDF> ''' p = RDFParser() p.parse(data) assert p.next_page() == 'http://example.com/catalog.xml?page=2'
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads( harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download( next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type( next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() content_hash.update(content) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning( 'Remote content was the same even when using a paginated URL, skipping' ) break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download( content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error( 'Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): source_dataset = model.Package.get(harvest_job.source.id) if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset) if not guid: self._save_gather_error( 'Could not get a unique identifier for dataset: {0}'. format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # get the next page next_page_url = parser.next_page()
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads( harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None self._names_taken = [] while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download( next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type( next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() if content: if six.PY2: content_hash.update(content) else: content_hash.update(content.encode('utf8')) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning( 'Remote content was the same even when using a paginated URL, skipping' ) break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download( content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException as e: self._save_gather_error( 'Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for harvester in p.PluginImplementations(IDCATRDFHarvester): parser, after_parsing_errors = harvester.after_parsing( parser, harvest_job) for error_msg in after_parsing_errors: self._save_gather_error(error_msg, harvest_job) if not parser: return [] try: source_dataset = model.Package.get(harvest_job.source.id) for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) if dataset['name'] in self._names_taken: suffix = len([ i for i in self._names_taken if i.startswith(dataset['name'] + '-') ]) + 1 dataset['name'] = '{}-{}'.format( dataset['name'], suffix) self._names_taken.append(dataset['name']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset, source_url=source_dataset.url) if not guid: self._save_gather_error( 'Could not get a unique identifier for dataset: {0}' .format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) except Exception as e: self._save_gather_error( 'Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) return [] # get the next page next_page_url = parser.next_page() # Check if some datasets need to be deleted object_ids_to_delete = self._mark_datasets_for_deletion( guids_in_source, harvest_job) object_ids.extend(object_ids_to_delete) return object_ids
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads(harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format) # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): source_dataset = model.Package.get(harvest_job.source.id) if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset) if not guid: self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # get the next page next_page_url = parser.next_page()