def process_resources(self, dataset, data, formats): dataset_id = data["datasetid"] ods_metadata = data["metas"] description = self.description_from_fields(data['fields']) for format in formats: label, udata_format, mime = self.FORMATS[format] resource = Resource(title='Export au format {0}'.format(label), description=description, filetype='remote', url=self._get_download_url(dataset_id, format), format=udata_format, mime=mime) resource.modified = ods_metadata["modified"] dataset.resources.append(resource)
def process_resources(self, dataset, data, formats): dataset_id = data["datasetid"] ods_metadata = data["metas"] description = self.description_from_fields(data['fields']) for format in formats: label, udata_format, mime = self.FORMATS[format] resource = Resource( title='Export au format {0}'.format(label), description=description, filetype='remote', url=self._get_download_url(dataset_id, format), format=udata_format, mime=mime) resource.modified = ods_metadata["modified"] dataset.resources.append(resource)
def process(self, item): '''Generate a random dataset from a fake identifier''' # Get or create a harvested dataset with this identifier. # Harvest metadata are already filled on creation. dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data dataset.title = faker.sentence() dataset.description = faker.text() dataset.tags = list(set(faker.words(nb=faker.pyint()))) # Resources for i in range(faker.pyint()): dataset.resources.append( Resource(title=faker.sentence(), description=faker.text(), url=faker.url(), filetype='remote', mime=faker.mime_type(category='text'), format=faker.file_extension(category='text'), filesize=faker.pyint())) return dataset
def process(self, item): kwargs = item.kwargs item.remote_id = kwargs['remote_id'] dataset = self.get_dataset(item.remote_id) dataset.title = kwargs['title'] dataset.tags = kwargs.get('tags', '').split(',') dataset.private = True dataset.frequency = kwargs.get('dataset_frequency', 'annual') license = kwargs.get('dataset_license', 'cc-zero') dataset.license = License.objects.get(id=license) resources = kwargs['resources'] description = u"Ce jeu de données contient: <br>" for resource in resources: description += resource['title'] + "<br>" dataset.description = description # Force recreation of all resources dataset.resources = [] for resource in resources: new_resource = Resource(title=resource['title'], url=resource['link'], filetype=resource['filetype'], format=resource['format'], description=resource.get( 'description', '')) dataset.resources.append(new_resource) return dataset
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs dataset.title = kwargs['title'] dataset.tags = ["statec-harvesting"] resources = kwargs['resources'] description = u"Ce jeu de données contient: <br>" for resource in resources: description += resource['title'] + "<br>" description += "<br>---------------------------------------" description += """<br> Automatically synched from portail statistique (category %s)""" % dataset.title dataset.description = description # Force recreation of all resources dataset.resources = [] for resource in resources: url = resource['link'] url = url.replace('tableView', 'download') params = { 'IF_DOWNLOADFORMAT': 'csv', 'IF_DOWNLOAD_ALL_ITEMS': 'yes' } url_parts = list(urlparse.urlparse(url)) query = dict(urlparse.parse_qsl(url_parts[4])) query.update(params) url_parts[4] = urlencode(query) download_url = urlparse.urlunparse(url_parts) new_resource = Resource(title=resource['title'], url=download_url, filetype='remote', format='csv') if len( filter( lambda d: d['title'] in [resource['title']] and d[ 'url'] in [download_url], dataset.resources)) == 0: # noqa dataset.resources.append(new_resource) else: pass return dataset
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs item = kwargs['item'] dataset.title = item['title'] dataset.license = License.guess('cc-by') dataset.tags = ["snig.dgterritorio.gov.pt"] dataset.description = item['description'] if item.get('date'): dataset.created_at = item['date'] for keyword in item.get('keywords'): dataset.tags.append(keyword) # Force recreation of all resources dataset.resources = [] for resource in item.get("resources"): parsed = urlparse.urlparse(resource['url']) try: format = str(urlparse.parse_qs(parsed.query)['service'][0]) except KeyError: format = resource['url'].split('.')[-1] new_resource = Resource(title=item['title'], url=resource['url'], filetype='remote', format=format) dataset.resources.append(new_resource) dataset.extras['harvest:name'] = self.source.name return dataset
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs dataset.title = kwargs['title'] dataset.license = License.guess('cc-by') dataset.tags = ["apambiente.pt"] item = kwargs['item'] dataset.description = item['summary'] if kwargs['date']: dataset.created_at = kwargs['date'] # Force recreation of all resources dataset.resources = [] for resource in item['links']: url = resource['href'].replace('\\', '').replace(' ', '%20') type = resource['type'] if type == 'details': dataset.description += "<br>" dataset.description += "<br>Mais detalhes : <a href=\"%s\" target=\"_blank\">%s</a>" % ( url, dataset.title) if type == 'open': url_parts = list(urlparse.urlparse(url)) parts = url_parts[2].split('.') format = parts[-1] if len(parts) > 1 else 'wms' new_resource = Resource(title=dataset.title, url=url, filetype='remote', format=format.lower()) dataset.resources.append(new_resource) return dataset
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data kwargs = item.kwargs dataset.title = kwargs['title'] dataset.license = License.guess('cc-by') dataset.tags = ["apambiente.pt"] item = kwargs['item'] dataset.description = item.get('description') if kwargs['date']: dataset.created_at = kwargs['date'] # Force recreation of all resources dataset.resources = [] url = item.get('url') if item.get('type') == "liveData": type = "wms" else: type = url.split('.')[-1].lower() if len(type) > 3: type = "wms" new_resource = Resource(title=dataset.title, url=url, filetype='remote', format=type) dataset.resources.append(new_resource) return dataset
def process(self, item): # dataset_id is found in item.remote_id dataset_id = item.remote_id # we want to avoid a same id from another # source so we add 'instn' in front of it dataset = self.get_dataset('instn-%s' % item.remote_id) # We saved the theme name as a tag and the dataset name # as the title dataset.title = item.kwargs['dataset_name'] dataset.tags = [item.kwargs['theme_name']] # We empty the existing resources (= the different "files" for this dataset) # Here we will have only one probably dataset.resources = [] # Getting the xls file url dataset_request = requests.get( 'http://beta.ins.tn/fr/node/get/nojs/%s' % dataset_id) dataset_soup = BeautifulSoup(dataset_request.content, 'html.parser') dataset_xls = dataset_soup.find_all('li', {"class": "data"}) for xls in dataset_xls: url = xls.find('a').attrs['href'] # We have the url, let's do a HEAD request # to get the file size without downloading the whole file size_request = requests.head(url) file_size = size_request.headers['Content-length'] # We're good, let's add the file # to the dataset resources dataset.resources.append( Resource( title=dataset.title, url=url, filetype='remote', mime= 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', format='xlsx', filesize=file_size)) # at the end, always return the dataset return dataset
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], schema) # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = data['notes'] dataset.license = License.objects(id=data['license_id']).first() # dataset.license = license or License.objects.get(id='notspecified') dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') print 'spatial-text', extra['value'] # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') print 'spatial-uri', extra['value'] # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': print 'temporal_start', extra['value'] temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': print 'temporal_end', extra['value'] temporal_end = daterange_end(extra['value']) continue # else: # print extra['key'], extra['value'] dataset.extras[extra['key']] = extra['value'] if spatial_geom: dataset.spatial = SpatialCoverage() if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): dataset.extras['remote_url'] = data['url'] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = res.get('description') resource.url = res['url'] resource.filetype = ('api' if res['resource_type'] == 'api' else 'remote') resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def get_resource(self, dataset, url): resource = get_by(dataset.resources, 'url', url) if not resource: return True, Resource(url=url) return False, resource
def process(self, item): dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data # check if this is a new dataset and give it a title if dataset.title is None: dataset.title = '' pass # Create the new list of tags and mmake sure the list has only unique tags tags = [] for tag in dataset.tags: tags.append(tag) pass tags.append("statec-harvesting") tags = list(set(tags)) dataset.tags = tags # return the gathered resources of the items # or return the updated list of all the resources of the given dataset resources = self.__update_resources(item, dataset) if dataset.title == '': dataset.title = item.kwargs['title'] # Rebuild the dataset description description = u"This dataset includes the following resource(s): <br>" for resource in resources: description += resource.title + "<br>" description += "<br>---------------------------------------" description += """<br> Automatically synched from portail statistique (category %s)""" % dataset.title dataset.description = description # Force recreation of all resources dataset.resources = [] for resource in resources: url = resource.url download_url = url # check if the resource format is csv and handle the link creation accordingly if resource.format == 'csv': url = url.replace('tableView', 'download') params = { 'IF_DOWNLOADFORMAT': 'csv', 'IF_DOWNLOAD_ALL_ITEMS': 'yes' } url_parts = list(urlparse.urlparse(url)) query = dict(urlparse.parse_qsl(url_parts[4])) query.update(params) url_parts[4] = urlencode(query) download_url = urlparse.urlunparse(url_parts) pass # The newly created resource new_resource = Resource( title=resource.title, description=resource.title, url=download_url, filetype='remote', format=resource.format ) dataset.resources.append(new_resource) return dataset
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], schema) # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = data['notes'] dataset.license = License.objects(id=data['license_id']).first() # dataset.license = license or License.objects.get(id='notspecified') dataset.tags = [t['name'] for t in data['tags']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') print 'spatial-text', extra['value'] # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') print 'spatial-uri', extra['value'] # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': print 'temporal_start', extra['value'] temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': print 'temporal_end', extra['value'] temporal_end = daterange_end(extra['value']) continue # else: # print extra['key'], extra['value'] dataset.extras[extra['key']] = extra['value'] if spatial_geom: dataset.spatial = SpatialCoverage() if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): dataset.extras['remote_url'] = data['url'] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = res.get('description') resource.url = res['url'] resource.filetype = ('api' if res['resource_type'] == 'api' else 'remote') resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset
def remote_datasets(self): response = self.get('package_list') for name in response['result']: details = self.get('package_show', {'id': name})['result'] dataset = self.get_harvested(Dataset, details['id']) # Core attributes dataset.slug = details['name'] dataset.title = details['title'] dataset.description = details.get('notes', 'No description') dataset.license = License.objects(id=details['license_id']).first() or License.objects.get(id='notspecified') dataset.tags = [tag['name'].lower() for tag in details['tags']] dataset.frequency = self.map('frequency', details) or 'unknown' dataset.created_at = parse(details['metadata_created']) dataset.last_modified = parse(details['metadata_modified']) if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'): coverage = TerritorialCoverage( codes=[code.strip() for code in details.get('territorial_coverage', '').split(',') if code.strip()], granularity=self.map('territorial_coverage_granularity', details), ) dataset.extras['territorial_coverage'] = coverage try: dataset.spatial = territorial_to_spatial(dataset) except Exception as e: print 'Error while processing spatial coverage for {0}:'.format(dataset.title), e if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'): try: dataset.temporal_coverage = db.DateRange( start=daterange_start(details.get('temporal_coverage_from')), end=daterange_end(details.get('temporal_coverage_to')), ) except: log.error('Unable to parse temporal coverage for dataset %s', details['id']) # Organization if details.get('organization'): dataset.organization = self.get_harvested(Organization, details['organization']['id'], False) else: # Need to fetch user from roles roles = self.get('roles_show', {'domain_object': name})['result']['roles'] for role in roles: if role['role'] == 'admin' and role['context'] == 'Package': dataset.owner = self.get_harvested(User, role['user_id']) break # Supplier if details.get('supplier_id'): dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False) # Remote URL if details.get('url'): dataset.extras['remote_url'] = details['url'] # Extras if 'extras' in details: extra_mapping = self.harvester.mapping.get('from_extras', {}) for extra in details['extras']: if extra['key'] in self.harvester.mapping: value = self.harvester.mapping[extra['key']].get(extra['value']) else: value = extra['value'] if extra['key'] in extra_mapping: setattr(dataset, extra_mapping[extra['key']], value) else: dataset.extras[extra['key']] = value # Resources for res in details['resources']: try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.url = res['url'] resource.description = res.get('description') resource.format = res.get('format') resource.hash = res.get('hash') resource.created = parse(res['created']) resource.modified = parse(res['revision_timestamp']) resource.published = resource.published or resource.created yield dataset if dataset.id: followers = self.get('dataset_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], False) if user: follow, created = FollowDataset.objects.get_or_create(follower=user, following=dataset)
def remote_datasets(self): response = self.get('package_list') for name in response['result']: details = self.get('package_show', {'id': name})['result'] dataset = self.get_harvested(Dataset, details['id']) # Core attributes dataset.slug = details['name'] dataset.title = details['title'] dataset.description = details.get('notes', 'No description') dataset.license = License.objects( id=details['license_id']).first() or License.objects.get( id='notspecified') dataset.tags = [tag['name'].lower() for tag in details['tags']] dataset.frequency = self.map('frequency', details) or 'unknown' dataset.created_at = parse(details['metadata_created']) dataset.last_modified = parse(details['metadata_modified']) if any_field(details, 'territorial_coverage', 'territorial_coverage_granularity'): coverage = TerritorialCoverage( codes=[ code.strip() for code in details.get( 'territorial_coverage', '').split(',') if code.strip() ], granularity=self.map('territorial_coverage_granularity', details), ) dataset.extras['territorial_coverage'] = coverage try: dataset.spatial = territorial_to_spatial(dataset) except Exception as e: print 'Error while processing spatial coverage for {0}:'.format( dataset.title), e if all_field(details, 'temporal_coverage_from', 'temporal_coverage_to'): try: dataset.temporal_coverage = db.DateRange( start=daterange_start( details.get('temporal_coverage_from')), end=daterange_end(details.get('temporal_coverage_to')), ) except: log.error( 'Unable to parse temporal coverage for dataset %s', details['id']) # Organization if details.get('organization'): dataset.organization = self.get_harvested( Organization, details['organization']['id'], False) else: # Need to fetch user from roles roles = self.get('roles_show', {'domain_object': name})['result']['roles'] for role in roles: if role['role'] == 'admin' and role['context'] == 'Package': dataset.owner = self.get_harvested( User, role['user_id']) break # Supplier if details.get('supplier_id'): dataset.supplier = self.get_harvested(Organization, details['supplier_id'], False) # Remote URL if details.get('url'): dataset.extras['remote_url'] = details['url'] # Extras if 'extras' in details: extra_mapping = self.harvester.mapping.get('from_extras', {}) for extra in details['extras']: if extra['key'] in self.harvester.mapping: value = self.harvester.mapping[extra['key']].get( extra['value']) else: value = extra['value'] if extra['key'] in extra_mapping: setattr(dataset, extra_mapping[extra['key']], value) else: dataset.extras[extra['key']] = value # Resources for res in details['resources']: try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except: log.error('Unable to parse resource %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.url = res['url'] resource.description = res.get('description') resource.format = res.get('format') resource.hash = res.get('hash') resource.created = parse(res['created']) resource.modified = parse(res['revision_timestamp']) resource.published = resource.published or resource.created yield dataset if dataset.id: followers = self.get('dataset_follower_list', {'id': name})['result'] for follower in followers: user = self.get_harvested(User, follower['id'], False) if user: follow, created = FollowDataset.objects.get_or_create( follower=user, following=dataset)
def on_form_valid(self, form): resource = Resource() form.populate_obj(resource) self.dataset.add_community_resource(resource) return redirect(url_for('datasets.show', dataset=self.dataset))
def process(self, item): response = self.get(item.remote_id) encoding = chardet.detect(response.content)['encoding'] xml = self.parse_xml(response.content.decode(encoding)) metadata = xml['metadata'] # Resolve and remote id from metadata item.remote_id = metadata['id'] dataset = self.get_dataset(metadata['id']) dataset.title = metadata['title'] dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown') dataset.description = metadata['notes'] dataset.private = metadata['private'] dataset.tags = sorted(set(metadata['tags'])) if metadata.get('license_id'): dataset.license = License.objects.get(id=metadata['license_id']) if (metadata.get('temporal_coverage_from') and metadata.get('temporal_coverage_to')): dataset.temporal_coverage = db.DateRange( start=metadata['temporal_coverage_from'], end=metadata['temporal_coverage_to']) if (metadata.get('territorial_coverage_code') or metadata.get('territorial_coverage_granularity')): dataset.spatial = SpatialCoverage() if metadata.get('territorial_coverage_granularity'): dataset.spatial.granularity = GRANULARITIES.get( metadata['territorial_coverage_granularity']) if metadata.get('territorial_coverage_code'): dataset.spatial.zones = [ ZONES[metadata['territorial_coverage_code']] ] dataset.resources = [] cle = get_by(metadata['resources'], 'format', 'cle') for row in metadata['resources']: if row['format'] == 'cle': continue else: resource = Resource(title=row['name'], description=(row['description'] + '\n\n' + SSL_COMMENT).strip(), filetype='remote', url=row['url'], format=row['format']) if resource.format == 'csv' and cle: resource.checksum = Checksum(type='sha256', value=self.get( cle['url']).text) if row.get('last_modified'): resource.modified = row['last_modified'] dataset.resources.append(resource) if metadata.get('author'): dataset.extras['author'] = metadata['author'] if metadata.get('author_email'): dataset.extras['author_email'] = metadata['author_email'] if metadata.get('maintainer'): dataset.extras['maintainer'] = metadata['maintainer'] if metadata.get('maintainer_email'): dataset.extras['maintainer_email'] = metadata['maintainer_email'] for extra in metadata['extras']: dataset.extras[extra['key']] = extra['value'] return dataset
def process(self, item): '''Return the INE datasets''' dataset = self.get_dataset(item.remote_id) # get remote data for dataset req = requests.get("https://www.ine.pt/ine/xml_indic.jsp", params={ 'varcd': item.remote_id, 'lang': 'PT', 'opc': '1' }, headers={'charset': 'utf8'}) returnedData = req.content print 'Get metadata for %s' % (item.remote_id) keywordSet = set() dataset.license = License.guess('cc-by') dataset.resources = [] doc = minidom.parseString(returnedData) properties = doc.getElementsByTagName('indicator') # go through the API dataset information for propNode in properties: for childNode in propNode.childNodes: # print childNode fc = childNode.firstChild if fc: if childNode.nodeName == 'keywords': for obj in childNode.childNodes: # INE needs to create a proper xml file... valueData = obj.nodeValue # need to ignore the ',' nodes if obj.nodeValue != ',': # need to ignore the last "," usually after the INE value if valueData[-1:] == ',': valueData = valueData[:-1] # this removes redundant keywords that sometimes show with different cases (lower and upper) keywordSet.add(valueData.lower()) dataset.tags = list(keywordSet) dataset.tags.append('ine.pt') dataset.frequency = 'unknown' elif childNode.nodeName == 'title': dataset.title = fc.nodeValue elif childNode.nodeName == 'description': dataset.description = fc.nodeValue elif childNode.nodeName == 'html': for obj in childNode.childNodes: if obj.nodeName == 'bdd_url': dataset.description += "\n " + obj.firstChild.nodeValue elif childNode.nodeName == 'json': for obj in childNode.childNodes: if obj.nodeName == 'json_dataset': dataset.resources.append( Resource( title='Dataset json url', description='Dataset em formato json', url=obj.firstChild.nodeValue, filetype='remote', format='json')) elif obj.nodeName == 'json_metainfo': dataset.resources.append( Resource( title='Json metainfo url', description='Metainfo em formato json', url=obj.firstChild.nodeValue, filetype='remote', format='json')) return dataset
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect Org organization_acronym = data['organization']['name'] orgObj = Organization.objects(acronym=organization_acronym).first() if orgObj: #print 'Found %s' % orgObj.acronym dataset.organization = orgObj else: orgObj = Organization() orgObj.acronym = organization_acronym orgObj.name = data['organization']['title'] orgObj.description = data['organization']['description'] orgObj.save() #print 'Created %s' % orgObj.acronym dataset.organization = orgObj # Detect license default_license = self.harvest_config.get('license', License.default()) dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.tags.append(urlparse(self.source.url).hostname) dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.frequency = 'unknown' dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom = None for extra in data['extras']: # GeoJSON representation (Polygon or Point) if extra['key'] == 'spatial': spatial_geom = json.loads(extra['value']) # Textual representation of the extent / location elif extra['key'] == 'spatial-text': log.debug('spatial-text value not handled') # Linked Data URI representing the place name elif extra['key'] == 'spatial-uri': log.debug('spatial-uri value not handled') # Update frequency elif extra['key'] == 'frequency': print 'frequency', extra['value'] # Temporal coverage start elif extra['key'] == 'temporal_start': temporal_start = daterange_start(extra['value']) continue # Temporal coverage end elif extra['key'] == 'temporal_end': temporal_end = daterange_end(extra['value']) continue dataset.extras[extra['key']] = extra['value'] # We don't want spatial to be added on harvester if self.harvest_config.get('geozones', False): dataset.spatial = SpatialCoverage() dataset.spatial.zones = [] for zone in self.harvest_config.get('geozones'): geo_zone = GeoZone.objects.get(id=zone) dataset.spatial.zones.append(geo_zone) # # if spatial_geom: # dataset.spatial = SpatialCoverage() # if spatial_geom['type'] == 'Polygon': # coordinates = [spatial_geom['coordinates']] # elif spatial_geom['type'] == 'MultiPolygon': # coordinates = spatial_geom['coordinates'] # else: # HarvestException('Unsupported spatial geometry') # dataset.spatial.geom = { # 'type': 'MultiPolygon', # 'coordinates': coordinates # } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['remote_url'] = self.dataset_url(data['name']) dataset.extras['ckan:source'] = data['url'] else: dataset.extras['remote_url'] = url dataset.extras['harvest:name'] = self.source.name current_resources = [ str(resource.id) for resource in dataset.resources ] fetched_resources = [] # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue #Ignore invalid Resources try: url = uris.validate(res['url']) except uris.ValidationError: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue fetched_resources.append(str(res['id'])) if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created # Clean up old resources removed from source for resource_id in current_resources: if resource_id not in fetched_resources: try: resource = get_by(dataset.resources, 'id', UUID(resource_id)) except Exception: log.error('Unable to parse resource ID %s', resource_id) continue else: if resource and not self.dryrun: dataset.resources.remove(resource) return dataset
def process(self, item): '''Return the DadosGov datasets with the corresponding original and xml file''' global REPORT_FILE_PATH, DADOSGOVPATH, DOWNLOADFILEPATH, DADOSGOVURL reload(sys) sys.setdefaultencoding('utf8') # Get or create a harvested dataset with this identifier. dataset = self.get_dataset(item.remote_id) # get the organization object, no check necessary, it should always exist orgObj = Organization.objects(id=item.kwargs['orgId']).first() print '------------------------------------' print 'Processing %s (%s)' % (dataset.title, item.remote_id) # print item.kwargs # print '--' # set additional vars dataset.tags = ['migrado'] dataset.extras = {} dataset.organization = orgObj.id dataset.license = License.guess('cc-by') dataset.resources = [] # ********************************************* # go through the DB dataset information dataset.created_at = item.kwargs['createdOn'] dataset.extras['links'] = item.kwargs['serviceUrl'] # ******************************************************** # ******************************************************** req = requests.get( "http://%s/v1/%s/TableMetadata" % (DADOSGOVURL, item.kwargs['orgAcronym']), params={'$filter': "partitionkey eq '%s'" % item.remote_id}, headers={'charset': 'utf8'}) xmlRootData = req.content propertiesDoc = minidom.parseString(xmlRootData) propertiesStuff = propertiesDoc.getElementsByTagName('content') propEl = propertiesDoc.getElementsByTagNameNS('*', 'properties') if propEl: propertiesElements = propEl[0].childNodes # go through the API dataset information for propEl in propertiesElements: if propEl.nodeType == Node.ELEMENT_NODE: fc = propEl.firstChild if fc: if propEl.nodeName == 'd:category': dataset.tags.append(fc.nodeValue) elif propEl.nodeName == 'd:keywords': dataset.tags.extend([ currentTag.strip() for currentTag in fc.nodeValue.split(',') ]) # elif propEl.nodeName == 'd:PartitionKey': # dataset.slug = fc.nodeValue elif propEl.nodeName == 'd:nameexternal': dataset.title = fc.nodeValue elif propEl.nodeName == 'd:description': dataset.description = fc.nodeValue elif propEl.nodeName == 'd:contact': dataset.extras['contact'] = fc.nodeValue elif propEl.nodeName == 'd:links' and fc.nodeValue: dataset.extras['links'] = '%s, %s' % ( dataset.extras['links'], fc.nodeValue) # ******************************************************** env = current_app.config.get('MIGRATION_URL') if env: fixedUrl = env else: fixedUrl = url_for('site.home', _external=True) fixedUrl = '%s/s/%s' % (fixedUrl[:fixedUrl.rfind('/', 0, -1)], DADOSGOVPATH) # empty previous dataset resources dataset.resources = [] # separate filename from extension filename = os.path.splitext(item.kwargs['filePath']) # ******************************************************** # get xml by api and set the dataset resource field: # filenameXml = '%s.xml' % (filename[0]) filenameXml = '%s.xml' % (item.remote_id) u = urllib2.urlopen( "http://%s/v1/%s/%s" % (DADOSGOVURL, item.kwargs['orgAcronym'], item.remote_id)) # create/open the local file to be written with open('%s/%s' % (DOWNLOADFILEPATH, filenameXml), 'wb') as f: # write file data f.write(u.read()) # get file size info meta = u.info() fileSize = int(meta.getheaders("Content-Length")[0]) fullPath = '%s/%s' % (fixedUrl, filenameXml) print fullPath # set the resource data for the dataset dataset.resources.append( Resource(title=dataset.title, description='Dados em formato xml', url=fullPath, mime='text/xml ', format='xml', filesize=fileSize, created_at=item.kwargs['createdOn'])) # ******************************************************** # ******************************************************** # get json by api and set the dataset resource field: filenameJson = '%s.json' % (item.remote_id) u = urllib2.urlopen( "http://%s/v1/%s/%s?format=json" % (DADOSGOVURL, item.kwargs['orgAcronym'], item.remote_id)) # create/open the local file to be written with open('%s/%s' % (DOWNLOADFILEPATH, filenameJson), 'wb') as f: # write file data f.write(u.read()) # get file size info meta = u.info() fileSize = int(meta.getheaders("Content-Length")[0]) fullPath = '%s/%s' % (fixedUrl, filenameJson) print fullPath # set the resource data for the dataset dataset.resources.append( Resource(title=dataset.title, description='Dados em formato json', url=fullPath, mime='application/json ', format='json', filesize=fileSize, created_at=item.kwargs['createdOn'])) # ******************************************************** # ******************************************************** # get original files using static path and ftp and set the dataset resource field if item.kwargs['filePath']: try: # https://dadosgovstorage.blob.core.windows.net/datasetsfiles/Acesso%20a%20Consultas%20M%C3%A9dicas%20pela%20Popula%C3%A7%C3%A3o%20Inscrita_636046701023924396.xlsx print '-- ** filePath ** --> %s' % item.kwargs['filePath'] try: urlSafe = urllib.quote(item.kwargs['filePath']) print "https://dadosgovstorage.blob.core.windows.net/datasetsfiles/%s" % ( urlSafe) u = urllib2.urlopen( "https://dadosgovstorage.blob.core.windows.net/datasetsfiles/%s" % (urlSafe)) # create/open the local file to be written with open( '%s/%s%s' % (DOWNLOADFILEPATH, item.remote_id, filename[1]), 'wb') as f: # write file data f.write(u.read()) # get file size info meta = u.info() fileSize = int( meta.getheaders("Content-Length")[0]) fullPath = '%s/%s%s' % (fixedUrl, item.remote_id, filename[1]) print fullPath # set the resource data for the dataset dataset.resources.append( Resource(title=dataset.title, description='Ficheiro original (%s)' % (item.kwargs['filePath']), url=fullPath, mime='application/vnd.ms-excel', format=filename[1][1:], filesize=fileSize, created_at=item.kwargs['createdOn'])) except KeyError: print '************ Error ************' print traceback.format_exc() print '*******************************' # file not found exception except IOError as ex: print 'Original file not found:' print ex # ******************************************************** print '--' print 'Returning %s' % dataset.title print '------------------------------------' with open(REPORT_FILE_PATH, 'a') as csvResFile: writer = csv.writer(csvResFile, delimiter=chr(9), quotechar=chr(34), quoting=csv.QUOTE_MINIMAL) writer.writerow([ item.remote_id, dataset.title, orgObj.name, json.dumps(dataset.tags, ensure_ascii=False), item.kwargs['filePath'], filenameXml, '', '[]' ]) # update the number of datasets associated with this organization orgObj.metrics['datasets'] += 1 orgObj.save() return dataset print 'No data returned from the API for the dataset %s' % ( item.remote_id) with open(REPORT_FILE_PATH, 'a') as csvResFile: writer = csv.writer(csvResFile, delimiter=chr(9), quotechar=chr(34), quoting=csv.QUOTE_MINIMAL) writer.writerow([ item.remote_id, '', '', '', item.kwargs['filePath'], '', '', '[]' ]) return None
def process(self, item): response = self.get(item.remote_id) encoding = chardet.detect(response.content)['encoding'] xml = self.parse_xml(response.content.decode(encoding)) metadata = xml['metadata'] # Resolve and remote id from metadata item.remote_id = metadata['id'] dataset = self.get_dataset(metadata['id']) dataset.title = metadata['title'] dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown') dataset.description = metadata['notes'] dataset.private = metadata['private'] dataset.tags = sorted(set(metadata['tags'])) if metadata.get('license_id'): dataset.license = License.objects.get(id=metadata['license_id']) if (metadata.get('temporal_coverage_from') and metadata.get('temporal_coverage_to')): dataset.temporal_coverage = db.DateRange( start=metadata['temporal_coverage_from'], end=metadata['temporal_coverage_to'] ) if (metadata.get('territorial_coverage_code') or metadata.get('territorial_coverage_granularity')): dataset.spatial = SpatialCoverage() if metadata.get('territorial_coverage_granularity'): dataset.spatial.granularity = GRANULARITIES.get( metadata['territorial_coverage_granularity']) if metadata.get('territorial_coverage_code'): dataset.spatial.zones = [ ZONES[metadata['territorial_coverage_code']]] dataset.resources = [] cle = get_by(metadata['resources'], 'format', 'cle') for row in metadata['resources']: if row['format'] == 'cle': continue else: resource = Resource( title=row['name'], description=( row['description'] + '\n\n' + SSL_COMMENT).strip(), filetype='remote', url=row['url'], format=row['format'] ) if resource.format == 'csv' and cle: resource.checksum = Checksum( type='sha256', value=self.get(cle['url']).text) if row.get('last_modified'): resource.modified = row['last_modified'] dataset.resources.append(resource) if metadata.get('author'): dataset.extras['author'] = metadata['author'] if metadata.get('author_email'): dataset.extras['author_email'] = metadata['author_email'] if metadata.get('maintainer'): dataset.extras['maintainer'] = metadata['maintainer'] if metadata.get('maintainer_email'): dataset.extras['maintainer_email'] = metadata['maintainer_email'] for extra in metadata['extras']: dataset.extras[extra['key']] = extra['value'] return dataset
def process(self, item): response = self.get_action('package_show', id=item.remote_id) data = self.validate(response['result'], self.schema) if type(data) == list: data = data[0] # Fix the remote_id: use real ID instead of not stable name item.remote_id = data['id'] # Skip if no resource if not len(data.get('resources', [])): msg = 'Dataset {0} has no record'.format(item.remote_id) raise HarvestSkipException(msg) dataset = self.get_dataset(item.remote_id) # Core attributes if not dataset.slug: dataset.slug = data['name'] dataset.title = data['title'] dataset.description = parse_html(data['notes']) # Detect license default_license = dataset.license or License.default() dataset.license = License.guess(data['license_id'], data['license_title'], default=default_license) dataset.tags = [t['name'] for t in data['tags'] if t['name']] dataset.created_at = data['metadata_created'] dataset.last_modified = data['metadata_modified'] dataset.extras['ckan:name'] = data['name'] temporal_start, temporal_end = None, None spatial_geom, spatial_zone = None, None for extra in data['extras']: key = extra['key'] value = extra['value'] if value is None or (isinstance(value, str) and not value.strip()): # Skip empty extras continue elif key == 'spatial': # GeoJSON representation (Polygon or Point) spatial_geom = json.loads(value) elif key == 'spatial-text': # Textual representation of the extent / location qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value)) qs = qs.valid_at(datetime.now()) if qs.count() == 1: spatial_zone = qs.first() else: dataset.extras['ckan:spatial-text'] = value log.debug('spatial-text value not handled: %s', value) elif key == 'spatial-uri': # Linked Data URI representing the place name dataset.extras['ckan:spatial-uri'] = value log.debug('spatial-uri value not handled: %s', value) elif key == 'frequency': # Update frequency freq = frequency_from_rdf(value) if freq: dataset.frequency = freq elif value in UPDATE_FREQUENCIES: dataset.frequency = value else: dataset.extras['ckan:frequency'] = value log.debug('frequency value not handled: %s', value) # Temporal coverage start elif key == 'temporal_start': temporal_start = daterange_start(value) # Temporal coverage end elif key == 'temporal_end': temporal_end = daterange_end(value) else: dataset.extras[extra['key']] = value if spatial_geom or spatial_zone: dataset.spatial = SpatialCoverage() if spatial_zone: dataset.spatial.zones = [spatial_zone] if spatial_geom: if spatial_geom['type'] == 'Polygon': coordinates = [spatial_geom['coordinates']] elif spatial_geom['type'] == 'MultiPolygon': coordinates = spatial_geom['coordinates'] else: raise HarvestException('Unsupported spatial geometry') dataset.spatial.geom = { 'type': 'MultiPolygon', 'coordinates': coordinates } if temporal_start and temporal_end: dataset.temporal_coverage = db.DateRange( start=temporal_start, end=temporal_end, ) # Remote URL dataset.extras['remote_url'] = self.dataset_url(data['name']) if data.get('url'): try: url = uris.validate(data['url']) except uris.ValidationError: dataset.extras['ckan:source'] = data['url'] else: # use declared `url` as `remote_url` if any dataset.extras['remote_url'] = url # Resources for res in data['resources']: if res['resource_type'] not in ALLOWED_RESOURCE_TYPES: continue try: resource = get_by(dataset.resources, 'id', UUID(res['id'])) except Exception: log.error('Unable to parse resource ID %s', res['id']) continue if not resource: resource = Resource(id=res['id']) dataset.resources.append(resource) resource.title = res.get('name', '') or '' resource.description = parse_html(res.get('description')) resource.url = res['url'] resource.filetype = 'remote' resource.format = res.get('format') resource.mime = res.get('mimetype') resource.hash = res.get('hash') resource.created = res['created'] resource.modified = res['last_modified'] resource.published = resource.published or resource.created return dataset