def process(self, item):
        '''Generate a random dataset from a fake identifier'''
        # Get or create a harvested dataset with this identifier.
        # Harvest metadata are already filled on creation.
        dataset = self.get_dataset(item.remote_id)

        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        dataset.title = faker.sentence()
        dataset.description = faker.text()
        dataset.tags = list(set(faker.words(nb=faker.pyint())))

        # Resources
        for i in range(faker.pyint()):
            dataset.resources.append(
                Resource(title=faker.sentence(),
                         description=faker.text(),
                         url=faker.url(),
                         filetype='remote',
                         mime=faker.mime_type(category='text'),
                         format=faker.file_extension(category='text'),
                         filesize=faker.pyint()))

        return dataset
Esempio n. 2
0
    def process(self, item):
        kwargs = item.kwargs
        item.remote_id = kwargs['remote_id']
        dataset = self.get_dataset(item.remote_id)
        dataset.title = kwargs['title']
        dataset.tags = kwargs.get('tags', '').split(',')
        dataset.private = True
        dataset.frequency = kwargs.get('dataset_frequency', 'annual')
        license = kwargs.get('dataset_license', 'cc-zero')
        dataset.license = License.objects.get(id=license)
        resources = kwargs['resources']

        description = u"Ce jeu de données contient: <br>"
        for resource in resources:
            description += resource['title'] + "<br>"

        dataset.description = description

        # Force recreation of all resources
        dataset.resources = []
        for resource in resources:
            new_resource = Resource(title=resource['title'],
                                    url=resource['link'],
                                    filetype=resource['filetype'],
                                    format=resource['format'],
                                    description=resource.get(
                                        'description', ''))
            dataset.resources.append(new_resource)
        return dataset
Esempio n. 3
0
    def process(self, item):
        dataset = self.get_dataset(item.remote_id)

        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        kwargs = item.kwargs
        dataset.title = kwargs['title']
        dataset.tags = ["statec-harvesting"]
        resources = kwargs['resources']

        description = u"Ce jeu de données contient: <br>"
        for resource in resources:
            description += resource['title'] + "<br>"
        description += "<br>---------------------------------------"
        description += """<br> Automatically synched from
                    portail statistique (category %s)""" % dataset.title

        dataset.description = description

        # Force recreation of all resources
        dataset.resources = []
        for resource in resources:
            url = resource['link']
            url = url.replace('tableView', 'download')
            params = {
                'IF_DOWNLOADFORMAT': 'csv',
                'IF_DOWNLOAD_ALL_ITEMS': 'yes'
            }

            url_parts = list(urlparse.urlparse(url))
            query = dict(urlparse.parse_qsl(url_parts[4]))
            query.update(params)
            url_parts[4] = urlencode(query)
            download_url = urlparse.urlunparse(url_parts)

            new_resource = Resource(title=resource['title'],
                                    url=download_url,
                                    filetype='remote',
                                    format='csv')
            if len(
                    filter(
                        lambda d: d['title'] in [resource['title']] and d[
                            'url'] in [download_url],
                        dataset.resources)) == 0:  # noqa
                dataset.resources.append(new_resource)
            else:
                pass

        return dataset
Esempio n. 4
0
 def process_resources(self, dataset, data, formats):
     dataset_id = data["datasetid"]
     ods_metadata = data["metas"]
     description = self.description_from_fields(data['fields'])
     for format in formats:
         label, udata_format, mime = self.FORMATS[format]
         resource = Resource(title='Export au format {0}'.format(label),
                             description=description,
                             filetype='remote',
                             url=self._get_download_url(dataset_id, format),
                             format=udata_format,
                             mime=mime)
         resource.modified = ods_metadata["modified"]
         dataset.resources.append(resource)
Esempio n. 5
0
    def process(self, item):
        dataset = self.get_dataset(item.remote_id)
        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        kwargs = item.kwargs
        item = kwargs['item']

        dataset.title = item['title']
        dataset.license = License.guess('cc-by')
        dataset.tags = ["snig.dgterritorio.gov.pt"]
        dataset.description = item['description']

        if item.get('date'):
            dataset.created_at = item['date']

        for keyword in item.get('keywords'):
            dataset.tags.append(keyword)

        # Force recreation of all resources
        dataset.resources = []

        for resource in item.get("resources"):

            parsed = urlparse.urlparse(resource['url'])
            try:
                format = str(urlparse.parse_qs(parsed.query)['service'][0])
            except KeyError:
                format = resource['url'].split('.')[-1]

            new_resource = Resource(title=item['title'],
                                    url=resource['url'],
                                    filetype='remote',
                                    format=format)

            dataset.resources.append(new_resource)

        dataset.extras['harvest:name'] = self.source.name

        return dataset
Esempio n. 6
0
    def process(self, item):
        dataset = self.get_dataset(item.remote_id)
        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        kwargs = item.kwargs
        dataset.title = kwargs['title']
        dataset.license = License.guess('cc-by')
        dataset.tags = ["apambiente.pt"]
        item = kwargs['item']

        dataset.description = item['summary']

        if kwargs['date']:
            dataset.created_at = kwargs['date']

        # Force recreation of all resources
        dataset.resources = []
        for resource in item['links']:
            url = resource['href'].replace('\\', '').replace(' ', '%20')
            type = resource['type']

            if type == 'details':
                dataset.description += "<br>"
                dataset.description += "<br>Mais detalhes : <a href=\"%s\" target=\"_blank\">%s</a>" % (
                    url, dataset.title)

            if type == 'open':
                url_parts = list(urlparse.urlparse(url))
                parts = url_parts[2].split('.')
                format = parts[-1] if len(parts) > 1 else 'wms'
                new_resource = Resource(title=dataset.title,
                                        url=url,
                                        filetype='remote',
                                        format=format.lower())
                dataset.resources.append(new_resource)

        return dataset
Esempio n. 7
0
    def process(self, item):
        dataset = self.get_dataset(item.remote_id)
        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        kwargs = item.kwargs
        dataset.title = kwargs['title']
        dataset.license = License.guess('cc-by')
        dataset.tags = ["apambiente.pt"]
        item = kwargs['item']

        dataset.description = item.get('description')

        if kwargs['date']:
            dataset.created_at = kwargs['date']

        # Force recreation of all resources
        dataset.resources = []

        url = item.get('url')

        if item.get('type') == "liveData":
            type = "wms"
        else:
            type = url.split('.')[-1].lower()
            if len(type) > 3:
                type = "wms"

        new_resource = Resource(title=dataset.title,
                                url=url,
                                filetype='remote',
                                format=type)
        dataset.resources.append(new_resource)

        return dataset
Esempio n. 8
0
 def process(self, item):
     # dataset_id is found in item.remote_id
     dataset_id = item.remote_id
     # we want to avoid a same id from another
     # source so we add 'instn' in front of it
     dataset = self.get_dataset('instn-%s' % item.remote_id)
     # We saved the theme name as a tag and the dataset name
     # as the title
     dataset.title = item.kwargs['dataset_name']
     dataset.tags = [item.kwargs['theme_name']]
     # We empty the existing resources (= the different "files" for this dataset)
     # Here we will have only one probably
     dataset.resources = []
     # Getting the xls file url
     dataset_request = requests.get(
         'http://beta.ins.tn/fr/node/get/nojs/%s' % dataset_id)
     dataset_soup = BeautifulSoup(dataset_request.content, 'html.parser')
     dataset_xls = dataset_soup.find_all('li', {"class": "data"})
     for xls in dataset_xls:
         url = xls.find('a').attrs['href']
         # We have the url, let's do a HEAD request
         # to get the file size without downloading the whole file
         size_request = requests.head(url)
         file_size = size_request.headers['Content-length']
         # We're good, let's add the file
         # to the dataset resources
         dataset.resources.append(
             Resource(
                 title=dataset.title,
                 url=url,
                 filetype='remote',
                 mime=
                 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                 format='xlsx',
                 filesize=file_size))
     # at the end, always return the dataset
     return dataset
Esempio n. 9
0
 def get_resource(self, dataset, url):
     resource = get_by(dataset.resources, 'url', url)
     if not resource:
         return True, Resource(url=url)
     return False, resource
Esempio n. 10
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], schema)

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = data['notes']
        dataset.license = License.objects(id=data['license_id']).first()
        # dataset.license = license or License.objects.get(id='notspecified')
        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
                print 'spatial-text', extra['value']
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
                print 'spatial-uri', extra['value']
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                print 'temporal_start', extra['value']
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                print 'temporal_end', extra['value']
                temporal_end = daterange_end(extra['value'])
                continue
            # else:
            #     print extra['key'], extra['value']
            dataset.extras[extra['key']] = extra['value']

        if spatial_geom:
            dataset.spatial = SpatialCoverage()
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            dataset.extras['remote_url'] = data['url']

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = res.get('description')
            resource.url = res['url']
            resource.filetype = ('api' if res['resource_type'] == 'api'
                                 else 'remote')
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
Esempio n. 11
0
    def process(self, item):
        dataset = self.get_dataset(item.remote_id)

        # Here you comes your implementation. You should :
        # - fetch the remote dataset (if necessary)
        # - validate the fetched payload
        # - map its content to the dataset fields
        # - store extra significant data in the `extra` attribute
        # - map resources data

        # check if this is a new dataset and give it a title
        if dataset.title is None:
            dataset.title = ''
            pass

        # Create the new list of tags and mmake sure the list has only unique tags
        tags = []
        for tag in dataset.tags:
            tags.append(tag)
            pass
        tags.append("statec-harvesting")
        tags = list(set(tags))
        dataset.tags = tags

        # return the gathered resources of the items
        # or return the updated list of all the resources of the given dataset
        resources = self.__update_resources(item, dataset)

        if dataset.title == '':
            dataset.title = item.kwargs['title']

        # Rebuild the dataset description
        description = u"This dataset includes the following resource(s): <br>"
        for resource in resources:
            description += resource.title + "<br>"
        description += "<br>---------------------------------------"
        description += """<br> Automatically synched from
                    portail statistique (category %s)""" % dataset.title

        dataset.description = description

        # Force recreation of all resources
        dataset.resources = []
        for resource in resources:
            url = resource.url
            download_url = url

            # check if the resource format is csv and handle the link creation accordingly
            if resource.format == 'csv':
                url = url.replace('tableView', 'download')
                params = {
                    'IF_DOWNLOADFORMAT': 'csv',
                    'IF_DOWNLOAD_ALL_ITEMS': 'yes'
                }

                url_parts = list(urlparse.urlparse(url))
                query = dict(urlparse.parse_qsl(url_parts[4]))
                query.update(params)
                url_parts[4] = urlencode(query)
                download_url = urlparse.urlunparse(url_parts)
                pass

            # The newly created resource
            new_resource = Resource(
                title=resource.title,
                description=resource.title,
                url=download_url,
                filetype='remote',
                format=resource.format
            )

            dataset.resources.append(new_resource)

        return dataset
Esempio n. 12
0
    def remote_datasets(self):
        response = self.get('package_list')
        for name in response['result']:
            details = self.get('package_show', {'id': name})['result']
            dataset = self.get_harvested(Dataset, details['id'])

            # Core attributes
            dataset.slug = details['name']
            dataset.title = details['title']
            dataset.description = details.get('notes', 'No description')
            dataset.license = License.objects(
                id=details['license_id']).first() or License.objects.get(
                    id='notspecified')
            dataset.tags = [tag['name'].lower() for tag in details['tags']]

            dataset.frequency = self.map('frequency', details) or 'unknown'
            dataset.created_at = parse(details['metadata_created'])
            dataset.last_modified = parse(details['metadata_modified'])

            if any_field(details, 'territorial_coverage',
                         'territorial_coverage_granularity'):
                coverage = TerritorialCoverage(
                    codes=[
                        code.strip() for code in details.get(
                            'territorial_coverage', '').split(',')
                        if code.strip()
                    ],
                    granularity=self.map('territorial_coverage_granularity',
                                         details),
                )
                dataset.extras['territorial_coverage'] = coverage
                try:
                    dataset.spatial = territorial_to_spatial(dataset)
                except Exception as e:
                    print 'Error while processing spatial coverage for {0}:'.format(
                        dataset.title), e

            if all_field(details, 'temporal_coverage_from',
                         'temporal_coverage_to'):
                try:
                    dataset.temporal_coverage = db.DateRange(
                        start=daterange_start(
                            details.get('temporal_coverage_from')),
                        end=daterange_end(details.get('temporal_coverage_to')),
                    )
                except:
                    log.error(
                        'Unable to parse temporal coverage for dataset %s',
                        details['id'])

            # Organization
            if details.get('organization'):
                dataset.organization = self.get_harvested(
                    Organization, details['organization']['id'], False)
            else:
                # Need to fetch user from roles
                roles = self.get('roles_show',
                                 {'domain_object': name})['result']['roles']
                for role in roles:
                    if role['role'] == 'admin' and role['context'] == 'Package':
                        dataset.owner = self.get_harvested(
                            User, role['user_id'])
                        break

            # Supplier
            if details.get('supplier_id'):
                dataset.supplier = self.get_harvested(Organization,
                                                      details['supplier_id'],
                                                      False)

            # Remote URL
            if details.get('url'):
                dataset.extras['remote_url'] = details['url']

            # Extras
            if 'extras' in details:
                extra_mapping = self.harvester.mapping.get('from_extras', {})
                for extra in details['extras']:
                    if extra['key'] in self.harvester.mapping:
                        value = self.harvester.mapping[extra['key']].get(
                            extra['value'])
                    else:
                        value = extra['value']
                    if extra['key'] in extra_mapping:
                        setattr(dataset, extra_mapping[extra['key']], value)
                    else:
                        dataset.extras[extra['key']] = value

            # Resources
            for res in details['resources']:
                try:
                    resource = get_by(dataset.resources, 'id', UUID(res['id']))
                except:
                    log.error('Unable to parse resource %s', res['id'])
                    continue
                if not resource:
                    resource = Resource(id=res['id'])
                    dataset.resources.append(resource)
                resource.title = res.get('name', '') or ''
                resource.url = res['url']
                resource.description = res.get('description')
                resource.format = res.get('format')
                resource.hash = res.get('hash')
                resource.created = parse(res['created'])
                resource.modified = parse(res['revision_timestamp'])
                resource.published = resource.published or resource.created
            yield dataset

            if dataset.id:
                followers = self.get('dataset_follower_list',
                                     {'id': name})['result']
                for follower in followers:
                    user = self.get_harvested(User, follower['id'], False)
                    if user:
                        follow, created = FollowDataset.objects.get_or_create(
                            follower=user, following=dataset)
Esempio n. 13
0
    def process(self, item):
        '''Return the INE datasets'''

        dataset = self.get_dataset(item.remote_id)

        # get remote data for dataset
        req = requests.get("https://www.ine.pt/ine/xml_indic.jsp",
                           params={
                               'varcd': item.remote_id,
                               'lang': 'PT',
                               'opc': '1'
                           },
                           headers={'charset': 'utf8'})

        returnedData = req.content
        print 'Get metadata for %s' % (item.remote_id)

        keywordSet = set()
        dataset.license = License.guess('cc-by')
        dataset.resources = []
        doc = minidom.parseString(returnedData)
        properties = doc.getElementsByTagName('indicator')
        # go through the API dataset information
        for propNode in properties:
            for childNode in propNode.childNodes:
                # print childNode
                fc = childNode.firstChild
                if fc:
                    if childNode.nodeName == 'keywords':
                        for obj in childNode.childNodes:
                            # INE needs to create a proper xml file...
                            valueData = obj.nodeValue
                            # need to ignore the ',' nodes
                            if obj.nodeValue != ',':
                                # need to ignore the last "," usually after the INE value
                                if valueData[-1:] == ',':
                                    valueData = valueData[:-1]
                                # this removes redundant keywords that sometimes show with different cases (lower and upper)
                                keywordSet.add(valueData.lower())

                        dataset.tags = list(keywordSet)
                        dataset.tags.append('ine.pt')
                        dataset.frequency = 'unknown'

                    elif childNode.nodeName == 'title':
                        dataset.title = fc.nodeValue

                    elif childNode.nodeName == 'description':
                        dataset.description = fc.nodeValue

                    elif childNode.nodeName == 'html':
                        for obj in childNode.childNodes:
                            if obj.nodeName == 'bdd_url':
                                dataset.description += "\n " + obj.firstChild.nodeValue

                    elif childNode.nodeName == 'json':
                        for obj in childNode.childNodes:
                            if obj.nodeName == 'json_dataset':
                                dataset.resources.append(
                                    Resource(
                                        title='Dataset json url',
                                        description='Dataset em formato json',
                                        url=obj.firstChild.nodeValue,
                                        filetype='remote',
                                        format='json'))
                            elif obj.nodeName == 'json_metainfo':
                                dataset.resources.append(
                                    Resource(
                                        title='Json metainfo url',
                                        description='Metainfo em formato json',
                                        url=obj.firstChild.nodeValue,
                                        filetype='remote',
                                        format='json'))
        return dataset
Esempio n. 14
0
    def process(self, item):
        response = self.get(item.remote_id)
        encoding = chardet.detect(response.content)['encoding']
        xml = self.parse_xml(response.content.decode(encoding))
        metadata = xml['metadata']

        # Resolve and remote id from metadata
        item.remote_id = metadata['id']
        dataset = self.get_dataset(metadata['id'])

        dataset.title = metadata['title']
        dataset.frequency = FREQUENCIES.get(metadata['frequency'], 'unknown')
        dataset.description = metadata['notes']
        dataset.private = metadata['private']
        dataset.tags = sorted(set(metadata['tags']))

        if metadata.get('license_id'):
            dataset.license = License.objects.get(id=metadata['license_id'])

        if (metadata.get('temporal_coverage_from')
                and metadata.get('temporal_coverage_to')):
            dataset.temporal_coverage = db.DateRange(
                start=metadata['temporal_coverage_from'],
                end=metadata['temporal_coverage_to'])

        if (metadata.get('territorial_coverage_code')
                or metadata.get('territorial_coverage_granularity')):
            dataset.spatial = SpatialCoverage()

            if metadata.get('territorial_coverage_granularity'):
                dataset.spatial.granularity = GRANULARITIES.get(
                    metadata['territorial_coverage_granularity'])

            if metadata.get('territorial_coverage_code'):
                dataset.spatial.zones = [
                    ZONES[metadata['territorial_coverage_code']]
                ]

        dataset.resources = []
        cle = get_by(metadata['resources'], 'format', 'cle')
        for row in metadata['resources']:
            if row['format'] == 'cle':
                continue
            else:
                resource = Resource(title=row['name'],
                                    description=(row['description'] + '\n\n' +
                                                 SSL_COMMENT).strip(),
                                    filetype='remote',
                                    url=row['url'],
                                    format=row['format'])
                if resource.format == 'csv' and cle:
                    resource.checksum = Checksum(type='sha256',
                                                 value=self.get(
                                                     cle['url']).text)
                if row.get('last_modified'):
                    resource.modified = row['last_modified']
                dataset.resources.append(resource)

        if metadata.get('author'):
            dataset.extras['author'] = metadata['author']
        if metadata.get('author_email'):
            dataset.extras['author_email'] = metadata['author_email']
        if metadata.get('maintainer'):
            dataset.extras['maintainer'] = metadata['maintainer']
        if metadata.get('maintainer_email'):
            dataset.extras['maintainer_email'] = metadata['maintainer_email']
        for extra in metadata['extras']:
            dataset.extras[extra['key']] = extra['value']

        return dataset
Esempio n. 15
0
 def on_form_valid(self, form):
     resource = Resource()
     form.populate_obj(resource)
     self.dataset.add_community_resource(resource)
     return redirect(url_for('datasets.show', dataset=self.dataset))
Esempio n. 16
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect Org
        organization_acronym = data['organization']['name']
        orgObj = Organization.objects(acronym=organization_acronym).first()
        if orgObj:
            #print 'Found %s' % orgObj.acronym
            dataset.organization = orgObj
        else:
            orgObj = Organization()
            orgObj.acronym = organization_acronym
            orgObj.name = data['organization']['title']
            orgObj.description = data['organization']['description']
            orgObj.save()
            #print 'Created %s' % orgObj.acronym

            dataset.organization = orgObj

        # Detect license
        default_license = self.harvest_config.get('license', License.default())
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.tags.append(urlparse(self.source.url).hostname)

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.frequency = 'unknown'
        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom = None

        for extra in data['extras']:
            # GeoJSON representation (Polygon or Point)
            if extra['key'] == 'spatial':
                spatial_geom = json.loads(extra['value'])
            #  Textual representation of the extent / location
            elif extra['key'] == 'spatial-text':
                log.debug('spatial-text value not handled')
            # Linked Data URI representing the place name
            elif extra['key'] == 'spatial-uri':
                log.debug('spatial-uri value not handled')
            # Update frequency
            elif extra['key'] == 'frequency':
                print 'frequency', extra['value']
            # Temporal coverage start
            elif extra['key'] == 'temporal_start':
                temporal_start = daterange_start(extra['value'])
                continue
            # Temporal coverage end
            elif extra['key'] == 'temporal_end':
                temporal_end = daterange_end(extra['value'])
                continue
            dataset.extras[extra['key']] = extra['value']

        # We don't want spatial to be added on harvester
        if self.harvest_config.get('geozones', False):
            dataset.spatial = SpatialCoverage()
            dataset.spatial.zones = []
            for zone in self.harvest_config.get('geozones'):
                geo_zone = GeoZone.objects.get(id=zone)
                dataset.spatial.zones.append(geo_zone)
        #
        # if spatial_geom:
        #     dataset.spatial = SpatialCoverage()
        #     if spatial_geom['type'] == 'Polygon':
        #         coordinates = [spatial_geom['coordinates']]
        #     elif spatial_geom['type'] == 'MultiPolygon':
        #         coordinates = spatial_geom['coordinates']
        #     else:
        #         HarvestException('Unsupported spatial geometry')
        #     dataset.spatial.geom = {
        #         'type': 'MultiPolygon',
        #         'coordinates': coordinates
        #     }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['remote_url'] = self.dataset_url(data['name'])
                dataset.extras['ckan:source'] = data['url']
            else:
                dataset.extras['remote_url'] = url

        dataset.extras['harvest:name'] = self.source.name

        current_resources = [
            str(resource.id) for resource in dataset.resources
        ]
        fetched_resources = []
        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue

            #Ignore invalid Resources
            try:
                url = uris.validate(res['url'])
            except uris.ValidationError:
                continue

            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue

            fetched_resources.append(str(res['id']))
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        # Clean up old resources removed from source
        for resource_id in current_resources:
            if resource_id not in fetched_resources:
                try:
                    resource = get_by(dataset.resources, 'id',
                                      UUID(resource_id))
                except Exception:
                    log.error('Unable to parse resource ID %s', resource_id)
                    continue
                else:
                    if resource and not self.dryrun:
                        dataset.resources.remove(resource)

        return dataset
Esempio n. 17
0
    def process(self, item):
        '''Return the DadosGov datasets with the corresponding original and xml file'''
        global REPORT_FILE_PATH, DADOSGOVPATH, DOWNLOADFILEPATH, DADOSGOVURL
        reload(sys)
        sys.setdefaultencoding('utf8')

        # Get or create a harvested dataset with this identifier.
        dataset = self.get_dataset(item.remote_id)
        # get the organization object, no check necessary, it should always exist
        orgObj = Organization.objects(id=item.kwargs['orgId']).first()

        print '------------------------------------'
        print 'Processing %s (%s)' % (dataset.title, item.remote_id)
        # print item.kwargs
        # print '--'

        # set additional vars
        dataset.tags = ['migrado']
        dataset.extras = {}
        dataset.organization = orgObj.id
        dataset.license = License.guess('cc-by')
        dataset.resources = []

        # *********************************************
        # go through the DB dataset information
        dataset.created_at = item.kwargs['createdOn']
        dataset.extras['links'] = item.kwargs['serviceUrl']
        # ********************************************************

        # ********************************************************
        req = requests.get(
            "http://%s/v1/%s/TableMetadata" %
            (DADOSGOVURL, item.kwargs['orgAcronym']),
            params={'$filter': "partitionkey eq '%s'" % item.remote_id},
            headers={'charset': 'utf8'})

        xmlRootData = req.content

        propertiesDoc = minidom.parseString(xmlRootData)
        propertiesStuff = propertiesDoc.getElementsByTagName('content')
        propEl = propertiesDoc.getElementsByTagNameNS('*', 'properties')
        if propEl:
            propertiesElements = propEl[0].childNodes

            # go through the API dataset information
            for propEl in propertiesElements:
                if propEl.nodeType == Node.ELEMENT_NODE:
                    fc = propEl.firstChild

                    if fc:
                        if propEl.nodeName == 'd:category':
                            dataset.tags.append(fc.nodeValue)

                        elif propEl.nodeName == 'd:keywords':
                            dataset.tags.extend([
                                currentTag.strip()
                                for currentTag in fc.nodeValue.split(',')
                            ])

                        # elif propEl.nodeName == 'd:PartitionKey':
                        #     dataset.slug = fc.nodeValue

                        elif propEl.nodeName == 'd:nameexternal':
                            dataset.title = fc.nodeValue

                        elif propEl.nodeName == 'd:description':
                            dataset.description = fc.nodeValue

                        elif propEl.nodeName == 'd:contact':
                            dataset.extras['contact'] = fc.nodeValue

                        elif propEl.nodeName == 'd:links' and fc.nodeValue:
                            dataset.extras['links'] = '%s, %s' % (
                                dataset.extras['links'], fc.nodeValue)
            # ********************************************************

            env = current_app.config.get('MIGRATION_URL')
            if env:
                fixedUrl = env
            else:
                fixedUrl = url_for('site.home', _external=True)

            fixedUrl = '%s/s/%s' % (fixedUrl[:fixedUrl.rfind('/', 0, -1)],
                                    DADOSGOVPATH)
            # empty previous dataset resources
            dataset.resources = []

            # separate filename from extension
            filename = os.path.splitext(item.kwargs['filePath'])

            # ********************************************************
            # get xml by api and set the dataset resource field:

            # filenameXml = '%s.xml' % (filename[0])
            filenameXml = '%s.xml' % (item.remote_id)
            u = urllib2.urlopen(
                "http://%s/v1/%s/%s" %
                (DADOSGOVURL, item.kwargs['orgAcronym'], item.remote_id))
            # create/open the local file to be written
            with open('%s/%s' % (DOWNLOADFILEPATH, filenameXml), 'wb') as f:
                # write file data
                f.write(u.read())

                # get file size info
                meta = u.info()
                fileSize = int(meta.getheaders("Content-Length")[0])
                fullPath = '%s/%s' % (fixedUrl, filenameXml)
                print fullPath

                # set the resource data for the dataset
                dataset.resources.append(
                    Resource(title=dataset.title,
                             description='Dados em formato xml',
                             url=fullPath,
                             mime='text/xml ',
                             format='xml',
                             filesize=fileSize,
                             created_at=item.kwargs['createdOn']))
            # ********************************************************

            # ********************************************************
            # get json by api and set the dataset resource field:

            filenameJson = '%s.json' % (item.remote_id)
            u = urllib2.urlopen(
                "http://%s/v1/%s/%s?format=json" %
                (DADOSGOVURL, item.kwargs['orgAcronym'], item.remote_id))
            # create/open the local file to be written
            with open('%s/%s' % (DOWNLOADFILEPATH, filenameJson), 'wb') as f:
                # write file data
                f.write(u.read())

                # get file size info
                meta = u.info()
                fileSize = int(meta.getheaders("Content-Length")[0])
                fullPath = '%s/%s' % (fixedUrl, filenameJson)
                print fullPath

                # set the resource data for the dataset
                dataset.resources.append(
                    Resource(title=dataset.title,
                             description='Dados em formato json',
                             url=fullPath,
                             mime='application/json ',
                             format='json',
                             filesize=fileSize,
                             created_at=item.kwargs['createdOn']))
            # ********************************************************

            # ********************************************************
            # get original files using static path and ftp and set the dataset resource field

            if item.kwargs['filePath']:
                try:
                    # https://dadosgovstorage.blob.core.windows.net/datasetsfiles/Acesso%20a%20Consultas%20M%C3%A9dicas%20pela%20Popula%C3%A7%C3%A3o%20Inscrita_636046701023924396.xlsx
                    print '-- ** filePath ** --> %s' % item.kwargs['filePath']
                    try:
                        urlSafe = urllib.quote(item.kwargs['filePath'])
                        print "https://dadosgovstorage.blob.core.windows.net/datasetsfiles/%s" % (
                            urlSafe)
                        u = urllib2.urlopen(
                            "https://dadosgovstorage.blob.core.windows.net/datasetsfiles/%s"
                            % (urlSafe))

                        # create/open the local file to be written
                        with open(
                                '%s/%s%s' %
                            (DOWNLOADFILEPATH, item.remote_id, filename[1]),
                                'wb') as f:
                            # write file data
                            f.write(u.read())

                            # get file size info
                            meta = u.info()
                            fileSize = int(
                                meta.getheaders("Content-Length")[0])
                            fullPath = '%s/%s%s' % (fixedUrl, item.remote_id,
                                                    filename[1])
                            print fullPath

                            # set the resource data for the dataset
                            dataset.resources.append(
                                Resource(title=dataset.title,
                                         description='Ficheiro original (%s)' %
                                         (item.kwargs['filePath']),
                                         url=fullPath,
                                         mime='application/vnd.ms-excel',
                                         format=filename[1][1:],
                                         filesize=fileSize,
                                         created_at=item.kwargs['createdOn']))
                    except KeyError:
                        print '************ Error ************'
                        print traceback.format_exc()
                        print '*******************************'

                # file not found exception
                except IOError as ex:
                    print 'Original file not found:'
                    print ex

            # ********************************************************

            print '--'
            print 'Returning %s' % dataset.title
            print '------------------------------------'
            with open(REPORT_FILE_PATH, 'a') as csvResFile:
                writer = csv.writer(csvResFile,
                                    delimiter=chr(9),
                                    quotechar=chr(34),
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow([
                    item.remote_id, dataset.title, orgObj.name,
                    json.dumps(dataset.tags, ensure_ascii=False),
                    item.kwargs['filePath'], filenameXml, '', '[]'
                ])

            # update the number of datasets associated with this organization
            orgObj.metrics['datasets'] += 1
            orgObj.save()

            return dataset

        print 'No data returned from the API for the dataset %s' % (
            item.remote_id)
        with open(REPORT_FILE_PATH, 'a') as csvResFile:
            writer = csv.writer(csvResFile,
                                delimiter=chr(9),
                                quotechar=chr(34),
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([
                item.remote_id, '', '', '', item.kwargs['filePath'], '', '',
                '[]'
            ])

        return None
Esempio n. 18
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect license
        default_license = dataset.license or License.default()
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom, spatial_zone = None, None

        for extra in data['extras']:
            key = extra['key']
            value = extra['value']
            if value is None or (isinstance(value, str) and not value.strip()):
                # Skip empty extras
                continue
            elif key == 'spatial':
                # GeoJSON representation (Polygon or Point)
                spatial_geom = json.loads(value)
            elif key == 'spatial-text':
                # Textual representation of the extent / location
                qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
                qs = qs.valid_at(datetime.now())
                if qs.count() == 1:
                    spatial_zone = qs.first()
                else:
                    dataset.extras['ckan:spatial-text'] = value
                    log.debug('spatial-text value not handled: %s', value)
            elif key == 'spatial-uri':
                # Linked Data URI representing the place name
                dataset.extras['ckan:spatial-uri'] = value
                log.debug('spatial-uri value not handled: %s', value)
            elif key == 'frequency':
                # Update frequency
                freq = frequency_from_rdf(value)
                if freq:
                    dataset.frequency = freq
                elif value in UPDATE_FREQUENCIES:
                    dataset.frequency = value
                else:
                    dataset.extras['ckan:frequency'] = value
                    log.debug('frequency value not handled: %s', value)
            # Temporal coverage start
            elif key == 'temporal_start':
                temporal_start = daterange_start(value)
            # Temporal coverage end
            elif key == 'temporal_end':
                temporal_end = daterange_end(value)
            else:
                dataset.extras[extra['key']] = value

        if spatial_geom or spatial_zone:
            dataset.spatial = SpatialCoverage()

        if spatial_zone:
            dataset.spatial.zones = [spatial_zone]

        if spatial_geom:
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                raise HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        dataset.extras['remote_url'] = self.dataset_url(data['name'])
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['ckan:source'] = data['url']
            else:
                # use declared `url` as `remote_url` if any
                dataset.extras['remote_url'] = url

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset