Example #1
0
def load(filename, drop=False):
    '''Load a GeoZones Bundle'''
    tmp = tempfile.mkdtemp()

    if filename.startswith('http'):
        log.info('Downloading GeoZones bundle: %s', filename)
        filename, _ = urlretrieve(filename, join(tmp, 'geozones.tar.xz'))

    log.info('Extracting GeoZones bundle')
    with contextlib.closing(lzma.LZMAFile(filename)) as xz:
        with tarfile.open(fileobj=xz) as f:
            f.extractall(tmp)

    log.info('Loading GeoZones levels')

    if (drop):
        log.info('Dropping existing levels')
        GeoLevel.drop_collection()

    log.info('Loading levels.json')
    total = 0
    with open(join(tmp, 'levels.json')) as fp:
        levels = json.load(fp)

    for level in levels:
        GeoLevel.objects.create(id=level['id'], name=level['label'],
                                parents=level['parents'])
        total += 1
    log.info('Loaded {0} levels'.format(total))

    if (drop):
        log.info('Dropping existing spatial zones')
        GeoZone.drop_collection()

    log.info('Loading zones.json')
    total = 0
    with open(join(tmp, 'zones.json')) as fp:
        geozones = json.load(fp)

    for zone in geozones['features']:
        props = zone['properties']
        GeoZone.objects.create(
            id=zone['id'],
            level=props['level'],
            code=props['code'],
            name=props['name'],
            keys=props['keys'],
            parents=props['parents'],
            population=props.get('population'),
            area=props.get('area'),
            geom=zone['geometry']
        )
        total += 1

    log.info('Loaded {0} zones'.format(total))

    log.info('Cleaning temporary working directory')
    shutil.rmtree(tmp)
Example #2
0
def load(filename, drop=False):
    '''Load a GeoZones Bundle'''
    if filename.startswith('http'):
        log.info('Downloading GeoZones bundle: %s', filename)
        filename, _ = urlretrieve(filename, tmp.path('geozones.tar.xz'))

    log.info('Extracting GeoZones bundle')
    with contextlib.closing(lzma.LZMAFile(filename)) as xz:
        with tarfile.open(fileobj=xz) as f:
            f.extractall(tmp.root)

    log.info('Loading GeoZones levels')

    if drop:
        log.info('Dropping existing levels')
        GeoLevel.drop_collection()

    log.info('Loading levels.msgpack')
    levels_filepath = tmp.path('levels.msgpack')
    with open(levels_filepath) as fp:
        unpacker = msgpack.Unpacker(fp, encoding=str('utf-8'))
        for i, level in enumerate(unpacker, start=1):
            GeoLevel.objects.create(
                id=level['id'],
                name=level['label'],
                parents=level['parents']
            )
    os.remove(levels_filepath)
    log.info('Loaded {total} levels'.format(total=i))

    if drop:
        log.info('Dropping existing spatial zones')
        GeoZone.drop_collection()

    log.info('Loading zones.msgpack')
    zones_filepath = tmp.path('zones.msgpack')
    with open(zones_filepath) as fp:
        unpacker = msgpack.Unpacker(fp, encoding=str('utf-8'))
        for i, geozone in enumerate(unpacker, start=1):
            GeoZone.objects.create(
                id=geozone['_id'],
                level=geozone['level'],
                code=geozone['code'],
                name=geozone['name'],
                keys=geozone.get('keys'),
                parents=geozone.get('parents'),
                population=geozone.get('population'),
                dbpedia=geozone.get('dbpedia'),
                logo=geozone.get('flag') or geozone.get('blazon'),
                wikipedia=geozone.get('wikipedia'),
                area=geozone.get('area'),
                geom=geozone['geom']
            )
    os.remove(zones_filepath)
    log.info('Loaded {total} zones'.format(total=i))

    shutil.rmtree(tmp.path('translations'))  # Not in use for now.
def load(filename, drop=False):
    '''Load a GeoZones Bundle'''
    if filename.startswith('http'):
        log.info('Downloading GeoZones bundle: %s', filename)
        filename, _ = urlretrieve(filename, tmp.path('geozones.tar.xz'))

    log.info('Extracting GeoZones bundle')
    with contextlib.closing(lzma.LZMAFile(filename)) as xz:
        with tarfile.open(fileobj=xz) as f:
            f.extractall(tmp.root)

    log.info('Loading GeoZones levels')

    if drop:
        log.info('Dropping existing levels')
        GeoLevel.drop_collection()

    log.info('Loading levels.msgpack')
    levels_filepath = tmp.path('levels.msgpack')
    with open(levels_filepath) as fp:
        unpacker = msgpack.Unpacker(fp, encoding=str('utf-8'))
        for i, level in enumerate(unpacker, start=1):
            GeoLevel.objects.create(id=level['id'],
                                    name=level['label'],
                                    parents=level['parents'],
                                    admin_level=level.get('admin_level'))
    os.remove(levels_filepath)
    log.info('Loaded {total} levels'.format(total=i))

    if drop:
        log.info('Dropping existing spatial zones')
        GeoZone.drop_collection()

    log.info('Loading zones.msgpack')
    zones_filepath = tmp.path('zones.msgpack')
    with open(zones_filepath) as fp:
        unpacker = msgpack.Unpacker(fp, encoding=str('utf-8'))
        for i, geozone in enumerate(unpacker, start=1):
            GeoZone.objects.create(id=geozone['_id'],
                                   level=geozone['level'],
                                   code=geozone['code'],
                                   name=geozone['name'],
                                   keys=geozone.get('keys'),
                                   parents=geozone.get('parents'),
                                   population=geozone.get('population'),
                                   dbpedia=geozone.get('dbpedia'),
                                   logo=geozone.get('flag')
                                   or geozone.get('blazon'),
                                   wikipedia=geozone.get('wikipedia'),
                                   area=geozone.get('area'),
                                   geom=geozone['geom'])
    os.remove(zones_filepath)
    log.info('Loaded {total} zones'.format(total=i))

    shutil.rmtree(tmp.path('translations'))  # Not in use for now.
Example #4
0
def attach_zone(insee_code, organization_id_or_slug, level="fr/town"):
    """Attach a zone restricted to level for a given organization."""
    organization = Organization.objects.get_by_id_or_slug(organization_id_or_slug)
    geozone = GeoZone.objects(code=insee_code, level=level).first()
    log.info("Attaching {organization} with {geozone.name}".format(organization=organization, geozone=geozone))
    organization.zone = geozone.id
    organization.save()
    log.info("Done")
def attach_zone(insee_code, organization_id_or_slug, level='fr/town'):
    '''Attach a zone restricted to level for a given organization.'''
    organization = Organization.objects.get_by_id_or_slug(
        organization_id_or_slug)
    geozone = GeoZone.objects(code=insee_code, level=level).first()
    log.info('Attaching {organization} with {geozone.name}'.format(
        organization=organization, geozone=geozone))
    organization.zone = geozone.id
    organization.save()
    log.info('Done')
Example #6
0
def check_for_territories(query):
    if (not query or len(query) < 4 or
            not current_app.config.get('ACTIVATE_TERRITORIES')):
        return GeoZone.objects.none()
    # If it's a code, try INSEE/postal, otherwise use the name.
    qs = GeoZone.objects(level='fr/town')
    if len(query) == 5 and query.isdigit():
        # Match both INSEE and postal codes.
        qs = qs(db.Q(code=query) | db.Q(keys__postal__contains=query))
    else:
        # Check names starting with query or exact match.
        qs = qs(db.Q(name__istartswith=query) | db.Q(name__iexact=query))
    # Sort matching results by population and area.
    return qs.order_by('-population', '-area')
Example #7
0
def sitemap_urls():
    if current_app.config.get('ACTIVATE_TERRITORIES'):
        for level in current_app.config.get('HANDLED_LEVELS'):
            for territory in (GeoZone.objects(level=level).only(
                    'id', 'code', 'validity', 'slug')):
                # Remove 'fr:' manually from the level.
                territory = dict_to_namedtuple(
                    'Territory', {
                        'level_name': level[3:],
                        'id': territory.id,
                        'code': territory.code,
                        'slug': territory.slug,
                        'validity': territory.validity
                    })
                yield ('territories.territory', {
                    'territory': territory
                }, None, 'weekly', 0.5)
Example #8
0
def render_home():
    if not current_app.config.get('ACTIVATE_TERRITORIES'):
        return abort(404)

    highest_level = current_app.config['HANDLED_LEVELS'][-1]
    regions = GeoZone.objects(level=highest_level).valid_at(date.today())
    regions = sorted(regions,
                     key=lambda zone: unicodedata.normalize('NFD', zone.name).
                     encode('ascii', 'ignore'))

    return theme.render(
        'territories/home.html', **{
            'geojson': {
                'type': 'FeatureCollection',
                'features': [region.toGeoJSON() for region in regions]
            },
            'regions': regions
        })
Example #9
0
def check_for_territories(query):
    """
    Return a geozone queryset of territories given the `query`.

    Results are sorted by population and area (biggest first).
    """
    if not query or not current_app.config.get('ACTIVATE_TERRITORIES'):
        return []

    dbqs = db.Q()
    query = query.lower()
    is_digit = query.isdigit()
    query_length = len(query)
    for level in current_app.config.get('HANDLED_LEVELS'):
        if level == 'country':
            continue  # Level not fully handled yet.
        q = db.Q(level=level)
        if (query_length == 2 and level == 'fr:departement' and
                (is_digit or query in ('2a', '2b'))):
            # Counties + Corsica.
            q &= db.Q(code=query)
        elif query_length == 3 and level == 'fr:departement' and is_digit:
            # French DROM-COM.
            q &= db.Q(code=query)
        elif query_length == 5 and level == 'fr:commune' and (
                is_digit or query.startswith('2a') or query.startswith('2b')):
            # INSEE code then postal codes with Corsica exceptions.
            q &= db.Q(code=query) | db.Q(keys__postal__contains=query)
        elif query_length >= 4:
            # Check names starting with query or exact match.
            q &= db.Q(name__istartswith=query) | db.Q(name__iexact=query)
        else:
            continue

        # Meta Q object, ready to be passed to a queryset.
        dbqs |= q

    if dbqs.empty:
        return []

    # Sort matching results by population and area.
    return GeoZone.objects(dbqs).order_by('-population', '-area')
Example #10
0
    def serialize(cls, dataset):
        organization = None
        owner = None

        if dataset.organization:
            org = Organization.objects(id=dataset.organization.id).first()
            organization = {
                'id': str(org.id),
                'name': org.name,
                'public_service': 1 if org.public_service else 0,
                'followers': org.metrics.get('followers', 0)
            }
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()

        document = {
            'id': str(dataset.id),
            'title': dataset.title,
            'description': dataset.description,
            'acronym': dataset.acronym or None,
            'url': dataset.display_url,
            'tags': dataset.tags,
            'license': getattr(dataset.license, 'id', None),
            'badges': [badge.kind for badge in dataset.badges],
            'frequency': dataset.frequency,
            'created_at': to_iso_datetime(dataset.created_at),
            'views': dataset.metrics.get('views', 0),
            'followers': dataset.metrics.get('followers', 0),
            'reuses': dataset.metrics.get('reuses', 0),
            'featured': 1 if dataset.featured else 0,
            'resources_count': len(dataset.resources),
            'organization': organization,
            'owner': str(owner.id) if owner else None,
            'format':
            [r.format.lower() for r in dataset.resources if r.format],
            'schema':
            [r.schema.get('name') for r in dataset.resources if r.schema]
        }
        extras = {}
        for key, value in dataset.extras.items():
            extras[key] = to_iso_datetime(value) if isinstance(
                value, datetime.datetime) else value
        document.update({'extras': extras})

        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = to_iso_datetime(dataset.temporal_coverage.start)
            end = to_iso_datetime(dataset.temporal_coverage.end)
            document.update({
                'temporal_coverage_start': start,
                'temporal_coverage_end': end,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
            })
        return document
Example #11
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect license
        default_license = dataset.license or License.default()
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom, spatial_zone = None, None

        for extra in data['extras']:
            key = extra['key']
            value = extra['value']
            if value is None or (isinstance(value, str) and not value.strip()):
                # Skip empty extras
                continue
            elif key == 'spatial':
                # GeoJSON representation (Polygon or Point)
                spatial_geom = json.loads(value)
            elif key == 'spatial-text':
                # Textual representation of the extent / location
                qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
                qs = qs.valid_at(datetime.now())
                if qs.count() == 1:
                    spatial_zone = qs.first()
                else:
                    dataset.extras['ckan:spatial-text'] = value
                    log.debug('spatial-text value not handled: %s', value)
            elif key == 'spatial-uri':
                # Linked Data URI representing the place name
                dataset.extras['ckan:spatial-uri'] = value
                log.debug('spatial-uri value not handled: %s', value)
            elif key == 'frequency':
                # Update frequency
                freq = frequency_from_rdf(value)
                if freq:
                    dataset.frequency = freq
                elif value in UPDATE_FREQUENCIES:
                    dataset.frequency = value
                else:
                    dataset.extras['ckan:frequency'] = value
                    log.debug('frequency value not handled: %s', value)
            # Temporal coverage start
            elif key == 'temporal_start':
                temporal_start = daterange_start(value)
            # Temporal coverage end
            elif key == 'temporal_end':
                temporal_end = daterange_end(value)
            else:
                dataset.extras[extra['key']] = value

        if spatial_geom or spatial_zone:
            dataset.spatial = SpatialCoverage()

        if spatial_zone:
            dataset.spatial.zones = [spatial_zone]

        if spatial_geom:
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                raise HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        dataset.extras['remote_url'] = self.dataset_url(data['name'])
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['ckan:source'] = data['url']
            else:
                # use declared `url` as `remote_url` if any
                dataset.extras['remote_url'] = url

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
Example #12
0
def load(filename, drop=False):
    '''Load a GeoZones Bundle'''
    if filename.startswith('http'):
        log.info('Downloading GeoZones bundle: %s', filename)
        filename, _ = urlretrieve(filename, tmp.path('geozones.tar.xz'))

    log.info('Extracting GeoZones bundle')
    with contextlib.closing(lzma.LZMAFile(filename)) as xz:
        with tarfile.open(fileobj=xz) as f:
            f.extractall(tmp.root)

    log.info('Loading GeoZones levels')

    if drop:
        log.info('Dropping existing levels')
        GeoLevel.drop_collection()

    log.info('Loading levels.msgpack')
    levels_filepath = tmp.path('levels.msgpack')
    with open(levels_filepath) as fp:
        unpacker = msgpack.Unpacker(fp, encoding=str('utf-8'))
        for i, level in enumerate(unpacker, start=1):
            GeoLevel.objects.create(
                id=level['id'],
                name=level['label'],
                parents=level['parents'],
                admin_level=level.get('admin_level')
            )
    os.remove(levels_filepath)
    log.info('Loaded {total} levels'.format(total=i))

    if drop:
        log.info('Dropping existing spatial zones')
        GeoZone.drop_collection()

    log.info('Loading zones.msgpack')
    zones_filepath = tmp.path('zones.msgpack')
    with open(zones_filepath) as fp:
        unpacker = msgpack.Unpacker(fp, encoding=str('utf-8'))
        unpacker.next()  # Skip headers.
        for i, geozone in enumerate(unpacker):
            if not geozone.get('geom') or (
                geozone['geom']['type'] == 'GeometryCollection' and
                    not geozone['geom']['geometries']):
                geom = None
            else:
                geom = geozone['geom']
            params = {
                'id': geozone['_id'],
                'slug': slugify.slugify(geozone['name'], separator='-'),
                'level': geozone['level'],
                'code': geozone['code'],
                'name': geozone['name'],
                'keys': geozone.get('keys'),
                'parents': geozone.get('parents'),
                'ancestors': geozone.get('ancestors'),
                'successors': geozone.get('successors'),
                'validity': geozone.get('validity'),
                'population': geozone.get('population'),
                'dbpedia': geozone.get('dbpedia'),
                'flag': geozone.get('flag'),
                'blazon': geozone.get('blazon'),
                'wikipedia': geozone.get('wikipedia'),
                'area': geozone.get('area'),
                'geom': geom
            }
            try:
                GeoZone.objects.create(**params)
            except errors.ValidationError as e:
                log.warning('Validation error (%s) for %s with %s',
                            e, geozone, params)
                continue
    os.remove(zones_filepath)
    log.info('Loaded {total} zones'.format(total=i))

    shutil.rmtree(tmp.path('translations'))  # Not in use for now.
Example #13
0
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title': dataset.title,
            'description': dataset.description,
            'license': getattr(dataset.license, 'id', None),
            'tags': dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest': dataset.tags,
            'resources': [
                {
                    'title': r.title,
                    'description': r.description,
                    'format': r.format,
                    'type': r.type,
                }
                for r in dataset.resources],
            'format_suggest': [r.format.lower()
                               for r in dataset.resources
                               if r.format],
            'frequency': dataset.frequency,
            'organization': str(organization.id) if organization else None,
            'owner': str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [str(dataset.id)],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified': dataset.last_modified.strftime(
                '%Y-%m-%dT%H:%M:%S'),
            'metrics': dataset.metrics,
            'featured': dataset.featured,
            'from_certified': certified,
        }
        if (dataset.temporal_coverage is not None and
                dataset.temporal_coverage.start and
                dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {'start': start, 'end': end},
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
Example #14
0
def zone_labelizer(value):
    if value and isinstance(value, basestring):
        return GeoZone.objects(id=value).first() or value
    return value
Example #15
0
    def serialize(cls, dataset):
        org_id = (str(dataset.organization.id)
                  if dataset.organization is not None else None)
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            'title': dataset.title,
            'description': dataset.description,
            'license': (dataset.license.id
                        if dataset.license is not None else None),
            'tags': dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest': dataset.tags,
            'resources': [
                {
                    'title': r.title,
                    'description': r.description,
                    'format': r.format,
                }
                for r in dataset.resources],
            'format_suggest': [r.format.lower()
                               for r in dataset.resources
                               if r.format],
            'frequency': dataset.frequency,
            'organization': org_id,
            'owner': str(dataset.owner.id) if dataset.owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'image_url': image_url,
                },
            },
            'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified': dataset.last_modified.strftime(
                '%Y-%m-%dT%H:%M:%S'),
            'metrics': dataset.metrics,
            'extras': dataset.extras,
            'featured': dataset.featured,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            document.update({
                'temporal_coverage': {
                    'start': dataset.temporal_coverage.start.toordinal(),
                    'end': dataset.temporal_coverage.end.toordinal(),
                }
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zones = GeoZone.objects(
                id__in=[z.id for z in dataset.spatial.zones])
            parents = set()
            geozones = []
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)

            geozones.extend([{'id': p} for p in parents])

            document.update({
                'geozones': geozones,
                # 'geom': dataset.spatial.geom,
                'granularity': dataset.spatial.granularity,
            })

        return document
Example #16
0
def migrate_zones_ids():
    """Migrate zones from old to new ids in datasets.

    Should only be run once with the new version of geozones w/ geohisto.
    """
    counter_datasets = 0
    counter_zones = 0
    counter_towns = 0
    counter_counties = 0
    counter_regions = 0
    counter_drom = 0
    counter_dromcom = 0
    drom_zone = GeoZone.objects(id='country-subset:fr:drom').first()
    dromcom_zone = GeoZone.objects(id='country-subset:fr:dromcom').first()
    for dataset in Dataset.objects.all():
        if dataset.spatial and dataset.spatial.zones:
            counter_datasets += 1
            new_zones = []
            for zone in dataset.spatial.zones:
                if zone.id.startswith('fr/'):
                    counter_zones += 1
                    country, kind, zone_id = zone.id.split('/')
                    zone_id = zone_id.upper()  # Corsica 2a/b case.
                    if kind == 'town':
                        counter_towns += 1
                        new_zones.append(
                            GeoZone.objects(code=zone_id,
                                            level='fr:commune').valid_at(
                                                date.today()).first())
                    elif kind == 'county':
                        counter_counties += 1
                        new_zones.append(
                            GeoZone.objects(code=zone_id,
                                            level='fr:departement').valid_at(
                                                date.today()).first())
                    elif kind == 'region':
                        counter_regions += 1
                        # Only link to pre-2016 regions which kept the same id.
                        new_zones.append(
                            GeoZone.objects(code=zone_id,
                                            level='fr:region').first())
                    else:
                        new_zones.append(zone)
                elif zone.id.startswith('country-subset/fr'):
                    counter_zones += 1
                    subset, country, kind = zone.id.split('/')
                    if kind == 'dom':
                        counter_drom += 1
                        new_zones.append(drom_zone)
                    elif kind == 'domtom':
                        counter_dromcom += 1
                        new_zones.append(dromcom_zone)
                else:
                    new_zones.append(zone)
            dataset.update(spatial=SpatialCoverage(
                zones=[z.id for z in new_zones if z]))
    print('{} datasets and {} zones affected.'.format(counter_datasets,
                                                      counter_zones))
    print('{} town zones, {} county zones and {} region zones updated.'.format(
        counter_towns, counter_counties, counter_regions))
    print('{} DROM zones, {} DROM-COM zones updated.'.format(
        counter_drom, counter_dromcom))
    log.info('Done')
Example #17
0
def sitemap_urls():
    if current_app.config.get('ACTIVATE_TERRITORIES'):
        for code in GeoZone.objects(level='fr/town').only('code'):
            yield ('territories.territory', {
                'territory': code
            }, None, "weekly", 0.5)
Example #18
0
def sitemap_urls():
    for code in GeoZone.objects(level='fr/town').only('code'):
        yield ('territories.territory', {'territory': code},
               None, "weekly", 0.5)
Example #19
0
def zone_labelizer(value):
    if value and isinstance(value, basestring):
        return GeoZone.objects(id=value).first() or value
    return value
Example #20
0
def migrate_zones_ids():
    """Migrate zones from old to new ids in datasets.

    Should only be run once with the new version of geozones w/ geohisto.
    """
    counter = Counter()
    drom_zone = GeoZone.objects(id='country-subset:fr:drom').first()
    dromcom_zone = GeoZone.objects(id='country-subset:fr:dromcom').first()
    # Iter over datasets with zones
    for dataset in Dataset.objects(spatial__zones__gt=[]):
        counter['datasets'] += 1
        new_zones = []
        for zone in dataset.spatial.zones:
            if zone.id.startswith('fr/'):
                counter['zones'] += 1
                country, kind, zone_id = zone.id.split('/')
                zone_id = zone_id.upper()  # Corsica 2a/b case.
                if kind == 'town':
                    counter['towns'] += 1
                    new_zones.append(
                        GeoZone.objects(code=zone_id,
                                        level='fr:commune').valid_at(
                                            date.today()).first())
                elif kind == 'county':
                    counter['counties'] += 1
                    new_zones.append(
                        GeoZone.objects(code=zone_id,
                                        level='fr:departement').valid_at(
                                            date.today()).first())
                elif kind == 'region':
                    counter['regions'] += 1
                    # Only link to pre-2016 regions which kept the same id.
                    new_zones.append(
                        GeoZone.objects(code=zone_id,
                                        level='fr:region').first())
                elif kind == 'epci':
                    counter['epcis'] += 1
                    new_zones.append(
                        GeoZone.objects(code=zone_id,
                                        level='fr:epci').valid_at(
                                            dataset.created_at.date()).first())
                else:
                    new_zones.append(zone)
            elif zone.id.startswith('country-subset/fr'):
                counter['zones'] += 1
                subset, country, kind = zone.id.split('/')
                if kind == 'dom':
                    counter['drom'] += 1
                    new_zones.append(drom_zone)
                elif kind == 'domtom':
                    counter['dromcom'] += 1
                    new_zones.append(dromcom_zone)
            elif zone.id.startswith('country/'):
                counter['zones'] += 1
                counter['countries'] += 1
                new_zones.append(zone.id.replace('/', ':'))
            elif zone.id.startswith('country-group/'):
                counter['zones'] += 1
                counter['countrygroups'] += 1
                new_zones.append(zone.id.replace('/', ':'))
            else:
                new_zones.append(zone)
        dataset.update(spatial=SpatialCoverage(
            granularity=dataset.spatial.granularity,
            zones=[getattr(z, 'id', z) for z in new_zones if z]))
    log.info(Formatter().vformat(
        '''Summary
    Processed {zones} zones in {datasets} datasets:
    - {countrygroups} country groups (World/UE)
    - {countries} countries
    - France:
        - {regions} regions
        - {counties} counties
        - {epcis} EPCIs
        - {towns} towns
        - {drom} DROM
        - {dromcom} DROM-COM
    ''', (), counter))
    log.info('Done')
Example #21
0
def sitemap_urls():
    if current_app.config.get('ACTIVATE_TERRITORIES'):
        for code in GeoZone.objects(level='fr/town').only('code'):
            yield ('territories.territory', {'territory': code},
                   None, "weekly", 0.5)
Example #22
0
    def serialize(cls, dataset):
        organization = None
        owner = None
        image_url = None
        spatial_weight = DEFAULT_SPATIAL_WEIGHT
        temporal_weight = DEFAULT_TEMPORAL_WEIGHT

        if dataset.organization:
            organization = Organization.objects(
                id=dataset.organization.id).first()
            image_url = organization.logo(40, external=True)
        elif dataset.owner:
            owner = User.objects(id=dataset.owner.id).first()
            image_url = owner.avatar(40, external=True)

        certified = organization and organization.certified

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            getattr(dataset.license, 'id', None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            str(organization.id) if organization else None,
            'owner':
            str(owner.id) if owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'acronym': dataset.acronym,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'featured':
            dataset.featured,
            'from_certified':
            certified,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            start = dataset.temporal_coverage.start.toordinal()
            end = dataset.temporal_coverage.end.toordinal()
            temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT)
            document.update({
                'temporal_coverage': {
                    'start': start,
                    'end': end
                },
                'temporal_weight': temporal_weight,
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zone_ids = [z.id for z in dataset.spatial.zones]
            zones = GeoZone.objects(id__in=zone_ids).exclude('geom')
            parents = set()
            geozones = []
            coverage_level = ADMIN_LEVEL_MAX
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)
                coverage_level = min(coverage_level, admin_levels[zone.level])

            geozones.extend([{'id': p} for p in parents])

            spatial_weight = ADMIN_LEVEL_MAX / coverage_level
            document.update({
                'geozones': geozones,
                'granularity': dataset.spatial.granularity,
                'spatial_weight': spatial_weight,
            })

        document['dataset_suggest']['weight'] = cls.get_suggest_weight(
            temporal_weight, spatial_weight, dataset.featured)

        if dataset.acronym:
            document['dataset_suggest']['input'].append(dataset.acronym)

        return document
Example #23
0
    def serialize(cls, dataset):
        org_id = (str(dataset.organization.id)
                  if dataset.organization is not None else None)
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            'title':
            dataset.title,
            'description':
            dataset.description,
            'license':
            (dataset.license.id if dataset.license is not None else None),
            'tags':
            dataset.tags,
            'badges': [badge.kind for badge in dataset.badges],
            'tag_suggest':
            dataset.tags,
            'resources': [{
                'title': r.title,
                'description': r.description,
                'format': r.format,
            } for r in dataset.resources],
            'format_suggest':
            [r.format.lower() for r in dataset.resources if r.format],
            'frequency':
            dataset.frequency,
            'organization':
            org_id,
            'owner':
            str(dataset.owner.id) if dataset.owner else None,
            'dataset_suggest': {
                'input': cls.completer_tokenize(dataset.title) + [dataset.id],
                'output': dataset.title,
                'payload': {
                    'id': str(dataset.id),
                    'slug': dataset.slug,
                    'image_url': image_url,
                },
            },
            'created':
            dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified':
            dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'metrics':
            dataset.metrics,
            'extras':
            dataset.extras,
            'featured':
            dataset.featured,
        }
        if (dataset.temporal_coverage is not None
                and dataset.temporal_coverage.start
                and dataset.temporal_coverage.end):
            document.update({
                'temporal_coverage': {
                    'start': dataset.temporal_coverage.start.toordinal(),
                    'end': dataset.temporal_coverage.end.toordinal(),
                }
            })

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zones = GeoZone.objects(
                id__in=[z.id for z in dataset.spatial.zones])
            parents = set()
            geozones = []
            for zone in zones:
                geozones.append({
                    'id': zone.id,
                    'name': zone.name,
                    'keys': zone.keys_values
                })
                parents |= set(zone.parents)

            geozones.extend([{'id': p} for p in parents])

            document.update({
                'geozones': geozones,
                # 'geom': dataset.spatial.geom,
                'granularity': dataset.spatial.granularity,
            })

        return document
Example #24
0
    def serialize(cls, dataset):
        org_id = str(dataset.organization.id) if dataset.organization is not None else None
        supplier_id = str(dataset.supplier.id) if dataset.supplier is not None else None
        supplier_id = supplier_id if supplier_id != org_id else None
        if dataset.organization:
            image_url = dataset.organization.logo(40)
        elif dataset.owner:
            image_url = dataset.owner.avatar(40)
        else:
            image_url = None

        document = {
            "title": dataset.title,
            "description": dataset.description,
            "license": (dataset.license.id if dataset.license is not None else None),
            "tags": dataset.tags,
            "badges": [badge.kind for badge in dataset.badges],
            "tag_suggest": dataset.tags,
            "resources": [
                {"title": r.title, "description": r.description, "format": r.format} for r in dataset.resources
            ],
            "format_suggest": [r.format.lower() for r in dataset.resources if r.format],
            "frequency": dataset.frequency,
            "organization": org_id,
            "owner": str(dataset.owner.id) if dataset.owner else None,
            "supplier": supplier_id,
            "dataset_suggest": {
                "input": cls.completer_tokenize(dataset.title) + [dataset.id],
                "output": dataset.title,
                "payload": {"id": str(dataset.id), "slug": dataset.slug, "image_url": image_url},
            },
            "created": dataset.created_at.strftime("%Y-%m-%dT%H:%M:%S"),
            "last_modified": dataset.last_modified.strftime("%Y-%m-%dT%H:%M:%S"),
            "metrics": dataset.metrics,
            "extras": dataset.extras,
            "featured": dataset.featured,
        }
        if dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end:
            document.update(
                {
                    "temporal_coverage": {
                        "start": dataset.temporal_coverage.start.toordinal(),
                        "end": dataset.temporal_coverage.end.toordinal(),
                    }
                }
            )

        if dataset.spatial is not None:
            # Index precise zone labels and parents zone identifiers
            # to allow fast filtering.
            zones = GeoZone.objects(id__in=[z.id for z in dataset.spatial.zones])
            parents = set()
            geozones = []
            for zone in zones:
                geozones.append({"id": zone.id, "name": zone.name, "code": zone.code})
                parents |= set(zone.parents)

            geozones.extend([{"id": p} for p in parents])

            document.update(
                {
                    "geozones": geozones,
                    # 'geom': dataset.spatial.geom,
                    "granularity": dataset.spatial.granularity,
                }
            )

        return document
Example #25
0
def zone_labelizer(value):
    if isinstance(value, str):
        return GeoZone.objects(id=value).first()
    elif isinstance(value, GeoZone):
        return value