Ejemplo n.º 1
0
 def query_token(self, token, token_type_hint, client):
     qs = OAuth2Token.objects(client=client)
     if token_type_hint:
         qs = qs(**{token_type_hint: token})
     else:
         qs = qs(db.Q(access_token=token) | db.Q(refresh_token=token))
     return qs.first()
Ejemplo n.º 2
0
    def guess_one(cls, text):
        '''
        Try to guess license from a string.

        Try to exact match on identifier then slugified title
        and fallback on edit distance ranking (after slugification)
        '''
        if not text:
            return
        qs = cls.objects
        text = text.strip().lower()  # Stored identifiers are lower case
        slug = cls.slug.slugify(text)  # Use slug as it normalize string
        license = qs(
            db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text)
            | db.Q(alternate_urls__iexact=text)).first()
        if license is None:
            # Try to single match with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]
        if license is None:
            # Try to single match with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug))
                        for l in cls.objects for t in l.alternate_titles)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]
        return license
Ejemplo n.º 3
0
    def get(self):
        '''Fetch site activity, optionally filtered by user of org.'''
        args = activity_parser.parse_args()
        qs = Activity.objects

        if args['organization']:
            qs = qs(
                db.Q(organization=args['organization'])
                | db.Q(related_to=args['organization']))

        if args['user']:
            qs = qs(actor=args['user'])

        qs = qs.order_by('-created_at')
        qs = qs.paginate(args['page'], args['page_size'])

        # Filter out DBRefs
        # Always return a result even not complete
        # But log the error (ie. visible in sentry, silent for user)
        # Can happen when someone manually delete an object in DB (ie. without proper purge)
        safe_items = []
        for item in qs.queryset.items:
            try:
                item.related_to
            except DoesNotExist as e:
                log.error(e, exc_info=True)
            else:
                safe_items.append(item)
        qs.queryset.items = safe_items

        return qs
Ejemplo n.º 4
0
 def valid_at(self, at):
     '''Limit current QuerySet to zone valid at a given date'''
     only_start = db.Q(validity__start__lte=at, validity__end=None)
     only_end = db.Q(validity__start=None, validity__end__gt=at)
     both = db.Q(validity__end__gt=at, validity__start__lte=at)
     no_validity = db.Q(validity=None) | db.Q(validity__start=None,
                                              validity__end=None)
     return self(no_validity | both | only_start | only_end)
Ejemplo n.º 5
0
 def query_token(self, token, token_type_hint, client):
     qs = OAuth2Token.objects(client=client)
     if token_type_hint == 'access_token':
         return qs.filter(access_token=token).first()
     elif token_type_hint == 'refresh_token':
         return qs.filter(refresh_token=token).first()
     else:
         qs = qs(db.Q(access_token=token) | db.Q(refresh_token=token))
         return qs.first()
Ejemplo n.º 6
0
 def from_organizations(self, user, *organizations):
     from udata.models import Dataset, Reuse  # Circular imports.
     Qs = db.Q()
     for dataset in Dataset.objects(owner=user).visible():
         Qs |= db.Q(subject=dataset)
     for org in organizations:
         for dataset in Dataset.objects(organization=org).visible():
             Qs |= db.Q(subject=dataset)
     for reuse in Reuse.objects.owned_by(*[user.id] + list(organizations)):
         Qs |= db.Q(subject=reuse)
     return self(Qs)
Ejemplo n.º 7
0
def check_for_territories(query):
    if (not query or len(query) < 4 or
            not current_app.config.get('ACTIVATE_TERRITORIES')):
        return GeoZone.objects.none()
    # If it's a code, try INSEE/postal, otherwise use the name.
    qs = GeoZone.objects(level='fr/town')
    if len(query) == 5 and query.isdigit():
        # Match both INSEE and postal codes.
        qs = qs(db.Q(code=query) | db.Q(keys__postal__contains=query))
    else:
        # Check names starting with query or exact match.
        qs = qs(db.Q(name__istartswith=query) | db.Q(name__iexact=query))
    # Sort matching results by population and area.
    return qs.order_by('-population', '-area')
Ejemplo n.º 8
0
Archivo: api.py Proyecto: taniki/udata
    def get(self):
        '''Fetch site activity, optionally filtered by user of org.'''
        args = activity_parser.parse_args()
        qs = Activity.objects

        if args['organization']:
            qs = qs(
                db.Q(organization=args['organization'])
                | db.Q(related_to=args['organization']))

        if args['user']:
            qs = qs(actor=args['user'])

        return (qs.order_by('-created_at').paginate(args['page'],
                                                    args['page_size']))
Ejemplo n.º 9
0
 def to_python(self, value):
     try:
         quoted = self.quote(value)
         query = db.Q(slug=value) | db.Q(slug=quoted)
         obj = self.model.objects(query).get()
     except (InvalidQueryError, self.model.DoesNotExist):
         # If the model doesn't have a slug or matching slug doesn't exist.
         obj = None
     else:
         if obj.slug != value:
             return LazyRedirect(quoted)
     try:
         return obj or self.model.objects.get_or_404(id=value)
     except NotFound as e:
         if self.has_redirected_slug:
             latest = self.model.slug.latest(value)
             if latest:
                 return LazyRedirect(latest)
         return e
Ejemplo n.º 10
0
    def guess_one(cls, text):
        '''
        Try to guess license from a string.

        Try to exact match on identifier then slugified title
        and fallback on edit distance ranking (after slugification)
        '''
        if not text:
            return
        qs = cls.objects
        text = text.strip().lower()  # Stored identifiers are lower case
        slug = cls.slug.slugify(text)  # Use slug as it normalize string
        license = qs(
            db.Q(id__iexact=text) | db.Q(slug=slug) | db.Q(url__iexact=text)
            | db.Q(alternate_urls__iexact=text)).first()

        if license is None:
            # If we're dealing with an URL, let's try some specific stuff
            # like getting rid of trailing slash and scheme mismatch
            try:
                url = validate_url(text)
            except ValidationError:
                pass
            else:
                parsed = urlparse(url)
                path = parsed.path.rstrip('/')
                query = f'{parsed.netloc}{path}'
                license = qs(
                    db.Q(url__icontains=query)
                    | db.Q(alternate_urls__contains=query)).first()

        if license is None:
            # Try to single match `slug` with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.slug, slug)) for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]

        if license is None:
            # Try to match `title` with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(l.title.lower(), text))
                        for l in cls.objects)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one match, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(candidates) == 1:
                license = candidates[0]

        if license is None:
            # Try to single match `alternate_titles` with a low Damerau-Levenshtein distance
            computed = ((l, rdlevenshtein(cls.slug.slugify(t), slug))
                        for l in cls.objects for t in l.alternate_titles)
            candidates = [l for l, d in computed if d <= MAX_DISTANCE]
            # If there is more that one license matching, we cannot determinate
            # which one is closer to safely choose between candidates
            if len(set(candidates)) == 1:
                license = candidates[0]
        return license
Ejemplo n.º 11
0
def check_for_territories(query):
    """
    Return a geozone queryset of territories given the `query`.

    Results are sorted by population and area (biggest first).
    """
    if not query or not current_app.config.get('ACTIVATE_TERRITORIES'):
        return []

    dbqs = db.Q()
    query = query.lower()
    is_digit = query.isdigit()
    query_length = len(query)
    for level in current_app.config.get('HANDLED_LEVELS'):
        if level == 'country':
            continue  # Level not fully handled yet.
        q = db.Q(level=level)
        if (query_length == 2 and level == 'fr:departement' and
                (is_digit or query in ('2a', '2b'))):
            # Counties + Corsica.
            q &= db.Q(code=query)
        elif query_length == 3 and level == 'fr:departement' and is_digit:
            # French DROM-COM.
            q &= db.Q(code=query)
        elif query_length == 5 and level == 'fr:commune' and (
                is_digit or query.startswith('2a') or query.startswith('2b')):
            # INSEE code then postal codes with Corsica exceptions.
            q &= db.Q(code=query) | db.Q(keys__postal__contains=query)
        elif query_length >= 4:
            # Check names starting with query or exact match.
            q &= db.Q(name__istartswith=query) | db.Q(name__iexact=query)
        else:
            continue

        # Meta Q object, ready to be passed to a queryset.
        dbqs |= q

    if dbqs.empty:
        return []

    # Sort matching results by population and area.
    return GeoZone.objects(dbqs).order_by('-population', '-area')
Ejemplo n.º 12
0
 def hidden(self):
     return self(
         db.Q(private=True) | db.Q(datasets__0__exists=False)
         | db.Q(deleted__ne=None))
Ejemplo n.º 13
0
 def get_value(self):
     org = self.target
     return (Dataset.objects(db.Q(organization=org)
                             | db.Q(supplier=org)).visible().count())
Ejemplo n.º 14
0
 def filter_activities(self, qs):
     predicate = (db.Q(organization=self.organization)
                  | db.Q(related_to=self.organization))
     return qs(predicate)
Ejemplo n.º 15
0
 def valid_at(self, valid_date):
     '''Limit current QuerySet to zone valid at a given date'''
     is_valid = db.Q(validity__end__gt=valid_date,
                     validity__start__lte=valid_date)
     no_validity = db.Q(validity=None)
     return self(is_valid | no_validity)
Ejemplo n.º 16
0
    def process(self, item):
        response = self.get_action('package_show', id=item.remote_id)
        data = self.validate(response['result'], self.schema)

        if type(data) == list:
            data = data[0]

        # Fix the remote_id: use real ID instead of not stable name
        item.remote_id = data['id']

        # Skip if no resource
        if not len(data.get('resources', [])):
            msg = 'Dataset {0} has no record'.format(item.remote_id)
            raise HarvestSkipException(msg)

        dataset = self.get_dataset(item.remote_id)

        # Core attributes
        if not dataset.slug:
            dataset.slug = data['name']
        dataset.title = data['title']
        dataset.description = parse_html(data['notes'])

        # Detect license
        default_license = dataset.license or License.default()
        dataset.license = License.guess(data['license_id'],
                                        data['license_title'],
                                        default=default_license)

        dataset.tags = [t['name'] for t in data['tags'] if t['name']]

        dataset.created_at = data['metadata_created']
        dataset.last_modified = data['metadata_modified']

        dataset.extras['ckan:name'] = data['name']

        temporal_start, temporal_end = None, None
        spatial_geom, spatial_zone = None, None

        for extra in data['extras']:
            key = extra['key']
            value = extra['value']
            if value is None or (isinstance(value, str) and not value.strip()):
                # Skip empty extras
                continue
            elif key == 'spatial':
                # GeoJSON representation (Polygon or Point)
                spatial_geom = json.loads(value)
            elif key == 'spatial-text':
                # Textual representation of the extent / location
                qs = GeoZone.objects(db.Q(name=value) | db.Q(slug=value))
                qs = qs.valid_at(datetime.now())
                if qs.count() == 1:
                    spatial_zone = qs.first()
                else:
                    dataset.extras['ckan:spatial-text'] = value
                    log.debug('spatial-text value not handled: %s', value)
            elif key == 'spatial-uri':
                # Linked Data URI representing the place name
                dataset.extras['ckan:spatial-uri'] = value
                log.debug('spatial-uri value not handled: %s', value)
            elif key == 'frequency':
                # Update frequency
                freq = frequency_from_rdf(value)
                if freq:
                    dataset.frequency = freq
                elif value in UPDATE_FREQUENCIES:
                    dataset.frequency = value
                else:
                    dataset.extras['ckan:frequency'] = value
                    log.debug('frequency value not handled: %s', value)
            # Temporal coverage start
            elif key == 'temporal_start':
                temporal_start = daterange_start(value)
            # Temporal coverage end
            elif key == 'temporal_end':
                temporal_end = daterange_end(value)
            else:
                dataset.extras[extra['key']] = value

        if spatial_geom or spatial_zone:
            dataset.spatial = SpatialCoverage()

        if spatial_zone:
            dataset.spatial.zones = [spatial_zone]

        if spatial_geom:
            if spatial_geom['type'] == 'Polygon':
                coordinates = [spatial_geom['coordinates']]
            elif spatial_geom['type'] == 'MultiPolygon':
                coordinates = spatial_geom['coordinates']
            else:
                raise HarvestException('Unsupported spatial geometry')
            dataset.spatial.geom = {
                'type': 'MultiPolygon',
                'coordinates': coordinates
            }

        if temporal_start and temporal_end:
            dataset.temporal_coverage = db.DateRange(
                start=temporal_start,
                end=temporal_end,
            )

        # Remote URL
        dataset.extras['remote_url'] = self.dataset_url(data['name'])
        if data.get('url'):
            try:
                url = uris.validate(data['url'])
            except uris.ValidationError:
                dataset.extras['ckan:source'] = data['url']
            else:
                # use declared `url` as `remote_url` if any
                dataset.extras['remote_url'] = url

        # Resources
        for res in data['resources']:
            if res['resource_type'] not in ALLOWED_RESOURCE_TYPES:
                continue
            try:
                resource = get_by(dataset.resources, 'id', UUID(res['id']))
            except Exception:
                log.error('Unable to parse resource ID %s', res['id'])
                continue
            if not resource:
                resource = Resource(id=res['id'])
                dataset.resources.append(resource)
            resource.title = res.get('name', '') or ''
            resource.description = parse_html(res.get('description'))
            resource.url = res['url']
            resource.filetype = 'remote'
            resource.format = res.get('format')
            resource.mime = res.get('mimetype')
            resource.hash = res.get('hash')
            resource.created = res['created']
            resource.modified = res['last_modified']
            resource.published = resource.published or resource.created

        return dataset
Ejemplo n.º 17
0
 def hidden(self):
     return self(
         db.Q(private=True) | db.Q(resources__0__exists=False)
         | db.Q(deleted__ne=None) | db.Q(archived__ne=None))
Ejemplo n.º 18
0
 def owned_by(self, *owners):
     Qs = db.Q()
     for owner in owners:
         Qs |= db.Q(owner=owner) | db.Q(organization=owner)
     return self(Qs)