Example #1
0
def error(msg, details=None):
    '''Display an error message with optional details'''
    msg = '{0} {1}'.format(red(KO), white(safe_unicode(msg)))
    msg = safe_unicode(msg)
    if details:
        msg = b'\n'.join((msg, safe_unicode(details)))
    echo(format_multiline(msg))
Example #2
0
    def pre_validate(self, form):
        if self.data:
            backend = self.get_backend(form)
            # Validate filters
            for f in (self.data.get('filters') or []):
                if not ('key' in f and 'value' in f):
                    msg = 'A field should have both key and value properties'
                    raise validators.ValidationError(msg)
                specs = self.get_filter_specs(backend, f['key'])
                if not specs:
                    msg = 'Unknown filter key "{0}" for "{1}" backend'
                    msg = msg.format(f['key'], backend.name)
                    raise validators.ValidationError(msg)

                if isinstance(f['value'], basestring):
                    f['value'] = safe_unicode(f['value'])  # Fix encoding error

                if not isinstance(f['value'], specs.type):
                    msg = '"{0}" filter should of type "{1}"'
                    msg = msg.format(specs.key, specs.type.__name__)
                    raise validators.ValidationError(msg)
            # Validate features
            for key, value in (self.data.get('features') or {}).items():
                if not isinstance(value, bool):
                    msg = 'A feature should be a boolean'
                    raise validators.ValidationError(msg)
                if not self.get_feature_specs(backend, key):
                    msg = 'Unknown feature "{0}" for "{1}" backend'
                    msg = msg.format(key, backend.name)
                    raise validators.ValidationError(msg)
Example #3
0
 def format(self, record):
     if not IS_TTY:
         return super(CliFormatter, self).format(record)
     record.msg = format_multiline(record.msg)
     record.msg = b' '.join((self._prefix(record), record.msg))
     record.args = tuple(a if isinstance(a, NO_CAST) else safe_unicode(a)
                         for a in record.args)
     return super(CliFormatter, self).format(record)
Example #4
0
    def perform_initialization(self):
        '''Initialize the harvesting for a given job'''
        log.debug('Initializing backend')
        factory = HarvestJob if self.dryrun else HarvestJob.objects.create
        self.job = factory(status='initializing',
                           started=datetime.now(),
                           source=self.source)

        before_harvest_job.send(self)

        try:
            self.initialize()
            self.job.status = 'initialized'
            if not self.dryrun:
                self.job.save()
        except HarvestValidationError as e:
            log.info('Initialization failed for "%s" (%s)',
                     safe_unicode(self.source.name), self.source.backend)
            error = HarvestError(message=safe_unicode(e))
            self.job.errors.append(error)
            self.job.status = 'failed'
            self.end()
            return
        except Exception as e:
            self.job.status = 'failed'
            error = HarvestError(message=safe_unicode(e))
            self.job.errors.append(error)
            self.end()
            msg = 'Initialization failed for "{0.name}" ({0.backend})'
            log.exception(msg.format(self.source))
            return

        if self.max_items:
            self.job.items = self.job.items[:self.max_items]

        if self.job.items:
            log.debug('Queued %s items', len(self.job.items))

        return len(self.job.items)
Example #5
0
def resource_from_rdf(graph_or_distrib, dataset=None):
    '''
    Map a Resource domain model to a DCAT/RDF graph
    '''
    if isinstance(graph_or_distrib, RdfResource):
        distrib = graph_or_distrib
    else:
        node = graph_or_distrib.value(predicate=RDF.type,
                                      object=DCAT.Distribution)
        distrib = graph_or_distrib.resource(node)

    download_url = url_from_rdf(distrib, DCAT.downloadURL)
    access_url = url_from_rdf(distrib, DCAT.accessURL)
    url = safe_unicode(download_url or access_url)

    if dataset:
        resource = get_by(dataset.resources, 'url', url)
    if not dataset or not resource:
        resource = Resource()
        if dataset:
            dataset.resources.append(resource)
    resource.title = title_from_rdf(distrib, url)
    resource.url = url
    resource.description = sanitize_html(distrib.value(DCT.description))
    resource.filesize = rdf_value(distrib, DCAT.bytesSize)
    resource.mime = rdf_value(distrib, DCAT.mediaType)
    fmt = rdf_value(distrib, DCT.term('format'))
    if fmt:
        resource.format = fmt.lower()
    checksum = distrib.value(SPDX.checksum)
    if checksum:
        algorithm = checksum.value(SPDX.algorithm).identifier
        algorithm = CHECKSUM_ALGORITHMS.get(algorithm)
        if algorithm:
            resource.checksum = Checksum()
            resource.checksum.value = rdf_value(checksum, SPDX.checksumValue)
            resource.checksum.type = algorithm

    resource.published = rdf_value(distrib, DCT.issued, resource.published)
    resource.modified = rdf_value(distrib, DCT.modified, resource.modified)

    identifier = rdf_value(distrib, DCT.identifier)
    if identifier:
        resource.extras['dct:identifier'] = identifier

    if isinstance(distrib.identifier, URIRef):
        resource.extras['uri'] = distrib.identifier.toPython()

    return resource
Example #6
0
def resource_from_rdf(graph_or_distrib, dataset=None):
    '''
    Map a Resource domain model to a DCAT/RDF graph
    '''
    if isinstance(graph_or_distrib, RdfResource):
        distrib = graph_or_distrib
    else:
        node = graph_or_distrib.value(predicate=RDF.type,
                                      object=DCAT.Distribution)
        distrib = graph_or_distrib.resource(node)

    download_url = url_from_rdf(distrib, DCAT.downloadURL)
    access_url = url_from_rdf(distrib, DCAT.accessURL)
    url = safe_unicode(download_url or access_url)

    if dataset:
        resource = get_by(dataset.resources, 'url', url)
    if not dataset or not resource:
        resource = Resource()
        if dataset:
            dataset.resources.append(resource)
    resource.title = title_from_rdf(distrib, url)
    resource.url = url
    resource.description = sanitize_html(distrib.value(DCT.description))
    resource.filesize = rdf_value(distrib, DCAT.bytesSize)
    resource.mime = rdf_value(distrib, DCAT.mediaType)
    fmt = rdf_value(distrib, DCT.term('format'))
    if fmt:
        resource.format = fmt.lower()
    checksum = distrib.value(SPDX.checksum)
    if checksum:
        algorithm = checksum.value(SPDX.algorithm).identifier
        algorithm = CHECKSUM_ALGORITHMS.get(algorithm)
        if algorithm:
            resource.checksum = Checksum()
            resource.checksum.value = rdf_value(checksum, SPDX.checksumValue)
            resource.checksum.type = algorithm

    resource.published = rdf_value(distrib, DCT.issued, resource.published)
    resource.modified = rdf_value(distrib, DCT.modified, resource.modified)

    identifier = rdf_value(distrib, DCT.identifier)
    if identifier:
        resource.extras['dct:identifier'] = identifier

    if isinstance(distrib.identifier, URIRef):
        resource.extras['uri'] = distrib.identifier.toPython()

    return resource
Example #7
0
def extract_name_from_path(path):
    """Return a readable name from a URL path.

    Useful to log requests on Piwik with categories tree structure.
    See: http://piwik.org/faq/how-to/#faq_62
    """
    base_path, query_string = path.split('?')
    infos = base_path.strip('/').split('/')[2:]  # Removes api/version.
    if len(infos) > 1:  # This is an object.
        name = '{category} / {name}'.format(category=infos[0].title(),
                                            name=infos[1].replace('-',
                                                                  ' ').title())
    else:  # This is a collection.
        name = '{category}'.format(category=infos[0].title())
    return safe_unicode(name)
Example #8
0
def extract_name_from_path(path):
    """Return a readable name from a URL path.

    Useful to log requests on Piwik with categories tree structure.
    See: http://piwik.org/faq/how-to/#faq_62
    """
    base_path, query_string = path.split('?')
    infos = base_path.strip('/').split('/')[2:]  # Removes api/version.
    if len(infos) > 1:  # This is an object.
        name = '{category} / {name}'.format(
            category=infos[0].title(),
            name=infos[1].replace('-', ' ').title()
        )
    else:  # This is a collection.
        name = '{category}'.format(category=infos[0].title())
    return safe_unicode(name)
Example #9
0
    def process_item(self, item):
        log.debug('Processing: %s', item.remote_id)
        item.status = 'started'
        item.started = datetime.now()
        if not self.dryrun:
            self.job.save()

        try:
            dataset = self.process(item)
            dataset.extras['harvest:source_id'] = str(self.source.id)
            dataset.extras['harvest:remote_id'] = item.remote_id
            dataset.extras['harvest:domain'] = self.source.domain
            dataset.extras['harvest:last_update'] = datetime.now().isoformat()

            # unset archived status if needed
            if dataset.extras.get('harvest:archived_at'):
                dataset.extras.pop('harvest:archived_at')
                dataset.extras.pop('harvest:archived')
                dataset.archived = None

            # TODO permissions checking
            if not dataset.organization and not dataset.owner:
                if self.source.organization:
                    dataset.organization = self.source.organization
                elif self.source.owner:
                    dataset.owner = self.source.owner

            # TODO: Apply editble mappings

            if self.dryrun:
                dataset.validate()
            else:
                dataset.save()
            item.dataset = dataset
            item.status = 'done'
        except HarvestSkipException as e:
            log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e))
            item.status = 'skipped'
            item.errors.append(HarvestError(message=safe_unicode(e)))
        except HarvestValidationError as e:
            log.info('Error validating item %s : %s', item.remote_id,
                     safe_unicode(e))
            item.status = 'failed'
            item.errors.append(HarvestError(message=safe_unicode(e)))
        except Exception as e:
            log.exception('Error while processing %s : %s', item.remote_id,
                          safe_unicode(e))
            error = HarvestError(message=safe_unicode(e),
                                 details=traceback.format_exc())
            item.errors.append(error)
            item.status = 'failed'

        item.ended = datetime.now()
        if not self.dryrun:
            self.job.save()
Example #10
0
    def process_item(self, item):
        log.debug('Processing: %s', item.remote_id)
        item.status = 'started'
        item.started = datetime.now()
        if not self.dryrun:
            self.job.save()

        try:
            dataset = self.process(item)
            dataset.extras['harvest:source_id'] = str(self.source.id)
            dataset.extras['harvest:remote_id'] = item.remote_id
            dataset.extras['harvest:domain'] = self.source.domain
            dataset.extras['harvest:last_update'] = datetime.now().isoformat()

            # TODO permissions checking
            if not dataset.organization and not dataset.owner:
                if self.source.organization:
                    dataset.organization = self.source.organization
                elif self.source.owner:
                    dataset.owner = self.source.owner

            # TODO: Apply editble mappings

            if self.dryrun:
                dataset.validate()
            else:
                dataset.last_modified = datetime.now()
                dataset.save()
            item.dataset = dataset
            item.status = 'done'
        except HarvestSkipException as e:
            log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e))
            item.status = 'skipped'
            item.errors.append(HarvestError(message=safe_unicode(e)))
        except HarvestValidationError as e:
            log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e))
            item.status = 'failed'
            item.errors.append(HarvestError(message=safe_unicode(e)))
        except Exception as e:
            log.exception('Error while processing %s : %s',
                          item.remote_id,
                          safe_unicode(e))
            error = HarvestError(message=safe_unicode(e),
                                 details=traceback.format_exc())
            item.errors.append(error)
            item.status = 'failed'

        item.ended = datetime.now()
        if not self.dryrun:
            self.job.save()
Example #11
0
    def validate(self, data, schema):
        '''Perform a data validation against a given schema.

        :param data: an object to validate
        :param schema: a Voluptous schema to validate against
        '''
        try:
            return schema(data)
        except MultipleInvalid as ie:
            errors = []
            for error in ie.errors:
                if error.path:
                    field = '.'.join(str(p) for p in error.path)
                    path = error.path
                    value = data
                    while path:
                        attr = path.pop(0)
                        try:
                            if isinstance(value, (list, tuple)):
                                attr = int(attr)
                            value = value[attr]
                        except Exception:
                            value = None

                    txt = safe_unicode(error).replace('for dictionary value',
                                                      '')
                    txt = txt.strip()
                    if isinstance(error, RequiredFieldInvalid):
                        msg = '[{0}] {1}'
                    else:
                        msg = '[{0}] {1}: {2}'
                    try:
                        msg = msg.format(field, txt, str(value))
                    except Exception:
                        msg = '[{0}] {1}'.format(field, txt)

                else:
                    msg = str(error)
                errors.append(msg)
            msg = '\n- '.join(['Validation error:'] + errors)
            raise HarvestValidationError(msg)
Example #12
0
    def validate(self, data, schema):
        '''Perform a data validation against a given schema.

        :param data: an object to validate
        :param schema: a Voluptous schema to validate against
        '''
        try:
            return schema(data)
        except MultipleInvalid as ie:
            errors = []
            for error in ie.errors:
                if error.path:
                    field = '.'.join(str(p) for p in error.path)
                    path = error.path
                    value = data
                    while path:
                        attr = path.pop(0)
                        try:
                            if isinstance(value, (list, tuple)):
                                attr = int(attr)
                            value = value[attr]
                        except Exception:
                            value = None

                    txt = safe_unicode(error).replace('for dictionary value', '')
                    txt = txt.strip()
                    if isinstance(error, RequiredFieldInvalid):
                        msg = '[{0}] {1}'
                    else:
                        msg = '[{0}] {1}: {2}'
                    try:
                        msg = msg.format(field, txt, str(value))
                    except Exception:
                        msg = '[{0}] {1}'.format(field, txt)

                else:
                    msg = str(error)
                errors.append(msg)
            msg = '\n- '.join(['Validation error:'] + errors)
            raise HarvestValidationError(msg)
Example #13
0
 def default_labelizer(self, value):
     if not isinstance(value, self.model):
         self.validate_parameter(value)
         id = self.model_field.to_mongo(value)
         value = self.model.objects.get(id=id)
     return safe_unicode(value)
Example #14
0
 def test_bytes_stays_bytes(self):
     assert safe_unicode(b'xxx') == b'xxx'
Example #15
0
 def test_unicode_is_encoded(self):
     assert safe_unicode('ééé') == 'ééé'.encode('utf8')
Example #16
0
 def test_object_to_string(self):
     assert safe_unicode({}) == b'{}'
Example #17
0
 def test_object_to_string(self):
     assert safe_unicode({}) == '{}'
Example #18
0
 def test_unicode_stays_unicode(self):
     assert safe_unicode('ééé') == 'ééé'
Example #19
0
 def default_labelizer(self, value):
     return clean_string(safe_unicode(value))
Example #20
0
def success(msg):
    '''Display a success message'''
    echo('{0} {1}'.format(green(OK), white(safe_unicode(msg))))
Example #21
0
def header(msg):
    '''Display an header'''
    echo(' '.join((yellow(HEADER), white(safe_unicode(msg)), yellow(HEADER))))
Example #22
0
 def _prefix(self, record):
     if record.levelno in LEVELS_PREFIX:
         return safe_unicode(LEVELS_PREFIX[record.levelno])
     else:
         color = LEVEL_COLORS.get(record.levelno, white)
         return safe_unicode('{0}:'.format(color(record.levelname.upper())))
Example #23
0
 def test_bytes_stays_bytes(self):
     assert safe_unicode(b'xxx') == b'xxx'
Example #24
0
 def test_unicode_is_encoded(self):
     assert safe_unicode('ééé') == 'ééé'.encode('utf8')
Example #25
0
def format_multiline(string):
    string = safe_unicode(string)
    string = string.replace(b'\n', b'\n│ ')
    return safe_unicode(replace_last(string, '│', '└'))
Example #26
0
 def test_bytes_is_decoded(self):
     assert safe_unicode(b'xxx') == 'xxx'
Example #27
0
def replace_last(string, char, replacement):
    char = safe_unicode(char)
    replacement = safe_unicode(replacement)
    string = safe_unicode(string)
    return replacement.join(string.rsplit(char, 1))
Example #28
0
 def test_unicode_to_string(self):
     assert safe_unicode(ValueError('é')) == 'é'
Example #29
0
 def test_unicode_to_string(self):
     assert safe_unicode(ValueError('é')) == 'é'.encode('utf8')