def error(msg, details=None): '''Display an error message with optional details''' msg = '{0} {1}'.format(red(KO), white(safe_unicode(msg))) msg = safe_unicode(msg) if details: msg = b'\n'.join((msg, safe_unicode(details))) echo(format_multiline(msg))
def pre_validate(self, form): if self.data: backend = self.get_backend(form) # Validate filters for f in (self.data.get('filters') or []): if not ('key' in f and 'value' in f): msg = 'A field should have both key and value properties' raise validators.ValidationError(msg) specs = self.get_filter_specs(backend, f['key']) if not specs: msg = 'Unknown filter key "{0}" for "{1}" backend' msg = msg.format(f['key'], backend.name) raise validators.ValidationError(msg) if isinstance(f['value'], basestring): f['value'] = safe_unicode(f['value']) # Fix encoding error if not isinstance(f['value'], specs.type): msg = '"{0}" filter should of type "{1}"' msg = msg.format(specs.key, specs.type.__name__) raise validators.ValidationError(msg) # Validate features for key, value in (self.data.get('features') or {}).items(): if not isinstance(value, bool): msg = 'A feature should be a boolean' raise validators.ValidationError(msg) if not self.get_feature_specs(backend, key): msg = 'Unknown feature "{0}" for "{1}" backend' msg = msg.format(key, backend.name) raise validators.ValidationError(msg)
def format(self, record): if not IS_TTY: return super(CliFormatter, self).format(record) record.msg = format_multiline(record.msg) record.msg = b' '.join((self._prefix(record), record.msg)) record.args = tuple(a if isinstance(a, NO_CAST) else safe_unicode(a) for a in record.args) return super(CliFormatter, self).format(record)
def perform_initialization(self): '''Initialize the harvesting for a given job''' log.debug('Initializing backend') factory = HarvestJob if self.dryrun else HarvestJob.objects.create self.job = factory(status='initializing', started=datetime.now(), source=self.source) before_harvest_job.send(self) try: self.initialize() self.job.status = 'initialized' if not self.dryrun: self.job.save() except HarvestValidationError as e: log.info('Initialization failed for "%s" (%s)', safe_unicode(self.source.name), self.source.backend) error = HarvestError(message=safe_unicode(e)) self.job.errors.append(error) self.job.status = 'failed' self.end() return except Exception as e: self.job.status = 'failed' error = HarvestError(message=safe_unicode(e)) self.job.errors.append(error) self.end() msg = 'Initialization failed for "{0.name}" ({0.backend})' log.exception(msg.format(self.source)) return if self.max_items: self.job.items = self.job.items[:self.max_items] if self.job.items: log.debug('Queued %s items', len(self.job.items)) return len(self.job.items)
def resource_from_rdf(graph_or_distrib, dataset=None): ''' Map a Resource domain model to a DCAT/RDF graph ''' if isinstance(graph_or_distrib, RdfResource): distrib = graph_or_distrib else: node = graph_or_distrib.value(predicate=RDF.type, object=DCAT.Distribution) distrib = graph_or_distrib.resource(node) download_url = url_from_rdf(distrib, DCAT.downloadURL) access_url = url_from_rdf(distrib, DCAT.accessURL) url = safe_unicode(download_url or access_url) if dataset: resource = get_by(dataset.resources, 'url', url) if not dataset or not resource: resource = Resource() if dataset: dataset.resources.append(resource) resource.title = title_from_rdf(distrib, url) resource.url = url resource.description = sanitize_html(distrib.value(DCT.description)) resource.filesize = rdf_value(distrib, DCAT.bytesSize) resource.mime = rdf_value(distrib, DCAT.mediaType) fmt = rdf_value(distrib, DCT.term('format')) if fmt: resource.format = fmt.lower() checksum = distrib.value(SPDX.checksum) if checksum: algorithm = checksum.value(SPDX.algorithm).identifier algorithm = CHECKSUM_ALGORITHMS.get(algorithm) if algorithm: resource.checksum = Checksum() resource.checksum.value = rdf_value(checksum, SPDX.checksumValue) resource.checksum.type = algorithm resource.published = rdf_value(distrib, DCT.issued, resource.published) resource.modified = rdf_value(distrib, DCT.modified, resource.modified) identifier = rdf_value(distrib, DCT.identifier) if identifier: resource.extras['dct:identifier'] = identifier if isinstance(distrib.identifier, URIRef): resource.extras['uri'] = distrib.identifier.toPython() return resource
def resource_from_rdf(graph_or_distrib, dataset=None): ''' Map a Resource domain model to a DCAT/RDF graph ''' if isinstance(graph_or_distrib, RdfResource): distrib = graph_or_distrib else: node = graph_or_distrib.value(predicate=RDF.type, object=DCAT.Distribution) distrib = graph_or_distrib.resource(node) download_url = url_from_rdf(distrib, DCAT.downloadURL) access_url = url_from_rdf(distrib, DCAT.accessURL) url = safe_unicode(download_url or access_url) if dataset: resource = get_by(dataset.resources, 'url', url) if not dataset or not resource: resource = Resource() if dataset: dataset.resources.append(resource) resource.title = title_from_rdf(distrib, url) resource.url = url resource.description = sanitize_html(distrib.value(DCT.description)) resource.filesize = rdf_value(distrib, DCAT.bytesSize) resource.mime = rdf_value(distrib, DCAT.mediaType) fmt = rdf_value(distrib, DCT.term('format')) if fmt: resource.format = fmt.lower() checksum = distrib.value(SPDX.checksum) if checksum: algorithm = checksum.value(SPDX.algorithm).identifier algorithm = CHECKSUM_ALGORITHMS.get(algorithm) if algorithm: resource.checksum = Checksum() resource.checksum.value = rdf_value(checksum, SPDX.checksumValue) resource.checksum.type = algorithm resource.published = rdf_value(distrib, DCT.issued, resource.published) resource.modified = rdf_value(distrib, DCT.modified, resource.modified) identifier = rdf_value(distrib, DCT.identifier) if identifier: resource.extras['dct:identifier'] = identifier if isinstance(distrib.identifier, URIRef): resource.extras['uri'] = distrib.identifier.toPython() return resource
def extract_name_from_path(path): """Return a readable name from a URL path. Useful to log requests on Piwik with categories tree structure. See: http://piwik.org/faq/how-to/#faq_62 """ base_path, query_string = path.split('?') infos = base_path.strip('/').split('/')[2:] # Removes api/version. if len(infos) > 1: # This is an object. name = '{category} / {name}'.format(category=infos[0].title(), name=infos[1].replace('-', ' ').title()) else: # This is a collection. name = '{category}'.format(category=infos[0].title()) return safe_unicode(name)
def extract_name_from_path(path): """Return a readable name from a URL path. Useful to log requests on Piwik with categories tree structure. See: http://piwik.org/faq/how-to/#faq_62 """ base_path, query_string = path.split('?') infos = base_path.strip('/').split('/')[2:] # Removes api/version. if len(infos) > 1: # This is an object. name = '{category} / {name}'.format( category=infos[0].title(), name=infos[1].replace('-', ' ').title() ) else: # This is a collection. name = '{category}'.format(category=infos[0].title()) return safe_unicode(name)
def process_item(self, item): log.debug('Processing: %s', item.remote_id) item.status = 'started' item.started = datetime.now() if not self.dryrun: self.job.save() try: dataset = self.process(item) dataset.extras['harvest:source_id'] = str(self.source.id) dataset.extras['harvest:remote_id'] = item.remote_id dataset.extras['harvest:domain'] = self.source.domain dataset.extras['harvest:last_update'] = datetime.now().isoformat() # unset archived status if needed if dataset.extras.get('harvest:archived_at'): dataset.extras.pop('harvest:archived_at') dataset.extras.pop('harvest:archived') dataset.archived = None # TODO permissions checking if not dataset.organization and not dataset.owner: if self.source.organization: dataset.organization = self.source.organization elif self.source.owner: dataset.owner = self.source.owner # TODO: Apply editble mappings if self.dryrun: dataset.validate() else: dataset.save() item.dataset = dataset item.status = 'done' except HarvestSkipException as e: log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e)) item.status = 'skipped' item.errors.append(HarvestError(message=safe_unicode(e))) except HarvestValidationError as e: log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e)) item.status = 'failed' item.errors.append(HarvestError(message=safe_unicode(e))) except Exception as e: log.exception('Error while processing %s : %s', item.remote_id, safe_unicode(e)) error = HarvestError(message=safe_unicode(e), details=traceback.format_exc()) item.errors.append(error) item.status = 'failed' item.ended = datetime.now() if not self.dryrun: self.job.save()
def process_item(self, item): log.debug('Processing: %s', item.remote_id) item.status = 'started' item.started = datetime.now() if not self.dryrun: self.job.save() try: dataset = self.process(item) dataset.extras['harvest:source_id'] = str(self.source.id) dataset.extras['harvest:remote_id'] = item.remote_id dataset.extras['harvest:domain'] = self.source.domain dataset.extras['harvest:last_update'] = datetime.now().isoformat() # TODO permissions checking if not dataset.organization and not dataset.owner: if self.source.organization: dataset.organization = self.source.organization elif self.source.owner: dataset.owner = self.source.owner # TODO: Apply editble mappings if self.dryrun: dataset.validate() else: dataset.last_modified = datetime.now() dataset.save() item.dataset = dataset item.status = 'done' except HarvestSkipException as e: log.info('Skipped item %s : %s', item.remote_id, safe_unicode(e)) item.status = 'skipped' item.errors.append(HarvestError(message=safe_unicode(e))) except HarvestValidationError as e: log.info('Error validating item %s : %s', item.remote_id, safe_unicode(e)) item.status = 'failed' item.errors.append(HarvestError(message=safe_unicode(e))) except Exception as e: log.exception('Error while processing %s : %s', item.remote_id, safe_unicode(e)) error = HarvestError(message=safe_unicode(e), details=traceback.format_exc()) item.errors.append(error) item.status = 'failed' item.ended = datetime.now() if not self.dryrun: self.job.save()
def validate(self, data, schema): '''Perform a data validation against a given schema. :param data: an object to validate :param schema: a Voluptous schema to validate against ''' try: return schema(data) except MultipleInvalid as ie: errors = [] for error in ie.errors: if error.path: field = '.'.join(str(p) for p in error.path) path = error.path value = data while path: attr = path.pop(0) try: if isinstance(value, (list, tuple)): attr = int(attr) value = value[attr] except Exception: value = None txt = safe_unicode(error).replace('for dictionary value', '') txt = txt.strip() if isinstance(error, RequiredFieldInvalid): msg = '[{0}] {1}' else: msg = '[{0}] {1}: {2}' try: msg = msg.format(field, txt, str(value)) except Exception: msg = '[{0}] {1}'.format(field, txt) else: msg = str(error) errors.append(msg) msg = '\n- '.join(['Validation error:'] + errors) raise HarvestValidationError(msg)
def validate(self, data, schema): '''Perform a data validation against a given schema. :param data: an object to validate :param schema: a Voluptous schema to validate against ''' try: return schema(data) except MultipleInvalid as ie: errors = [] for error in ie.errors: if error.path: field = '.'.join(str(p) for p in error.path) path = error.path value = data while path: attr = path.pop(0) try: if isinstance(value, (list, tuple)): attr = int(attr) value = value[attr] except Exception: value = None txt = safe_unicode(error).replace('for dictionary value', '') txt = txt.strip() if isinstance(error, RequiredFieldInvalid): msg = '[{0}] {1}' else: msg = '[{0}] {1}: {2}' try: msg = msg.format(field, txt, str(value)) except Exception: msg = '[{0}] {1}'.format(field, txt) else: msg = str(error) errors.append(msg) msg = '\n- '.join(['Validation error:'] + errors) raise HarvestValidationError(msg)
def default_labelizer(self, value): if not isinstance(value, self.model): self.validate_parameter(value) id = self.model_field.to_mongo(value) value = self.model.objects.get(id=id) return safe_unicode(value)
def test_bytes_stays_bytes(self): assert safe_unicode(b'xxx') == b'xxx'
def test_unicode_is_encoded(self): assert safe_unicode('ééé') == 'ééé'.encode('utf8')
def test_object_to_string(self): assert safe_unicode({}) == b'{}'
def test_object_to_string(self): assert safe_unicode({}) == '{}'
def test_unicode_stays_unicode(self): assert safe_unicode('ééé') == 'ééé'
def default_labelizer(self, value): return clean_string(safe_unicode(value))
def success(msg): '''Display a success message''' echo('{0} {1}'.format(green(OK), white(safe_unicode(msg))))
def header(msg): '''Display an header''' echo(' '.join((yellow(HEADER), white(safe_unicode(msg)), yellow(HEADER))))
def _prefix(self, record): if record.levelno in LEVELS_PREFIX: return safe_unicode(LEVELS_PREFIX[record.levelno]) else: color = LEVEL_COLORS.get(record.levelno, white) return safe_unicode('{0}:'.format(color(record.levelname.upper())))
def test_bytes_stays_bytes(self): assert safe_unicode(b'xxx') == b'xxx'
def test_unicode_is_encoded(self): assert safe_unicode('ééé') == 'ééé'.encode('utf8')
def format_multiline(string): string = safe_unicode(string) string = string.replace(b'\n', b'\n│ ') return safe_unicode(replace_last(string, '│', '└'))
def test_bytes_is_decoded(self): assert safe_unicode(b'xxx') == 'xxx'
def replace_last(string, char, replacement): char = safe_unicode(char) replacement = safe_unicode(replacement) string = safe_unicode(string) return replacement.join(string.rsplit(char, 1))
def test_unicode_to_string(self): assert safe_unicode(ValueError('é')) == 'é'
def test_unicode_to_string(self): assert safe_unicode(ValueError('é')) == 'é'.encode('utf8')