Ejemplo n.º 1
0
def datapackage(source, **options):
    errors = []
    tables = []

    # Prepare datapackage
    datapackage = DataPackage(source, **options)
    for exception in datapackage.iter_errors():
        # Error message should contain datapackage source (often it's path)
        message = spec['errors']['datapackage-error']['message']
        message = message.format(
            error_message='{problem} [{source}]'.format(
                problem=str(exception).splitlines()[0],
                source=str(source)))
        errors.append({
            'code': 'datapackage-error',
            'message': message,
            'row-number': None,
            'column-number': None,
        })

    # Add tables
    if not errors:
        for resource in datapackage.resources:
            path = resource.remote_data_path or resource.local_data_path
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': Schema(resource.descriptor['schema']),
                'extra': {
                    'datapackage': str(source),
                },
            })

    return errors, tables
Ejemplo n.º 2
0
 def get(self):
   url=self.request.get('url',None)
   if url:
     if not re.match(".*?/$",url):
       url="%s/"%url
     data=memcache.get('metadata-%s'%url)
     self.response.headers['Content-Type'] = 'application/json; charset=utf-8;'
     self.response.headers['Access-Control-Allow-Origin'] = '*'
     if not data:
       d=DataPackage(url)
       memcache.add('metadata-%s'%url,json.dumps(d.get_descriptor()),300)
       self.response.write(json.dumps(d.get_descriptor()))
     else:
       self.response.write(data)
   else:
     self.response.write("Error: please specify URL")
Ejemplo n.º 3
0
def test_pull_datapackage(storage, descriptor):

    # Prepare and call
    storage.buckets = ['data___data']
    storage.describe.return_value = (
        {'fields': [
            {'name': 'id', 'type': 'integer'},
            {'name': 'city', 'type': 'string'}]})
    storage.read.return_value = [
        (1, 'London'),
        (2, 'Paris'),
    ]
    module.pull_datapackage(descriptor=descriptor, name='name', backend='backend')

    # Assert pulled datapackage
    dp = DataPackage(descriptor)
    assert dp.descriptor == helpers.expand_data_package_descriptor(
        {'name': 'name',
        'resources': [
            {'path': ['data.csv'],
             'name': 'data',
             'schema':
                {'fields': [
                    {'name': 'id', 'type': 'integer'},
                    {'name': 'city', 'type': 'string'}]}}]})
Ejemplo n.º 4
0
def get_fiscal_datapackage(skip_validation=False, source=None):
    """Create the master fiscal datapackage from parts."""

    with open(FISCAL_METADATA_FILE) as stream:
        fiscal_datapackage = yaml.load(stream.read())

    if source:
        datapackage = source
        datapackage['name'] = slugify(os.getcwd().lstrip(DATA_DIR)).lower()
    else:
        datapackage = fiscal_datapackage

    with open(FISCAL_SCHEMA_FILE) as stream:
        schema = yaml.load(stream.read())
        datapackage['resources'][0]['schema'] = schema
        datapackage['resources'][0].update(mediatype='text/csv')
        datapackage['resources'] = [datapackage['resources'][0]]

        # TODO: Update the resource properties in the fiscal data-package

    with open(FISCAL_MODEL_FILE) as stream:
        datapackage['model'] = yaml.load(stream.read())

    if not skip_validation:
        DataPackage(datapackage, schema='fiscal').validate()

    return datapackage
Ejemplo n.º 5
0
def get(location):
    """
    Helper function to retreive data from a data package located at the
    provided location.
    """
    datapkg = DataPackage(location)
    return datapkg.data
Ejemplo n.º 6
0
def assemble(metadata_file):
    """Assemble a data-package from its descriptor parts."""

    def read(file):
        with open(file) as yaml:
            return load(yaml.read())

    def add_name(info):
        info['name'] = slugify(info['title'], separator='_')
        return info

    def get_files(filetype):
        filename = metadata_file.replace('metadata', filetype)
        folder = dirname(metadata_file)
        schema_files_pattern = join(folder, filename)
        return glob(schema_files_pattern)

    descriptor = add_name(read(metadata_file))
    resources = [add_name(read(file)) for file in get_files('resource')]
    model = get_files('model')

    descriptor['resources'] = resources
    if model and len(model) == 1:
        descriptor['model'] = model.pop()

    return DataPackage(descriptor)
def split_resource_per_year(in_file, out_file):
    dp = Package(in_file)

    if not _is_valid(dp):
        logging.warn(
            'DataPackage doesn\'t comply with our prerequisites. Ignoring it.'
        )
    else:
        resource = dp.resources[0]
        resources_per_year = _split_rows_per_year(resource)

        if len(resources_per_year.keys()) <= 1:
            logging.info(
                'Skipping creation of resources per year,'
                ' as there is only data for a single fiscal year'
            )
        else:
            default_resource_descriptor = _clean_resource_descriptor(resource)

            for year, resource_data in sorted(resources_per_year.items(), reverse=True):  # noqa: E501
                # Make sure all rows are written to the filesystem
                resource_data['fp'].flush()

                descriptor = copy.deepcopy(default_resource_descriptor)
                descriptor.update({
                    'name': str(year),
                    'path': resource_data['fp'].name,
                    'count_of_rows': resource_data['count_of_rows'],
                    'profile': 'tabular-data-resource',
                })
                dp.descriptor['resources'].append(descriptor)

                logging.info(
                    'Created resource for year %s (%d rows)',
                    year,
                    resource_data['count_of_rows']
                )

    # FIXME: Remove this whenever we're able to create and safe datapackages
    # without safe resources.
    # See https://github.com/frictionlessdata/datapackage-py/issues/171
    with mock.patch('datapackage.helpers.is_safe_path', return_value=True):
        dp.commit()
        dp.save(out_file)

    return dp
Ejemplo n.º 8
0
def load_data(pkgdir, engine):
    dpo = DataPackage(pkgdir)
    schema = dpo.resources[0].schema
    csvpath = pkgdir + dpo.resources[0].path
    data = [row for row in csv.DictReader(open(csvpath))]
    table = SchemaTable(engine, 'table', schema)
    table.create()
    table.load_iter(data)
Ejemplo n.º 9
0
 def __init__(self, datapackage_dir, with_dependencies):
     self.datapackage_dir = datapackage_dir
     self.with_dependencies = with_dependencies
     datapackage_descriptor_file = os.path.join(datapackage_dir,
                                                "datapackage.json")
     with open(datapackage_descriptor_file) as f:
         descriptor = json.load(f)
     self.fix_descriptor(descriptor)
     self.datapackage = DataPackage(descriptor,
                                    default_base_path=self.datapackage_dir)
Ejemplo n.º 10
0
 def _get_load_resources(self):
     if not hasattr(self, "_load_resources"):
         self._load_resources = []
         for load_resource in self._parameters["load-resources"]:
             if os.path.exists(load_resource["url"]):
                 datapackage = DataPackage(load_resource["url"])
                 for resource in datapackage.resources:
                     if resource.descriptor["name"] == load_resource["resource"]:
                         self._load_resources.append(resource)
     return self._load_resources
Ejemplo n.º 11
0
 def __new__(mcls, name, bases, attrs):
     cls = super(BaseMeta, mcls).__new__(mcls, name, bases, attrs)
     datapackage = attrs.get('__datapackage__')
     if datapackage:
         if isinstance(datapackage, basestring):
             datapackage = DataPackage(unicode(datapackage))
         resource_name = unicode(attrs.get('__resource__'))
         metadata = attrs.get('__metadata__', metadata_)
         mapper(cls, datapackage, resource_name, metadata)
         cls.__queryset__ = SQLAlchemyQuerySet
     return cls
Ejemplo n.º 12
0
    def _load_dp(self, path):
        dppath = join(path, 'datapackage.json')

        # do we need to do this or is it done in datapackage library?
        if not exists(dppath):
            raise DpmException(
                'No Data Package found at %s. Did not find datapackage.json at %s'
                % (path, dppath))

        dp = DataPackage(dppath)
        return dp
Ejemplo n.º 13
0
def import_package(storage, descriptor):
    """Import Data Package to storage.

    Parameters
    ----------
    storage: object
        Storage object.
    descriptor: str
        Path to descriptor.

    """

    # Init maps
    tables = []
    schemas = []
    datamap = {}
    mapping = {}

    # Init model
    model = DataPackage(descriptor)

    # Collect tables/schemas/data
    for resource in model.resources:
        name = resource.metadata.get('name', None)
        table = _convert_path(resource.metadata['path'], name)
        schema = resource.metadata['schema']
        data = resource.iter()
        tables.append(table)
        schemas.append(schema)
        datamap[table] = data
        if name is not None:
            mapping[name] = table
    schemas = _convert_schemas(mapping, schemas)

    # Create tables
    for table in tables:
        if storage.check(table):
            storage.delete(table)
    storage.create(tables, schemas)

    # Write data to tables
    for table in storage.tables:
        storage.write(table, datamap[table])
Ejemplo n.º 14
0
def create_offline_client(paths, cachedir='.cached'):
    ''' Establish an offline client for more up to date assessments than those published
  '''
    import pandas as pd
    from datapackage import DataPackage
    all_pkgs = {}
    for path in paths:
        pkg = DataPackage(path)
        for resource in map(format_patch, pkg.resources):
            if resource.name not in all_pkgs:
                all_pkgs[resource.name] = {
                    'schema': resource.descriptor['schema'],
                    'data': []
                }
            #
            try:
                all_pkgs[resource.name]['data'] += resource.read(keyed=True)
            except Exception as e:
                print(
                    f"datapackage exception while reading from table: '{resource.name}'"
                )
                print(e.errors)
                raise e
    #
    joined_pkgs = {}
    for resource_name, resource in all_pkgs.items():
        if resource['data']:
            data = pd.DataFrame(resource['data'])
        else:
            data = pd.DataFrame([],
                                columns=[
                                    field['name']
                                    for field in resource['schema']['fields']
                                ])
        #
        for field in resource['schema']['fields']:
            if field['type'] == 'datetime':
                data[field['name']] = pd.to_datetime(data[field['name']],
                                                     utc=True)
            elif field['type'] in {'array', 'object'}:
                data[field['name']] = data[field['name']].apply(json.dumps)
        joined_pkgs[resource_name] = dict(resource, data=data)
    return DerivaCompatPkg(joined_pkgs, cachedir=cachedir)
Ejemplo n.º 15
0
def datavalidate(filepath, print_json):
    """
    Validate csv file data, given its path. Print validation report. If the file is
    a resource of the datapackage in current dir, will use datapackage.json schema for
    validation; otherwise infer the schema automatically.
    If no file path is given, validate all resources data in datapackage.json.
    """
    inspector = goodtables.Inspector(infer_schema=True)

    if exists('datapackage.json'):
        dp = DataPackage('datapackage.json')
    else:
        dp = None

    if not filepath and not dp:
        echo(
            '[ERROR] please provide csv file path or run command inside a datapackage dir.'
        )
        sys.exit(1)

    if filepath:
        schema = None
        if dp:
            # Try to find schema in the datapackage.json
            for resource in dp.resources:
                if resource.local_data_path == abspath(filepath):
                    #import ipdb; ipdb.sset_trace()
                    schema = resource.descriptor.get('schema')
                    break

        report = inspector.inspect(filepath, schema=schema)
    else:
        # Validate whole datapackage
        dprclient.validate_metadata(dp)
        report = dprclient.validate_data(dp)

    dprclient.print_inspection_report(report, print_json)
    if not report['valid']:
        sys.exit(1)
Ejemplo n.º 16
0
    def __init__(self, dp_url):
        self.datapackage = DataPackage(dp_url)
        self.resource = self.datapackage.resources[0]
        descriptor = self.resource.descriptor
        self.type_name = descriptor['name']

        self._schema = descriptor['schema']
        fields = self._schema['fields']

        try:
            self.keys = self._schema['primaryKey']
        except KeyError:
            logger.exception('Failed to load %s', dp_url)
            raise
        if isinstance(self.keys, str):
            self.keys = [self.keys]

        self.date_fields = {}
        self.range_structure = {}

        for field in fields:
            if field.get("es:time-range"):
                self.date_fields[field["es:time-range"]] = field["name"]

        try:
            self.scoring_column = next(iter(
                filter(lambda f: 'es:score-column' in f, fields),
            ))['name']
        except StopIteration:
            self.scoring_column = '<none>'
        self._mapping_generator = MappingGenerator()
        try:
            self.mapping, self.search_fields = self.build_mapping(self._schema)
        except: #noqa
            logger.exception('Failed to load %s', dp_url)
            raise
Ejemplo n.º 17
0
 def simpsons_datapackage(self, simpsons_descriptor_path):
     datapackage = DataPackage(descriptor=simpsons_descriptor_path)
     for r in datapackage.resources:
         sanitize_resource_schema(r)
     return datapackage
Ejemplo n.º 18
0
 def __init__(self, datapackage, databackend):
     if isinstance(datapackage, basestring):
         datapackage = DataPackage(unicode(datapackage))
     self.datapackage = datapackage
     self.models_maker = ModelsMaker(datapackage, backend=databackend)
     self._resources = {}
def test_support_criteria_parser():
    dp = DataPackage({
        "name":
        "support-criteria",
        "resources": [{
            "name": "criteria",
            "dialect": {
                "delimiter": ",",
                "doubleQuote": True,
                "lineTerminator": "\r\n",
                "quoteChar": '"',
                "skipInitialSpace": False
            },
            "encoding": "utf-8",
            "format": "csv",
            "path": "tests/support/criteria.csv",
            "schema": {
                "fields": [
                    # the original support-criteria fields
                    {
                        "format": "%Y-%m-%d",
                        "name": "date",
                        "type": "date"
                    },
                    {
                        "name": "title",
                        "type": "string"
                    },
                    {
                        "name": "paper_type",
                        "type": "string"
                    },
                    {
                        "name": "office",
                        "type": "string"
                    },
                    {
                        "format": "uri",
                        "name": "pdf_url",
                        "type": "string"
                    },
                    # the expected data from the parser
                    {
                        "name": "expected_purpose",
                        "type": "string"
                    }
                ]
            }
        }]
    })
    i = 0
    num_parsed = 0
    num_expected_purposes = 0
    for i, row in enumerate(dp.resources[0].iter(keyed=True)):
        parsed_row = parse_row(row)
        if len(parsed_row["purpose"]
               ) > 0 and parsed_row["purpose"] != row["title"]:
            num_parsed += 1
        row_expected_purpose = row["expected_purpose"] if row[
            "expected_purpose"] else ""
        if len(row_expected_purpose) > 0:
            num_expected_purposes += 1
            assert parsed_row["purpose"] == row_expected_purpose, "{}".format({
                "i":
                i,
                "row":
                row,
                "parsed_row":
                parsed_row,
                "expected_purpose":
                row_expected_purpose
            })
    assert i == 358
    assert num_expected_purposes > 20, "not enough purposes were checked, might be something wrong with the criteria.csv file"
    assert num_parsed > 190, "not enough parsed rows"
class FDPLoader(object):
    """
    Utility class for loading FDPs to the DB
    """
    def __init__(self, engine=None):
        if engine is None:
            self.engine = get_engine()
        else:
            self.engine = engine
        self.package = None
        self.model = None
        self.model_name = None
        self.dpo = None
        self.datapackage_name = None
        self.fullname = None
        self.registry = ModelRegistry()
        self.last_package_descriptor = None
        self.last_loading_success = None
        self.callback = noop

    def check_hashes(self, resource):
        logging.info('Checking hashes of currently loaded data')

        current_schema_hash = self.last_package_descriptor\
                                .get('resources', ({},))[0]\
                                .get('_schema_hash')
        logging.info('Loaded resource descriptor hash is %s',
                     current_schema_hash)

        new_schema_hash = dict((k, v) for k, v in resource.descriptor.items()
                               if not k.startswith('_'))
        new_schema_hash['_model'] = self.dpo.descriptor.get('model', {})
        new_schema_hash = json.dumps(new_schema_hash,
                                     sort_keys=True,
                                     ensure_ascii=True)
        new_schema_hash = new_schema_hash.encode('ascii')
        new_schema_hash = hashlib.md5(new_schema_hash).hexdigest()
        logging.info('Loading resource descriptor hash is %s', new_schema_hash)

        current_data_hash = self.last_package_descriptor \
            .get('resources', ({},))[0] \
            .get('_data_hash')
        logging.info('Loaded resource data hash is %s', current_data_hash)

        new_data_hash = None
        remote_url = resource.source
        if remote_url and remote_url.startswith('http'):
            response = requests.head(remote_url)
            new_data_hash = response.headers.get('etag')
        logging.info('Loading resource data hash is %s', new_data_hash)

        resource.descriptor['_schema_hash'] = new_schema_hash
        resource.descriptor['_data_hash'] = new_data_hash

        ret = (current_schema_hash != new_schema_hash) or\
              (current_data_hash != new_data_hash) or\
              (not self.last_loading_success)

        if ret:
            logging.info('Looks like stuff changed, loading data')
        else:
            logging.info(
                'Looks like nothing major changed, skipping data load')

        return ret

    def status_update(self, **kwargs):
        if self.model_name is not None:
            try:
                _name, _, _package, _model, _dataset, \
                _author, _loading_status, _loaded = \
                    self.registry.get_raw(self.model_name)
                if self.last_package_descriptor is None:
                    self.last_package_descriptor = _package
                if self.last_loading_success is None:
                    self.last_loading_success = _loading_status == STATUS_DONE
            except KeyError:
                _name = self.model_name
                _package = {}
                _model = {}
                _dataset = ''
                _author = ''
                _loading_status = None
                _loaded = False
                self.last_package_descriptor = {}
                self.last_loading_success = False

            if self.model is not None:
                _model = self.model
            if self.dpo is not None:
                _package = self.dpo.descriptor
            if self.datapackage_name is not None:
                _dataset = self.datapackage_name
            if self.fullname is not None:
                _author = self.fullname
            status = kwargs.get('status')
            if status is not None:
                _loading_status = status
                _loaded = status == STATUS_DONE
            self.registry.save_model(_name, self.package, _package, _model,
                                     _dataset, _author, _loading_status,
                                     _loaded)
        self.callback(**kwargs)

    def load_fdp_to_db(self, package, callback=noop):
        """
        Load an FDP to the database, create a babbage model and save it as well
        :param package: URL for the datapackage.json
        :param callback: callback to use to send progress updates
        """

        self.callback = callback
        self.package = package

        # Load and validate the datapackage
        self.status_update(status=STATUS_LOADING_DATAPACKAGE)
        self.dpo = DataPackage(package)
        self.status_update(status=STATUS_VALIDATING_DATAPACKAGE)
        self.dpo.validate()
        self.status_update(status=STATUS_LOADING_RESOURCE)
        resource = self.dpo.resources[0]
        schema = resource.descriptor['schema']

        # Use the cube manager to get the table name
        self.datapackage_name = self.dpo.descriptor['name']
        datapackage_owner = self.dpo.descriptor['owner']
        datapackage_author = self.dpo.descriptor['author']

        # Get the full name from the author field, and rewrite it without the email
        self.fullname, email_addr = email.utils.parseaddr(datapackage_author)
        email_addr = email_addr.split('@')[0] + '@not.shown'
        self.dpo.descriptor['author'] = '{0} <{1}>'.format(
            self.fullname, email_addr)
        self.dpo.descriptor.setdefault('private', True)

        self.model_name = "{0}:{1}".format(datapackage_owner,
                                           self.datapackage_name)
        table_name = table_name_for_package(datapackage_owner,
                                            self.datapackage_name)

        try:
            all_fields = set()
            field_translation = {}
            # Process schema - slugify field names
            for field in schema['fields']:
                name = database_name(field['name'], all_fields)
                all_fields.add(name)
                translated_field = {'name': name, 'type': field['type']}
                field_translation[field['name']] = translated_field

            storage_schema = {
                'fields': [{
                    'type': f['type'],
                    'name': field_translation[f['name']]['name'],
                    'format': f.get('format', 'default')
                } for f in schema['fields']],
                # Babbage likes just one primary key
                'primaryKey':
                '_id'
            }

            # Add Primary key to schema
            storage_schema['fields'].insert(0, {
                'name': '_id',
                'type': 'integer'
            })

            # Create Babbage Model
            self.status_update(status=STATUS_CREATING_BABBAGE_MODEL)
            self.model = fdp_to_model(self.dpo, table_name, resource,
                                      field_translation)

            if self.check_hashes(resource):
                # Create indexes
                indexes = set()
                primary_keys = schema.get('primaryKey', [])
                for dim in self.dpo.descriptor.get('model',
                                                   {}).get('dimensions',
                                                           {}).values():
                    attributes = dim.get('attributes', {})
                    for attribute in attributes.values():
                        source = attribute.get('source')
                        if source in primary_keys:
                            indexes.add((field_translation[source]['name'], ))
                        labelfor = attribute.get('labelfor')
                        if labelfor is not None:
                            labelfor = attributes.get(labelfor, {})
                            labelfor_source = labelfor.get('source')
                            if labelfor_source in primary_keys:
                                indexes.add((
                                    field_translation[labelfor_source]['name'],
                                    field_translation[source]['name'],
                                ))
                indexes = list(indexes)
                logging.error('INDEXES: %r', indexes)
                #
                # if dim['label'] in primary_keys:
                #     key_field = dim['attributes'][dim['key_attribute']]['label']
                #     key_field = field_translation[key_field]['name']
                #     indexes.append((key_field,))
                #
                #     label_field = dim['attributes'].get(dim.get('label_attribute'), {}).get('label')
                #     if label_field is not None:
                #         label_field = field_translation[label_field]['name']
                #         if label_field != key_field:
                #             indexes.append((key_field, label_field))

                # Load 1st resource data into DB
                # We use the prefix name so that JTS-SQL doesn't load all table data into memory
                storage = Storage(self.engine, prefix=table_name)
                faux_table_name = ''
                if faux_table_name in storage.buckets:
                    self.status_update(status=STATUS_DELETING_TABLE)
                    storage.delete(faux_table_name)
                self.status_update(status=STATUS_CREATING_TABLE)
                indexes_fields = None
                if indexes:
                    indexes_fields = [indexes]
                storage.create(faux_table_name,
                               storage_schema,
                               indexes_fields=indexes_fields)

                self.status_update(status=STATUS_LOADING_DATA_READY)
                row_processor = RowProcessor(resource.iter(keyed=True),
                                             self.status_update, schema,
                                             self.dpo.descriptor)
                storage.write(faux_table_name, row_processor.iter())

                cache = get_os_cache()
                if cache is not None:
                    logging.info('Clearing cache for context=%s',
                                 self.model_name)
                    cache.clear(self.model_name)

            response = {
                'model_name': self.model_name,
                'babbage_model': self.model,
                'package': self.dpo.descriptor
            }
            self.status_update(status=STATUS_DONE, data=response)

        except Exception as e:
            logging.exception('LOADING FAILED')
            self.status_update(status=STATUS_FAIL,
                               error=traceback.format_exc())
            return False

        return True
Ejemplo n.º 21
0
 def simpsons_broken_datapackage(self, simpsons_broken_descriptor_path):
     return DataPackage(descriptor=simpsons_broken_descriptor_path)
    def load_fdp_to_db(package, engine = None, callback=None):
        """
        Load an FDP to the database, create a babbage model and save it as well
        :param package: URL for the datapackage.json
        """

        # Load and validate the datapackage
        if engine is None:
            engine = get_engine()
        if callback is None:
            callback = noop
        callback(status=STATUS_LOADING_DATAPACKAGE)
        dpo = DataPackage(package, schema='fiscal')
        callback(status=STATUS_VALIDATING_DATAPACKAGE)
        dpo.validate()
        callback(status=STATUS_LOADING_RESOURCE)
        resource = dpo.resources[0]
        schema = resource.metadata['schema']

        # Use the cube manager to get the table name
        registry = ModelRegistry()
        datapackage_name = dpo.metadata['name']
        datapackage_owner = dpo.metadata['owner']
        datapackage_author = dpo.metadata['author']

        # Get the full name from the author field, and rewrite it without the email
        fullname, email_addr = email.utils.parseaddr(datapackage_author)
        email_addr = email_addr.split('@')[0] + '@not.shown'
        dpo.metadata['author'] = '{0} <{1}>'.format(fullname, email_addr)

        model_name = "{0}:{1}".format(datapackage_owner, datapackage_name)
        table_name = registry.table_name_for_package(datapackage_owner, datapackage_name)

        all_fields = set()
        field_translation = {}
        field_order = []
        # Process schema - slugify field names
        for field in schema['fields']:
            name = database_name(field['name'], all_fields)
            all_fields.add(name)
            translated_field = {
                'name': name,
                'type': field['type']
            }
            field_translation[field['name']] = translated_field
            field_order.append(field['name'])

        storage_schema = {
            'fields': [
                {
                    'type': f['type'],
                    'name': field_translation[f['name']]['name'],
                    'format': f.get('format', 'default')
                }
                for f in schema['fields']
                ],
            # Babbage likes just one primary key
            'primaryKey': '_id'
        }

        # Add Primary key to schema
        storage_schema['fields'].insert(0, {
            'name': '_id',
            'type': 'integer'
        })

        # Load 1st resource data into DB
        storage = Storage(engine)
        if storage.check(table_name):
            callback(status=STATUS_DELETING_TABLE)
            storage.delete(table_name)
        callback(status=STATUS_CREATING_TABLE)
        storage.create(table_name, storage_schema)
        callback(status=STATUS_LOADING_DATA_READY)
        storage.write(table_name, _translator_iterator(resource.iter(), field_order, callback))

        # Create Babbage Model
        callback(status=STATUS_CREATING_BABBAGE_MODEL)
        model = fdp_to_model(dpo, table_name, resource, field_translation)
        callback(status=STATUS_SAVING_METADATA)
        registry.save_model(model_name, package, dpo.metadata,
                            model, datapackage_name, fullname)
        return model_name, dpo.metadata, model
Ejemplo n.º 23
0
    pipeline.register_processor('schema', options={'schema': schema})
    valid, report = pipeline.run()
    return valid, report


def validate_schema(package):
    try:
        package.validate()
        return []
    except (ValidationError, InvalidSchemaError, SchemaValidationError):
        for error in package.iter_errors():
            yield error.message


if __name__ == '__main__':
    package_ = DataPackage(SOURCE_SCHEMA)
    encoding_ = detect_encoding(SOURCE_DATA)
    errors_ = list(validate_schema(package_))

    source_df = read_excel(SOURCE_DATA, header=4, skiprows=range(0, 3))
    comment_lines = source_df.index[COMMENT_LINES]
    for i in COMMENT_LINES:
        print(list(source_df.iloc[i]))
    source_df = source_df.drop(comment_lines)
    source_csv = SOURCE_DATA.replace('xls', 'csv')
    source_df.to_csv(source_csv)

    odo(source_df, source_csv)

    if errors_:
        for message in errors_:
def test_assemble_fiscal_datapackage_returns_a_valid_fiscal_descriptor():
    datapackage = assemble_fiscal_datapackage()
    # The validation raises an exception if the validation fails
    assert DataPackage(datapackage, schema='fiscal').validate() is None
    def load_fdp_to_db(package, engine=None, callback=None):
        """
        Load an FDP to the database, create a babbage model and save it as well
        :param package: URL for the datapackage.json
        :param engine: DB engine
        :param callback: callback to use to send progress updates
        """

        # Load and validate the datapackage
        if engine is None:
            engine = get_engine()
        if callback is None:
            callback = noop
        callback(status=STATUS_LOADING_DATAPACKAGE)
        dpo = DataPackage(package, schema='fiscal')
        callback(status=STATUS_VALIDATING_DATAPACKAGE)
        dpo.validate()
        callback(status=STATUS_LOADING_RESOURCE)
        resource = dpo.resources[0]
        schema = resource.descriptor['schema']

        # Use the cube manager to get the table name
        registry = ModelRegistry()
        datapackage_name = dpo.descriptor['name']
        datapackage_owner = dpo.descriptor['owner']
        datapackage_author = dpo.descriptor['author']

        # Get the full name from the author field, and rewrite it without the email
        fullname, email_addr = email.utils.parseaddr(datapackage_author)
        email_addr = email_addr.split('@')[0] + '@not.shown'
        dpo.descriptor['author'] = '{0} <{1}>'.format(fullname, email_addr)
        dpo.descriptor.setdefault('private', True)

        # Measure factors
        measures = dpo.descriptor.get('model',{}).get('measures',{})
        factors = {}
        for _, measure in measures.items():
            factor = measure.get('factor',1)
            if factor != 1:
                factors[measure.get('source')] = factor

        model_name = "{0}:{1}".format(datapackage_owner, datapackage_name)
        table_name = table_name_for_package(datapackage_owner, datapackage_name)

        all_fields = set()
        field_translation = {}
        field_order = []
        # Process schema - slugify field names
        for field in schema['fields']:
            name = database_name(field['name'], all_fields)
            all_fields.add(name)
            translated_field = {
                'name': name,
                'type': field['type']
            }
            field_translation[field['name']] = translated_field
            field_order.append(field['name'])

        storage_schema = {
            'fields': [
                {
                    'type': f['type'],
                    'name': field_translation[f['name']]['name'],
                    'format': f.get('format', 'default')
                }
                for f in schema['fields']
                ],
            # Babbage likes just one primary key
            'primaryKey': '_id'
        }

        # Add Primary key to schema
        storage_schema['fields'].insert(0, {
            'name': '_id',
            'type': 'integer'
        })

        # Create Babbage Model
        callback(status=STATUS_CREATING_BABBAGE_MODEL)
        model = fdp_to_model(dpo, table_name, resource, field_translation)

        # Create indexes
        indexes = []
        primary_keys = resource.descriptor['schema'].get('primaryKey',[])
        for dim in model['dimensions'].values():
            if dim['label'] in primary_keys:
                key_field = dim['attributes'][dim['key_attribute']]['label']
                key_field = field_translation[key_field]['name']
                indexes.append((key_field,))

                label_field = dim['attributes'].get(dim.get('label_attribute'), {}).get('label')
                if label_field is not None:
                    label_field = field_translation[label_field]['name']
                    if label_field != key_field:
                        indexes.append((key_field, label_field))


        # Load 1st resource data into DB
        storage = Storage(engine)
        if storage.check(table_name):
            callback(status=STATUS_DELETING_TABLE)
            storage.delete(table_name)
        callback(status=STATUS_CREATING_TABLE)
        storage.create(table_name, storage_schema, indexes)

        callback(status=STATUS_LOADING_DATA_READY)
        storage.write(table_name, _translator_iterator(resource.iter(), field_order, factors, callback))

        callback(status=STATUS_SAVING_METADATA)
        registry.save_model(model_name, package, dpo.descriptor,
                            model, datapackage_name, fullname)
        return model_name, dpo.descriptor, model
    def load_fdp_to_db(self, package, callback=noop):
        """
        Load an FDP to the database, create a babbage model and save it as well
        :param package: URL for the datapackage.json
        :param callback: callback to use to send progress updates
        """

        self.callback = callback
        self.package = package

        # Load and validate the datapackage
        self.status_update(status=STATUS_LOADING_DATAPACKAGE)
        self.dpo = DataPackage(package)
        self.status_update(status=STATUS_VALIDATING_DATAPACKAGE)
        self.dpo.validate()
        self.status_update(status=STATUS_LOADING_RESOURCE)
        resource = self.dpo.resources[0]
        schema = resource.descriptor['schema']

        # Use the cube manager to get the table name
        self.datapackage_name = self.dpo.descriptor['name']
        datapackage_owner = self.dpo.descriptor['owner']
        datapackage_author = self.dpo.descriptor['author']

        # Get the full name from the author field, and rewrite it without the email
        self.fullname, email_addr = email.utils.parseaddr(datapackage_author)
        email_addr = email_addr.split('@')[0] + '@not.shown'
        self.dpo.descriptor['author'] = '{0} <{1}>'.format(
            self.fullname, email_addr)
        self.dpo.descriptor.setdefault('private', True)

        self.model_name = "{0}:{1}".format(datapackage_owner,
                                           self.datapackage_name)
        table_name = table_name_for_package(datapackage_owner,
                                            self.datapackage_name)

        try:
            all_fields = set()
            field_translation = {}
            # Process schema - slugify field names
            for field in schema['fields']:
                name = database_name(field['name'], all_fields)
                all_fields.add(name)
                translated_field = {'name': name, 'type': field['type']}
                field_translation[field['name']] = translated_field

            storage_schema = {
                'fields': [{
                    'type': f['type'],
                    'name': field_translation[f['name']]['name'],
                    'format': f.get('format', 'default')
                } for f in schema['fields']],
                # Babbage likes just one primary key
                'primaryKey':
                '_id'
            }

            # Add Primary key to schema
            storage_schema['fields'].insert(0, {
                'name': '_id',
                'type': 'integer'
            })

            # Create Babbage Model
            self.status_update(status=STATUS_CREATING_BABBAGE_MODEL)
            self.model = fdp_to_model(self.dpo, table_name, resource,
                                      field_translation)

            if self.check_hashes(resource):
                # Create indexes
                indexes = set()
                primary_keys = schema.get('primaryKey', [])
                for dim in self.dpo.descriptor.get('model',
                                                   {}).get('dimensions',
                                                           {}).values():
                    attributes = dim.get('attributes', {})
                    for attribute in attributes.values():
                        source = attribute.get('source')
                        if source in primary_keys:
                            indexes.add((field_translation[source]['name'], ))
                        labelfor = attribute.get('labelfor')
                        if labelfor is not None:
                            labelfor = attributes.get(labelfor, {})
                            labelfor_source = labelfor.get('source')
                            if labelfor_source in primary_keys:
                                indexes.add((
                                    field_translation[labelfor_source]['name'],
                                    field_translation[source]['name'],
                                ))
                indexes = list(indexes)
                logging.error('INDEXES: %r', indexes)
                #
                # if dim['label'] in primary_keys:
                #     key_field = dim['attributes'][dim['key_attribute']]['label']
                #     key_field = field_translation[key_field]['name']
                #     indexes.append((key_field,))
                #
                #     label_field = dim['attributes'].get(dim.get('label_attribute'), {}).get('label')
                #     if label_field is not None:
                #         label_field = field_translation[label_field]['name']
                #         if label_field != key_field:
                #             indexes.append((key_field, label_field))

                # Load 1st resource data into DB
                # We use the prefix name so that JTS-SQL doesn't load all table data into memory
                storage = Storage(self.engine, prefix=table_name)
                faux_table_name = ''
                if faux_table_name in storage.buckets:
                    self.status_update(status=STATUS_DELETING_TABLE)
                    storage.delete(faux_table_name)
                self.status_update(status=STATUS_CREATING_TABLE)
                indexes_fields = None
                if indexes:
                    indexes_fields = [indexes]
                storage.create(faux_table_name,
                               storage_schema,
                               indexes_fields=indexes_fields)

                self.status_update(status=STATUS_LOADING_DATA_READY)
                row_processor = RowProcessor(resource.iter(keyed=True),
                                             self.status_update, schema,
                                             self.dpo.descriptor)
                storage.write(faux_table_name, row_processor.iter())

                cache = get_os_cache()
                if cache is not None:
                    logging.info('Clearing cache for context=%s',
                                 self.model_name)
                    cache.clear(self.model_name)

            response = {
                'model_name': self.model_name,
                'babbage_model': self.model,
                'package': self.dpo.descriptor
            }
            self.status_update(status=STATUS_DONE, data=response)

        except Exception as e:
            logging.exception('LOADING FAILED')
            self.status_update(status=STATUS_FAIL,
                               error=traceback.format_exc())
            return False

        return True