Example #1
0
def test_package_groups_save_to_sql_merge_groups():
    package = Package('data/datapackage-groups/datapackage.json')

    # Save to storage
    engine = sqlalchemy.create_engine('sqlite://')
    storage = Storage.connect('sql', engine=engine)
    package.save(storage=storage, merge_groups=True)

    # Check storage
    storage = Storage.connect('sql', engine=engine)
    assert storage.buckets == ['cars']
    assert storage.describe('cars') == {
        'fields': [
            {
                'name': 'name',
                'type': 'string'
            },
            {
                'name': 'value',
                'type': 'integer'
            },
        ],
    }
    assert storage.read('cars') == [
        ['bmw', 2016],
        ['tesla', 2016],
        ['nissan', 2016],
        ['bmw', 2017],
        ['tesla', 2017],
        ['nissan', 2017],
        ['bmw', 2018],
        ['tesla', 2018],
        ['nissan', 2018],
    ]
Example #2
0
def test_package_groups_save_to_sql():
    package = Package('data/datapackage-groups/datapackage.json')

    # Save to storage
    engine = sqlalchemy.create_engine('sqlite://')
    storage = Storage.connect('sql', engine=engine)
    package.save(storage=storage)

    # Check storage
    storage = Storage.connect('sql', engine=engine)
    assert storage.buckets == ['cars_2016', 'cars_2017', 'cars_2018']
    for year in [2016, 2017, 2018]:
        assert storage.describe('cars_%s' % year) == {
            'fields': [
                {
                    'name': 'name',
                    'type': 'string'
                },
                {
                    'name': 'value',
                    'type': 'integer'
                },
            ],
        }
        assert storage.read('cars_%s' % year) == [
            ['bmw', year],
            ['tesla', year],
            ['nissan', year],
        ]
Example #3
0
    def save(self, target=None, storage=None, **options):
        """https://github.com/frictionlessdata/datapackage-py#package
        """

        # Save package to storage
        if storage is not None:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            buckets = []
            schemas = []
            for resource in self.resources:
                if resource.tabular:
                    resource.infer()
                    buckets.append(_slugify_resource_name(resource.name))
                    schemas.append(resource.schema.descriptor)
            schemas = list(map(_slugify_foreign_key, schemas))
            storage.create(buckets, schemas, force=True)
            for bucket in storage.buckets:
                resource = self.resources[buckets.index(bucket)]
                storage.write(bucket, resource.iter())

        # Save descriptor to json
        elif str(target).endswith('.json'):
            mode = 'w'
            encoding = 'utf-8'
            if six.PY2:
                mode = 'wb'
                encoding = None
            helpers.ensure_dir(target)
            with io.open(target, mode=mode, encoding=encoding) as file:
                json.dump(self.__current_descriptor, file, indent=4)

        # Save package to zip
        else:
            try:
                with zipfile.ZipFile(target, 'w') as z:
                    descriptor = json.loads(
                        json.dumps(self.__current_descriptor))
                    for index, resource in enumerate(self.resources):
                        if not resource.name:
                            continue
                        if not resource.local:
                            continue
                        path = os.path.abspath(resource.source)
                        basename = resource.descriptor.get('name')
                        resource_format = resource.descriptor.get('format')
                        if resource_format:
                            basename = '.'.join(
                                [basename, resource_format.lower()])
                        path_inside_dp = os.path.join('data', basename)
                        z.write(path, path_inside_dp)
                        descriptor['resources'][index]['path'] = path_inside_dp
                    z.writestr('datapackage.json', json.dumps(descriptor))
            except (IOError, zipfile.BadZipfile,
                    zipfile.LargeZipFile) as exception:
                six.raise_from(exceptions.DataPackageException(exception),
                               exception)

        return True
Example #4
0
    def save(self, target=None, storage=None, **options):
        """https://github.com/frictionlessdata/datapackage-py#package
        """

        # Save package to storage
        if storage is not None:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            buckets = []
            schemas = []
            for resource in self.resources:
                if resource.tabular:
                    resource.infer()
                    buckets.append(_slugify_resource_name(resource.name))
                    schemas.append(resource.schema.descriptor)
            schemas = list(map(_slugify_foreign_key, schemas))
            storage.create(buckets, schemas, force=True)
            for bucket in storage.buckets:
                resource = self.resources[buckets.index(bucket)]
                storage.write(bucket, resource.iter())

        # Save descriptor to json
        elif str(target).endswith('.json'):
            mode = 'w'
            encoding = 'utf-8'
            if six.PY2:
                mode = 'wb'
                encoding = None
            helpers.ensure_dir(target)
            with io.open(target, mode=mode, encoding=encoding) as file:
                json.dump(self.__current_descriptor, file, indent=4)

        # Save package to zip
        else:
            try:
                with zipfile.ZipFile(target, 'w') as z:
                    descriptor = json.loads(json.dumps(self.__current_descriptor))
                    for index, resource in enumerate(self.resources):
                        if not resource.name:
                            continue
                        if not resource.local:
                            continue
                        path = os.path.abspath(resource.source)
                        basename = resource.descriptor.get('name')
                        resource_format = resource.descriptor.get('format')
                        if resource_format:
                            basename = '.'.join([basename, resource_format.lower()])
                        path_inside_dp = os.path.join('data', basename)
                        z.write(path, path_inside_dp)
                        descriptor['resources'][index]['path'] = path_inside_dp
                    z.writestr('datapackage.json', json.dumps(descriptor))
            except (IOError, zipfile.BadZipfile, zipfile.LargeZipFile) as exception:
                six.raise_from(exceptions.DataPackageException(exception), exception)

        return True
def test_table_sql(name, resource):

    # Storage
    engine = create_engine('sqlite:///')
    storage = Storage.connect('sql', engine=engine)

    # Save
    table = Table(resource['data'], schema=resource['schema'])
    table.save('table', storage=storage)

    # Load
    table = Table('table', schema=resource['schema'], storage=storage)
    assert table.read() == cast(resource)['data']
def test_table_pandas(name):

    # Storage
    storage = Storage.connect('pandas')

    # Save
    package = Package('data/packages/%s/datapackage.json' % name)
    package.save(storage=storage)

    # Load
    package = Package(storage=storage)
    assert package.resources
    for resource in package.resources:
        assert resource.read()
def test_table_sql(name):

    # Storage
    engine = create_engine('sqlite:///')
    storage = Storage.connect('sql', engine=engine)

    # Save
    package = Package('data/packages/%s/datapackage.json' % name)
    package.save(storage=storage)

    # Load
    package = Package(storage=storage)
    assert package.resources
    for resource in package.resources:
        assert resource.read()
Example #8
0
    def __init__(
            self,
            descriptor={},
            base_path=None,
            strict=False,
            unsafe=False,
            storage=None,
            # Internal
            package=None,
            **options):

        # Get base path
        if base_path is None:
            base_path = helpers.get_descriptor_base_path(descriptor)

        # Instantiate storage
        if storage and not isinstance(storage, Storage):
            storage = Storage.connect(storage, **options)

        # Process descriptor
        descriptor = helpers.retrieve_descriptor(descriptor)
        descriptor = helpers.dereference_resource_descriptor(
            descriptor, base_path)

        # Handle deprecated resource.path.url
        if descriptor.get('url'):
            warnings.warn(
                'Resource property "url: <url>" is deprecated. '
                'Please use "path: <url>" instead.', UserWarning)
            descriptor['path'] = descriptor['url']
            del descriptor['url']

        # Set attributes
        self.__current_descriptor = deepcopy(descriptor)
        self.__next_descriptor = deepcopy(descriptor)
        self.__base_path = base_path
        self.__package = package
        self.__storage = storage
        self.__relations = None
        self.__strict = strict
        self.__unsafe = unsafe
        self.__table = None
        self.__errors = []
        self.__table_options = options

        # Build resource
        self.__build()
def test_table_bigquery(name, resource):

    # Storage
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json'
    credentials = GoogleCredentials.get_application_default()
    service = build('bigquery', 'v2', credentials=credentials)
    project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id']
    dataset = 'resource'
    prefix = '%s_' % uuid.uuid4().hex
    storage = Storage.connect('bigquery',
        service=service, project=project, dataset=dataset, prefix=prefix)

    # Save
    table = Table(resource['data'], schema=resource['schema'])
    table.save('table', storage=storage)

    # Load
    table = Table('table', schema=resource['schema'], storage=storage)
    assert table.read() == cast(resource)['data']

    # Clean
    storage.delete()
def test_table_bigquery(name):

    # Storage
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json'
    credentials = GoogleCredentials.get_application_default()
    service = build('bigquery', 'v2', credentials=credentials)
    project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id']
    dataset = 'package'
    prefix = '%s_' % uuid.uuid4().hex
    storage = Storage.connect('bigquery',
        service=service, project=project, dataset=dataset, prefix=prefix)

    # Save
    package = Package('data/packages/%s/datapackage.json' % name)
    package.save(storage=storage)

    # Load
    package = Package(storage=storage)
    assert package.resources
    for resource in package.resources:
        assert resource.read()

    # Clean
    storage.delete()
Example #11
0
    download_ftp_file(TEMPORARY_FOLDER, DOWNLOAD_URL)

# unpack the zipped file

with ZipFile(os.path.join(TEMPORARY_FOLDER, IBGE_FILE_NAME)) as pacote:
    with io.BytesIO(pacote.read('RELATORIO_DTB_BRASIL_MUNICIPIO.xls')) as f:
        df = pd.read_excel(f,
                           dtype={
                               'Nome_UF': 'category',
                               'Nome_Mesorregião': 'category',
                               'Nome_Microrregião': 'category'
                           })

# get the state (UF) codes as the IBGE DTB file does not contain them

geographic = Storage.connect('pandas')
package = Package(os.path.join(OUTPUT_FOLDER, 'datapackage.json'))
package.save(storage=geographic)

# adjust column names and types
uf = geographic['uf'].rename(columns={'code': 'UF', 'abbr': 'Sigla_UF'})
uf.drop('name', axis=1, inplace=True)
uf['Sigla_UF'] = uf['Sigla_UF'].astype('category')

# merge back into the IBGE DTB data
df = df.merge(uf)

# clean and store auxiliary data

df.drop([
    'UF', 'Nome_UF', 'Mesorregião Geográfica', 'Nome_Mesorregião',
with tqdm(total=len(codes)) as pbar:
    print(f'Cralwing candidate URLs for {len(codes)} cities...')
    for chunk in in_chunks(codes, MAX_SIMULTANEOUS):
        results = pool.map(partial(verify_city_links, candidates), chunk)
        for result in results:
            for verified_link in result:
                goodlinks = goodlinks.append(verified_link, ignore_index=True)
        pbar.update(MAX_SIMULTANEOUS)

# record validated

# read schema
package = Package(os.path.join(OUTPUT_FOLDER, 'datapackage.json'))
r = package.get_resource('brazilian-municipality-and-state-websites')
valid_data = Storage.connect('pandas')
package.save(storage=valid_data)
df = valid_data[r.name.replace('-', '_')]  # bucket names use _ instead of -

# prepare column names
goodlinks.rename(columns={
    'uf': 'state_code',
    'code': 'municipality_code',
    'name': 'municipality',
    'link_type': 'branch',
    'link': 'url',
    'last_checked': 'last-verified-auto'
},
                 inplace=True)

# map values
Example #13
0
    def __init__(
            self,
            descriptor=None,
            base_path=None,
            strict=False,
            storage=None,
            # Deprecated
            schema=None,
            default_base_path=None,
            **options):

        # Handle deprecated schema argument
        if schema is not None:
            warnings.warn(
                'Argument "schema" is deprecated. '
                'Please use "descriptor.profile" property.', UserWarning)
            if isinstance(schema, six.string_types):
                if schema in ['base', 'default']:
                    schema = 'data-package'
                elif schema == 'tabular':
                    schema = 'tabular-data-package'
                elif schema == 'fiscal':
                    schema = 'fiscal-data-package'
                descriptor['profile'] = schema

        # Handle deprecated default_base_path argument
        if default_base_path is not None:
            warnings.warn(
                'Argument "default_base_path" is deprecated. '
                'Please use "base_path" argument.', UserWarning)
            base_path = default_base_path

        # Extract from zip
        tempdir, descriptor = _extract_zip_if_possible(descriptor)
        if tempdir:
            self.__tempdir = tempdir

        # Get base path
        if base_path is None:
            base_path = helpers.get_descriptor_base_path(descriptor)

        # Instantiate storage
        if storage and not isinstance(storage, Storage):
            storage = Storage.connect(storage, **options)

        # Get descriptor from storage
        if storage and not descriptor:
            descriptor = {'resources': []}
            for bucket in storage.buckets:
                descriptor['resources'].append({'path': bucket})

        # Process descriptor
        descriptor = helpers.retrieve_descriptor(descriptor)
        descriptor = helpers.dereference_package_descriptor(
            descriptor, base_path)

        # Handle deprecated resource.path/url
        for resource in descriptor.get('resources', []):
            url = resource.pop('url', None)
            if url is not None:
                warnings.warn(
                    'Resource property "url: <url>" is deprecated. '
                    'Please use "path: [url]" instead (as array).',
                    UserWarning)
                resource['path'] = [url]

        # Set attributes
        self.__current_descriptor = deepcopy(descriptor)
        self.__next_descriptor = deepcopy(descriptor)
        self.__base_path = base_path
        self.__storage = storage
        self.__strict = strict
        self.__resources = []
        self.__errors = []

        # Build package
        self.__build()
Example #14
0
    def save(self, target=None, storage=None, merge_groups=False, **options):
        """Saves this data package

        It saves it to storage if `storage` argument is passed or
        saves this data package's descriptor to json file if `target` arguments
        ends with `.json` or saves this data package to zip file otherwise.

        # Example

        It creates a zip file into ``file_or_path`` with the contents
        of this Data Package and its resources. Every resource which content
        lives in the local filesystem will be copied to the zip file.
        Consider the following Data Package descriptor:

        ```json
        {
            "name": "gdp",
            "resources": [
                {"name": "local", "format": "CSV", "path": "data.csv"},
                {"name": "inline", "data": [4, 8, 15, 16, 23, 42]},
                {"name": "remote", "url": "http://someplace.com/data.csv"}
            ]
        }
        ```

        The final structure of the zip file will be:

        ```
        ./datapackage.json
        ./data/local.csv
        ```

        With the contents of `datapackage.json` being the same as
        returned `datapackage.descriptor`. The resources' file names are generated
        based on their `name` and `format` fields if they exist.
        If the resource has no `name`, it'll be used `resource-X`,
        where `X` is the index of the resource in the `resources` list (starting at zero).
        If the resource has `format`, it'll be lowercased and appended to the `name`,
        becoming "`name.format`".

        # Arguments
            target (string/filelike):
                the file path or a file-like object where
                the contents of this Data Package will be saved into.
            storage (str/tableschema.Storage):
                storage name like `sql` or storage instance
            merge_groups (bool):
                save all the group's tabular resoruces into one bucket
                if a storage is provided (for example into one SQL table).
                Read more about [Group](#group).
            options (dict):
                storage options to use for storage creation

        # Raises
            DataPackageException: raises if there was some error writing the package

        # Returns
            bool/Storage: on success return true or a `Storage` instance

        """

        # Save package to storage
        if storage is not None:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            buckets = []
            schemas = []
            sources = []
            group_names = []
            for resource in self.resources:
                if not resource.tabular:
                    continue
                if merge_groups and resource.group:
                    if resource.group in group_names:
                        continue
                    group = self.get_group(resource.group)
                    name = group.name
                    schema = group.schema
                    source = group.iter
                    group_names.append(name)
                else:
                    resource.infer()
                    name = resource.name
                    schema = resource.schema
                    source = resource.iter
                buckets.append(_slugify_resource_name(name))
                schemas.append(schema.descriptor)
                sources.append(source)
            schemas = list(map(_slugify_foreign_key, schemas))
            storage.create(buckets, schemas, force=True)
            for bucket in storage.buckets:
                source = sources[buckets.index(bucket)]
                storage.write(bucket, source())
            return storage

        # Save descriptor to json
        elif str(target).endswith('.json'):
            mode = 'w'
            encoding = 'utf-8'
            if six.PY2:
                mode = 'wb'
                encoding = None
            helpers.ensure_dir(target)
            with io.open(target, mode=mode, encoding=encoding) as file:
                json.dump(self.__current_descriptor, file, indent=4)

        # Save package to zip
        else:
            try:
                with zipfile.ZipFile(target, 'w') as z:
                    descriptor = json.loads(
                        json.dumps(self.__current_descriptor))
                    for index, resource in enumerate(self.resources):
                        if not resource.name:
                            continue
                        if not resource.local:
                            continue
                        path = os.path.abspath(resource.source)
                        basename = resource.descriptor.get('name')
                        resource_format = resource.descriptor.get('format')
                        if resource_format:
                            basename = '.'.join(
                                [basename, resource_format.lower()])
                        path_inside_dp = os.path.join('data', basename)
                        z.write(path, path_inside_dp)
                        descriptor['resources'][index]['path'] = path_inside_dp
                    z.writestr('datapackage.json', json.dumps(descriptor))
            except (IOError, zipfile.BadZipfile,
                    zipfile.LargeZipFile) as exception:
                six.raise_from(exceptions.DataPackageException(exception),
                               exception)

        return True
Example #15
0
    def __init__(self, descriptor=None, base_path=None, strict=False, storage=None,
                 # Deprecated
                 schema=None, default_base_path=None, **options):
        """https://github.com/frictionlessdata/datapackage-py#package
        """

        # Handle deprecated schema argument
        if schema is not None:
            warnings.warn(
                'Argument "schema" is deprecated. '
                'Please use "descriptor.profile" property.',
                UserWarning)
            if isinstance(schema, six.string_types):
                if schema in ['base', 'default']:
                    schema = 'data-package'
                elif schema == 'tabular':
                    schema = 'tabular-data-package'
                elif schema == 'fiscal':
                    schema = 'fiscal-data-package'
                descriptor['profile'] = schema

        # Handle deprecated default_base_path argument
        if default_base_path is not None:
            warnings.warn(
                'Argument "default_base_path" is deprecated. '
                'Please use "base_path" argument.',
                UserWarning)
            base_path = default_base_path

        # Extract from zip
        tempdir, descriptor = _extract_zip_if_possible(descriptor)
        if tempdir:
            self.__tempdir = tempdir

        # Get base path
        if base_path is None:
            base_path = helpers.get_descriptor_base_path(descriptor)

        # Instantiate storage
        if storage and not isinstance(storage, Storage):
            storage = Storage.connect(storage, **options)

        # Get descriptor from storage
        if storage and not descriptor:
            descriptor = {'resources': []}
            for bucket in storage.buckets:
                descriptor['resources'].append({'path': bucket})

        # Process descriptor
        descriptor = helpers.retrieve_descriptor(descriptor)
        descriptor = helpers.dereference_package_descriptor(descriptor, base_path)

        # Handle deprecated resource.path/url
        for resource in descriptor.get('resources', []):
            url = resource.pop('url', None)
            if url is not None:
                warnings.warn(
                    'Resource property "url: <url>" is deprecated. '
                    'Please use "path: [url]" instead (as array).',
                    UserWarning)
                resource['path'] = [url]

        # Set attributes
        self.__current_descriptor = deepcopy(descriptor)
        self.__next_descriptor = deepcopy(descriptor)
        self.__base_path = base_path
        self.__storage = storage
        self.__strict = strict
        self.__resources = []
        self.__errors = []

        # Build package
        self.__build()