def test_package_groups_save_to_sql_merge_groups(): package = Package('data/datapackage-groups/datapackage.json') # Save to storage engine = sqlalchemy.create_engine('sqlite://') storage = Storage.connect('sql', engine=engine) package.save(storage=storage, merge_groups=True) # Check storage storage = Storage.connect('sql', engine=engine) assert storage.buckets == ['cars'] assert storage.describe('cars') == { 'fields': [ { 'name': 'name', 'type': 'string' }, { 'name': 'value', 'type': 'integer' }, ], } assert storage.read('cars') == [ ['bmw', 2016], ['tesla', 2016], ['nissan', 2016], ['bmw', 2017], ['tesla', 2017], ['nissan', 2017], ['bmw', 2018], ['tesla', 2018], ['nissan', 2018], ]
def test_package_groups_save_to_sql(): package = Package('data/datapackage-groups/datapackage.json') # Save to storage engine = sqlalchemy.create_engine('sqlite://') storage = Storage.connect('sql', engine=engine) package.save(storage=storage) # Check storage storage = Storage.connect('sql', engine=engine) assert storage.buckets == ['cars_2016', 'cars_2017', 'cars_2018'] for year in [2016, 2017, 2018]: assert storage.describe('cars_%s' % year) == { 'fields': [ { 'name': 'name', 'type': 'string' }, { 'name': 'value', 'type': 'integer' }, ], } assert storage.read('cars_%s' % year) == [ ['bmw', year], ['tesla', year], ['nissan', year], ]
def save(self, target=None, storage=None, **options): """https://github.com/frictionlessdata/datapackage-py#package """ # Save package to storage if storage is not None: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) buckets = [] schemas = [] for resource in self.resources: if resource.tabular: resource.infer() buckets.append(_slugify_resource_name(resource.name)) schemas.append(resource.schema.descriptor) schemas = list(map(_slugify_foreign_key, schemas)) storage.create(buckets, schemas, force=True) for bucket in storage.buckets: resource = self.resources[buckets.index(bucket)] storage.write(bucket, resource.iter()) # Save descriptor to json elif str(target).endswith('.json'): mode = 'w' encoding = 'utf-8' if six.PY2: mode = 'wb' encoding = None helpers.ensure_dir(target) with io.open(target, mode=mode, encoding=encoding) as file: json.dump(self.__current_descriptor, file, indent=4) # Save package to zip else: try: with zipfile.ZipFile(target, 'w') as z: descriptor = json.loads( json.dumps(self.__current_descriptor)) for index, resource in enumerate(self.resources): if not resource.name: continue if not resource.local: continue path = os.path.abspath(resource.source) basename = resource.descriptor.get('name') resource_format = resource.descriptor.get('format') if resource_format: basename = '.'.join( [basename, resource_format.lower()]) path_inside_dp = os.path.join('data', basename) z.write(path, path_inside_dp) descriptor['resources'][index]['path'] = path_inside_dp z.writestr('datapackage.json', json.dumps(descriptor)) except (IOError, zipfile.BadZipfile, zipfile.LargeZipFile) as exception: six.raise_from(exceptions.DataPackageException(exception), exception) return True
def save(self, target=None, storage=None, **options): """https://github.com/frictionlessdata/datapackage-py#package """ # Save package to storage if storage is not None: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) buckets = [] schemas = [] for resource in self.resources: if resource.tabular: resource.infer() buckets.append(_slugify_resource_name(resource.name)) schemas.append(resource.schema.descriptor) schemas = list(map(_slugify_foreign_key, schemas)) storage.create(buckets, schemas, force=True) for bucket in storage.buckets: resource = self.resources[buckets.index(bucket)] storage.write(bucket, resource.iter()) # Save descriptor to json elif str(target).endswith('.json'): mode = 'w' encoding = 'utf-8' if six.PY2: mode = 'wb' encoding = None helpers.ensure_dir(target) with io.open(target, mode=mode, encoding=encoding) as file: json.dump(self.__current_descriptor, file, indent=4) # Save package to zip else: try: with zipfile.ZipFile(target, 'w') as z: descriptor = json.loads(json.dumps(self.__current_descriptor)) for index, resource in enumerate(self.resources): if not resource.name: continue if not resource.local: continue path = os.path.abspath(resource.source) basename = resource.descriptor.get('name') resource_format = resource.descriptor.get('format') if resource_format: basename = '.'.join([basename, resource_format.lower()]) path_inside_dp = os.path.join('data', basename) z.write(path, path_inside_dp) descriptor['resources'][index]['path'] = path_inside_dp z.writestr('datapackage.json', json.dumps(descriptor)) except (IOError, zipfile.BadZipfile, zipfile.LargeZipFile) as exception: six.raise_from(exceptions.DataPackageException(exception), exception) return True
def test_table_sql(name, resource): # Storage engine = create_engine('sqlite:///') storage = Storage.connect('sql', engine=engine) # Save table = Table(resource['data'], schema=resource['schema']) table.save('table', storage=storage) # Load table = Table('table', schema=resource['schema'], storage=storage) assert table.read() == cast(resource)['data']
def test_table_pandas(name): # Storage storage = Storage.connect('pandas') # Save package = Package('data/packages/%s/datapackage.json' % name) package.save(storage=storage) # Load package = Package(storage=storage) assert package.resources for resource in package.resources: assert resource.read()
def test_table_sql(name): # Storage engine = create_engine('sqlite:///') storage = Storage.connect('sql', engine=engine) # Save package = Package('data/packages/%s/datapackage.json' % name) package.save(storage=storage) # Load package = Package(storage=storage) assert package.resources for resource in package.resources: assert resource.read()
def __init__( self, descriptor={}, base_path=None, strict=False, unsafe=False, storage=None, # Internal package=None, **options): # Get base path if base_path is None: base_path = helpers.get_descriptor_base_path(descriptor) # Instantiate storage if storage and not isinstance(storage, Storage): storage = Storage.connect(storage, **options) # Process descriptor descriptor = helpers.retrieve_descriptor(descriptor) descriptor = helpers.dereference_resource_descriptor( descriptor, base_path) # Handle deprecated resource.path.url if descriptor.get('url'): warnings.warn( 'Resource property "url: <url>" is deprecated. ' 'Please use "path: <url>" instead.', UserWarning) descriptor['path'] = descriptor['url'] del descriptor['url'] # Set attributes self.__current_descriptor = deepcopy(descriptor) self.__next_descriptor = deepcopy(descriptor) self.__base_path = base_path self.__package = package self.__storage = storage self.__relations = None self.__strict = strict self.__unsafe = unsafe self.__table = None self.__errors = [] self.__table_options = options # Build resource self.__build()
def test_table_bigquery(name, resource): # Storage os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json' credentials = GoogleCredentials.get_application_default() service = build('bigquery', 'v2', credentials=credentials) project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id'] dataset = 'resource' prefix = '%s_' % uuid.uuid4().hex storage = Storage.connect('bigquery', service=service, project=project, dataset=dataset, prefix=prefix) # Save table = Table(resource['data'], schema=resource['schema']) table.save('table', storage=storage) # Load table = Table('table', schema=resource['schema'], storage=storage) assert table.read() == cast(resource)['data'] # Clean storage.delete()
def test_table_bigquery(name): # Storage os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json' credentials = GoogleCredentials.get_application_default() service = build('bigquery', 'v2', credentials=credentials) project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id'] dataset = 'package' prefix = '%s_' % uuid.uuid4().hex storage = Storage.connect('bigquery', service=service, project=project, dataset=dataset, prefix=prefix) # Save package = Package('data/packages/%s/datapackage.json' % name) package.save(storage=storage) # Load package = Package(storage=storage) assert package.resources for resource in package.resources: assert resource.read() # Clean storage.delete()
download_ftp_file(TEMPORARY_FOLDER, DOWNLOAD_URL) # unpack the zipped file with ZipFile(os.path.join(TEMPORARY_FOLDER, IBGE_FILE_NAME)) as pacote: with io.BytesIO(pacote.read('RELATORIO_DTB_BRASIL_MUNICIPIO.xls')) as f: df = pd.read_excel(f, dtype={ 'Nome_UF': 'category', 'Nome_Mesorregião': 'category', 'Nome_Microrregião': 'category' }) # get the state (UF) codes as the IBGE DTB file does not contain them geographic = Storage.connect('pandas') package = Package(os.path.join(OUTPUT_FOLDER, 'datapackage.json')) package.save(storage=geographic) # adjust column names and types uf = geographic['uf'].rename(columns={'code': 'UF', 'abbr': 'Sigla_UF'}) uf.drop('name', axis=1, inplace=True) uf['Sigla_UF'] = uf['Sigla_UF'].astype('category') # merge back into the IBGE DTB data df = df.merge(uf) # clean and store auxiliary data df.drop([ 'UF', 'Nome_UF', 'Mesorregião Geográfica', 'Nome_Mesorregião',
with tqdm(total=len(codes)) as pbar: print(f'Cralwing candidate URLs for {len(codes)} cities...') for chunk in in_chunks(codes, MAX_SIMULTANEOUS): results = pool.map(partial(verify_city_links, candidates), chunk) for result in results: for verified_link in result: goodlinks = goodlinks.append(verified_link, ignore_index=True) pbar.update(MAX_SIMULTANEOUS) # record validated # read schema package = Package(os.path.join(OUTPUT_FOLDER, 'datapackage.json')) r = package.get_resource('brazilian-municipality-and-state-websites') valid_data = Storage.connect('pandas') package.save(storage=valid_data) df = valid_data[r.name.replace('-', '_')] # bucket names use _ instead of - # prepare column names goodlinks.rename(columns={ 'uf': 'state_code', 'code': 'municipality_code', 'name': 'municipality', 'link_type': 'branch', 'link': 'url', 'last_checked': 'last-verified-auto' }, inplace=True) # map values
def __init__( self, descriptor=None, base_path=None, strict=False, storage=None, # Deprecated schema=None, default_base_path=None, **options): # Handle deprecated schema argument if schema is not None: warnings.warn( 'Argument "schema" is deprecated. ' 'Please use "descriptor.profile" property.', UserWarning) if isinstance(schema, six.string_types): if schema in ['base', 'default']: schema = 'data-package' elif schema == 'tabular': schema = 'tabular-data-package' elif schema == 'fiscal': schema = 'fiscal-data-package' descriptor['profile'] = schema # Handle deprecated default_base_path argument if default_base_path is not None: warnings.warn( 'Argument "default_base_path" is deprecated. ' 'Please use "base_path" argument.', UserWarning) base_path = default_base_path # Extract from zip tempdir, descriptor = _extract_zip_if_possible(descriptor) if tempdir: self.__tempdir = tempdir # Get base path if base_path is None: base_path = helpers.get_descriptor_base_path(descriptor) # Instantiate storage if storage and not isinstance(storage, Storage): storage = Storage.connect(storage, **options) # Get descriptor from storage if storage and not descriptor: descriptor = {'resources': []} for bucket in storage.buckets: descriptor['resources'].append({'path': bucket}) # Process descriptor descriptor = helpers.retrieve_descriptor(descriptor) descriptor = helpers.dereference_package_descriptor( descriptor, base_path) # Handle deprecated resource.path/url for resource in descriptor.get('resources', []): url = resource.pop('url', None) if url is not None: warnings.warn( 'Resource property "url: <url>" is deprecated. ' 'Please use "path: [url]" instead (as array).', UserWarning) resource['path'] = [url] # Set attributes self.__current_descriptor = deepcopy(descriptor) self.__next_descriptor = deepcopy(descriptor) self.__base_path = base_path self.__storage = storage self.__strict = strict self.__resources = [] self.__errors = [] # Build package self.__build()
def save(self, target=None, storage=None, merge_groups=False, **options): """Saves this data package It saves it to storage if `storage` argument is passed or saves this data package's descriptor to json file if `target` arguments ends with `.json` or saves this data package to zip file otherwise. # Example It creates a zip file into ``file_or_path`` with the contents of this Data Package and its resources. Every resource which content lives in the local filesystem will be copied to the zip file. Consider the following Data Package descriptor: ```json { "name": "gdp", "resources": [ {"name": "local", "format": "CSV", "path": "data.csv"}, {"name": "inline", "data": [4, 8, 15, 16, 23, 42]}, {"name": "remote", "url": "http://someplace.com/data.csv"} ] } ``` The final structure of the zip file will be: ``` ./datapackage.json ./data/local.csv ``` With the contents of `datapackage.json` being the same as returned `datapackage.descriptor`. The resources' file names are generated based on their `name` and `format` fields if they exist. If the resource has no `name`, it'll be used `resource-X`, where `X` is the index of the resource in the `resources` list (starting at zero). If the resource has `format`, it'll be lowercased and appended to the `name`, becoming "`name.format`". # Arguments target (string/filelike): the file path or a file-like object where the contents of this Data Package will be saved into. storage (str/tableschema.Storage): storage name like `sql` or storage instance merge_groups (bool): save all the group's tabular resoruces into one bucket if a storage is provided (for example into one SQL table). Read more about [Group](#group). options (dict): storage options to use for storage creation # Raises DataPackageException: raises if there was some error writing the package # Returns bool/Storage: on success return true or a `Storage` instance """ # Save package to storage if storage is not None: if not isinstance(storage, Storage): storage = Storage.connect(storage, **options) buckets = [] schemas = [] sources = [] group_names = [] for resource in self.resources: if not resource.tabular: continue if merge_groups and resource.group: if resource.group in group_names: continue group = self.get_group(resource.group) name = group.name schema = group.schema source = group.iter group_names.append(name) else: resource.infer() name = resource.name schema = resource.schema source = resource.iter buckets.append(_slugify_resource_name(name)) schemas.append(schema.descriptor) sources.append(source) schemas = list(map(_slugify_foreign_key, schemas)) storage.create(buckets, schemas, force=True) for bucket in storage.buckets: source = sources[buckets.index(bucket)] storage.write(bucket, source()) return storage # Save descriptor to json elif str(target).endswith('.json'): mode = 'w' encoding = 'utf-8' if six.PY2: mode = 'wb' encoding = None helpers.ensure_dir(target) with io.open(target, mode=mode, encoding=encoding) as file: json.dump(self.__current_descriptor, file, indent=4) # Save package to zip else: try: with zipfile.ZipFile(target, 'w') as z: descriptor = json.loads( json.dumps(self.__current_descriptor)) for index, resource in enumerate(self.resources): if not resource.name: continue if not resource.local: continue path = os.path.abspath(resource.source) basename = resource.descriptor.get('name') resource_format = resource.descriptor.get('format') if resource_format: basename = '.'.join( [basename, resource_format.lower()]) path_inside_dp = os.path.join('data', basename) z.write(path, path_inside_dp) descriptor['resources'][index]['path'] = path_inside_dp z.writestr('datapackage.json', json.dumps(descriptor)) except (IOError, zipfile.BadZipfile, zipfile.LargeZipFile) as exception: six.raise_from(exceptions.DataPackageException(exception), exception) return True
def __init__(self, descriptor=None, base_path=None, strict=False, storage=None, # Deprecated schema=None, default_base_path=None, **options): """https://github.com/frictionlessdata/datapackage-py#package """ # Handle deprecated schema argument if schema is not None: warnings.warn( 'Argument "schema" is deprecated. ' 'Please use "descriptor.profile" property.', UserWarning) if isinstance(schema, six.string_types): if schema in ['base', 'default']: schema = 'data-package' elif schema == 'tabular': schema = 'tabular-data-package' elif schema == 'fiscal': schema = 'fiscal-data-package' descriptor['profile'] = schema # Handle deprecated default_base_path argument if default_base_path is not None: warnings.warn( 'Argument "default_base_path" is deprecated. ' 'Please use "base_path" argument.', UserWarning) base_path = default_base_path # Extract from zip tempdir, descriptor = _extract_zip_if_possible(descriptor) if tempdir: self.__tempdir = tempdir # Get base path if base_path is None: base_path = helpers.get_descriptor_base_path(descriptor) # Instantiate storage if storage and not isinstance(storage, Storage): storage = Storage.connect(storage, **options) # Get descriptor from storage if storage and not descriptor: descriptor = {'resources': []} for bucket in storage.buckets: descriptor['resources'].append({'path': bucket}) # Process descriptor descriptor = helpers.retrieve_descriptor(descriptor) descriptor = helpers.dereference_package_descriptor(descriptor, base_path) # Handle deprecated resource.path/url for resource in descriptor.get('resources', []): url = resource.pop('url', None) if url is not None: warnings.warn( 'Resource property "url: <url>" is deprecated. ' 'Please use "path: [url]" instead (as array).', UserWarning) resource['path'] = [url] # Set attributes self.__current_descriptor = deepcopy(descriptor) self.__next_descriptor = deepcopy(descriptor) self.__base_path = base_path self.__storage = storage self.__strict = strict self.__resources = [] self.__errors = [] # Build package self.__build()