def chinook_summaries(input_path, output_path='/pfs/out'): dp = datapackage.DataPackage( descriptor=os.path.join(input_path, 'datapackage.json')) dp_out = datapackage.DataPackage() dp_out.descriptor['name'] = 'chinook-summary' dp_out.descriptor['title'] = 'chinook-summary' dp_out.descriptor['description'] = 'Summary Stats from Chinook DB' dp_out.descriptor['x-visibility'] = 'PRIVATE' dp_out.descriptor['licenses'] = [{'name': 'Other'}] dp_out.descriptor['resources'] = [] for r in dp.descriptor['resources']: print('Processing {} with format {}'.format(r['path'], r['format'])) if r.get('format', '') == 'csv': print('Attempting stats for {} with format {}'.format( r['path'], r['format'])) df = pd.read_csv(os.path.join(dp.base_path, r['path'])) try: stats = df.describe() plot_name = os.path.basename('{}.txt'.format(r['name'])) with open(os.path.join(output_path, plot_name), 'w') as f: f.write(str(stats)) dp_out.descriptor['resources'].append({ 'name': plot_name, 'path': plot_name }) print('Done generating stats for {} with format {}'.format( r['path'], r['format'])) except: print('Failed to generate stats for {} with format {}'.format( r['path'], r['format'])) with open(os.path.join(output_path, 'datapackage.json'), 'w') as f: f.write(dp_out.to_json())
def towns_and_counties(): towns = datapackage.DataPackage( 'https://raw.githubusercontent.com/CT-Data-Collaborative/ct-town-list/master/datapackage.json' ) counties = datapackage.DataPackage( 'https://raw.githubusercontent.com/CT-Data-Collaborative/ct-county-list/master/datapackage.json' ) town_name_list = [t['Town'] for t in towns.resources[0].data] county_name_list = [c['County'] for c in counties.resources[0].data] return town_name_list + county_name_list
def test_datapackage_only_requires_some_fields_to_be_valid(self): invalid_datapackage = datapackage.DataPackage({}) valid_datapackage = datapackage.DataPackage({ 'name': 'gdp', 'resources': [{ 'name': 'the-resource', 'path': 'http://example.com/some-data.csv' }] }) converter.datapackage_to_dataset(valid_datapackage) nose.tools.assert_raises(KeyError, converter.datapackage_to_dataset, invalid_datapackage)
def handle_datapackage(self, datapackage, parameters, stats): '''Create or update a ckan dataset from datapackage and parameters''' # core dataset properties dataset = { 'title': '', 'version': '', 'state': 'active', 'url': '', 'notes': '', 'license_id': '', 'author': '', 'author_email': '', 'maintainer': '', 'maintainer_email': '', 'owner_org': None, 'private': False } dp = datapackage_lib.DataPackage(datapackage) dataset.update(converter.datapackage_to_dataset(dp)) self.__dataset_resources = dataset.get('resources', []) if self.__dataset_resources: del dataset['resources'] # Merge dataset-properties from parameters into dataset. dataset_props_from_params = parameters.get('dataset-properties') if dataset_props_from_params: dataset.update(dataset_props_from_params) package_create_url = '{}/package_create'.format(self.__base_endpoint) response = make_ckan_request(package_create_url, method='POST', json=dataset, api_key=self.__ckan_api_key) ckan_error = get_ckan_error(response) if ckan_error \ and parameters.get('overwrite_existing') \ and 'That URL is already in use.' in ckan_error.get('name', []): package_update_url = \ '{}/package_update'.format(self.__base_endpoint) log.info('CKAN dataset with url already exists. ' 'Attempting package_update.') response = make_ckan_request(package_update_url, method='POST', json=dataset, api_key=self.__ckan_api_key) ckan_error = get_ckan_error(response) if ckan_error: log.exception('CKAN returned an error: ' + json.dumps(ckan_error)) raise Exception if response['success']: self.__dataset_id = response['result']['id']
def test_base_path_is_set_to_base_url_when_datapackage_is_in_url(self): base_url = 'http://someplace.com/data' url = '{base_url}/datapackage.json'.format(base_url=base_url) body = '{}' httpretty.register_uri(httpretty.GET, url, body=body) dp = datapackage.DataPackage(url) assert dp.base_path == base_url
def test_descriptor_apply_defaults_resource_tabular_schema(self): descriptor = { 'resources': [{ 'name': 'name', 'data': 'data', 'profile': 'tabular-data-resource', 'schema': { 'fields': [{'name': 'name'}], } }], } dp = datapackage.DataPackage(descriptor) assert descriptor == { 'profile': 'data-package', 'resources': [{ 'name': 'name', 'data': 'data', 'profile': 'tabular-data-resource', 'encoding': 'utf-8', 'schema': { 'fields': [{'name': 'name', 'type': 'string', 'format': 'default'}], 'missingValues': [''], } }], }
def test_attributes_can_be_set(self): descriptor = { 'profile': 'data-package', } dp = datapackage.DataPackage(descriptor) dp.descriptor['title'] = 'bar' assert dp.to_dict() == {'profile': 'data-package', 'title': 'bar'}
def setUp(self): # GIVEN datapackage with one resource invalid_dp = datapackage.DataPackage({ "name": "some-datapackage", "resources": [ { "path": "invalid.csv", "schema": { "fields": [ { "name": "Price", "type": "number" }, { # Column 2 name should be "Year" "name": "Year", "type": "date", "format": "%Y" } ] } } ] }) patch('dpm.main.exists', lambda *a: True).start() patch('dpm.main.DataPackage', lambda *a: invalid_dp).start() # AND the resource file has invalid schema (column 2 name should be 'Year') with open('invalid.csv', 'w') as f: f.write( 'Price,Ugh\n' # 'Ugh' != 'Year' '1,1980' )
def test_attributes_arent_immutable(self): metadata = { 'keywords': [], } dp = datapackage.DataPackage(metadata) dp.metadata['keywords'].append('foo') assert dp.to_dict() == {'keywords': ['foo']}
def setUp(self): # GIVEN datapackage that can be treated as valid by the dpm self.valid_dp = datapackage.DataPackage( { "name": "some-datapackage", "resources": [{ "name": "some-resource", "path": "./data/some_data.csv", }] }, default_base_path='.') patch('dpm.client.DataPackage', lambda *a: self.valid_dp).start() patch('dpm.client.exists', lambda *a: True).start() # AND the registry server that accepts any user responses.add(responses.POST, 'https://example.com/api/auth/token', json={'token': 'blabla'}, status=200) # AND registry server accepts deletion of any datapackage responses.add(responses.DELETE, 'https://example.com/api/package/user/some-datapackage', json={'message': 'OK'}, status=200) # AND registry server accepts purging of any datapackage responses.add( responses.DELETE, 'https://example.com/api/package/user/some-datapackage/purge', json={'message': 'OK'}, status=200)
def create(reader, out_dir, log_level=None): logger = get_logger("createdp.create", level=log_level) top_level_dict = reader.conf[config.TOP_LEVEL] try: name = top_level_dict["name"] except KeyError: raise ValueError("'name' is a required property") if not NAME_PATTERN.match(name): raise ValueError("invalid name: %r" % (name,)) dp = datapackage.DataPackage() for k, v in top_level_dict.items(): dp.descriptor[k] = v dp.descriptor['resources'] = [] mkdir_p(out_dir) logger.info("writing to '%s'", out_dir) for a in "objects", "links": out_bn = "%s.csv" % a out_fn = os.path.join(out_dir, out_bn) df = getattr(reader, a) df.to_csv(out_fn, index=False, quoting=csv.QUOTE_NONE) if a == "objects": name = cmso.OBJECTS_TABLE infer_kwargs = {"primary_key": cmso.OBJECT_ID} else: name = cmso.LINKS_TABLE infer_kwargs = {} schema = infer_from_df(df, **infer_kwargs) if a == "links": schema['foreignKeys'] = FOREIGN_KEYS res = {"name": name, "path": out_bn, "schema": schema} dp.descriptor['resources'].append(res) with open(os.path.join(out_dir, 'dp.json'), 'w') as f: f.write(to_json(dp) + '\n') return dp
def load_datapackage_file(datapackage_path): dp = datapackage.DataPackage(datapackage_path) dpdict = dp.descriptor # First we need to see what sort of author object we have. CTData only publishes one author per dataset, but # we want to support valid datapackage.json files which allow for an array of them try: author_name = dpdict['author']['name'] author = dpdict['author'] except TypeError as e: author = dpdict['author'][0] try: upload_object = {'name': dpdict['name'], 'title': dpdict['title'], 'maintainer': author['name'], 'maintainer_email': author['email'], 'owner_org': dpdict['sources'][0]['name']} except KeyError as e: raise e try: dp.validate() except datapackage.exceptions.ValidationError as e: if e.instance == dpdict['author']: pass else: raise e try: upload_object['extras'] = get_extras_object(dpdict) except (KeyError, Exception) as e: raise e return dpdict, upload_object
def test_attributes_arent_immutable(self): descriptor = { 'keywords': [], } dp = datapackage.DataPackage(descriptor) dp.descriptor['keywords'].append('foo') assert dp.to_dict() == {'keywords': ['foo']}
def initialize_datapackage(self, config): """Create a datapackage or return the existing one along with it's path""" datapkg_file_path = config.get('datapackage_file', '') if not datapkg_file_path or not os.path.isabs(datapkg_file_path): datapkg_file_path = os.path.join(self.workspace_path, 'datapackage.json') datapkg_file_path = os.path.abspath(datapkg_file_path) if not os.path.exists(datapkg_file_path): with io.open(datapkg_file_path, mode='w+', encoding='utf-8') as new_datapkg: default_datapkg = utilities.get_default_datapackage() for resource in default_datapkg.resources: resource_path = config.get(resource.descriptor['name'], resource.descriptor['path']) resource.descriptor['path'] = os.path.join( config['data_dir'], resource_path) json_datapkg = json.dumps(default_datapkg.to_dict(), indent=4) new_datapkg.write(compat.str(json_datapkg)) print( ('A new "datapackage.json" file has been created at {0}. ' 'Please review and update it.'.format(datapkg_file_path))) return default_datapkg else: datapackage_check = DataPackageChecker(config) datapackage_check.run() return datapackage.DataPackage(datapkg_file_path)
def validate(self, dp): if isinstance(dp, datapackage.DataPackage) and not is_tabular(dp): raise ValueError("data package must be a tabular data package") else: dp = datapackage.DataPackage(dp, schema="tabular") dp.validate() self.logger.debug("valid tabular data package") if len(dp.resources) < 2: self.__error("data package must have at least two resources") res_map = dict((_.descriptor['name'], _) for _ in dp.resources) try: objects = res_map[cmso.OBJECTS_TABLE] except KeyError: self.__error("objects table not found") else: self.validate_objects(objects.descriptor) try: links = res_map[cmso.LINKS_TABLE] except KeyError: self.__error("links table not found") else: self.validate_links(links.descriptor) try: tracks = res_map[cmso.TRACKS_TABLE] except KeyError: pass else: self.validate_tracks(tracks.descriptor) return dp
def test_should_raise_if_zipfile_raised_LargeZipFile( self, zipfile_mock, tmpfile): zipfile_mock.side_effect = zipfile.LargeZipFile() dp = datapackage.DataPackage({}, {}) with pytest.raises(datapackage.exceptions.DataPackageException): dp.save(tmpfile)
def test_attributes_can_be_set(self): metadata = { 'name': 'foo', } dp = datapackage.DataPackage(metadata) dp.metadata['title'] = 'bar' assert dp.to_dict() == {'name': 'foo', 'title': 'bar'}
def __init__(self, config, **kwargs): self.config = config self.remotes = self.config['remotes'] self.branch = self.config['branch'] self.data_dir = self.config['data_dir'] self.result_file = os.path.join(self.data_dir, self.config['result_file']) self.run_file = os.path.join(self.data_dir, self.config['run_file']) self.source_file = os.path.join(self.data_dir, self.config['source_file']) self.performance_file = os.path.join(self.data_dir, self.config['performance_file']) self.publisher_file = os.path.join(self.data_dir, self.config['publisher_file']) self.cache_dir = self.config['cache_dir'] self.data_key = self.config['goodtables']['arguments']['batch'][ 'data_key'] datapkg_file_path = self.config.get('datapackage_file', 'datapackage.json') if not os.path.isabs(datapkg_file_path): datapkg_file_path = os.path.join(os.path.dirname(self.data_dir), datapkg_file_path) try: self.datapackage = datapackage.DataPackage(datapkg_file_path) except datapackage.exceptions.DataPackageException as e: raise ValueError( ('A datapackage couldn\'t be created because of the ' 'following error: "{0}". Make sure the file is not ' 'empty and use "dq init" command.').format(e)) self.all_scores = []
def test_init_accepts_filelike_object(self): metadata = { 'foo': 'bar', } filelike_metadata = six.StringIO(json.dumps(metadata)) dp = datapackage.DataPackage(filelike_metadata) assert dp.metadata == metadata
def validate(resource=None): datapackage.validate('datapackage/gtex-v8-datapackage.json') gtex_package = datapackage.DataPackage( 'datapackage/gtex-v8-datapackage.json', strict=True) if resource != 'all': r = gtex_package.get_resource(resource) print(r.name) try: t = r.read() except CastError as ce: print('Hit cast error') for err in ce.errors: print(err) print(ce) except Exception as inst: print('Hit generic exception') print(type(inst)) print(inst.args) print(inst) else: for r in gtex_package.resources: print(r.name) try: t = r.read() except CastError as ce: print('Hit cast error') print(ce.errors) print(ce) except Exception as inst: print('Hit generic exception') print(type(inst)) print(inst.args) print(inst)
def test_schema_gets_from_registry_if_available(self, registry_class_mock): schema = {'foo': 'bar'} registry_mock = mock.MagicMock() registry_mock.get.return_value = schema registry_class_mock.return_value = registry_mock assert datapackage.DataPackage().schema.to_dict() == schema
def __init__(self, descriptor_file): self._datapackage = datapackage.DataPackage(descriptor_file) self.__descriptor_file = descriptor_file self.__base_path = os.path.dirname( os.path.abspath(self.__descriptor_file)) # Index resources by name self.__resources = {r.descriptor['name']: r for r in self._datapackage.resources} self.__tabular_resources = {k: sanitize_resource_schema(r) for (k, r) in self.__resources.items() if type(r) is TabularResource and r.descriptor['path'].startswith('data')} self.__invalid_schemas = [] # Resource names with invalid schemas # All formats self.raw_data = LazyLoadedDict.from_keys( self.__resources.keys(), self._load_raw_data, 'bytes') # Tabular formats self.tables = LazyLoadedDict.from_keys( self.__tabular_resources.keys(), self._load_table, type_hint='list of rows') self.dataframes = LazyLoadedDict.from_keys( self.__tabular_resources.keys(), self._load_dataframe, type_hint='pandas.DataFrame')
def test_init_accepts_filelike_object(self): descriptor = { 'profile': 'data-package', } filelike_descriptor = six.StringIO(json.dumps(descriptor)) dp = datapackage.DataPackage(filelike_descriptor) assert dp.descriptor == descriptor
def test_open_resource_url(self, mocklib_urlopen): dpkg = datapackage.DataPackage("tests/test.dpkg_url/") list( dpkg.data ) # Force the iteration over the iterable returned from data property. mocklib_urlopen.assert_called_once_with( 'http://example.com/country-codes.csv')
def test_descriptor_apply_defaults_resource_tabular_dialect(self): descriptor = { 'resources': [{ 'name': 'name', 'data': 'data', 'profile': 'tabular-data-resource', 'dialect': { 'delimiter': 'custom', } }], } dp = datapackage.DataPackage(descriptor) assert descriptor == { 'profile': 'data-package', 'resources': [{ 'name': 'name', 'data': 'data', 'profile': 'tabular-data-resource', 'encoding': 'utf-8', 'dialect': { 'delimiter': 'custom', 'doubleQuote': True, 'lineTerminator': '\r\n', 'quoteChar': '"', 'escapeChar': '\\', 'skipInitialSpace': True, 'header': True, 'caseSensitiveHeader': False, } }], }
def test_open_resource_local(self): dpkg = datapackage.DataPackage("tests/test.dpkg_local/") with mocklib.patch('io.open') as mocklib_open: list( dpkg.data ) # Force the iteration over the iterable returned from data property. mocklib_open.assert_called_once()
def test_init_raises_if_path_is_a_bad_json(self): bad_json = test_helpers.fixture_path('bad_json.json') with pytest.raises(datapackage.exceptions.DataPackageException) as excinfo: datapackage.DataPackage(bad_json) message = str(excinfo.value) assert 'Unable to parse JSON' in message assert 'line 2 column 5 (char 6)' in message
def test_open_resource_encoding(self): dpkg = datapackage.DataPackage("tests/test.dpkg_local/") rows = list( dpkg.data ) # Force the iteration over the iterable returned from data property. # And make sure we were able to get some utf-8 data out of thereget assert 'Alg\xe9rie' == rows[2]['name_fr']
def __call__(self): url = self.parameters['url'] dep_prefix = 'dependency://' if url.startswith(dep_prefix): dependency = url[len(dep_prefix):].strip() url = get_dependency_datapackage_url(dependency) assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency resource = self.parameters['resource'] stream = self.parameters.get('stream', True) name_matcher = ResourceMatcher(resource) if isinstance(resource, str) else None resource_index = resource if isinstance(resource, int) else None selected_resources = [] found = False dp = datapackage.DataPackage(url) dp = self.process_datapackage(dp) for i, orig_res in enumerate(dp.resources): if resource_index == i or \ (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): found = True orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source self.dp['resources'].append(orig_res.descriptor) if tabular(orig_res.descriptor) and stream: orig_res.descriptor[PROP_STREAMING] = True selected_resources.append(orig_res.iter(keyed=True)) else: orig_res.descriptor[PROP_STREAMING] = False assert found, "Failed to find resource with index or name matching %r" % resource spew(self.dp, itertools.chain(self.res_iter, selected_resources))
def test_works_with_resources_with_relative_paths(self, tmpfile): path = test_helpers.fixture_path( 'datapackage_with_foo.txt_resource.json') dp = datapackage.DataPackage(path) dp.save(tmpfile) with zipfile.ZipFile(tmpfile, 'r') as z: assert len(z.filelist) == 2