def test_restore_schema_with_primary_key(): data = [ ('a', ), ('b', ), ] index = pd.Index([1, 2], name='key') df = pd.DataFrame(data, columns=('value', ), index=index) storage = Storage(dataframes={'data': df}) assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] assert storage.describe('data') == { 'primaryKey': 'key', 'fields': [ { 'name': 'key', 'type': 'integer', 'constraints': { 'required': True } }, { 'name': 'value', 'type': 'string', 'constraints': { 'required': True } }, ] }
def test_init_tables(): data = [ (1, 'a'), (2, 'b'), ] df = pd.DataFrame(data, columns=('key', 'value')) storage = Storage(dataframes={'data': df}) assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] assert storage.describe('data') == { 'fields': [ { 'name': 'key', 'type': 'integer', 'constraints': { 'required': True } }, { 'name': 'value', 'type': 'string', 'constraints': { 'required': True } }, ] }
def test_multiple_writes(): index = pd.Index([1, 2], name='key') df = pd.DataFrame([('a', ), ('b', )], columns=('value', ), index=index) storage = Storage(dataframes={'data': df}) storage.write('data', [(2, 'x'), (3, 'y')]) assert list(storage.read('data')) == [ [1, 'a'], [2, 'b'], [2, 'x'], [3, 'y'], ]
def dataframe(self): """Build and cache a dataframe from query results""" try: from jsontableschema_pandas import Storage except ImportError: raise RuntimeError('To enable dataframe support, ' 'run \'pip install datadotworld[PANDAS]\'') if self.__storage is None: self.__storage = Storage() self.__storage.create('results', self._schema) row_values = [row.values() for row in self.table] self.__storage.write('results', row_values) return self.__storage['results']
def __initialize_storage(self): try: from jsontableschema_pandas import Storage, mappers patch_jsontableschema_pandas(mappers) except ImportError: raise RuntimeError('To enable dataframe support, ' 'run \'pip install datadotworld[PANDAS]\'') # Initialize storage if needed if not hasattr(self, '__storage'): self.__storage = Storage() for (k, r) in self.__tabular_resources.items(): if 'schema' in r.descriptor: try: self.__storage.create(k, r.descriptor['schema']) except SchemaValidationError: self.__invalid_schemas.append(r.descriptor['schema'])
def test_table_without_primary_key(): schema = { 'fields': [ { 'name': 'a', 'type': 'integer' }, { 'name': 'b', 'type': 'string' }, ] } data = [[1, 'x'], [2, 'y']] storage = Storage() storage.create('data', schema) storage.write('data', data) assert list(storage.read('data')) == data
class QueryResults(object): """Query results Class for accessing and working with the results of a query. Attributes ---------- raw_data : str Query results as raw SPARQL JSON data table : list of rows Query results as a `list` of rows. Each row is a mapping of field names to their respective values. dataframe : `pandas.DataFrame` Query results as a `DataFrame`. """ def __init__(self, raw): self.raw_data = raw self._schema = table_schema.infer_table_schema(raw) self._table = None self.__storage = None def __repr__(self): return '{}({})'.format(self.__class__.__name__, repr(self.raw_data)) def __str__(self): return str(self.raw_data) def describe(self): return self._schema @property def table(self): """Build and cache a table from query results""" if self._table is None: schema_obj = Schema(self._schema) table = [] if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) table.append(OrderedDict(zip(field_names, table_row))) elif 'boolean' in self.raw_data: # Results of an ASK query table = [{'boolean': self.raw_data['boolean']}] self._table = table return self._table @property def dataframe(self): """Build and cache a dataframe from query results""" try: from jsontableschema_pandas import Storage except ImportError: raise RuntimeError('To enable dataframe support, ' 'run \'pip install datadotworld[PANDAS]\'') if self.__storage is None: self.__storage = Storage() self.__storage.create('results', self._schema) row_values = [row.values() for row in self.table] self.__storage.write('results', row_values) return self.__storage['results']
class LocalDataset(object): """Dataset saved in the local file system .. note:: Datasets are packaged for local access in the form of Datapackage. See specs at http://specs.frictionlessdata.io/data-package/ Parameters ---------- descriptor_file : str or file-like object Path or handle for the descriptor of the dataset (datapackage.json) Attributes ---------- raw_data : dict of bytes Mapping of resource names to their content (raw bytes) for all types of data contained in the dataset. tables : dict of tables Mapping of resource names to their data table for all *tabular* data contained in the dataset. A table is a `list` of rows, where each row is a mapping of field names to their respective values. dataframes : dict of `pandas.DataFrame` Mapping of resource names to their `DataFrame` representation for all *tabular* data contained in the dataset. """ def __init__(self, descriptor_file): self._datapackage = datapackage.DataPackage(descriptor_file) self.__descriptor_file = descriptor_file self.__base_path = os.path.dirname( os.path.abspath(self.__descriptor_file)) # Index resources by name self.__resources = { r.descriptor['name']: r for r in self._datapackage.resources } self.__tabular_resources = { k: sanitize_resource_schema(r) for (k, r) in self.__resources.items() if type(r) is TabularResource } self.__invalid_schemas = [] # Resource names with invalid schemas # All formats self.raw_data = LazyLoadedDict.from_keys(self.__resources.keys(), self._load_raw_data, 'bytes') # Tabular formats self.tables = LazyLoadedDict.from_keys(self.__tabular_resources.keys(), self._load_table, type_hint='list of rows') self.dataframes = LazyLoadedDict.from_keys( self.__tabular_resources.keys(), self._load_dataframe, type_hint='pandas.DataFrame') def describe(self, resource=None): """Describe dataset or resource within dataset Parameters ---------- resource : str, optional The name of a specific resource (i.e. file or table) contained in the dataset. If ``resource`` is None, this method will describe the dataset itself. Returns ------- dict The descriptor of the dataset or of a specific resource, if ``resource`` is specified in the call. """ if resource is None: # Show simpler descriptor, omitting schema definitions simple_descriptor = copy.deepcopy(self._datapackage.descriptor) for resource in simple_descriptor['resources']: resource.pop('schema', None) return simple_descriptor else: return self.__resources[resource].descriptor @memoized(key_mapper=lambda self, resource_name: resource_name) def _load_raw_data(self, resource_name): """Extract raw data from resource""" # Instantiating the resource again as a simple `Resource` ensures that # ``data`` will be returned as bytes. upcast_resource = datapackage.Resource( self.__resources[resource_name].descriptor, default_base_path=self.__base_path) return upcast_resource.data @memoized(key_mapper=lambda self, resource_name: resource_name) def _load_table(self, resource_name): """Build table structure from resource data""" tabular_resource = self.__tabular_resources[resource_name] try: # Sorting fields in the same order as they appear in the schema # is necessary for tables to be converted into pandas.DataFrame fields = [] if 'schema' in tabular_resource.descriptor: fields = [ f['name'] for f in tabular_resource.descriptor['schema']['fields'] ] elif len(tabular_resource.data) > 0: fields = tabular_resource.data[0].keys() return [ order_columns_in_row(fields, row) for row in tabular_resource.data ] except (SchemaValidationError, ValueError, TypeError) as e: warnings.warn( 'Unable to set column types automatically using {} schema. ' 'Data types may need to be adjusted manually. ' 'Error: {}'.format(resource_name, e)) self.__invalid_schemas.append(resource_name) file_format = tabular_resource.descriptor['format'] with Stream(six.BytesIO(self.raw_data[resource_name]), format=file_format, headers=1, scheme='stream', encoding='utf-8') as stream: return [ OrderedDict(zip(stream.headers, row)) for row in stream.iter() ] @memoized(key_mapper=lambda self, resource_name: resource_name) def _load_dataframe(self, resource_name): """Build pandas.DataFrame from resource data Lazy load any optional dependencies in order to allow users to use package without installing pandas if so they wish. """ self.__initialize_storage() rows = self.tables[resource_name] if (resource_name in self.__storage.buckets and resource_name not in self.__invalid_schemas): if self.__storage[resource_name].size == 0: row_values = [row.values() for row in rows] self.__storage.write(resource_name, row_values) return self.__storage[resource_name] else: try: import pandas except ImportError: raise RuntimeError('To enable dataframe support, ' 'run \'pip install datadotworld[PANDAS]\'') return pandas.DataFrame(rows) def __initialize_storage(self): try: from jsontableschema_pandas import Storage, mappers patch_jsontableschema_pandas(mappers) except ImportError: raise RuntimeError('To enable dataframe support, ' 'run \'pip install datadotworld[PANDAS]\'') # Initialize storage if needed if not hasattr(self, '__storage'): self.__storage = Storage() for (k, r) in self.__tabular_resources.items(): if 'schema' in r.descriptor: try: self.__storage.create(k, r.descriptor['schema']) except SchemaValidationError: self.__invalid_schemas.append(r.descriptor['schema']) def __repr__(self): return '{}({})'.format(self.__class__.__name__, repr(self.__descriptor_file)) def __eq__(self, other): return self._datapackage.descriptor == other._datapackage.descriptor
def test_storage(): # Get resources articles_descriptor = json.load( io.open('data/articles.json', encoding='utf-8')) comments_descriptor = json.load( io.open('data/comments.json', encoding='utf-8')) articles_rows = Stream('data/articles.csv', headers=1).open().read() comments_rows = Stream('data/comments.csv', headers=1).open().read() # Storage storage = Storage() # Create buckets storage.create('articles', articles_descriptor) storage.create('comments', comments_descriptor) # Assert rows assert storage['articles'].shape == (0, 0) assert storage['comments'].shape == (0, 0) # Write rows storage.write('articles', articles_rows) storage.write('comments', comments_rows) # Assert rows assert storage['articles'].shape == (2, 11) assert storage['comments'].shape == (1, 1) # Create existent bucket with pytest.raises(RuntimeError): storage.create('articles', articles_descriptor) # Assert representation assert repr(storage).startswith('Storage') # Assert buckets assert storage.buckets == ['articles', 'comments'] # Assert descriptors (from cache) assert storage.describe('articles') == articles_descriptor assert storage.describe('comments') == comments_descriptor # Assert rows assert list(storage.read('articles')) == sync_rows(articles_descriptor, articles_rows) assert list(storage.read('comments')) == sync_rows(comments_descriptor, comments_rows) # Describe bucket storage.describe('articles', articles_descriptor) # Assert descriptor assert storage.describe('articles') == articles_descriptor # Delete buckets storage.delete() # Delete non existent bucket with pytest.raises(RuntimeError): storage.delete('articles')
def test_read_missing_table(): storage = Storage() with pytest.raises(RuntimeError) as excinfo: list(storage.read('data')) assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'