Example #1
0
def test_restore_schema_with_primary_key():
    data = [
        ('a', ),
        ('b', ),
    ]
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame(data, columns=('value', ), index=index)
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'primaryKey':
        'key',
        'fields': [
            {
                'name': 'key',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'value',
                'type': 'string',
                'constraints': {
                    'required': True
                }
            },
        ]
    }
Example #2
0
def test_init_tables():
    data = [
        (1, 'a'),
        (2, 'b'),
    ]
    df = pd.DataFrame(data, columns=('key', 'value'))
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'fields': [
            {
                'name': 'key',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'value',
                'type': 'string',
                'constraints': {
                    'required': True
                }
            },
        ]
    }
Example #3
0
def test_multiple_writes():
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame([('a', ), ('b', )], columns=('value', ), index=index)
    storage = Storage(dataframes={'data': df})
    storage.write('data', [(2, 'x'), (3, 'y')])
    assert list(storage.read('data')) == [
        [1, 'a'],
        [2, 'b'],
        [2, 'x'],
        [3, 'y'],
    ]
Example #4
0
    def dataframe(self):
        """Build and cache a dataframe from query results"""
        try:
            from jsontableschema_pandas import Storage
        except ImportError:
            raise RuntimeError('To enable dataframe support, '
                               'run \'pip install datadotworld[PANDAS]\'')

        if self.__storage is None:
            self.__storage = Storage()
            self.__storage.create('results', self._schema)

            row_values = [row.values() for row in self.table]
            self.__storage.write('results', row_values)

        return self.__storage['results']
Example #5
0
    def __initialize_storage(self):
        try:
            from jsontableschema_pandas import Storage, mappers
            patch_jsontableschema_pandas(mappers)
        except ImportError:
            raise RuntimeError('To enable dataframe support, '
                               'run \'pip install datadotworld[PANDAS]\'')

        # Initialize storage if needed
        if not hasattr(self, '__storage'):
            self.__storage = Storage()
            for (k, r) in self.__tabular_resources.items():
                if 'schema' in r.descriptor:
                    try:
                        self.__storage.create(k, r.descriptor['schema'])
                    except SchemaValidationError:
                        self.__invalid_schemas.append(r.descriptor['schema'])
Example #6
0
def test_table_without_primary_key():
    schema = {
        'fields': [
            {
                'name': 'a',
                'type': 'integer'
            },
            {
                'name': 'b',
                'type': 'string'
            },
        ]
    }
    data = [[1, 'x'], [2, 'y']]

    storage = Storage()
    storage.create('data', schema)
    storage.write('data', data)
    assert list(storage.read('data')) == data
Example #7
0
class QueryResults(object):
    """Query results

    Class for accessing and working with the results of a query.

    Attributes
    ----------
    raw_data : str
        Query results as raw SPARQL JSON data
    table : list of rows
        Query results as a `list` of rows.
        Each row is a mapping of field names to their respective values.
    dataframe : `pandas.DataFrame`
        Query results as a `DataFrame`.
    """
    def __init__(self, raw):
        self.raw_data = raw

        self._schema = table_schema.infer_table_schema(raw)

        self._table = None
        self.__storage = None

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, repr(self.raw_data))

    def __str__(self):
        return str(self.raw_data)

    def describe(self):
        return self._schema

    @property
    def table(self):
        """Build and cache a table from query results"""
        if self._table is None:
            schema_obj = Schema(self._schema)

            table = []
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)
                    table.append(OrderedDict(zip(field_names, table_row)))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                table = [{'boolean': self.raw_data['boolean']}]

            self._table = table

        return self._table

    @property
    def dataframe(self):
        """Build and cache a dataframe from query results"""
        try:
            from jsontableschema_pandas import Storage
        except ImportError:
            raise RuntimeError('To enable dataframe support, '
                               'run \'pip install datadotworld[PANDAS]\'')

        if self.__storage is None:
            self.__storage = Storage()
            self.__storage.create('results', self._schema)

            row_values = [row.values() for row in self.table]
            self.__storage.write('results', row_values)

        return self.__storage['results']
Example #8
0
class LocalDataset(object):
    """Dataset saved in the local file system

    .. note:: Datasets are packaged for local access in the form of
              Datapackage.
              See specs at http://specs.frictionlessdata.io/data-package/

    Parameters
    ----------
    descriptor_file : str or file-like object
        Path or handle for the descriptor of the dataset (datapackage.json)

    Attributes
    ----------
    raw_data : dict of bytes
        Mapping of resource names to their content (raw bytes) for all types
        of data contained in the dataset.
    tables : dict of tables
        Mapping of resource names to their data table for all *tabular* data
        contained in the dataset.
        A table is a `list` of rows, where each row is a mapping of field
        names to their respective values.
    dataframes : dict of `pandas.DataFrame`
        Mapping of resource names to their `DataFrame` representation for all
        *tabular* data contained  in the dataset.

    """
    def __init__(self, descriptor_file):

        self._datapackage = datapackage.DataPackage(descriptor_file)

        self.__descriptor_file = descriptor_file
        self.__base_path = os.path.dirname(
            os.path.abspath(self.__descriptor_file))

        # Index resources by name
        self.__resources = {
            r.descriptor['name']: r
            for r in self._datapackage.resources
        }
        self.__tabular_resources = {
            k: sanitize_resource_schema(r)
            for (k, r) in self.__resources.items()
            if type(r) is TabularResource
        }
        self.__invalid_schemas = []  # Resource names with invalid schemas

        # All formats
        self.raw_data = LazyLoadedDict.from_keys(self.__resources.keys(),
                                                 self._load_raw_data, 'bytes')

        # Tabular formats
        self.tables = LazyLoadedDict.from_keys(self.__tabular_resources.keys(),
                                               self._load_table,
                                               type_hint='list of rows')
        self.dataframes = LazyLoadedDict.from_keys(
            self.__tabular_resources.keys(),
            self._load_dataframe,
            type_hint='pandas.DataFrame')

    def describe(self, resource=None):
        """Describe dataset or resource within dataset

        Parameters
        ----------
        resource : str, optional
            The name of a specific resource (i.e. file or table) contained in
            the dataset. If ``resource`` is None, this method will describe
            the dataset itself.

        Returns
        -------
        dict
            The descriptor of the dataset or of a specific resource, if
            ``resource`` is specified in the call.
        """
        if resource is None:
            # Show simpler descriptor, omitting schema definitions
            simple_descriptor = copy.deepcopy(self._datapackage.descriptor)
            for resource in simple_descriptor['resources']:
                resource.pop('schema', None)
            return simple_descriptor
        else:
            return self.__resources[resource].descriptor

    @memoized(key_mapper=lambda self, resource_name: resource_name)
    def _load_raw_data(self, resource_name):
        """Extract raw data from resource"""
        # Instantiating the resource again as a simple `Resource` ensures that
        # ``data`` will be returned as bytes.
        upcast_resource = datapackage.Resource(
            self.__resources[resource_name].descriptor,
            default_base_path=self.__base_path)
        return upcast_resource.data

    @memoized(key_mapper=lambda self, resource_name: resource_name)
    def _load_table(self, resource_name):
        """Build table structure from resource data"""
        tabular_resource = self.__tabular_resources[resource_name]

        try:
            # Sorting fields in the same order as they appear in the schema
            # is necessary for tables to be converted into pandas.DataFrame
            fields = []
            if 'schema' in tabular_resource.descriptor:
                fields = [
                    f['name']
                    for f in tabular_resource.descriptor['schema']['fields']
                ]
            elif len(tabular_resource.data) > 0:
                fields = tabular_resource.data[0].keys()

            return [
                order_columns_in_row(fields, row)
                for row in tabular_resource.data
            ]
        except (SchemaValidationError, ValueError, TypeError) as e:
            warnings.warn(
                'Unable to set column types automatically using {} schema. '
                'Data types may need to be adjusted manually. '
                'Error: {}'.format(resource_name, e))
            self.__invalid_schemas.append(resource_name)
            file_format = tabular_resource.descriptor['format']
            with Stream(six.BytesIO(self.raw_data[resource_name]),
                        format=file_format,
                        headers=1,
                        scheme='stream',
                        encoding='utf-8') as stream:
                return [
                    OrderedDict(zip(stream.headers, row))
                    for row in stream.iter()
                ]

    @memoized(key_mapper=lambda self, resource_name: resource_name)
    def _load_dataframe(self, resource_name):
        """Build pandas.DataFrame from resource data

        Lazy load any optional dependencies in order to allow users to
        use package without installing pandas if so they wish.
        """
        self.__initialize_storage()

        rows = self.tables[resource_name]
        if (resource_name in self.__storage.buckets
                and resource_name not in self.__invalid_schemas):
            if self.__storage[resource_name].size == 0:
                row_values = [row.values() for row in rows]
                self.__storage.write(resource_name, row_values)
            return self.__storage[resource_name]
        else:
            try:
                import pandas
            except ImportError:
                raise RuntimeError('To enable dataframe support, '
                                   'run \'pip install datadotworld[PANDAS]\'')
            return pandas.DataFrame(rows)

    def __initialize_storage(self):
        try:
            from jsontableschema_pandas import Storage, mappers
            patch_jsontableschema_pandas(mappers)
        except ImportError:
            raise RuntimeError('To enable dataframe support, '
                               'run \'pip install datadotworld[PANDAS]\'')

        # Initialize storage if needed
        if not hasattr(self, '__storage'):
            self.__storage = Storage()
            for (k, r) in self.__tabular_resources.items():
                if 'schema' in r.descriptor:
                    try:
                        self.__storage.create(k, r.descriptor['schema'])
                    except SchemaValidationError:
                        self.__invalid_schemas.append(r.descriptor['schema'])

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__,
                               repr(self.__descriptor_file))

    def __eq__(self, other):
        return self._datapackage.descriptor == other._datapackage.descriptor
Example #9
0
def test_storage():

    # Get resources
    articles_descriptor = json.load(
        io.open('data/articles.json', encoding='utf-8'))
    comments_descriptor = json.load(
        io.open('data/comments.json', encoding='utf-8'))
    articles_rows = Stream('data/articles.csv', headers=1).open().read()
    comments_rows = Stream('data/comments.csv', headers=1).open().read()

    # Storage
    storage = Storage()

    # Create buckets
    storage.create('articles', articles_descriptor)
    storage.create('comments', comments_descriptor)

    # Assert rows
    assert storage['articles'].shape == (0, 0)
    assert storage['comments'].shape == (0, 0)

    # Write rows
    storage.write('articles', articles_rows)
    storage.write('comments', comments_rows)

    # Assert rows
    assert storage['articles'].shape == (2, 11)
    assert storage['comments'].shape == (1, 1)

    # Create existent bucket
    with pytest.raises(RuntimeError):
        storage.create('articles', articles_descriptor)

    # Assert representation
    assert repr(storage).startswith('Storage')

    # Assert buckets
    assert storage.buckets == ['articles', 'comments']

    # Assert descriptors (from cache)
    assert storage.describe('articles') == articles_descriptor
    assert storage.describe('comments') == comments_descriptor

    # Assert rows
    assert list(storage.read('articles')) == sync_rows(articles_descriptor,
                                                       articles_rows)
    assert list(storage.read('comments')) == sync_rows(comments_descriptor,
                                                       comments_rows)

    # Describe bucket
    storage.describe('articles', articles_descriptor)

    # Assert descriptor
    assert storage.describe('articles') == articles_descriptor

    # Delete buckets
    storage.delete()

    # Delete non existent bucket
    with pytest.raises(RuntimeError):
        storage.delete('articles')
Example #10
0
def test_read_missing_table():
    storage = Storage()
    with pytest.raises(RuntimeError) as excinfo:
        list(storage.read('data'))
    assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'