def read_list_from_csv(
    url: str,
    headers: Union[int, List[int], List[str], None] = None,
    dict_form: bool = False,
    **kwargs: Any,
) -> List[Union[Dict, List]]:
    """Read a list of rows in dict or list form from a csv. The headers argument is either a row
       number or list of row numbers (in case of multi-line headers) to be considered as headers
       (rows start counting at 1), or the actual headers defined a list of strings. If not set,
       all rows will be treated as containing values.

    Args:
        url (str): URL or path to read from
        headers (Union[int, List[int], List[str], None]): Row number of headers. Defaults to None.
        dict_form (bool): Return dict (requires headers parameter) or list for each row. Defaults to False (list)
        **kwargs: Other arguments to pass to Tabulator Stream

    Returns:
        List[Union[Dict, List]]: List of rows in dict or list form

    """
    if dict_form and headers is None:
        raise ValueError("If dict_form is True, headers must not be None!")
    stream = Stream(url, headers=headers, **kwargs)
    stream.open()
    result = stream.read(keyed=dict_form)
    stream.close()
    return result
def test_stream_reset_on_close_issue_190():
    source = [['1', 'english'], ['2', '中国人']]
    stream = Stream(source)
    stream.open()
    stream.read(limit=1) == [['1', 'english']]
    stream.open()
    stream.read(limit=1) == [['1', 'english']]
    stream.close()
Exemple #3
0
def _csv(path, encoding):
    _s = Stream(path, format='csv', encoding=encoding)
    try:
        _s.open()
        _s.close()
        return 'csv'
    except (FormatError, UnicodeDecodeError, FileNotFoundError, BadZipFile):
        return None
Exemple #4
0
 def _xls():
     _s = Stream(path, format='xls', encoding=encoding)
     try:
         _s.open()
         _s.close()
         return 'xls'
     except (FormatError, BadZipFile, ValueError, XLRDError, FileNotFoundError, NotImplementedError):
         return None
Exemple #5
0
    def _ods():
        _s = Stream(path, format='ods', encoding=encoding)
        try:
            _s.open()
            _s.close()
            return True

        except (FormatError, OSError, BadZipFile, FileNotFoundError, TypeError):
            return False
Exemple #6
0
 def _xlsx():
     _s = Stream(path, format='xlsx', encoding=encoding)
     try:
         _s.open()
         _s.close()
         return 'xlsx'
     except ValueError:
         return 'xlsx'
     except (FormatError, BadZipFile, OSError, FileNotFoundError, KeyError):
         return None
Exemple #7
0
def test_stream_local_csv_zip_multiple_open():
    # That's how `tableschema.iter()` acts
    stream = Stream('data/table.csv.zip')
    stream.open()
    assert stream.headers is None
    assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
    stream.close()
    stream.open()
    assert stream.headers is None
    assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']]
    stream.close()
def write_list_to_csv(list_of_rows, filepath, headers=None):
    # type: (List[Union[DictUpperBound, List]], str, Optional[List[str]]) -> None
    """Write a list of rows in dict or list form to a csv.

    Args:
        list_of_rows (List[Union[DictUpperBound, List]]): List of rows in dict or list form
        filepath (str): Path to write to
        headers (Optional[List[str]]): Headers to write. Defaults to None.

    Returns:
        None

    """
    stream = Stream(list_of_rows, headers=headers)
    stream.open()
    stream.save(filepath, format='csv')
    stream.close()
def read_list_from_csv(filepath, dict_form=False, headers=None):
    # type: (str, bool, Optional[int]) -> List[Union[Dict, List]]
    """Read a list of rows in dict or list form from a csv.

    Args:
        filepath (str): Path to read from
        dict_form (bool): Return in dict form. Defaults to False.
        headers (Optional[List[str]]): Row number of headers. Defaults to None.

    Returns:
        List[Union[Dict, List]]: List of rows in dict or list form

    """
    stream = Stream(filepath, headers=headers)
    stream.open()
    result = stream.read(keyed=dict_form)
    stream.close()
    return result
def write_list_to_csv(
    filepath: str,
    list_of_rows: List[Union[DictUpperBound, List]],
    headers: Union[int, List[int], List[str], None] = None,
) -> None:
    """Write a list of rows in dict or list form to a csv. (The headers argument is either a row
       number or list of row numbers (in case of multi-line headers) to be considered as headers
       (rows start counting at 1), or the actual headers defined a list of strings. If not set,
       all rows will be treated as containing values.)

    Args:
        filepath (str): Path to write to
        list_of_rows (List[Union[DictUpperBound, List]]): List of rows in dict or list form
        headers (Union[int, List[int], List[str], None]): Headers to write. Defaults to None.

    Returns:
        None

    """
    stream = Stream(list_of_rows, headers=headers)
    stream.open()
    stream.save(filepath, format="csv")
    stream.close()
    def create_datastore(self,
                         schema=None,
                         primary_key=None,
                         delete_first=0,
                         path=None):
        # type: (Optional[List[dict]], Optional[str], Optional[int], Optional[str]) -> None
        """For csvs, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided
        all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX.

        Args:
            schema (List[dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None.
            primary_key (Optional[str]): Primary key of schema. Defaults to None.
            delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0.
            path (Optional[str]): Local path to file that was uploaded. Defaults to None.

        Returns:
            None
        """
        if delete_first == 0:
            pass
        elif delete_first == 1:
            self.delete_datastore()
        elif delete_first == 2:
            if primary_key is None:
                self.delete_datastore()
        else:
            raise HDXError(
                'delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)'
            )
        if path is None:
            # Download the resource
            url, path = self.download()
            delete_after_download = True
        else:
            url = self.data.get('url', None)
            if not url:
                raise HDXError('No URL to download!')
            delete_after_download = False

        zip_path = None
        stream = None
        try:
            extension = splitext(path)[1]
            if extension.lower() == '.zip':
                zip_file = zipfile.ZipFile(path)
                filename = zip_file.namelist()[0]
                tempdir = dirname(abspath(path))
                zip_file.extract(filename, tempdir)
                zip_path = path
                path = join(tempdir, filename)

            def convert_to_text(extended_rows):
                for number, headers, row in extended_rows:
                    for i, val in enumerate(row):
                        row[i] = str(val)
                    yield (number, headers, row)

            tabulator.config.BYTES_SAMPLE_SIZE = 1000000
            stream = Stream(path, headers=1, post_parse=[convert_to_text])
            stream.open()
            if schema is None:
                schema = list()
                for fieldname in stream.headers:
                    schema.append({'id': fieldname, 'type': 'text'})
            data = {
                'resource_id': self.data['id'],
                'force': True,
                'fields': schema,
                'primary_key': primary_key
            }
            self._write_to_hdx('datastore_create', data, 'id')
            if primary_key is None:
                method = 'insert'
            else:
                method = 'upsert'
            logger.debug('Uploading data from %s to datastore' % url)
            offset = 0
            chunksize = 100
            rowset = stream.read(keyed=True, limit=chunksize)
            while len(rowset) != 0:
                data = {
                    'resource_id': self.data['id'],
                    'force': True,
                    'method': method,
                    'records': rowset
                }
                self._write_to_hdx('datastore_upsert', data, 'id')
                rowset = stream.read(keyed=True, limit=chunksize)
                logger.debug('Uploading: %s' % offset)
                offset += chunksize
        except Exception as e:
            six.raise_from(HDXError('Upload to datastore of %s failed!' % url),
                           e)
        finally:
            if stream:
                stream.close()
            if delete_after_download:
                unlink(path)
                if zip_path:
                    unlink(zip_path)
            else:
                if zip_path:
                    unlink(
                        path
                    )  # ie. we keep the zip but remove the extracted file
Exemple #12
0
class Table(object):

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 strict=False,
                 post_cast=[],
                 storage=None,
                 **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if isinstance(schema, Schema):
            self.__schema = schema
        elif schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source),
                                   headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self,
             keyed=False,
             extended=False,
             cast=True,
             relations=False,
             foreign_keys_values=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)
        # Prepare relation checks
        if relations and not foreign_keys_values:
            # we have to test relations but the index has not been precomputed
            # prepare the index to boost validation process
            foreign_keys_values = self.index_foreign_keys_values(relations)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row)
                                   if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    row_with_relations = dict(zip(headers, copy(row)))
                    for foreign_key in self.schema.foreign_keys:
                        refValue = _resolve_relations(row, headers,
                                                      foreign_keys_values,
                                                      foreign_key)
                        if refValue is None:
                            self.__stream.close()
                            keyed_row = OrderedDict(zip(headers, row))
                            # local values of the FK
                            local_values = tuple(
                                keyed_row[f] for f in foreign_key['fields'])
                            message = 'Foreign key "%s" violation in row "%s": %s not found in %s'
                            message = message % (
                                foreign_key['fields'], row_number,
                                local_values,
                                foreign_key['reference']['resource'])
                            raise exceptions.RelationError(message)
                        elif type(refValue) is dict:
                            for field in foreign_key['fields']:
                                if type(row_with_relations[field]) is not dict:
                                    # no previous refValues injected on this field
                                    row_with_relations[field] = refValue
                                else:
                                    # alreayd one ref, merging
                                    row_with_relations[field].update(refValue)
                        else:
                            # case when all original value of the FK are empty
                            # refValue == row, there is nothing to do
                            # an empty dict might be a better returned value for this case ?
                            pass

                    #  mutate row now that we are done, in the right order
                    row = [row_with_relations[f] for f in headers]

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self,
             keyed=False,
             extended=False,
             cast=True,
             relations=False,
             limit=None,
             foreign_keys_values=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed,
                         extended=extended,
                         cast=cast,
                         relations=relations,
                         foreign_keys_values=foreign_keys_values)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100, confidence=0.75):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit],
                                            headers=stream.headers,
                                            confidence=confidence)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    def index_foreign_keys_values(self, relations):
        # we dont need to load the complete reference table to test relations
        # we can lower payload AND optimize testing foreign keys
        # by preparing the right index based on the foreign key definition
        # foreign_keys are sets of tuples of all possible values in the foreign table
        # foreign keys =
        # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... }
        foreign_keys = defaultdict(dict)
        if self.schema:
            for fk in self.schema.foreign_keys:
                # load relation data
                relation = fk['reference']['resource']

                # create a set of foreign keys
                # to optimize we prepare index of existing values
                # this index should use reference + foreign_keys as key
                # cause many foreign keys may use the same reference
                foreign_keys[relation][tuple(fk['reference']['fields'])] = {}
                for row in relations[relation]:
                    key = tuple([
                        row[foreign_field]
                        for foreign_field in fk['reference']['fields']
                    ])
                    # here we should chose to pick the first or nth row which match
                    # previous implementation picked the first, so be it
                    if key not in foreign_keys[relation][tuple(
                            fk['reference']['fields'])]:
                        foreign_keys[relation][tuple(
                            fk['reference']['fields'])][key] = row
        return foreign_keys

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator
Exemple #13
0
class Table(object):

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 strict=False,
                 post_cast=[],
                 storage=None,
                 **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if isinstance(schema, Schema):
            self.__schema = schema
        elif schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source),
                                   headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self, keyed=False, extended=False, cast=True, relations=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row)
                                   if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    for foreign_key in self.schema.foreign_keys:
                        row = _resolve_relations(row, headers, relations,
                                                 foreign_key)
                        if row is None:
                            self.__stream.close()
                            message = 'Foreign key "%s" violation in row "%s"'
                            message = message % (foreign_key['fields'],
                                                 row_number)
                            raise exceptions.RelationError(message)

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self,
             keyed=False,
             extended=False,
             cast=True,
             relations=False,
             limit=None):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed,
                         extended=extended,
                         cast=cast,
                         relations=relations)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100, confidence=0.75):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit],
                                            headers=stream.headers,
                                            confidence=confidence)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator
Exemple #14
0
def spreadsheet_file_format(path, encoding):  # noqa: C901
    encoding = encoding or 'utf-8'
    _s = Stream(path, encoding=encoding)
    _s.open()
    _s.close()
    return _s.format if _s.format != 'inline' else None
Exemple #15
0
class Table(object):
    """Table representation

    # Arguments
      source (str/list[]): data source one of:
        - local file (path)
        - remote file (url)
        - array of arrays representing the rows
      schema (any): data schema in all forms supported by `Schema` class
      strict (bool): strictness option to pass to `Schema` constructor
      post_cast (function[]): list of post cast processors
      storage (None): storage name like `sql` or `bigquery`
      options (dict): `tabulator` or storage's options

    # Raises
      TableSchemaException: raises on any error

    """

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 strict=False,
                 post_cast=[],
                 storage=None,
                 **options):

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if isinstance(schema, Schema):
            self.__schema = schema
        elif schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source),
                                   headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """Table's headers is available

        # Returns
            str[]: headers

        """
        return self.__headers

    @property
    def schema(self):
        """Returns schema class instance if available

        # Returns
            Schema: schema

        """
        return self.__schema

    @property
    def size(self):
        """Table's size in BYTES if it's available

        If it's already read using e.g. `table.read`, otherwise returns `None`.
        In the middle of an iteration it returns size of already read contents

        # Returns
            int/None: size in BYTES

        """
        if self.__stream:
            return self.__stream.size

    @property
    def hash(self):
        """Table's SHA256 hash if it's available.

        If it's already read using e.g. `table.read`, otherwise returns `None`.
        In the middle of an iteration it returns hash of already read contents

        # Returns
            str/None: SHA256 hash

        """
        if self.__stream:
            return self.__stream.hash

    def iter(self,
             keyed=False,
             extended=False,
             cast=True,
             integrity=False,
             relations=False,
             foreign_keys_values=False,
             exc_handler=None):
        """Iterates through the table data and emits rows cast based on table schema.

        # Arguments

            keyed (bool):
                yield keyed rows in a form of `{header1\\: value1, header2\\: value2}`
                (default is false; the form of rows is `[value1, value2]`)

            extended (bool):
                yield extended rows in a for of `[rowNumber, [header1, header2], [value1, value2]]`
                (default is false; the form of rows is `[value1, value2]`)

            cast (bool):
                disable data casting if false
                (default is true)

            integrity (dict):
                dictionary in a form of `{'size'\\: <bytes>, 'hash'\\: '<sha256>'}`
                to check integrity of the table when it's read completely.
                Both keys are optional.

            relations (dict):
                dictionary of foreign key references in a form
                of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`.
                If provided, foreign key fields will checked and resolved
                to one of their references (/!\\ one-to-many fk are not completely resolved).

            foreign_keys_values (dict):
                three-level dictionary of foreign key references optimized
                to speed up validation process in a form of
                `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}}`.
                If not provided but relations is true, it will be created
                before the validation process by *index_foreign_keys_values* method

            exc_handler (func):
                optional custom exception handler callable.
                Can be used to defer raising errors (i.e. "fail late"), e.g.
                for data validation purposes. Must support the signature below

        # Custom exception handler

        ```python
        def exc_handler(exc, row_number=None, row_data=None, error_data=None):
            '''Custom exception handler (example)

            # Arguments:
                exc(Exception):
                    Deferred exception instance
                row_number(int):
                    Data row number that triggers exception exc
                row_data(OrderedDict):
                    Invalid data row source data
                error_data(OrderedDict):
                    Data row source data field subset responsible for the error, if
                    applicable (e.g. invalid primary or foreign key fields). May be
                    identical to row_data.
            '''
            # ...
        ```

        # Raises
            TableSchemaException: base class of any error
            CastError: data cast error
            IntegrityError: integrity checking error
            UniqueKeyError: unique key constraint violation
            UnresolvedFKError: unresolved foreign key reference error

        # Returns
            Iterator[list]: yields rows

        """
        # TODO: Use helpers.default_exc_handler instead. Prerequisite: Use
        # stream context manager to make sure the stream gets properly closed
        # in all situations, see comment below.
        if exc_handler is None:
            stream = self.__stream

            def exc_handler(exc, *args, **kwargs):
                stream.close()
                raise exc

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)
        # Prepare relation checks
        if relations and not foreign_keys_values:
            # we have to test relations but the index has not been precomputed
            # prepare the index to boost validation process
            foreign_keys_values = self.index_foreign_keys_values(relations)

        # Open/iterate stream
        # TODO: Use context manager instead to make sure stream gets closed in
        # case of exceptions. Leaving that in for now for the sake of a smaller
        # diff.
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator,
                                           cast=cast,
                                           exc_handler=exc_handler)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        message = ('Table headers (%r) don\'t match '
                                   'schema field names (%r) in row %s' %
                                   (self.headers, self.schema.field_names,
                                    row_number))
                        keyed_row = OrderedDict(zip(headers, row))
                        exc_handler(exceptions.CastError(message),
                                    row_number=row_number,
                                    row_data=keyed_row,
                                    error_data=keyed_row)
                        continue

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    keyed_values = OrderedDict((headers[i], value)
                                               for i, value in enumerate(row)
                                               if i in indexes)
                    values = tuple(keyed_values.values())
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            message = ('Field(s) "%s" duplicates in row "%s" '
                                       'for values %r' %
                                       (cache['name'], row_number, values))
                            exc_handler(exceptions.UniqueKeyError(message),
                                        row_number=row_number,
                                        row_data=OrderedDict(zip(headers,
                                                                 row)),
                                        error_data=keyed_values)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    row_with_relations = dict(zip(headers, copy(row)))
                    for foreign_key in self.schema.foreign_keys:
                        refValue = _resolve_relations(row, headers,
                                                      foreign_keys_values,
                                                      foreign_key)
                        if refValue is None:
                            keyed_row = OrderedDict(zip(headers, row))
                            # local values of the FK
                            local_keyed_values = {
                                key: keyed_row[key]
                                for key in foreign_key['fields']
                            }
                            local_values = tuple(local_keyed_values.values())
                            message = (
                                'Foreign key "%s" violation in row "%s": '
                                '%s not found in %s' %
                                (foreign_key['fields'], row_number,
                                 local_values,
                                 foreign_key['reference']['resource']))
                            exc_handler(exceptions.UnresolvedFKError(message),
                                        row_number=row_number,
                                        row_data=keyed_row,
                                        error_data=local_keyed_values)
                            # If we reach this point we don't fail-early
                            # i.e. no exception has been raised. As the
                            # reference can't be resolved, use empty dict
                            # as the "unresolved result".
                            for field in foreign_key['fields']:
                                if not isinstance(row_with_relations[field],
                                                  dict):
                                    row_with_relations[field] = {}
                        elif type(refValue) is dict:
                            # Substitute resolved referenced object for
                            # original referencing field value.
                            # For a composite foreign key, this substitutes
                            # each part of the composite key with the
                            # referenced object.
                            for field in foreign_key['fields']:
                                if type(row_with_relations[field]) is not dict:
                                    # no previous refValues injected on this field
                                    row_with_relations[field] = refValue
                                else:
                                    # alreayd one ref, merging
                                    row_with_relations[field].update(refValue)
                        else:
                            # case when all original value of the FK are empty
                            # refValue == row, there is nothing to do
                            # an empty dict might be a better returned value for this case ?
                            pass

                    #  mutate row now that we are done, in the right order
                    row = [row_with_relations[f] for f in headers]

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Check integrity
        if integrity:
            violations = []
            size = integrity.get('size')
            hash = integrity.get('hash')
            if size and size != self.__stream.size:
                violations.append('size "%s"' % self.__stream.size)
            if hash and hash != self.__stream.hash:
                violations.append('hash "%s"' % self.__stream.hash)
            if violations:
                message = 'Calculated %s differ(s) from declared value(s)'
                raise exceptions.IntegrityError(message %
                                                ' and '.join(violations))

        # Close stream
        self.__stream.close()

    def read(self,
             keyed=False,
             extended=False,
             cast=True,
             limit=None,
             integrity=False,
             relations=False,
             foreign_keys_values=False,
             exc_handler=None):
        """Read the whole table and return as array of rows

        > It has the same API as `table.iter` except for

        # Arguments
            limit (int): limit count of rows to read and return

        # Returns
            list[]: returns rows

        """
        result = []
        rows = self.iter(keyed=keyed,
                         extended=extended,
                         cast=cast,
                         integrity=integrity,
                         relations=relations,
                         foreign_keys_values=foreign_keys_values,
                         exc_handler=exc_handler)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100, confidence=0.75):
        """Infer a schema for the table.

        It will infer and set Table Schema to `table.schema` based on table data.

        # Arguments
            limit (int): limit rows sample size
            confidence (float):
                how many casting errors are allowed
                (as a ratio, between 0 and 1)

        # Returns
            dict: Table Schema descriptor

        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit],
                                            headers=stream.headers,
                                            confidence=confidence)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """Save data source to file locally in CSV format with `,` (comma) delimiter

        > To save schema use `table.schema.save()`

        # Arguments
            target (str): saving target (e.g. file path)
            storage (None/str): storage name like `sql` or `bigquery`
            options (dict): `tabulator` or storage options

        # Raises
            TableSchemaException: raises an error if there is saving problem

        # Returns
            True/Storage: returns true or storage instance

        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    def index_foreign_keys_values(self, relations):
        """Creates a three-level dictionary of foreign key references

        We create them optimized to speed up validation process in a form of
        `{resource1: {(fk_field1, fk_field2): {(value1, value2): {one_keyedrow}, ... }}}`.

        For each foreign key of the schema it will iterate through the corresponding
        `relations['resource']` to create an index (i.e. a dict) of existing values
        for the foreign fields and store on keyed row for each value combination.

        The optimization relies on the indexation of possible values for one foreign key
        in a hashmap to later speed up resolution.

        This method is public to allow creating the index once to apply it
        on multiple tables charing the same schema
        (typically [grouped resources in datapackage](https://github.com/frictionlessdata/datapackage-py#group))

        # Notes

        - the second key of the output is a tuple of the foreign fields,
            a proxy identifier of the foreign key
        - the same relation resource can be indexed multiple times
            as a schema can contain more than one Foreign Keys
            pointing to the same resource

        # Arguments
            relations (dict):
                dict of foreign key references in a form of
                `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`.
                It must contain all resources pointed in the foreign keys schema definition.

        # Returns
            dict:
                returns a three-level dictionary of foreign key references
                optimized to speed up validation process in a form of
                `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}})`

        """

        # we dont need to load the complete reference table to test relations
        # we can lower payload AND optimize testing foreign keys
        # by preparing the right index based on the foreign key definition
        # foreign_keys are sets of tuples of all possible values in the foreign table
        # foreign keys =
        # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... }
        foreign_keys = defaultdict(dict)
        if self.schema:
            for fk in self.schema.foreign_keys:
                # load relation data
                relation = fk['reference']['resource']

                # create a set of foreign keys
                # to optimize we prepare index of existing values
                # this index should use reference + foreign_keys as key
                # cause many foreign keys may use the same reference
                foreign_keys[relation][tuple(fk['reference']['fields'])] = {}
                for row in relations[relation]:
                    key = tuple([
                        row[foreign_field]
                        for foreign_field in fk['reference']['fields']
                    ])
                    # here we should chose to pick the first or nth row which match
                    # previous implementation picked the first, so be it
                    if key not in foreign_keys[relation][tuple(
                            fk['reference']['fields'])]:
                        foreign_keys[relation][tuple(
                            fk['reference']['fields'])][key] = row
        return foreign_keys

    # Private

    def __apply_processors(self, iterator, cast=True, exc_handler=None):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row,
                                                 row_number=row_number,
                                                 exc_handler=exc_handler)
                yield (row_number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator
Exemple #16
0
class Table(object):

    # Public

    def __init__(self, source, schema=None, strict=False,
                 post_cast=[], storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source,  **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source), headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self, keyed=False, extended=False, cast=True, relations=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row) if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    for foreign_key in self.schema.foreign_keys:
                        row = _resolve_relations(row, headers, relations, foreign_key)
                        if row is None:
                            self.__stream.close()
                            message = 'Foreign key "%s" violation in row "%s"'
                            message = message % (foreign_key['fields'], row_number)
                            raise exceptions.RelationError(message)

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit], headers=stream.headers)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)
        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator