Example #1
0
    def __init__(self, source, schema=None, strict=False,
                 post_cast=[], storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source,  **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source), headers=headers)
            self.__storage = storage
Example #2
0
def test_stream_skip_rows_with_headers_example_from_readme():
    source = [['#comment'], ['name', 'order'], ['John', 1], ['Alex', 2]]
    with Stream(source, headers=1, skip_rows=['#']) as stream:
        assert stream.headers == ['name', 'order']
        assert stream.read() == [['John', 1], ['Alex', 2]]
Example #3
0
def test_stream_skip_rows_excel_empty_column():
    source = 'data/special/skip-rows.xlsx'
    with Stream(source, headers=1, skip_rows=['']) as stream:
        assert stream.read() == [['A', 'B'], [8, 9]]
Example #4
0
def test_stream_bytes_sample_size():
    source = 'data/special/latin1.csv'
    with Stream(source) as stream:
        assert stream.encoding == 'cp1252'
    with Stream(source, sample_size=0, bytes_sample_size=10) as stream:
        assert stream.encoding == 'utf-8'
Example #5
0
def test_stream_html_content_with_allow_html():
    # Link to html file containing information about csv file
    source = 'https://github.com/frictionlessdata/tabulator-py/blob/master/data/table.csv'
    with Stream(source, allow_html=True) as stream:
        assert stream
Example #6
0
def test_stream_encoding_utf_16():
    # Bytes encoded as UTF-16 with BOM in platform order is detected
    bio = io.BytesIO(u'en,English\nja,日本語'.encode('utf-16'))
    with Stream(bio, format='csv') as stream:
        assert stream.encoding == 'utf-16'
        assert stream.read() == [[u'en', u'English'], [u'ja', u'日本語']]
def test_stream_http_error():
    stream = Stream('http://github.com/bad_path.csv')
    with pytest.raises(exceptions.HTTPError) as excinfo:
        stream.open()
Example #8
0
def test_stream_bad_options_warning():
    Stream('', scheme='text', format='csv', bad_option=True).open()
    with pytest.warns(UserWarning) as record:
        Stream('', scheme='text', format='csv', bad_option=True).open()
    assert 'bad_option' in str(record[0].message.args[0])
def test_stream_format_error():
    stream = Stream('', format='bad_format')
    with pytest.raises(exceptions.FormatError) as excinfo:
        stream.open()
    assert 'bad_format' in str(excinfo.value)
def test_stream_io_error():
    stream = Stream('bad_path.csv')
    with pytest.raises(exceptions.IOError) as excinfo:
        stream.open()
    assert 'bad_path.csv' in str(excinfo.value)
def test_stream_scheme_error():
    stream = Stream('', scheme='bad_scheme')
    with pytest.raises(exceptions.SchemeError) as excinfo:
        stream.open()
    assert 'bad_scheme' in str(excinfo.value)
def test_stream_format_error_html():
    stream = Stream('data/special/table.csv.html', format='csv')
    with pytest.raises(exceptions.FormatError) as excinfo:
        stream.open()
def test_stream_source_error_data():
    stream = Stream('[1,2]', scheme='text', format='json')
    with pytest.raises(exceptions.SourceError) as excinfo:
        stream.open()
        stream.read()
Example #14
0
def test_stream_source_error_data():
    stream = Stream('[1,2]', scheme='text', format='json')
    with pytest.raises(exceptions.SourceError) as excinfo:
        stream.open()
        stream.read()
def test_stream_gsheet_bad_url():
    stream = Stream('https://docs.google.com/spreadsheets/d/bad')
    with pytest.raises(exceptions.HTTPError) as excinfo:
        stream.open()
Example #16
0
def test_stream_scheme_error():
    stream = Stream('', scheme='bad_scheme')
    with pytest.raises(exceptions.SchemeError) as excinfo:
        stream.open()
    assert 'bad_scheme' in str(excinfo.value)
Example #17
0
def test_stream_compression_error_gz():
    source = 'id,filename\n\1,dump.tar.gz'
    stream = Stream(source, scheme='text', format='csv')
    stream.open()
Example #18
0
def test_stream_http_error():
    stream = Stream('http://github.com/bad_path.csv')
    with pytest.raises(exceptions.HTTPError) as excinfo:
        stream.open()
Example #19
0
def test_stream_compression_error_zip():
    source = 'id,filename\n1,archive.zip'
    stream = Stream(source, scheme='text', format='csv')
    stream.open()
Example #20
0
def test_stream_encoding_explicit_latin1():
    with Stream('data/special/latin1.csv', encoding='latin1') as stream:
        assert stream.encoding == 'iso8859-1'
        assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '©']]
Example #21
0
def test_stream_scheme_file():
    with Stream('data/table.csv') as stream:
        assert stream.scheme == 'file'
Example #22
0
def test_stream_html_content():
    # Link to html file containing information about csv file
    source = 'https://github.com/frictionlessdata/tabulator-py/blob/master/data/table.csv'
    with pytest.raises(exceptions.FormatError) as excinfo:
        Stream(source).open()
    assert 'HTML' in str(excinfo.value)
Example #23
0
def test_stream_scheme_https():
    with Stream(BASE_URL % 'data/table.csv') as stream:
        assert stream.scheme == 'https'
Example #24
0
def test_stream_sample():
    source = [['id', 'name'], ['1', 'english'], ['2', '中国人']]
    with Stream(source, headers=1) as stream:
        assert stream.headers == ['id', 'name']
        assert stream.sample == [['1', 'english'], ['2', '中国人']]
Example #25
0
def test_stream_scheme_stream():
    with Stream(io.open('data/table.csv', mode='rb'), format='csv') as stream:
        assert stream.scheme == 'stream'
Example #26
0
def test_stream_skip_rows():
    source = 'data/special/skip-rows.csv'
    with Stream(source, skip_rows=['#', 5]) as stream:
        assert stream.read() == [['id', 'name'], ['1', 'english']]
Example #27
0
def test_stream_scheme_text():
    with Stream('text://a\nb', format='csv') as stream:
        assert stream.scheme == 'text'
Example #28
0
def test_stream_skip_rows_with_headers():
    source = 'data/special/skip-rows.csv'
    with Stream(source, headers=1, skip_rows=['#']) as stream:
        assert stream.headers == ['id', 'name']
        assert stream.read() == [['1', 'english'], ['2', '中国人']]
Example #29
0
def test_stream_format_ndjson():
    with Stream('data/table.ndjson') as stream:
        assert stream.format == 'ndjson'
Example #30
0
def test_stream_json_property():
    source = '{"root": [["value1", "value2"], ["value3", "value4"]]}'
    with Stream(source, scheme='text', format='json',
                property='root') as stream:
        assert stream.read() == [['value1', 'value2'], ['value3', 'value4']]
Example #31
0
def test_stream_format_ods():
    with Stream('data/table.ods') as stream:
        assert stream.format == 'ods'
Example #32
0
def test_stream_format_error_html():
    stream = Stream('data/special/table.csv.html', format='csv')
    with pytest.raises(exceptions.FormatError) as excinfo:
        stream.open()
Example #33
0
def test_stream_format_tsv():
    with Stream('data/table.tsv') as stream:
        assert stream.format == 'tsv'
Example #34
0
def test_stream_format_error():
    stream = Stream('', format='bad_format')
    with pytest.raises(exceptions.FormatError) as excinfo:
        stream.open()
    assert 'bad_format' in str(excinfo.value)
Example #35
0
def test_stream_format_xlsx():
    with Stream('data/table.xlsx') as stream:
        assert stream.format == 'xlsx'
Example #36
0
def test_stream_io_error():
    stream = Stream('bad_path.csv')
    with pytest.raises(exceptions.IOError) as excinfo:
        stream.open()
    assert 'bad_path.csv' in str(excinfo.value)
Example #37
0
def test_stream_encoding_explicit_utf8():
    with Stream('data/table.csv', encoding='utf-8') as stream:
        assert stream.encoding == 'utf-8'
        assert stream.read() == [['id', 'name'], ['1', 'english'],
                                 ['2', '中国人']]
Example #38
0
def test_stream_read_closed():
    stream = Stream('data/table.csv')
    with pytest.raises(exceptions.TabulatorException) as excinfo:
        stream.read()
    assert 'stream.open()' in str(excinfo.value)
Example #39
0
class Table(object):

    # Public

    def __init__(self, source, schema=None, strict=False,
                 post_cast=[], storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source,  **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source), headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self, keyed=False, extended=False, cast=True, relations=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row) if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    for foreign_key in self.schema.foreign_keys:
                        row = _resolve_relations(row, headers, relations, foreign_key)
                        if row is None:
                            self.__stream.close()
                            message = 'Foreign key "%s" violation in row "%s"'
                            message = message % (foreign_key['fields'], row_number)
                            raise exceptions.RelationError(message)

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit], headers=stream.headers)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)
        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator