def test_storage_restore_schema_with_primary_key(): data = [ ('a', ), ('b', ), ] index = pd.Index([1, 2], name='key') df = pd.DataFrame(data, columns=('value', ), index=index) storage = Storage(dataframes={'data': df}) assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] assert storage.describe('data') == { 'primaryKey': 'key', 'fields': [ { 'name': 'key', 'type': 'integer', 'constraints': { 'required': True } }, { 'name': 'value', 'type': 'string' }, ] }
def test_storage_multiple_writes(): index = pd.Index([1, 2], name='key') df = pd.DataFrame([('a', ), ('b', )], columns=('value', ), index=index) storage = Storage(dataframes={'data': df}) storage.write('data', [(2, 'x'), (3, 'y')]) assert list(storage.read('data')) == [ [1, 'a'], [2, 'b'], [2, 'x'], [3, 'y'], ]
def test_storage_multiple_writes(): index = pd.Index([1, 2], name='key') df = pd.DataFrame([('a',), ('b',)], columns=('value',), index=index) storage = Storage(dataframes={'data': df}) storage.write('data', [(2, 'x'), (3, 'y')]) assert list(storage.read('data')) == [ [1, 'a'], [2, 'b'], [2, 'x'], [3, 'y'], ]
def test_storage_init_tables(): data = [ (1, 'a'), (2, 'b'), ] df = pd.DataFrame(data, columns=('key', 'value')) storage = Storage(dataframes={'data': df}) assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] assert storage.describe('data') == { 'fields': [ {'name': 'key', 'type': 'integer'}, {'name': 'value', 'type': 'string'}, ] }
def test_storage_restore_schema_with_primary_key(): data = [ ('a',), ('b',), ] index = pd.Index([1, 2], name='key') df = pd.DataFrame(data, columns=('value',), index=index) storage = Storage(dataframes={'data': df}) assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] assert storage.describe('data') == { 'primaryKey': 'key', 'fields': [ {'name': 'key', 'type': 'integer', 'constraints': {'required': True}}, {'name': 'value', 'type': 'string'}, ] }
def test_storage_composite_primary_key(): schema = { 'fields': [ { 'name': 'field1', 'type': 'string' }, { 'name': 'field2', 'type': 'string' }, { 'name': 'field3', 'type': 'string' }, ], 'primaryKey': ['field1', 'field2'], } data = [['value1', 'value2', 'value3']] storage = Storage() storage.create('bucket', schema) storage.write('bucket', data) assert storage['bucket'].to_dict() == { 'field3': { ('value1', 'value2'): 'value3' } }
def test_storage_table_without_primary_key(): schema = { 'fields': [ {'name': 'a', 'type': 'integer'}, {'name': 'b', 'type': 'string'}, ] } data = [[1, 'x'], [2, 'y']] storage = Storage() storage.create('data', schema) storage.write('data', data) assert list(storage.read('data')) == data
def import_manifests(source_files): """Loop through the source files and stream them into a dataframe. The dataframe is converted to a list of manifest dicts. """ # Set up the storage functions for pandas dataframes storage = Storage() storage.create( 'data', { 'primaryKey': 'name', 'fields': [{ 'name': 'name', 'type': 'string' }, { 'name': 'metapath', 'type': 'string' }, { 'name': 'namespace', 'type': 'string' }, { 'name': 'title', 'type': 'string' }, { 'name': 'id', 'type': 'string' }, { 'name': '_id', 'type': 'string' }, { 'name': 'description', 'type': 'string' }, { 'name': 'version', 'type': 'string' }, { 'name': 'shortTitle', 'type': 'string' }, { 'name': 'label', 'type': 'string' }, { 'name': 'notes', 'type': 'string' }, { 'name': 'keywords', 'type': 'string' }, { 'name': 'image', 'type': 'string' }, { 'name': 'publisher', 'type': 'string' }, { 'name': 'webpage', 'type': 'string' }, { 'name': 'authors', 'type': 'string' }, { 'name': 'date', 'type': 'string' }, { 'name': 'edition', 'type': 'string' }, { 'name': 'contentType', 'type': 'string' }, { 'name': 'country', 'type': 'string' }, { 'name': 'language', 'type': 'string' }, { 'name': 'citation', 'type': 'string' }] }) path = os.path.join('app', current_app.config['UPLOAD_FOLDER']) error_list = [] print('source_files') print(source_files) for item in source_files: if item.endswith('.xlsx') or item.endswith('.xls'): options = {'format': 'xlsx', 'sheet': 1, 'headers': 1} else: options = {'headers': 1} filepath = os.path.join(path, item) with Stream(filepath, **options) as stream: try: stream.headers == [ 'name', 'metapath', 'namespace', 'title', 'id', '_id', 'description', 'version', 'shortTitle', 'label', 'notes', 'keywords', 'image', 'publisher', 'webpage', 'authors', 'date', 'edition', 'contentType', 'country', 'language', 'citation' ] except: col_order = 'name, metapath, namespace, title, id, _id, description, version, shortTitle, label, notes, keywords, image, publisher, webpage, authors, date, edition, contentType, country, language, citation' error_list.append( 'Error: The table headings in ' + item + ' do not match the Sources schema. Please use the headings ' + col_order + ' in that order.') with Stream(filepath, **options) as stream: try: storage.write('data', stream) except: error_list.append('Error: Could not stream tabular data.') os.remove(filepath) manifests = [] properties = {} data_dict = storage['data'].to_dict('index') print(data_dict) for key, values in data_dict.items(): properties = {k: v for k, v in values.items() if v is not None} properties = {k: v.replace('\\n', '\n') for k, v in properties.items()} properties['name'] = key properties['namespace'] = 'we1sv2.0' properties['metapath'] = 'Sources' if validate_manifest(properties) is True: manifests.append(properties) else: error_list.append('Could not produce a valid manifest for <code>' + key + '</code>.') # Now we're ready to insert into the database print(manifests) for manifest in manifests: db_errors = create_record(manifest) error_list = error_list + db_errors return manifests, error_list
def import_manifests(source_files): """Loop through the source files and stream them into a dataframe. The dataframe is converted to a list of manifest dicts. """ # Set up the storage functions for pandas dataframes storage = Storage() storage.create('data', { 'primaryKey': 'name', 'fields': [ {'name': 'name', 'type': 'string'}, {'name': 'metapath', 'type': 'string'}, {'name': 'namespace', 'type': 'string'}, {'name': 'title', 'type': 'string'}, {'name': 'id', 'type': 'string'}, {'name': '_id', 'type': 'string'}, {'name': 'description', 'type': 'string'}, {'name': 'version', 'type': 'string'}, {'name': 'shortTitle', 'type': 'string'}, {'name': 'label', 'type': 'string'}, {'name': 'notes', 'type': 'string'}, {'name': 'keywords', 'type': 'string'}, {'name': 'image', 'type': 'string'}, {'name': 'publisher', 'type': 'string'}, {'name': 'webpage', 'type': 'string'}, {'name': 'authors', 'type': 'string'}, {'name': 'date', 'type': 'string'}, {'name': 'edition', 'type': 'string'}, {'name': 'contentType', 'type': 'string'}, {'name': 'country', 'type': 'string'}, {'name': 'language', 'type': 'string'}, {'name': 'citation', 'type': 'string'} ] }) path = os.path.join('app', current_app.config['UPLOAD_FOLDER']) error_list = [] print('source_files') print(source_files) for item in source_files: if item.endswith('.xlsx') or item.endswith('.xls'): options = {'format': 'xlsx', 'sheet': 1, 'headers': 1} else: options = {'headers': 1} filepath = os.path.join(path, item) with Stream(filepath, **options) as stream: try: stream.headers == ['name', 'metapath', 'namespace', 'title', 'id', '_id', 'description', 'version', 'shortTitle', 'label', 'notes', 'keywords', 'image', 'publisher', 'webpage', 'authors', 'date', 'edition', 'contentType', 'country', 'language', 'citation'] except: col_order = 'name, metapath, namespace, title, id, _id, description, version, shortTitle, label, notes, keywords, image, publisher, webpage, authors, date, edition, contentType, country, language, citation' error_list.append('Error: The table headings in ' + item + ' do not match the Sources schema. Please use the headings ' + col_order + ' in that order.') with Stream(filepath, **options) as stream: try: storage.write('data', stream) except: error_list.append('Error: Could not stream tabular data.') os.remove(filepath) manifests = [] properties = {} data_dict = storage['data'].to_dict('index') print(data_dict) for key, values in data_dict.items(): properties = {k: v for k, v in values.items() if v is not None} properties = {k: v.replace('\\n', '\n') for k, v in properties.items()} properties['name'] = key properties['namespace'] = 'we1sv2.0' properties['metapath'] = 'Sources' if validate_manifest(properties) is True: manifests.append(properties) else: error_list.append('Could not produce a valid manifest for <code>' + key + '</code>.') # Now we're ready to insert into the database print(manifests) for manifest in manifests: db_errors = create_record(manifest) error_list = error_list + db_errors return manifests, error_list
def test_storage_read_missing_table(): storage = Storage() with pytest.raises(tableschema.exceptions.StorageError) as excinfo: list(storage.read('data')) assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'
def test_storage(): # Create storage storage = Storage() # Delete buckets storage.delete() # Create buckets storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']]) storage.create('comments', COMMENTS['schema'], force=True) storage.create('temporal', TEMPORAL['schema']) storage.create('location', LOCATION['schema']) storage.create('compound', COMPOUND['schema']) # Write data storage.write('articles', ARTICLES['data']) storage.write('comments', COMMENTS['data']) storage.write('temporal', TEMPORAL['data']) storage.write('location', LOCATION['data']) storage.write('compound', COMPOUND['data']) # Create new storage to use reflection only dataframes = OrderedDict() dataframes['articles'] = storage['articles'] dataframes['comments'] = storage['comments'] dataframes['temporal'] = storage['temporal'] dataframes['location'] = storage['location'] dataframes['compound'] = storage['compound'] storage = Storage(dataframes=dataframes) # Create existent bucket with pytest.raises(tableschema.exceptions.StorageError): storage.create('articles', ARTICLES['schema']) # Assert buckets assert storage.buckets == [ 'articles', 'comments', 'compound', 'location', 'temporal' ] # Assert schemas assert storage.describe('articles') == { 'fields': [ { 'name': 'id', 'type': 'integer', 'constraints': { 'required': True } }, { 'name': 'parent', 'type': 'number' }, # type downgrade { 'name': 'name', 'type': 'string' }, { 'name': 'current', 'type': 'boolean' }, { 'name': 'rating', 'type': 'number' }, ], 'primaryKey': 'id', } assert storage.describe('comments') == { 'fields': [ { 'name': 'entry_id', 'type': 'integer', 'constraints': { 'required': True } }, { 'name': 'comment', 'type': 'string' }, { 'name': 'note', 'type': 'string' }, # type downgrade ], 'primaryKey': 'entry_id', } assert storage.describe('temporal') == { 'fields': [ { 'name': 'date', 'type': 'date' }, { 'name': 'date_year', 'type': 'date' }, # format removal { 'name': 'datetime', 'type': 'datetime' }, { 'name': 'duration', 'type': 'duration' }, { 'name': 'time', 'type': 'time' }, { 'name': 'year', 'type': 'integer' }, # type downgrade { 'name': 'yearmonth', 'type': 'array' }, # type downgrade ], } assert storage.describe('location') == { 'fields': [ { 'name': 'location', 'type': 'object' }, # type downgrade { 'name': 'geopoint', 'type': 'array' }, # type downgrade ], } assert storage.describe('compound') == COMPOUND['schema'] assert storage.read('articles') == cast(ARTICLES)['data'] assert storage.read('comments') == cast(COMMENTS)['data'] assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth': list})['data'] assert storage.read('location') == cast(LOCATION, wrap_each={'geopoint': Decimal})['data'] assert storage.read('compound') == cast(COMPOUND)['data'] # Assert data with forced schema storage.describe('compound', COMPOUND['schema']) assert storage.read('compound') == cast(COMPOUND)['data'] # Delete non existent bucket with pytest.raises(tableschema.exceptions.StorageError): storage.delete('non_existent') # Delete buckets storage.delete()
def test_storage(): # Create storage storage = Storage() # Delete buckets storage.delete() # Create buckets storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']]) storage.create('comments', COMMENTS['schema'], force=True) storage.create('temporal', TEMPORAL['schema']) storage.create('location', LOCATION['schema']) storage.create('compound', COMPOUND['schema']) # Write data storage.write('articles', ARTICLES['data']) storage.write('comments', COMMENTS['data']) storage.write('temporal', TEMPORAL['data']) storage.write('location', LOCATION['data']) storage.write('compound', COMPOUND['data']) # Create new storage to use reflection only dataframes = OrderedDict() dataframes['articles'] = storage['articles'] dataframes['comments'] = storage['comments'] dataframes['temporal'] = storage['temporal'] dataframes['location'] = storage['location'] dataframes['compound'] = storage['compound'] storage = Storage(dataframes=dataframes) # Create existent bucket with pytest.raises(tableschema.exceptions.StorageError): storage.create('articles', ARTICLES['schema']) # Assert buckets assert storage.buckets == ['articles', 'comments', 'compound', 'location', 'temporal'] # Assert schemas assert storage.describe('articles') == { 'fields': [ {'name': 'id', 'type': 'integer', 'constraints': {'required': True}}, {'name': 'parent', 'type': 'number'}, # type downgrade {'name': 'name', 'type': 'string'}, {'name': 'current', 'type': 'boolean'}, {'name': 'rating', 'type': 'number'}, ], 'primaryKey': 'id', } assert storage.describe('comments') == { 'fields': [ {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}}, {'name': 'comment', 'type': 'string'}, {'name': 'note', 'type': 'string'}, # type downgrade ], 'primaryKey': 'entry_id', } assert storage.describe('temporal') == { 'fields': [ {'name': 'date', 'type': 'date'}, {'name': 'date_year', 'type': 'date'}, # format removal {'name': 'datetime', 'type': 'datetime'}, {'name': 'duration', 'type': 'duration'}, {'name': 'time', 'type': 'time'}, {'name': 'year', 'type': 'integer'}, # type downgrade {'name': 'yearmonth', 'type': 'array'}, # type downgrade ], } assert storage.describe('location') == { 'fields': [ {'name': 'location', 'type': 'object'}, # type downgrade {'name': 'geopoint', 'type': 'array'}, # type downgrade ], } assert storage.describe('compound') == COMPOUND['schema'] assert storage.read('articles') == cast(ARTICLES)['data'] assert storage.read('comments') == cast(COMMENTS)['data'] assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth': list})['data'] assert storage.read('location') == cast(LOCATION, wrap_each={'geopoint': Decimal})['data'] assert storage.read('compound') == cast(COMPOUND)['data'] # Assert data with forced schema storage.describe('compound', COMPOUND['schema']) assert storage.read('compound') == cast(COMPOUND)['data'] # Delete non existent bucket with pytest.raises(tableschema.exceptions.StorageError): storage.delete('non_existent') # Delete buckets storage.delete()