def test_storage_restore_schema_with_primary_key():
    data = [
        ('a', ),
        ('b', ),
    ]
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame(data, columns=('value', ), index=index)
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'primaryKey':
        'key',
        'fields': [
            {
                'name': 'key',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'value',
                'type': 'string'
            },
        ]
    }
def test_storage_multiple_writes():
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame([('a', ), ('b', )], columns=('value', ), index=index)
    storage = Storage(dataframes={'data': df})
    storage.write('data', [(2, 'x'), (3, 'y')])
    assert list(storage.read('data')) == [
        [1, 'a'],
        [2, 'b'],
        [2, 'x'],
        [3, 'y'],
    ]
def test_storage_multiple_writes():
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame([('a',), ('b',)], columns=('value',), index=index)
    storage = Storage(dataframes={'data': df})
    storage.write('data', [(2, 'x'), (3, 'y')])
    assert list(storage.read('data')) == [
        [1, 'a'],
        [2, 'b'],
        [2, 'x'],
        [3, 'y'],
    ]
def test_storage_init_tables():
    data = [
        (1, 'a'),
        (2, 'b'),
    ]
    df = pd.DataFrame(data, columns=('key', 'value'))
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'fields': [
            {'name': 'key', 'type': 'integer'},
            {'name': 'value', 'type': 'string'},
        ]
    }
def test_storage_restore_schema_with_primary_key():
    data = [
        ('a',),
        ('b',),
    ]
    index = pd.Index([1, 2], name='key')
    df = pd.DataFrame(data, columns=('value',), index=index)
    storage = Storage(dataframes={'data': df})
    assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
    assert storage.describe('data') == {
        'primaryKey': 'key',
        'fields': [
            {'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'value', 'type': 'string'},
        ]
    }
def test_storage_composite_primary_key():
    schema = {
        'fields': [
            {
                'name': 'field1',
                'type': 'string'
            },
            {
                'name': 'field2',
                'type': 'string'
            },
            {
                'name': 'field3',
                'type': 'string'
            },
        ],
        'primaryKey': ['field1', 'field2'],
    }
    data = [['value1', 'value2', 'value3']]
    storage = Storage()
    storage.create('bucket', schema)
    storage.write('bucket', data)
    assert storage['bucket'].to_dict() == {
        'field3': {
            ('value1', 'value2'): 'value3'
        }
    }
def test_storage_table_without_primary_key():
    schema = {
        'fields': [
            {'name': 'a', 'type': 'integer'},
            {'name': 'b', 'type': 'string'},
        ]
    }
    data = [[1, 'x'], [2, 'y']]

    storage = Storage()
    storage.create('data', schema)
    storage.write('data', data)
    assert list(storage.read('data')) == data
Exemple #8
0
def import_manifests(source_files):
    """Loop through the source files and stream them into a dataframe.

    The dataframe is converted to a list of manifest dicts.
    """
    # Set up the storage functions for pandas dataframes
    storage = Storage()
    storage.create(
        'data', {
            'primaryKey':
            'name',
            'fields': [{
                'name': 'name',
                'type': 'string'
            }, {
                'name': 'metapath',
                'type': 'string'
            }, {
                'name': 'namespace',
                'type': 'string'
            }, {
                'name': 'title',
                'type': 'string'
            }, {
                'name': 'id',
                'type': 'string'
            }, {
                'name': '_id',
                'type': 'string'
            }, {
                'name': 'description',
                'type': 'string'
            }, {
                'name': 'version',
                'type': 'string'
            }, {
                'name': 'shortTitle',
                'type': 'string'
            }, {
                'name': 'label',
                'type': 'string'
            }, {
                'name': 'notes',
                'type': 'string'
            }, {
                'name': 'keywords',
                'type': 'string'
            }, {
                'name': 'image',
                'type': 'string'
            }, {
                'name': 'publisher',
                'type': 'string'
            }, {
                'name': 'webpage',
                'type': 'string'
            }, {
                'name': 'authors',
                'type': 'string'
            }, {
                'name': 'date',
                'type': 'string'
            }, {
                'name': 'edition',
                'type': 'string'
            }, {
                'name': 'contentType',
                'type': 'string'
            }, {
                'name': 'country',
                'type': 'string'
            }, {
                'name': 'language',
                'type': 'string'
            }, {
                'name': 'citation',
                'type': 'string'
            }]
        })
    path = os.path.join('app', current_app.config['UPLOAD_FOLDER'])
    error_list = []
    print('source_files')
    print(source_files)
    for item in source_files:
        if item.endswith('.xlsx') or item.endswith('.xls'):
            options = {'format': 'xlsx', 'sheet': 1, 'headers': 1}
        else:
            options = {'headers': 1}
        filepath = os.path.join(path, item)
        with Stream(filepath, **options) as stream:
            try:
                stream.headers == [
                    'name', 'metapath', 'namespace', 'title', 'id', '_id',
                    'description', 'version', 'shortTitle', 'label', 'notes',
                    'keywords', 'image', 'publisher', 'webpage', 'authors',
                    'date', 'edition', 'contentType', 'country', 'language',
                    'citation'
                ]
            except:
                col_order = 'name, metapath, namespace, title, id, _id, description, version, shortTitle, label, notes, keywords, image, publisher, webpage, authors, date, edition, contentType, country, language, citation'
                error_list.append(
                    'Error: The table headings in ' + item +
                    ' do not match the Sources schema. Please use the headings '
                    + col_order + ' in that order.')
        with Stream(filepath, **options) as stream:
            try:
                storage.write('data', stream)
            except:
                error_list.append('Error: Could not stream tabular data.')
    os.remove(filepath)
    manifests = []
    properties = {}
    data_dict = storage['data'].to_dict('index')
    print(data_dict)
    for key, values in data_dict.items():
        properties = {k: v for k, v in values.items() if v is not None}
        properties = {k: v.replace('\\n', '\n') for k, v in properties.items()}
        properties['name'] = key
        properties['namespace'] = 'we1sv2.0'
        properties['metapath'] = 'Sources'
        if validate_manifest(properties) is True:
            manifests.append(properties)
        else:
            error_list.append('Could not produce a valid manifest for <code>' +
                              key + '</code>.')
    # Now we're ready to insert into the database
    print(manifests)
    for manifest in manifests:
        db_errors = create_record(manifest)
        error_list = error_list + db_errors
    return manifests, error_list
Exemple #9
0
def import_manifests(source_files):
    """Loop through the source files and stream them into a dataframe.

    The dataframe is converted to a list of manifest dicts.
    """
    # Set up the storage functions for pandas dataframes
    storage = Storage()
    storage.create('data', {
        'primaryKey': 'name',
        'fields': [
            {'name': 'name', 'type': 'string'},
            {'name': 'metapath', 'type': 'string'},
            {'name': 'namespace', 'type': 'string'},
            {'name': 'title', 'type': 'string'},
            {'name': 'id', 'type': 'string'},
            {'name': '_id', 'type': 'string'},
            {'name': 'description', 'type': 'string'},
            {'name': 'version', 'type': 'string'},
            {'name': 'shortTitle', 'type': 'string'},
            {'name': 'label', 'type': 'string'},
            {'name': 'notes', 'type': 'string'},
            {'name': 'keywords', 'type': 'string'},
            {'name': 'image', 'type': 'string'},
            {'name': 'publisher', 'type': 'string'},
            {'name': 'webpage', 'type': 'string'},
            {'name': 'authors', 'type': 'string'},
            {'name': 'date', 'type': 'string'},
            {'name': 'edition', 'type': 'string'},
            {'name': 'contentType', 'type': 'string'},
            {'name': 'country', 'type': 'string'},
            {'name': 'language', 'type': 'string'},
            {'name': 'citation', 'type': 'string'}
        ]
    })
    path = os.path.join('app', current_app.config['UPLOAD_FOLDER'])
    error_list = []
    print('source_files')
    print(source_files)
    for item in source_files:
        if item.endswith('.xlsx') or item.endswith('.xls'):
            options = {'format': 'xlsx', 'sheet': 1, 'headers': 1}
        else:
            options = {'headers': 1}
        filepath = os.path.join(path, item)
        with Stream(filepath, **options) as stream:
            try:
                stream.headers == ['name', 'metapath', 'namespace', 'title',
                                   'id', '_id', 'description', 'version',
                                   'shortTitle', 'label', 'notes', 'keywords',
                                   'image', 'publisher', 'webpage', 'authors',
                                   'date', 'edition', 'contentType', 'country',
                                   'language', 'citation']
            except:
                col_order = 'name, metapath, namespace, title, id, _id, description, version, shortTitle, label, notes, keywords, image, publisher, webpage, authors, date, edition, contentType, country, language, citation'
                error_list.append('Error: The table headings in ' + item + ' do not match the Sources schema. Please use the headings ' + col_order + ' in that order.')
        with Stream(filepath, **options) as stream:
            try:
                storage.write('data', stream)
            except:
                error_list.append('Error: Could not stream tabular data.')
    os.remove(filepath)
    manifests = []
    properties = {}
    data_dict = storage['data'].to_dict('index')
    print(data_dict)
    for key, values in data_dict.items():
        properties = {k: v for k, v in values.items() if v is not None}
        properties = {k: v.replace('\\n', '\n') for k, v in properties.items()}
        properties['name'] = key
        properties['namespace'] = 'we1sv2.0'
        properties['metapath'] = 'Sources'
        if validate_manifest(properties) is True:
            manifests.append(properties)
        else:
            error_list.append('Could not produce a valid manifest for <code>' + key + '</code>.')
    # Now we're ready to insert into the database
    print(manifests)
    for manifest in manifests:
        db_errors = create_record(manifest)
        error_list = error_list + db_errors
    return manifests, error_list
def test_storage_read_missing_table():
    storage = Storage()
    with pytest.raises(tableschema.exceptions.StorageError) as excinfo:
        list(storage.read('data'))
    assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'
def test_storage():

    # Create storage
    storage = Storage()

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create(['articles', 'comments'],
                   [ARTICLES['schema'], COMMENTS['schema']])
    storage.create('comments', COMMENTS['schema'], force=True)
    storage.create('temporal', TEMPORAL['schema'])
    storage.create('location', LOCATION['schema'])
    storage.create('compound', COMPOUND['schema'])

    # Write data
    storage.write('articles', ARTICLES['data'])
    storage.write('comments', COMMENTS['data'])
    storage.write('temporal', TEMPORAL['data'])
    storage.write('location', LOCATION['data'])
    storage.write('compound', COMPOUND['data'])

    # Create new storage to use reflection only
    dataframes = OrderedDict()
    dataframes['articles'] = storage['articles']
    dataframes['comments'] = storage['comments']
    dataframes['temporal'] = storage['temporal']
    dataframes['location'] = storage['location']
    dataframes['compound'] = storage['compound']
    storage = Storage(dataframes=dataframes)

    # Create existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.create('articles', ARTICLES['schema'])

    # Assert buckets
    assert storage.buckets == [
        'articles', 'comments', 'compound', 'location', 'temporal'
    ]

    # Assert schemas
    assert storage.describe('articles') == {
        'fields': [
            {
                'name': 'id',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'parent',
                'type': 'number'
            },  # type downgrade
            {
                'name': 'name',
                'type': 'string'
            },
            {
                'name': 'current',
                'type': 'boolean'
            },
            {
                'name': 'rating',
                'type': 'number'
            },
        ],
        'primaryKey':
        'id',
    }
    assert storage.describe('comments') == {
        'fields': [
            {
                'name': 'entry_id',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'comment',
                'type': 'string'
            },
            {
                'name': 'note',
                'type': 'string'
            },  # type downgrade
        ],
        'primaryKey':
        'entry_id',
    }
    assert storage.describe('temporal') == {
        'fields': [
            {
                'name': 'date',
                'type': 'date'
            },
            {
                'name': 'date_year',
                'type': 'date'
            },  # format removal
            {
                'name': 'datetime',
                'type': 'datetime'
            },
            {
                'name': 'duration',
                'type': 'duration'
            },
            {
                'name': 'time',
                'type': 'time'
            },
            {
                'name': 'year',
                'type': 'integer'
            },  # type downgrade
            {
                'name': 'yearmonth',
                'type': 'array'
            },  # type downgrade
        ],
    }
    assert storage.describe('location') == {
        'fields': [
            {
                'name': 'location',
                'type': 'object'
            },  # type downgrade
            {
                'name': 'geopoint',
                'type': 'array'
            },  # type downgrade
        ],
    }
    assert storage.describe('compound') == COMPOUND['schema']

    assert storage.read('articles') == cast(ARTICLES)['data']
    assert storage.read('comments') == cast(COMMENTS)['data']
    assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth':
                                                            list})['data']
    assert storage.read('location') == cast(LOCATION,
                                            wrap_each={'geopoint':
                                                       Decimal})['data']
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Assert data with forced schema
    storage.describe('compound', COMPOUND['schema'])
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('non_existent')

    # Delete buckets
    storage.delete()
def test_storage_read_missing_table():
    storage = Storage()
    with pytest.raises(tableschema.exceptions.StorageError) as excinfo:
        list(storage.read('data'))
    assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'
def test_storage():

    # Create storage
    storage = Storage()

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']])
    storage.create('comments', COMMENTS['schema'], force=True)
    storage.create('temporal', TEMPORAL['schema'])
    storage.create('location', LOCATION['schema'])
    storage.create('compound', COMPOUND['schema'])

    # Write data
    storage.write('articles', ARTICLES['data'])
    storage.write('comments', COMMENTS['data'])
    storage.write('temporal', TEMPORAL['data'])
    storage.write('location', LOCATION['data'])
    storage.write('compound', COMPOUND['data'])

    # Create new storage to use reflection only
    dataframes = OrderedDict()
    dataframes['articles'] = storage['articles']
    dataframes['comments'] = storage['comments']
    dataframes['temporal'] = storage['temporal']
    dataframes['location'] = storage['location']
    dataframes['compound'] = storage['compound']
    storage = Storage(dataframes=dataframes)

    # Create existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.create('articles', ARTICLES['schema'])

    # Assert buckets
    assert storage.buckets == ['articles', 'comments', 'compound', 'location', 'temporal']

    # Assert schemas
    assert storage.describe('articles') == {
        'fields': [
            {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'parent', 'type': 'number'}, # type downgrade
            {'name': 'name', 'type': 'string'},
            {'name': 'current', 'type': 'boolean'},
            {'name': 'rating', 'type': 'number'},
        ],
        'primaryKey': 'id',
    }
    assert storage.describe('comments') == {
        'fields': [
            {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'comment', 'type': 'string'},
            {'name': 'note', 'type': 'string'}, # type downgrade
        ],
        'primaryKey': 'entry_id',
    }
    assert storage.describe('temporal') == {
        'fields': [
            {'name': 'date', 'type': 'date'},
            {'name': 'date_year', 'type': 'date'}, # format removal
            {'name': 'datetime', 'type': 'datetime'},
            {'name': 'duration', 'type': 'duration'},
            {'name': 'time', 'type': 'time'},
            {'name': 'year', 'type': 'integer'}, # type downgrade
            {'name': 'yearmonth', 'type': 'array'}, # type downgrade
        ],
    }
    assert storage.describe('location') == {
        'fields': [
            {'name': 'location', 'type': 'object'}, # type downgrade
            {'name': 'geopoint', 'type': 'array'}, # type downgrade
        ],
    }
    assert storage.describe('compound') == COMPOUND['schema']

    assert storage.read('articles') == cast(ARTICLES)['data']
    assert storage.read('comments') == cast(COMMENTS)['data']
    assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth': list})['data']
    assert storage.read('location') == cast(LOCATION, wrap_each={'geopoint': Decimal})['data']
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Assert data with forced schema
    storage.describe('compound', COMPOUND['schema'])
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('non_existent')

    # Delete buckets
    storage.delete()