Example #1
0
def test_extract_url():
    path = mkdtemp()
    index = open_collection('test', 'file', path=path)
    package = index.create()
    src = extract.from_url(package, CSV_URL)
    assert src is not None, src

    assert 'barnet-2009.csv' in src.path, src
Example #2
0
def test_extract_url():
    path = mkdtemp()
    index = open_collection('test', 'file', path=path)
    package = index.create()
    src = extract.from_url(package, CSV_URL)
    assert src is not None, src

    assert 'barnet-2009.csv' in src.path, src
Example #3
0
def test_basic_api():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    assert not len(list(index)), len(list(index))

    package = index.create(manifest={'test': 'value'})
    assert len(list(index)) == 1, len(list(index))
    assert package.id is not None, package.id

    assert package.manifest['test'] == 'value'

    assert index.get(package.id) == package, index.get(package.id)
Example #4
0
def test_basic_api():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    assert not len(list(index)), len(list(index))

    package = index.create(manifest={'test': 'value'})
    assert len(list(index)) == 1, len(list(index))
    assert package.id is not None, package.id

    assert package.manifest['test'] == 'value'

    assert index.get(package.id) == package, index.get(package.id)
Example #5
0
 def create_app(self):
     app = create_web_app(**{
         'DEBUG': True,
         'TESTING': True,
         'SITE_TITLE': 'SpenDB',
         'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:',
         'PRESERVE_CONTEXT_ON_EXCEPTION': False,
         'CELERY_ALWAYS_EAGER': True
     })
     data_manager._coll = open_collection('test', 'file',
                                          path=tempfile.mkdtemp())
     return app
Example #6
0
 def collection(self):
     if not self.configured:
         return
     if self._coll is None:
         env = self.app.config
         args = {
             'aws_key_id': env.get('AWS_KEY_ID'),
             'aws_secret': env.get('AWS_SECRET'),
             'bucket_name': env.get('AWS_DATA_BUCKET')
         }
         self._coll = open_collection('datasets', 's3', **args)
     return self._coll
Example #7
0
def test_extract_file():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    src = extract.from_file(package, CSV_FIXTURE)
    assert src is not None, src

    sources = list(package.all(Source))
    assert len(sources) == 1, sources

    artifacts = list(package.all(Table))
    assert len(artifacts) == 0, artifacts

    assert 'barnet-2009.csv' in src.path, src
Example #8
0
def test_extract_file():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    src = extract.from_file(package, CSV_FIXTURE)
    assert src is not None, src

    sources = list(package.all(Source))
    assert len(sources) == 1, sources

    artifacts = list(package.all(Table))
    assert len(artifacts) == 0, artifacts

    assert 'barnet-2009.csv' in src.path, src
Example #9
0
 def create_app(self):
     app = create_web_app(
         **{
             'DEBUG': True,
             'TESTING': True,
             'SITE_TITLE': 'SpenDB',
             'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:',
             'PRESERVE_CONTEXT_ON_EXCEPTION': False,
             'CELERY_ALWAYS_EAGER': True
         })
     data_manager._coll = open_collection('test',
                                          'file',
                                          path=tempfile.mkdtemp())
     return app
Example #10
0
 def collection(self):
     if not self.configured:
         return
     if self._coll is None:
         env = self.app.config
         storage_type = env.get('STORAGE_TYPE', 's3')
         if storage_type == 's3':
             args = {
                 'aws_key_id': env.get('AWS_KEY_ID'),
                 'aws_secret': env.get('AWS_SECRET'),
                 'bucket_name': env.get('AWS_DATA_BUCKET')
             }
         elif storage_type == 'file':
             args = {
                 'path': env.get('STORAGE_PATH', 'data')
             }
         self._coll = open_collection('datasets', storage_type, **args)
     return self._coll
Example #11
0
def test_parse_with_dates():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    extract.from_file(package, GPC_FIXTURE)
    pipeline = Pipeline(index, 'foo',
                        {'process': {
                            'table': {
                                'operator': 'table_extract'
                            }
                        }})
    pipeline.process_package(package)

    artifacts = list(package.all(Table))
    assert len(artifacts) == 1, artifacts
    artifact = artifacts[0]
    assert artifact.name == 'table.json'
    recs = list(artifact.records())
    assert len(recs) == 23, len(recs)
    assert isinstance(recs[0]['transaction_date'], date)
Example #12
0
def test_parse_with_dates():
    index = open_collection('test', 's3', bucket_name='test.mapthemoney.org')
    package = index.create()
    extract.from_file(package, GPC_FIXTURE)
    pipeline = Pipeline(index, 'foo', {
        'process': {
            'table': {
                'operator': 'table_extract'
            }
        }
    })
    pipeline.process_package(package)

    artifacts = list(package.all(Table))
    assert len(artifacts) == 1, artifacts
    artifact = artifacts[0]
    assert artifact.name == 'table.json'
    recs = list(artifact.records())
    assert len(recs) == 23, len(recs)
    assert isinstance(recs[0]['transaction_date'], date)
Example #13
0
        raise click.ClickException("Cannot parse pipeline: %s" % e)
    if 'config' not in config:
        config['config'] = {}

    collections = ctx.pop('collections', [])
    config['config'].update(ctx)
    config['config']['threads'] = ctx.pop('threads', None)

    collection_configs = config['config'].pop('collections', {})
    if not len(collections):
        collections = collection_configs.keys()
    collections = [c for c in collections if c in collection_configs]

    for cname in collections:
        cconfig = collection_configs.get(cname)
        coll = open_collection(cname, cconfig.pop('type'), **cconfig)
        try:
            pipeline = Pipeline(coll, fh.name, config=config)
            getattr(pipeline, operation)()
        except LoadKitException, de:
            raise click.ClickException(unicode(de))


@click.group()
@click.option('-c',
              '--collections',
              default=None,
              nargs=-1,
              help='The configured collection name to use.')
@click.option('-t',
              '--threads',
Example #14
0
def test_open_collection():
    from archivekit import open_collection
    coll = open_collection('test', 's3', bucket_name='foo')
    assert isinstance(coll.store, S3Store), coll.store
    assert coll.store.bucket.name == 'foo', coll.store.bucket
Example #15
0
File: cli.py Project: 01-/loadkit
        raise click.ClickException("Cannot parse pipeline: %s" % e)
    if 'config' not in config:
        config['config'] = {}

    collections = ctx.pop('collections', [])
    config['config'].update(ctx)
    config['config']['threads'] = ctx.pop('threads', None)

    collection_configs = config['config'].pop('collections', {})
    if not len(collections):
        collections = collection_configs.keys()
    collections = [c for c in collections if c in collection_configs]

    for cname in collections:
        cconfig = collection_configs.get(cname)
        coll = open_collection(cname, cconfig.pop('type'), **cconfig)
        try:
            pipeline = Pipeline(coll, fh.name, config=config)
            getattr(pipeline, operation)()
        except LoadKitException, de:
            raise click.ClickException(unicode(de))


@click.group()
@click.option('-c', '--collections', default=None, nargs=-1,
              help='The configured collection name to use.')
@click.option('-t', '--threads', default=None, type=int,
              help='Number of threads to process data')
@click.option('-d', '--debug', default=False, is_flag=True,
              help='Verbose output for debugging')
@click.pass_context