def test_extract_url(): path = mkdtemp() index = open_collection('test', 'file', path=path) package = index.create() src = extract.from_url(package, CSV_URL) assert src is not None, src assert 'barnet-2009.csv' in src.path, src
def test_basic_api(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') assert not len(list(index)), len(list(index)) package = index.create(manifest={'test': 'value'}) assert len(list(index)) == 1, len(list(index)) assert package.id is not None, package.id assert package.manifest['test'] == 'value' assert index.get(package.id) == package, index.get(package.id)
def create_app(self): app = create_web_app(**{ 'DEBUG': True, 'TESTING': True, 'SITE_TITLE': 'SpenDB', 'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:', 'PRESERVE_CONTEXT_ON_EXCEPTION': False, 'CELERY_ALWAYS_EAGER': True }) data_manager._coll = open_collection('test', 'file', path=tempfile.mkdtemp()) return app
def collection(self): if not self.configured: return if self._coll is None: env = self.app.config args = { 'aws_key_id': env.get('AWS_KEY_ID'), 'aws_secret': env.get('AWS_SECRET'), 'bucket_name': env.get('AWS_DATA_BUCKET') } self._coll = open_collection('datasets', 's3', **args) return self._coll
def test_extract_file(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') package = index.create() src = extract.from_file(package, CSV_FIXTURE) assert src is not None, src sources = list(package.all(Source)) assert len(sources) == 1, sources artifacts = list(package.all(Table)) assert len(artifacts) == 0, artifacts assert 'barnet-2009.csv' in src.path, src
def create_app(self): app = create_web_app( **{ 'DEBUG': True, 'TESTING': True, 'SITE_TITLE': 'SpenDB', 'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:', 'PRESERVE_CONTEXT_ON_EXCEPTION': False, 'CELERY_ALWAYS_EAGER': True }) data_manager._coll = open_collection('test', 'file', path=tempfile.mkdtemp()) return app
def collection(self): if not self.configured: return if self._coll is None: env = self.app.config storage_type = env.get('STORAGE_TYPE', 's3') if storage_type == 's3': args = { 'aws_key_id': env.get('AWS_KEY_ID'), 'aws_secret': env.get('AWS_SECRET'), 'bucket_name': env.get('AWS_DATA_BUCKET') } elif storage_type == 'file': args = { 'path': env.get('STORAGE_PATH', 'data') } self._coll = open_collection('datasets', storage_type, **args) return self._coll
def test_parse_with_dates(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') package = index.create() extract.from_file(package, GPC_FIXTURE) pipeline = Pipeline(index, 'foo', {'process': { 'table': { 'operator': 'table_extract' } }}) pipeline.process_package(package) artifacts = list(package.all(Table)) assert len(artifacts) == 1, artifacts artifact = artifacts[0] assert artifact.name == 'table.json' recs = list(artifact.records()) assert len(recs) == 23, len(recs) assert isinstance(recs[0]['transaction_date'], date)
def test_parse_with_dates(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') package = index.create() extract.from_file(package, GPC_FIXTURE) pipeline = Pipeline(index, 'foo', { 'process': { 'table': { 'operator': 'table_extract' } } }) pipeline.process_package(package) artifacts = list(package.all(Table)) assert len(artifacts) == 1, artifacts artifact = artifacts[0] assert artifact.name == 'table.json' recs = list(artifact.records()) assert len(recs) == 23, len(recs) assert isinstance(recs[0]['transaction_date'], date)
raise click.ClickException("Cannot parse pipeline: %s" % e) if 'config' not in config: config['config'] = {} collections = ctx.pop('collections', []) config['config'].update(ctx) config['config']['threads'] = ctx.pop('threads', None) collection_configs = config['config'].pop('collections', {}) if not len(collections): collections = collection_configs.keys() collections = [c for c in collections if c in collection_configs] for cname in collections: cconfig = collection_configs.get(cname) coll = open_collection(cname, cconfig.pop('type'), **cconfig) try: pipeline = Pipeline(coll, fh.name, config=config) getattr(pipeline, operation)() except LoadKitException, de: raise click.ClickException(unicode(de)) @click.group() @click.option('-c', '--collections', default=None, nargs=-1, help='The configured collection name to use.') @click.option('-t', '--threads',
def test_open_collection(): from archivekit import open_collection coll = open_collection('test', 's3', bucket_name='foo') assert isinstance(coll.store, S3Store), coll.store assert coll.store.bucket.name == 'foo', coll.store.bucket
raise click.ClickException("Cannot parse pipeline: %s" % e) if 'config' not in config: config['config'] = {} collections = ctx.pop('collections', []) config['config'].update(ctx) config['config']['threads'] = ctx.pop('threads', None) collection_configs = config['config'].pop('collections', {}) if not len(collections): collections = collection_configs.keys() collections = [c for c in collections if c in collection_configs] for cname in collections: cconfig = collection_configs.get(cname) coll = open_collection(cname, cconfig.pop('type'), **cconfig) try: pipeline = Pipeline(coll, fh.name, config=config) getattr(pipeline, operation)() except LoadKitException, de: raise click.ClickException(unicode(de)) @click.group() @click.option('-c', '--collections', default=None, nargs=-1, help='The configured collection name to use.') @click.option('-t', '--threads', default=None, type=int, help='Number of threads to process data') @click.option('-d', '--debug', default=False, is_flag=True, help='Verbose output for debugging') @click.pass_context