def test_extract_url(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') package = index.create() src = extract.from_url(package, CSV_URL) assert src is not None, src assert 'barnet-2009.csv' in src.path, src
def test_parse_with_dates(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') package = index.create() src = extract.from_file(package, GPC_FIXTURE) artifact = transform.to_table(src, 'table') assert artifact.name == 'table' recs = list(artifact.records()) assert len(recs) == 23, len(recs) assert isinstance(recs[0]['transaction_date'], date)
def store_contract_text(barndir, txtdir): coll = barn.open_collection('knowncontracts', 'file', path=barndir) for pkg in coll: extension = pkg.source.name.rsplit('.')[-1].lower() # fail fast if we hit an unknown filetype -- better than producing junk extractor = extractors.get(extension, donothing) text = extractor(pkg.source.data()) outfile = '%s/%s.txt' % (txtdir, pkg.id) with codecs.open(outfile, 'w', 'utf-8', errors='ignore') as outf: outf.write(text) return coll
def test_basic_api(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') assert not len(list(index)), len(list(index)) package = index.create(manifest={'test': 'value'}) assert len(list(index)) == 1, len(list(index)) assert package.id is not None, package.id assert package.manifest['test'] == 'value' assert index.get(package.id) == package, index.get(package.id)
def test_extract_file(): index = open_collection('test', 's3', bucket_name='test.mapthemoney.org') package = index.create() src = extract.from_file(package, CSV_FIXTURE) assert src is not None, src sources = list(package.all(Source)) assert len(sources) == 1, sources artifacts = list(package.all(Artifact)) assert len(artifacts) == 0, artifacts assert 'barnet-2009.csv' in src.path, src
def store_known_contracts(args): coll = barn.open_collection('knowncontracts', 'file', path=args.barndir) # XXX warning: special-handling for pattern in args.include_text_directory: for url in glob.glob(pattern + '/*'): print('ingesting %s' % url) res = coll.ingest(url) for url in known_contracts(args.sheetid): url = preprocess_url(url) if url is None: print('skipping %s' % url) continue print('ingesting %s' % url) res = coll.ingest(url) print('done')
def test_open_collection(): from barn import open_collection coll = open_collection('test', 's3', bucket_name='foo') assert isinstance(coll.store, S3Store), coll.store assert coll.store.bucket.name == 'foo', coll.store.bucket