def _sorter(rows, key_calc, reverse, batch_size): db = KVFile() db.insert(((key_calc(row) + "{:08x}".format(row_num), row) for row_num, row in enumerate(rows)), batch_size=batch_size) for _, value in db.items(reverse=reverse): yield value
def _sorter(rows, key_calc, reverse, batch_size): db = KVFile() def process(rows): for row_num, row in enumerate(rows): key = key_calc(row) + '{:08x}'.format(row_num) yield (key, row) db.insert(process(rows), batch_size=batch_size) for _, value in db.items(reverse=reverse): yield value
def test_filename(): from kvfile import KVFile, db_kind filename = 'bla.filename.' + db_kind + '.db' kv1 = KVFile(filename=filename) kv1.insert(((str(i), ':{}'.format(i)) for i in range(50000))) del kv1 kv = KVFile(filename=filename) assert len(list(kv.keys())) == 50000 assert len(list(kv.items())) == 50000 assert kv.get('49999') == ':49999'
def test_default(): from kvfile import KVFile kv = KVFile() kv.set('aaaa', 5) assert kv.get('aaaa') == 5 assert kv.get('bbbb', default=6) == 6 with pytest.raises(KeyError): kv.get('bbbb')
def func(package): source_, target_name_, target_path_ = source, target_name, target_path if source_ is None: source_ = package.pkg.descriptor['resources'][0]['name'] if target_name_ is None: target_name_ = source_ + '_copy' if target_path is None: target_path_ = target_name_ + '.csv' def traverse_resources(resources): for res in resources: yield res if res['name'] == source_: res = copy.deepcopy(res) res['name'] = target_name_ res['path'] = target_path_ yield res descriptor = package.pkg.descriptor descriptor['resources'] = list(traverse_resources(descriptor['resources'])) yield package.pkg for resource in package: if resource.res.name == source_: db = KVFile() yield saver(resource, db, batch_size) yield loader(db) else: yield resource
def get_all_existing_ids(connection_string, db_table, key_fields, db_status_fields): db_fields = key_fields + db_status_fields stmt = ' '.join(['select', ','.join(db_fields), 'from', db_table]) engine = create_engine(connection_string) ret = KVFile() try: rows = engine.execute(stmt) for row in rows: rec = dict(zip(db_fields, row)) existing_id = dict( (k, v) for k, v in rec.items() if k in db_status_fields) key = calc_key(rec, key_fields) ret.set(key, existing_id) except ProgrammingError: print('WARNING: Failed to fetch existing keys') except OperationalError: print('WARNING: Failed to fetch existing keys') return ret
def test_insert_generator(): from kvfile import KVFile kv = KVFile() data = [(str(i), ':{}'.format(i)) for i in range(50)] expected_data = [] for key, value in kv.insert_generator(data): expected_data.append((key, value)) assert data == expected_data assert len(list(kv.keys())) == 50 assert len(list(kv.items())) == 50 assert kv.get('49') == ':49'
def test_sanity(): from kvfile import KVFile kv = KVFile() data = dict(s='value', i=123, d=datetime.datetime.fromtimestamp(12325), n=decimal.Decimal('1234.56'), ss=set(range(10)), o=dict(d=decimal.Decimal('1234.58'), n=datetime.datetime.fromtimestamp(12325))) for k, v in data.items(): kv.set(k, v) for k, v in data.items(): assert kv.get(k) == v assert list(kv.keys()) == sorted(data.keys()) assert list(kv.items()) == sorted(data.items()) assert list(kv.keys(reverse=True)) == sorted(data.keys(), reverse=True) assert list(kv.items(reverse=True)) == sorted(data.items(), reverse=True)
def func(package): source_, target_name_, target_path_ = source, target_name, target_path if source_ is None: source_ = package.pkg.descriptor['resources'][0]['name'] if target_name_ is None: target_name_ = source_ + '_copy' if target_path is None: target_path_ = target_name_ + '.csv' def traverse_resources(resources): new_res_list = [] for res in resources: yield res if res['name'] == source_: res = copy.deepcopy(res) res['name'] = target_name_ res['path'] = target_path_ if duplicate_to_end: new_res_list.append(res) else: yield res for res in new_res_list: yield res descriptor = package.pkg.descriptor descriptor['resources'] = list( traverse_resources(descriptor['resources'])) yield package.pkg dbs = [] for resource in package: if resource.res.name == source_: db = KVFile() yield saver(resource, db, batch_size) if duplicate_to_end: dbs.append(db) else: yield loader(db) else: yield resource for db in dbs: yield loader(db)
def flow(parameters, *_): stats = defaultdict(int) kv = KVFile() last_id = None load_from = parameters.get("load_from", parameters.get('dump_to_path')) if load_from and os.path.exists(os.path.join(load_from, "datapackage.json")): logging.info("Loading from last load_from_db package: " + os.path.join(load_from, "datapackage.json")) row = None for resource in Flow( load(os.path.join(load_from, "datapackage.json"), limit_rows=parameters.get("limit_rows"), resources="db_data")).datastream().res_iter: for row in resource: stats['loaded from package'] += 1 last_id = row['__id'] kv.set("{:0>12}".format(last_id), row) if last_id % 10000 == 0: logging.info("Loaded id: %s" % last_id) all_data_keys = set(row.keys()) if row else set() else: all_data_keys = set() logging.info('num rows loaded from package: %s' % stats['loaded from package']) engine = create_engine( "postgresql://{username}:{password}@{host}:5432/reports?sslmode=verify-ca&sslrootcert={sslrootcert}&sslcert={sslcert}&sslkey={sslkey}" .format(**config.db_settings)) engine.update_execution_options(stream_results=True) if parameters.get("where"): logging.info("Loading from DB, with where clause: " + parameters["where"]) where = " where " + parameters["where"] elif last_id: logging.info("Loading from DB, starting at id %s" % last_id) where = " where id > %s" % last_id else: logging.info("Loading all records from DB") where = "" for id, created, data in engine.execute( "select id, created, data from reports%s order by id" % where): if parameters.get("filter_db_row_callback"): id, created, data = parameters["filter_db_row_callback"](id, created, data) if not data or not isinstance(data, dict): stats['invalid data'] += 1 continue stats['loaded from db'] += 1 last_id = id row = { "__id": id, "__created": created, } for k, v in data.items(): all_data_keys.add(k) row[k] = v kv.set("{:0>12}".format(id), row) if id % 100000 == 0: logging.info("Loaded id: %s" % id) if parameters.get("limit_rows") and stats[ 'loaded from db'] > parameters["limit_rows"]: break logging.info('DB rows with invalid data: %s' % stats['invalid data']) logging.info("last_id = %s" % last_id) logging.info('num rows loaded from db: %s' % stats['loaded from db']) def _yield_from_kv(): for _, row in kv.items(): yield { "__id": row["__id"], "__created": row["__created"], **{ k: json.dumps(row.get(k)) for k in all_data_keys if k not in ["__id", "__created"] } } flow_args = [ _yield_from_kv(), update_resource( -1, name="db_data", path="db_data.csv", schema={ "fields": [{ "name": "__id", "type": "integer" }, { "name": "__created", "type": "datetime" }, *[{ "name": k, "type": "string" } for k in all_data_keys if k not in ["__id", "__created"]]] }, **{"dpp:streaming": True}), ] if parameters.get("dump_to_path"): flow_args += [dump_to_path(parameters['dump_to_path'])] return Flow(*flow_args)
def join_aux( source_name, source_key, source_delete, # noqa: C901 target_name, target_key, fields, full, mode): deduplication = target_key is None fields = fix_fields(fields) source_key = KeyCalc(source_key) target_key = KeyCalc(target_key) if target_key is not None else target_key # We will store db keys as boolean flags: # - False -> inserted/not used # - True -> inserted/used db_keys_usage = KVFile() db = KVFile() # Mode of join operation if full is not None: warnings.warn( 'For the `join` processor the `full=True` flag is deprecated. ' 'Please use the "mode" parameter instead.', UserWarning) mode = 'half-outer' if full else 'inner' assert mode in ['inner', 'half-outer', 'full-outer'] # Indexes the source data def indexer(resource): for row_number, row in enumerate(resource, start=1): key = source_key(row, row_number) try: current = db.get(key) except KeyError: current = {} for field, spec in fields.items(): name = spec['name'] curr = current.get(field) agg = spec['aggregate'] if agg != 'count': new = row.get(name) else: new = '' if new is not None: current[field] = AGGREGATORS[agg].func(curr, new) elif field not in current: current[field] = None if mode == 'full-outer': for field in source_key.key_list: current[field] = row.get(field) db.set(key, current) db_keys_usage.set(key, False) yield row # Generates the joined data def process_target(resource): if deduplication: # just empty the iterable collections.deque(indexer(resource), maxlen=0) for key, value in db.items(): row = dict((f, None) for f in fields.keys()) row.update( dict((k, AGGREGATORS[fields[k]['aggregate']].finaliser(v)) for k, v in value.items())) yield row else: for row_number, row in enumerate(resource, start=1): key = target_key(row, row_number) try: extra = create_extra_by_key(key) db_keys_usage.set(key, True) except KeyError: if mode == 'inner': continue extra = dict((k, row.get(k)) for k in fields.keys()) row.update(extra) yield row if mode == 'full-outer': for key, value in db_keys_usage.items(): if value is False: extra = create_extra_by_key(key) yield extra # Creates extra by key def create_extra_by_key(key): extra = db.get(key) extra.update( dict((k, AGGREGATORS[fields[k]['aggregate']].finaliser(v)) for k, v in extra.items() if k in fields)) return extra # Yields the new resources def new_resource_iterator(resource_iterator): has_index = False for resource in resource_iterator: name = resource.res.name if name == source_name: has_index = True if source_delete: # just empty the iterable collections.deque(indexer(resource), maxlen=0) else: yield indexer(resource) if deduplication: yield process_target(resource) elif name == target_name: assert has_index yield process_target(resource) else: yield resource # Updates / creates the target resource descriptor def process_target_resource(source_spec, resource): target_fields = \ resource.setdefault('schema', {}).setdefault('fields', []) for name, spec in fields.items(): agg = spec['aggregate'] data_type = AGGREGATORS[agg].dataType copy_properties = AGGREGATORS[agg].copyProperties to_copy = {} if data_type is None: try: source_field = \ next(filter(lambda f: f['name'] == spec['name'], source_spec['schema']['fields'])) except StopIteration: raise KeyError( 'Failed to find field with name %s in resource %s' % (spec['name'], source_spec['name'])) if copy_properties: to_copy = copy.deepcopy(source_field) data_type = source_field['type'] try: existing_field = next( iter(filter(lambda f: f['name'] == name, target_fields))) assert existing_field['type'] == data_type, \ 'Reusing %s but with different data types: %s != %s' % (name, existing_field['type'], data_type) except StopIteration: to_copy.update({'name': name, 'type': data_type}) target_fields.append(to_copy) return resource # Updates the datapackage descriptor based on parameters def process_datapackage(datapackage): new_resources = [] source_spec = None resource_names = [ resource['name'] for resource in datapackage['resources'] ] assert source_name in resource_names, \ 'Source resource ({}) not found package (target={}, found: {})'\ .format(source_name, target_name, resource_names) assert target_name in resource_names, \ 'Target resource ({}) not found package (source={}, found: {})'\ .format(target_name, source_name, resource_names) for resource in datapackage['resources']: if resource['name'] == source_name: nonlocal fields source_spec = resource schema_fields = source_spec.get('schema', {}).get('fields', []) expand_fields(fields, schema_fields) fields = order_fields(fields, schema_fields) if not source_delete: new_resources.append(resource) if deduplication: resource = process_target_resource( source_spec, { 'name': target_name, 'path': os.path.join('data', target_name + '.csv') }) new_resources.append(resource) elif resource['name'] == target_name: assert isinstance(source_spec, dict),\ 'Source resource ({}) must appear before target resource ({}), found: {}'\ .format(source_name, target_name, resource_names) resource = process_target_resource(source_spec, resource) new_resources.append(resource) else: new_resources.append(resource) datapackage['resources'] = new_resources def func(package: PackageWrapper): process_datapackage(package.pkg.descriptor) yield package.pkg yield from new_resource_iterator(package) return func
import os import requests import base64 import math from io import BytesIO from PIL import Image from kvfile import KVFile _cache = KVFile(filename='_cache_airtable') override = set([]) for key in override: try: _cache.get(key) print('got', key) _cache.delete(key) print('deleted', key) except: print('no such key', key) pass def fetch_airtable(kind, rid=None, view='Grid%20view'): API_KEY = os.environ.get('AIRTABLE_API_KEY') key = '%s/%s' % (kind, rid) try: return _cache.get(key) except (KeyError, AssertionError): HEADERS = {'Authorization': 'Bearer ' + API_KEY} URL = 'https://api.airtable.com/v0/appVBVIwOAu4okunl/' + kind if rid:
def join_aux(source_name, source_key, source_delete, # noqa: C901 target_name, target_key, fields, full): deduplication = target_key is None fields = fix_fields(fields) source_key = KeyCalc(source_key) target_key = KeyCalc(target_key) if target_key is not None else target_key db = KVFile() # Indexes the source data def indexer(resource): for row in resource: key = source_key(row) try: current = db.get(key) except KeyError: current = {} for field, spec in fields.items(): name = spec['name'] curr = current.get(field) agg = spec['aggregate'] if agg != 'count': new = row.get(name) else: new = '' if new is not None: current[field] = AGGREGATORS[agg].func(curr, new) elif field not in current: current[field] = None db.set(key, current) yield row # Generates the joined data def process_target(resource): if deduplication: # just empty the iterable collections.deque(indexer(resource), maxlen=0) for key, value in db.items(): row = dict( (f, None) for f in fields.keys() ) row.update(dict( (k, AGGREGATORS[fields[k]['aggregate']].finaliser(v)) for k, v in value.items() )) yield row else: for row in resource: key = target_key(row) try: extra = db.get(key) extra = dict( (k, AGGREGATORS[fields[k]['aggregate']].finaliser(v)) for k, v in extra.items() ) except KeyError: if not full: continue extra = dict( (k, row.get(k)) for k in fields.keys() ) row.update(extra) yield row # Yields the new resources def new_resource_iterator(resource_iterator): has_index = False for resource in resource_iterator: name = resource.res.name if name == source_name: has_index = True if source_delete: # just empty the iterable collections.deque(indexer(resource), maxlen=0) else: yield indexer(resource) if deduplication: yield process_target(resource) elif name == target_name: assert has_index yield process_target(resource) else: yield resource # Updates / creates the target resource descriptor def process_target_resource(source_spec, resource): target_fields = \ resource.setdefault('schema', {}).setdefault('fields', []) added_fields = sorted(fields.keys()) for field in added_fields: spec = fields[field] agg = spec['aggregate'] data_type = AGGREGATORS[agg].dataType copy_properties = AGGREGATORS[agg].copyProperties to_copy = {} if data_type is None: try: source_field = \ next(filter(lambda f, spec_=spec: f['name'] == spec_['name'], source_spec['schema']['fields'])) except StopIteration: raise KeyError('Failed to find field with name %s in resource %s' % (spec['name'], source_spec['name'])) if copy_properties: to_copy = copy.deepcopy(source_field) data_type = source_field['type'] try: existing_field = next(iter(filter( lambda f: f['name'] == field, target_fields))) assert existing_field['type'] == data_type, \ 'Reusing %s but with different data types: %s != %s' % (field, existing_field['type'], data_type) except StopIteration: to_copy.update({ 'name': field, 'type': data_type }) target_fields.append(to_copy) return resource # Updates the datapackage descriptor based on parameters def process_datapackage(datapackage): new_resources = [] source_spec = None for resource in datapackage['resources']: if resource['name'] == source_name: source_spec = resource if not source_delete: new_resources.append(resource) if deduplication: resource = process_target_resource( source_spec, { 'name': target_name, 'path': os.path.join('data', target_name + '.csv') }) new_resources.append(resource) elif resource['name'] == target_name: assert isinstance(source_spec, dict), \ "Source resource must appear before target resource" resource = process_target_resource(source_spec, resource) new_resources.append(resource) else: new_resources.append(resource) datapackage['resources'] = new_resources def func(package: PackageWrapper): process_datapackage(package.pkg.descriptor) yield package.pkg yield from new_resource_iterator(package) return func
import logging, requests, os from knesset_data.protocols.committee import CommitteeMeetingProtocol import hashlib, json from kvfile import KVFile from dataflows import Flow, load import csv from fuzzywuzzy import fuzz import traceback BASE_HASH_OBJ = hashlib.md5() with open('../people/committee_meeting_speaker_stats.py', 'rb') as f: BASE_HASH_OBJ.update(f.read()) speaker_stats_kv = KVFile() mk_individual_factions = {} mk_individual_names = {} def speaker_stats_resource(): for k, row in speaker_stats_kv.items(): # logging.info(row) row['CommitteeSessionID'], row['parts_crc32c'], row['part_index'] = k.split('-') yield row def add_speaker_stats_row(row): key = '{}-{}-{}'.format(row['CommitteeSessionID'], row['parts_crc32c'], row['part_index'])
def _get_resource(self, last_update_resource=None): last_kvfile, last_update, key_fields, incremental_field = None, None, None, None if last_update_resource is not None: last_kvfile = KVFile() key_fields = self._parameters.get('incremental-field-key', [self._primary_key_field_name]) incremental_field = self._parameters['incremental-field'] for row in last_update_resource: key = '-'.join([str(row[k]) for k in key_fields]) try: last_row = last_kvfile.get(key) except KeyError: last_row = None if not last_row or last_row[incremental_field] < row[ incremental_field]: last_kvfile.set(key, dict(row)) if not last_update or last_update < row[incremental_field]: last_update = row[incremental_field] if last_update: logging.info('last_update={}'.format(last_update)) resources_yielded = 0 with utils.temp_loglevel(): logging.info( "Loading dataservice resource from service {} method {}". format(self._parameters["service-name"], self._parameters["method-name"])) # with process_metrics('dataservice_collection_row', # {'service_name': self._parameters['service-name'], # 'method_name': self._parameters['method-name']}) as send_process_metrics: if last_update: if self._parameters.get('incremental-field-type') == 'integer': last_update_str = last_update else: last_update_str = ( last_update - datetime.timedelta(days=1)).strftime('%Y-%m-%d') since_last_update = (self._parameters['incremental-field'], last_update_str, self._parameters.get( 'incremental-field-type', 'datetime')) else: since_last_update = None for dataservice_object in self.dataservice_class.get_all( since_last_update=since_last_update): row = self._filter_dataservice_object(dataservice_object) if os.environ.get( "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", ""): if int( os.environ.get( "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", "")) < resources_yielded: return for k in row: for field in self._schema["fields"]: if field["name"] == k: if field["type"] == "integer" and row[ k] is not None: row[k] = int(row[k]) if last_update: key = '-'.join([str(row[k]) for k in key_fields]) last_kvfile.set(key, dict(row)) else: resources_yielded += 1 yield row # send_process_metrics() if resources_yielded > 0 and resources_yielded % 10000 == 0: logging.info("Loaded {} dataservice objects".format( resources_yielded)) if last_update: for key, row in last_kvfile.items(): resources_yielded += 1 yield row if resources_yielded % 10000 == 0: logging.info("Loaded {} dataservice objects".format( resources_yielded))
def test_insert(): from kvfile import KVFile kv = KVFile() kv.insert(((str(i), ':{}'.format(i)) for i in range(50000))) assert len(list(kv.keys())) == 50000 assert len(list(kv.items())) == 50000 assert kv.get('49999') == ':49999' kv.insert(((str(i), ':{}'.format(i)) for i in range(50000, 100000)), batch_size=40000) assert len(list(kv.items())) == 100000 kv.insert(((str(i), ':{}'.format(i)) for i in range(100000, 100002)), batch_size=1) kv.insert(((str(i), ':{}'.format(i)) for i in range(100002, 100005)), batch_size=0) assert len(list(kv.items())) == 100005