Beispiel #1
0
def _sorter(rows, key_calc, reverse, batch_size):
    db = KVFile()
    db.insert(((key_calc(row) + "{:08x}".format(row_num), row)
               for row_num, row in enumerate(rows)),
              batch_size=batch_size)

    for _, value in db.items(reverse=reverse):
        yield value
Beispiel #2
0
def _sorter(rows, key_calc, reverse, batch_size):
    db = KVFile()

    def process(rows):
        for row_num, row in enumerate(rows):
            key = key_calc(row) + '{:08x}'.format(row_num)
            yield (key, row)

    db.insert(process(rows), batch_size=batch_size)
    for _, value in db.items(reverse=reverse):
        yield value
Beispiel #3
0
def test_filename():
    from kvfile import KVFile, db_kind
    filename = 'bla.filename.' + db_kind + '.db'
    kv1 = KVFile(filename=filename)
    kv1.insert(((str(i), ':{}'.format(i)) for i in range(50000)))
    del kv1

    kv = KVFile(filename=filename)
    assert len(list(kv.keys())) == 50000
    assert len(list(kv.items())) == 50000
    assert kv.get('49999') == ':49999'
Beispiel #4
0
def test_default():
    from kvfile import KVFile
    kv = KVFile()
    kv.set('aaaa', 5)
    assert kv.get('aaaa') == 5
    assert kv.get('bbbb', default=6) == 6
    with pytest.raises(KeyError):
        kv.get('bbbb')
Beispiel #5
0
    def func(package):
        source_, target_name_, target_path_ = source, target_name, target_path
        if source_ is None:
            source_ = package.pkg.descriptor['resources'][0]['name']
        if target_name_ is None:
            target_name_ = source_ + '_copy'
        if target_path is None:
            target_path_ = target_name_ + '.csv'

        def traverse_resources(resources):
            for res in resources:
                yield res
                if res['name'] == source_:
                    res = copy.deepcopy(res)
                    res['name'] = target_name_
                    res['path'] = target_path_
                    yield res

        descriptor = package.pkg.descriptor
        descriptor['resources'] = list(traverse_resources(descriptor['resources']))
        yield package.pkg

        for resource in package:
            if resource.res.name == source_:
                db = KVFile()
                yield saver(resource, db, batch_size)
                yield loader(db)
            else:
                yield resource
def get_all_existing_ids(connection_string, db_table, key_fields,
                         db_status_fields):
    db_fields = key_fields + db_status_fields
    stmt = ' '.join(['select', ','.join(db_fields), 'from', db_table])
    engine = create_engine(connection_string)
    ret = KVFile()
    try:
        rows = engine.execute(stmt)
        for row in rows:
            rec = dict(zip(db_fields, row))
            existing_id = dict(
                (k, v) for k, v in rec.items() if k in db_status_fields)
            key = calc_key(rec, key_fields)
            ret.set(key, existing_id)
    except ProgrammingError:
        print('WARNING: Failed to fetch existing keys')
    except OperationalError:
        print('WARNING: Failed to fetch existing keys')
    return ret
Beispiel #7
0
def test_insert_generator():
    from kvfile import KVFile
    kv = KVFile()
    data = [(str(i), ':{}'.format(i)) for i in range(50)]
    expected_data = []
    for key, value in kv.insert_generator(data):
        expected_data.append((key, value))
    assert data == expected_data
    assert len(list(kv.keys())) == 50
    assert len(list(kv.items())) == 50
    assert kv.get('49') == ':49'
Beispiel #8
0
def test_sanity():
    from kvfile import KVFile

    kv = KVFile()

    data = dict(s='value',
                i=123,
                d=datetime.datetime.fromtimestamp(12325),
                n=decimal.Decimal('1234.56'),
                ss=set(range(10)),
                o=dict(d=decimal.Decimal('1234.58'),
                       n=datetime.datetime.fromtimestamp(12325)))

    for k, v in data.items():
        kv.set(k, v)

    for k, v in data.items():
        assert kv.get(k) == v

    assert list(kv.keys()) == sorted(data.keys())
    assert list(kv.items()) == sorted(data.items())

    assert list(kv.keys(reverse=True)) == sorted(data.keys(), reverse=True)
    assert list(kv.items(reverse=True)) == sorted(data.items(), reverse=True)
Beispiel #9
0
    def func(package):
        source_, target_name_, target_path_ = source, target_name, target_path
        if source_ is None:
            source_ = package.pkg.descriptor['resources'][0]['name']
        if target_name_ is None:
            target_name_ = source_ + '_copy'
        if target_path is None:
            target_path_ = target_name_ + '.csv'

        def traverse_resources(resources):
            new_res_list = []
            for res in resources:
                yield res
                if res['name'] == source_:
                    res = copy.deepcopy(res)
                    res['name'] = target_name_
                    res['path'] = target_path_
                    if duplicate_to_end:
                        new_res_list.append(res)
                    else:
                        yield res
            for res in new_res_list:
                yield res

        descriptor = package.pkg.descriptor
        descriptor['resources'] = list(
            traverse_resources(descriptor['resources']))
        yield package.pkg

        dbs = []
        for resource in package:
            if resource.res.name == source_:
                db = KVFile()
                yield saver(resource, db, batch_size)
                if duplicate_to_end:
                    dbs.append(db)
                else:
                    yield loader(db)
            else:
                yield resource
        for db in dbs:
            yield loader(db)
Beispiel #10
0
def flow(parameters, *_):
    stats = defaultdict(int)
    kv = KVFile()
    last_id = None
    load_from = parameters.get("load_from", parameters.get('dump_to_path'))
    if load_from and os.path.exists(os.path.join(load_from,
                                                 "datapackage.json")):
        logging.info("Loading from last load_from_db package: " +
                     os.path.join(load_from, "datapackage.json"))
        row = None
        for resource in Flow(
                load(os.path.join(load_from, "datapackage.json"),
                     limit_rows=parameters.get("limit_rows"),
                     resources="db_data")).datastream().res_iter:
            for row in resource:
                stats['loaded from package'] += 1
                last_id = row['__id']
                kv.set("{:0>12}".format(last_id), row)
                if last_id % 10000 == 0:
                    logging.info("Loaded id: %s" % last_id)
        all_data_keys = set(row.keys()) if row else set()
    else:
        all_data_keys = set()
    logging.info('num rows loaded from package: %s' %
                 stats['loaded from package'])
    engine = create_engine(
        "postgresql://{username}:{password}@{host}:5432/reports?sslmode=verify-ca&sslrootcert={sslrootcert}&sslcert={sslcert}&sslkey={sslkey}"
        .format(**config.db_settings))
    engine.update_execution_options(stream_results=True)
    if parameters.get("where"):
        logging.info("Loading from DB, with where clause: " +
                     parameters["where"])
        where = " where " + parameters["where"]
    elif last_id:
        logging.info("Loading from DB, starting at id %s" % last_id)
        where = " where id > %s" % last_id
    else:
        logging.info("Loading all records from DB")
        where = ""
    for id, created, data in engine.execute(
            "select id, created, data from reports%s order by id" % where):
        if parameters.get("filter_db_row_callback"):
            id, created, data = parameters["filter_db_row_callback"](id,
                                                                     created,
                                                                     data)
        if not data or not isinstance(data, dict):
            stats['invalid data'] += 1
            continue
        stats['loaded from db'] += 1
        last_id = id
        row = {
            "__id": id,
            "__created": created,
        }
        for k, v in data.items():
            all_data_keys.add(k)
            row[k] = v
        kv.set("{:0>12}".format(id), row)
        if id % 100000 == 0:
            logging.info("Loaded id: %s" % id)
        if parameters.get("limit_rows") and stats[
                'loaded from db'] > parameters["limit_rows"]:
            break
    logging.info('DB rows with invalid data: %s' % stats['invalid data'])
    logging.info("last_id = %s" % last_id)
    logging.info('num rows loaded from db: %s' % stats['loaded from db'])

    def _yield_from_kv():
        for _, row in kv.items():
            yield {
                "__id": row["__id"],
                "__created": row["__created"],
                **{
                    k: json.dumps(row.get(k))
                    for k in all_data_keys if k not in ["__id", "__created"]
                }
            }

    flow_args = [
        _yield_from_kv(),
        update_resource(
            -1,
            name="db_data",
            path="db_data.csv",
            schema={
                "fields": [{
                    "name": "__id",
                    "type": "integer"
                }, {
                    "name": "__created",
                    "type": "datetime"
                }, *[{
                    "name": k,
                    "type": "string"
                } for k in all_data_keys if k not in ["__id", "__created"]]]
            },
            **{"dpp:streaming": True}),
    ]
    if parameters.get("dump_to_path"):
        flow_args += [dump_to_path(parameters['dump_to_path'])]
    return Flow(*flow_args)
Beispiel #11
0
def join_aux(
        source_name,
        source_key,
        source_delete,  # noqa: C901
        target_name,
        target_key,
        fields,
        full,
        mode):

    deduplication = target_key is None
    fields = fix_fields(fields)
    source_key = KeyCalc(source_key)
    target_key = KeyCalc(target_key) if target_key is not None else target_key
    # We will store db keys as boolean flags:
    # - False -> inserted/not used
    # - True -> inserted/used
    db_keys_usage = KVFile()
    db = KVFile()

    # Mode of join operation
    if full is not None:
        warnings.warn(
            'For the `join` processor the `full=True` flag is deprecated. '
            'Please use the "mode" parameter instead.', UserWarning)
        mode = 'half-outer' if full else 'inner'
    assert mode in ['inner', 'half-outer', 'full-outer']

    # Indexes the source data
    def indexer(resource):
        for row_number, row in enumerate(resource, start=1):
            key = source_key(row, row_number)
            try:
                current = db.get(key)
            except KeyError:
                current = {}
            for field, spec in fields.items():
                name = spec['name']
                curr = current.get(field)
                agg = spec['aggregate']
                if agg != 'count':
                    new = row.get(name)
                else:
                    new = ''
                if new is not None:
                    current[field] = AGGREGATORS[agg].func(curr, new)
                elif field not in current:
                    current[field] = None
            if mode == 'full-outer':
                for field in source_key.key_list:
                    current[field] = row.get(field)
            db.set(key, current)
            db_keys_usage.set(key, False)
            yield row

    # Generates the joined data
    def process_target(resource):
        if deduplication:
            # just empty the iterable
            collections.deque(indexer(resource), maxlen=0)
            for key, value in db.items():
                row = dict((f, None) for f in fields.keys())
                row.update(
                    dict((k, AGGREGATORS[fields[k]['aggregate']].finaliser(v))
                         for k, v in value.items()))
                yield row
        else:
            for row_number, row in enumerate(resource, start=1):
                key = target_key(row, row_number)
                try:
                    extra = create_extra_by_key(key)
                    db_keys_usage.set(key, True)
                except KeyError:
                    if mode == 'inner':
                        continue
                    extra = dict((k, row.get(k)) for k in fields.keys())
                row.update(extra)
                yield row
            if mode == 'full-outer':
                for key, value in db_keys_usage.items():
                    if value is False:
                        extra = create_extra_by_key(key)
                        yield extra

    # Creates extra by key
    def create_extra_by_key(key):
        extra = db.get(key)
        extra.update(
            dict((k, AGGREGATORS[fields[k]['aggregate']].finaliser(v))
                 for k, v in extra.items() if k in fields))
        return extra

    # Yields the new resources
    def new_resource_iterator(resource_iterator):
        has_index = False
        for resource in resource_iterator:
            name = resource.res.name
            if name == source_name:
                has_index = True
                if source_delete:
                    # just empty the iterable
                    collections.deque(indexer(resource), maxlen=0)
                else:
                    yield indexer(resource)
                if deduplication:
                    yield process_target(resource)
            elif name == target_name:
                assert has_index
                yield process_target(resource)
            else:
                yield resource

    # Updates / creates the target resource descriptor
    def process_target_resource(source_spec, resource):
        target_fields = \
            resource.setdefault('schema', {}).setdefault('fields', [])
        for name, spec in fields.items():
            agg = spec['aggregate']
            data_type = AGGREGATORS[agg].dataType
            copy_properties = AGGREGATORS[agg].copyProperties
            to_copy = {}
            if data_type is None:
                try:
                    source_field = \
                        next(filter(lambda f: f['name'] == spec['name'],
                                    source_spec['schema']['fields']))
                except StopIteration:
                    raise KeyError(
                        'Failed to find field with name %s in resource %s' %
                        (spec['name'], source_spec['name']))
                if copy_properties:
                    to_copy = copy.deepcopy(source_field)
                data_type = source_field['type']
            try:
                existing_field = next(
                    iter(filter(lambda f: f['name'] == name, target_fields)))
                assert existing_field['type'] == data_type, \
                    'Reusing %s but with different data types: %s != %s' % (name, existing_field['type'], data_type)
            except StopIteration:
                to_copy.update({'name': name, 'type': data_type})
                target_fields.append(to_copy)
        return resource

    # Updates the datapackage descriptor based on parameters
    def process_datapackage(datapackage):

        new_resources = []
        source_spec = None

        resource_names = [
            resource['name'] for resource in datapackage['resources']
        ]
        assert source_name in resource_names, \
            'Source resource ({}) not found package (target={}, found: {})'\
            .format(source_name, target_name, resource_names)
        assert target_name in resource_names, \
            'Target resource ({}) not found package (source={}, found: {})'\
            .format(target_name, source_name, resource_names)

        for resource in datapackage['resources']:

            if resource['name'] == source_name:
                nonlocal fields
                source_spec = resource
                schema_fields = source_spec.get('schema', {}).get('fields', [])
                expand_fields(fields, schema_fields)
                fields = order_fields(fields, schema_fields)
                if not source_delete:
                    new_resources.append(resource)
                if deduplication:
                    resource = process_target_resource(
                        source_spec, {
                            'name': target_name,
                            'path': os.path.join('data', target_name + '.csv')
                        })
                    new_resources.append(resource)

            elif resource['name'] == target_name:
                assert isinstance(source_spec, dict),\
                       'Source resource ({}) must appear before target resource ({}), found: {}'\
                       .format(source_name, target_name, resource_names)
                resource = process_target_resource(source_spec, resource)
                new_resources.append(resource)

            else:
                new_resources.append(resource)

        datapackage['resources'] = new_resources

    def func(package: PackageWrapper):
        process_datapackage(package.pkg.descriptor)
        yield package.pkg
        yield from new_resource_iterator(package)

    return func
import os
import requests
import base64
import math
from io import BytesIO

from PIL import Image
from kvfile import KVFile

_cache = KVFile(filename='_cache_airtable')
override = set([])
for key in override:
    try:
        _cache.get(key)
        print('got', key)
        _cache.delete(key)
        print('deleted', key)
    except:
        print('no such key', key)
        pass


def fetch_airtable(kind, rid=None, view='Grid%20view'):
    API_KEY = os.environ.get('AIRTABLE_API_KEY')
    key = '%s/%s' % (kind, rid)
    try:
        return _cache.get(key)
    except (KeyError, AssertionError):
        HEADERS = {'Authorization': 'Bearer ' + API_KEY}
        URL = 'https://api.airtable.com/v0/appVBVIwOAu4okunl/' + kind
        if rid:
Beispiel #13
0
def join_aux(source_name, source_key, source_delete,  # noqa: C901
             target_name, target_key, fields, full):

    deduplication = target_key is None
    fields = fix_fields(fields)
    source_key = KeyCalc(source_key)
    target_key = KeyCalc(target_key) if target_key is not None else target_key
    db = KVFile()

    # Indexes the source data
    def indexer(resource):
        for row in resource:
            key = source_key(row)
            try:
                current = db.get(key)
            except KeyError:
                current = {}
            for field, spec in fields.items():
                name = spec['name']
                curr = current.get(field)
                agg = spec['aggregate']
                if agg != 'count':
                    new = row.get(name)
                else:
                    new = ''
                if new is not None:
                    current[field] = AGGREGATORS[agg].func(curr, new)
                elif field not in current:
                    current[field] = None
            db.set(key, current)
            yield row

    # Generates the joined data
    def process_target(resource):
        if deduplication:
            # just empty the iterable
            collections.deque(indexer(resource), maxlen=0)
            for key, value in db.items():
                row = dict(
                    (f, None) for f in fields.keys()
                )
                row.update(dict(
                    (k, AGGREGATORS[fields[k]['aggregate']].finaliser(v))
                    for k, v in value.items()
                ))
                yield row
        else:
            for row in resource:
                key = target_key(row)
                try:
                    extra = db.get(key)
                    extra = dict(
                        (k, AGGREGATORS[fields[k]['aggregate']].finaliser(v))
                        for k, v in extra.items()
                    )
                except KeyError:
                    if not full:
                        continue
                    extra = dict(
                        (k, row.get(k))
                        for k in fields.keys()
                    )
                row.update(extra)
                yield row

    # Yields the new resources
    def new_resource_iterator(resource_iterator):
        has_index = False
        for resource in resource_iterator:
            name = resource.res.name
            if name == source_name:
                has_index = True
                if source_delete:
                    # just empty the iterable
                    collections.deque(indexer(resource), maxlen=0)
                else:
                    yield indexer(resource)
                if deduplication:
                    yield process_target(resource)
            elif name == target_name:
                assert has_index
                yield process_target(resource)
            else:
                yield resource

    # Updates / creates the target resource descriptor
    def process_target_resource(source_spec, resource):
        target_fields = \
            resource.setdefault('schema', {}).setdefault('fields', [])
        added_fields = sorted(fields.keys())
        for field in added_fields:
            spec = fields[field]
            agg = spec['aggregate']
            data_type = AGGREGATORS[agg].dataType
            copy_properties = AGGREGATORS[agg].copyProperties
            to_copy = {}
            if data_type is None:
                try:
                    source_field = \
                        next(filter(lambda f, spec_=spec:
                                    f['name'] == spec_['name'],
                                    source_spec['schema']['fields']))
                except StopIteration:
                    raise KeyError('Failed to find field with name %s in resource %s' %
                                   (spec['name'], source_spec['name']))
                if copy_properties:
                    to_copy = copy.deepcopy(source_field)
                data_type = source_field['type']
            try:
                existing_field = next(iter(filter(
                    lambda f: f['name'] == field,
                    target_fields)))
                assert existing_field['type'] == data_type, \
                    'Reusing %s but with different data types: %s != %s' % (field, existing_field['type'], data_type)
            except StopIteration:
                to_copy.update({
                    'name': field,
                    'type': data_type
                })
                target_fields.append(to_copy)
        return resource

    # Updates the datapackage descriptor based on parameters
    def process_datapackage(datapackage):

        new_resources = []
        source_spec = None

        for resource in datapackage['resources']:

            if resource['name'] == source_name:
                source_spec = resource
                if not source_delete:
                    new_resources.append(resource)
                if deduplication:
                    resource = process_target_resource(
                        source_spec,
                        {
                            'name': target_name,
                            'path': os.path.join('data', target_name + '.csv')
                        })
                    new_resources.append(resource)

            elif resource['name'] == target_name:
                assert isinstance(source_spec, dict), \
                    "Source resource must appear before target resource"
                resource = process_target_resource(source_spec, resource)
                new_resources.append(resource)

            else:
                new_resources.append(resource)

        datapackage['resources'] = new_resources

    def func(package: PackageWrapper):
        process_datapackage(package.pkg.descriptor)
        yield package.pkg
        yield from new_resource_iterator(package)

    return func
import logging, requests, os
from knesset_data.protocols.committee import CommitteeMeetingProtocol
import hashlib, json
from kvfile import KVFile
from dataflows import Flow, load
import csv
from fuzzywuzzy import fuzz
import traceback


BASE_HASH_OBJ = hashlib.md5()
with open('../people/committee_meeting_speaker_stats.py', 'rb') as f:
    BASE_HASH_OBJ.update(f.read())


speaker_stats_kv = KVFile()


mk_individual_factions = {}
mk_individual_names = {}


def speaker_stats_resource():
    for k, row in speaker_stats_kv.items():
        # logging.info(row)
        row['CommitteeSessionID'], row['parts_crc32c'], row['part_index'] = k.split('-')
        yield row


def add_speaker_stats_row(row):
    key = '{}-{}-{}'.format(row['CommitteeSessionID'], row['parts_crc32c'], row['part_index'])
 def _get_resource(self, last_update_resource=None):
     last_kvfile, last_update, key_fields, incremental_field = None, None, None, None
     if last_update_resource is not None:
         last_kvfile = KVFile()
         key_fields = self._parameters.get('incremental-field-key',
                                           [self._primary_key_field_name])
         incremental_field = self._parameters['incremental-field']
         for row in last_update_resource:
             key = '-'.join([str(row[k]) for k in key_fields])
             try:
                 last_row = last_kvfile.get(key)
             except KeyError:
                 last_row = None
             if not last_row or last_row[incremental_field] < row[
                     incremental_field]:
                 last_kvfile.set(key, dict(row))
                 if not last_update or last_update < row[incremental_field]:
                     last_update = row[incremental_field]
         if last_update:
             logging.info('last_update={}'.format(last_update))
     resources_yielded = 0
     with utils.temp_loglevel():
         logging.info(
             "Loading dataservice resource from service {} method {}".
             format(self._parameters["service-name"],
                    self._parameters["method-name"]))
         # with process_metrics('dataservice_collection_row',
         #                      {'service_name': self._parameters['service-name'],
         #                       'method_name': self._parameters['method-name']}) as send_process_metrics:
         if last_update:
             if self._parameters.get('incremental-field-type') == 'integer':
                 last_update_str = last_update
             else:
                 last_update_str = (
                     last_update -
                     datetime.timedelta(days=1)).strftime('%Y-%m-%d')
             since_last_update = (self._parameters['incremental-field'],
                                  last_update_str,
                                  self._parameters.get(
                                      'incremental-field-type', 'datetime'))
         else:
             since_last_update = None
         for dataservice_object in self.dataservice_class.get_all(
                 since_last_update=since_last_update):
             row = self._filter_dataservice_object(dataservice_object)
             if os.environ.get(
                     "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", ""):
                 if int(
                         os.environ.get(
                             "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS",
                             "")) < resources_yielded:
                     return
             for k in row:
                 for field in self._schema["fields"]:
                     if field["name"] == k:
                         if field["type"] == "integer" and row[
                                 k] is not None:
                             row[k] = int(row[k])
             if last_update:
                 key = '-'.join([str(row[k]) for k in key_fields])
                 last_kvfile.set(key, dict(row))
             else:
                 resources_yielded += 1
                 yield row
             # send_process_metrics()
             if resources_yielded > 0 and resources_yielded % 10000 == 0:
                 logging.info("Loaded {} dataservice objects".format(
                     resources_yielded))
         if last_update:
             for key, row in last_kvfile.items():
                 resources_yielded += 1
                 yield row
                 if resources_yielded % 10000 == 0:
                     logging.info("Loaded {} dataservice objects".format(
                         resources_yielded))
Beispiel #16
0
def test_insert():
    from kvfile import KVFile
    kv = KVFile()
    kv.insert(((str(i), ':{}'.format(i)) for i in range(50000)))
    assert len(list(kv.keys())) == 50000
    assert len(list(kv.items())) == 50000
    assert kv.get('49999') == ':49999'

    kv.insert(((str(i), ':{}'.format(i)) for i in range(50000, 100000)),
              batch_size=40000)
    assert len(list(kv.items())) == 100000

    kv.insert(((str(i), ':{}'.format(i)) for i in range(100000, 100002)),
              batch_size=1)
    kv.insert(((str(i), ':{}'.format(i)) for i in range(100002, 100005)),
              batch_size=0)
    assert len(list(kv.items())) == 100005