Ejemplo n.º 1
0
def test_default():
    from kvfile import KVFile
    kv = KVFile()
    kv.set('aaaa', 5)
    assert kv.get('aaaa') == 5
    assert kv.get('bbbb', default=6) == 6
    with pytest.raises(KeyError):
        kv.get('bbbb')
def get_all_existing_ids(connection_string, db_table, key_fields,
                         db_status_fields):
    db_fields = key_fields + db_status_fields
    stmt = ' '.join(['select', ','.join(db_fields), 'from', db_table])
    engine = create_engine(connection_string)
    ret = KVFile()
    try:
        rows = engine.execute(stmt)
        for row in rows:
            rec = dict(zip(db_fields, row))
            existing_id = dict(
                (k, v) for k, v in rec.items() if k in db_status_fields)
            key = calc_key(rec, key_fields)
            ret.set(key, existing_id)
    except ProgrammingError:
        print('WARNING: Failed to fetch existing keys')
    except OperationalError:
        print('WARNING: Failed to fetch existing keys')
    return ret
Ejemplo n.º 3
0
def test_sanity():
    from kvfile import KVFile

    kv = KVFile()

    data = dict(
        s='value', 
        i=123, 
        d=datetime.datetime.fromtimestamp(12325), 
        n=decimal.Decimal('1234.56'),
        ss=set(range(10)),
        o=dict(d=decimal.Decimal('1234.58'), n=datetime.datetime.fromtimestamp(12325))
    )

    for k, v in data.items():
        kv.set(k, v)

    for k, v in data.items():
        assert kv.get(k) == v

    assert sorted(kv.keys()) == sorted(data.keys())
    assert sorted(kv.items()) == sorted(data.items())
Ejemplo n.º 4
0
def flow(parameters, *_):
    stats = defaultdict(int)
    kv = KVFile()
    last_id = None
    load_from = parameters.get("load_from", parameters.get('dump_to_path'))
    if load_from and os.path.exists(os.path.join(load_from,
                                                 "datapackage.json")):
        logging.info("Loading from last load_from_db package: " +
                     os.path.join(load_from, "datapackage.json"))
        row = None
        for resource in Flow(
                load(os.path.join(load_from, "datapackage.json"),
                     limit_rows=parameters.get("limit_rows"),
                     resources="db_data")).datastream().res_iter:
            for row in resource:
                stats['loaded from package'] += 1
                last_id = row['__id']
                kv.set("{:0>12}".format(last_id), row)
                if last_id % 10000 == 0:
                    logging.info("Loaded id: %s" % last_id)
        all_data_keys = set(row.keys()) if row else set()
    else:
        all_data_keys = set()
    logging.info('num rows loaded from package: %s' %
                 stats['loaded from package'])
    engine = create_engine(
        "postgresql://{username}:{password}@{host}:5432/reports?sslmode=verify-ca&sslrootcert={sslrootcert}&sslcert={sslcert}&sslkey={sslkey}"
        .format(**config.db_settings))
    engine.update_execution_options(stream_results=True)
    if parameters.get("where"):
        logging.info("Loading from DB, with where clause: " +
                     parameters["where"])
        where = " where " + parameters["where"]
    elif last_id:
        logging.info("Loading from DB, starting at id %s" % last_id)
        where = " where id > %s" % last_id
    else:
        logging.info("Loading all records from DB")
        where = ""
    for id, created, data in engine.execute(
            "select id, created, data from reports%s order by id" % where):
        if parameters.get("filter_db_row_callback"):
            id, created, data = parameters["filter_db_row_callback"](id,
                                                                     created,
                                                                     data)
        if not data or not isinstance(data, dict):
            stats['invalid data'] += 1
            continue
        stats['loaded from db'] += 1
        last_id = id
        row = {
            "__id": id,
            "__created": created,
        }
        for k, v in data.items():
            all_data_keys.add(k)
            row[k] = v
        kv.set("{:0>12}".format(id), row)
        if id % 100000 == 0:
            logging.info("Loaded id: %s" % id)
        if parameters.get("limit_rows") and stats[
                'loaded from db'] > parameters["limit_rows"]:
            break
    logging.info('DB rows with invalid data: %s' % stats['invalid data'])
    logging.info("last_id = %s" % last_id)
    logging.info('num rows loaded from db: %s' % stats['loaded from db'])

    def _yield_from_kv():
        for _, row in kv.items():
            yield {
                "__id": row["__id"],
                "__created": row["__created"],
                **{
                    k: json.dumps(row.get(k))
                    for k in all_data_keys if k not in ["__id", "__created"]
                }
            }

    flow_args = [
        _yield_from_kv(),
        update_resource(
            -1,
            name="db_data",
            path="db_data.csv",
            schema={
                "fields": [{
                    "name": "__id",
                    "type": "integer"
                }, {
                    "name": "__created",
                    "type": "datetime"
                }, *[{
                    "name": k,
                    "type": "string"
                } for k in all_data_keys if k not in ["__id", "__created"]]]
            },
            **{"dpp:streaming": True}),
    ]
    if parameters.get("dump_to_path"):
        flow_args += [dump_to_path(parameters['dump_to_path'])]
    return Flow(*flow_args)
 def _get_resource(self, last_update_resource=None):
     last_kvfile, last_update, key_fields, incremental_field = None, None, None, None
     if last_update_resource is not None:
         last_kvfile = KVFile()
         key_fields = self._parameters.get('incremental-field-key',
                                           [self._primary_key_field_name])
         incremental_field = self._parameters['incremental-field']
         for row in last_update_resource:
             key = '-'.join([str(row[k]) for k in key_fields])
             try:
                 last_row = last_kvfile.get(key)
             except KeyError:
                 last_row = None
             if not last_row or last_row[incremental_field] < row[
                     incremental_field]:
                 last_kvfile.set(key, dict(row))
                 if not last_update or last_update < row[incremental_field]:
                     last_update = row[incremental_field]
         if last_update:
             logging.info('last_update={}'.format(last_update))
     resources_yielded = 0
     with utils.temp_loglevel():
         logging.info(
             "Loading dataservice resource from service {} method {}".
             format(self._parameters["service-name"],
                    self._parameters["method-name"]))
         # with process_metrics('dataservice_collection_row',
         #                      {'service_name': self._parameters['service-name'],
         #                       'method_name': self._parameters['method-name']}) as send_process_metrics:
         if last_update:
             if self._parameters.get('incremental-field-type') == 'integer':
                 last_update_str = last_update
             else:
                 last_update_str = (
                     last_update -
                     datetime.timedelta(days=1)).strftime('%Y-%m-%d')
             since_last_update = (self._parameters['incremental-field'],
                                  last_update_str,
                                  self._parameters.get(
                                      'incremental-field-type', 'datetime'))
         else:
             since_last_update = None
         for dataservice_object in self.dataservice_class.get_all(
                 since_last_update=since_last_update):
             row = self._filter_dataservice_object(dataservice_object)
             if os.environ.get(
                     "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS", ""):
                 if int(
                         os.environ.get(
                             "OVERRIDE_DATASERVICE_COLLECTION_LIMIT_ITEMS",
                             "")) < resources_yielded:
                     return
             for k in row:
                 for field in self._schema["fields"]:
                     if field["name"] == k:
                         if field["type"] == "integer" and row[
                                 k] is not None:
                             row[k] = int(row[k])
             if last_update:
                 key = '-'.join([str(row[k]) for k in key_fields])
                 last_kvfile.set(key, dict(row))
             else:
                 resources_yielded += 1
                 yield row
             # send_process_metrics()
             if resources_yielded > 0 and resources_yielded % 10000 == 0:
                 logging.info("Loaded {} dataservice objects".format(
                     resources_yielded))
         if last_update:
             for key, row in last_kvfile.items():
                 resources_yielded += 1
                 yield row
                 if resources_yielded % 10000 == 0:
                     logging.info("Loaded {} dataservice objects".format(
                         resources_yielded))