コード例 #1
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_no_change(self):
        row = {
            "a_str": "hello",
            "a_list": ["foo", "bar", 1, 2, {"name": "nick"}],
            "an_object": {
                "a_nested_str": "baz",
                "a_nested_list": [1, 2, "hi"]
            }
        }

        schema = {"type": "object", "properties": {}}

        changed = common.row_to_schema(schema, row)
        self.assertFalse(changed)

        # another row that looks the same keeps changed false
        changed = common.row_to_schema(schema, row)
        self.assertFalse(changed)

        # a different looking row makes the schema change
        row = {"a_str": "hello",
               "a_date": bson.timestamp.Timestamp(1565897157, 1)}
        changed = common.row_to_schema(schema, row)
        self.assertTrue(changed)

        # the same (different) row again sets changed back to false
        changed = common.row_to_schema(schema, row)
        self.assertFalse(changed)
コード例 #2
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_decimal_and_date(self):
        date_row = {"a_field": bson.timestamp.Timestamp(1565897157, 1)}
        decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))}

        schema = {"type": "object", "properties": {}}

        changed_date = common.row_to_schema(schema, date_row)
        changed_decimal = common.row_to_schema(schema, decimal_row)

        expected = {
            "type": "object",
            "properties": {
                "a_field": {
                    "anyOf": [
                        {"type": "string",
                         "format": "date-time"},
                        {"type": "number",
                         "multipleOf": decimal.Decimal('1e-34')},
                        {}
                    ]
                }
            }
        }
        self.assertTrue(changed_date)
        self.assertTrue(changed_decimal)
        self.assertEqual(expected, schema)
コード例 #3
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_nested_data(self):
        date_row = {"foo": {"a_field": bson.timestamp.Timestamp(1565897157, 1)}}
        schema = {"type": "object", "properties": {}}

        changed = common.row_to_schema(schema, date_row)

        expected = {
            "type": "object",
            "properties": {
                "foo": {
                    "anyOf": [
                        {
                            "type": "object",
                            "properties": {
                                "a_field": {
                                    "anyOf": [
                                        {"type": "string",
                                         "format": "date-time"},
                                        {}
                                    ]
                                }
                            }
                        },
                        {}
                    ]
                }
            }
        }
        self.assertTrue(changed)
        self.assertEqual(expected, schema)
コード例 #4
0
ファイル: oplog.py プロジェクト: psschroeter/tap-mongodb
def write_schema(schema, row, stream):
    schema_build_start_time = time.time()
    if common.row_to_schema(schema, row):
        singer.write_message(singer.SchemaMessage(
            stream=common.calculate_destination_stream_name(stream),
            schema=schema,
            key_properties=['_id']))
        common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
    common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time
コード例 #5
0
ファイル: __init__.py プロジェクト: psschroeter/tap-mongodb
def get_collection_schema_from_rows(collection, config):
    LOGGER.info('Getting schema from collection data: %s.%s', collection.database.name, collection.name)
    schema = {
        'type': 'object',
        'properties': {
            '_id': { 'type': ['null', 'string'] }
        }
    }

    limit = config.get('discovery_row_limit', 1000)
    with collection.find().sort('_id', -1).limit(limit) as cursor:
        for row in cursor:
            try:
                common.row_to_schema(schema, row)
            except Exception as e:
                LOGGER.critical(e)

    return schema
コード例 #6
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_float_then_float(self):
        float_row = {"a_field": 1.34}
        float_row_2 = {"a_field": 1.34}

        schema = {"type": "object", "properties": {}}

        changed_float = common.row_to_schema(schema, float_row)
        changed_float_2 = common.row_to_schema(schema, float_row_2)

        expected = {
            "type": "object",
            "properties": {
                "a_field": {
                    "anyOf": [{"type": "number"},
                              {}]
                }
            }
        }

        self.assertTrue(changed_float)
        self.assertFalse(changed_float_2)
        self.assertEqual(expected, schema)
コード例 #7
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_decimal_then_decimal(self):
        decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))}
        decimal_row_2 = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))}

        schema = {"type": "object", "properties": {}}

        changed_decimal = common.row_to_schema(schema, decimal_row)
        changed_decimal_2 = common.row_to_schema(schema, decimal_row_2)

        expected = {
            "type": "object",
            "properties": {
                "a_field": {
                    "anyOf": [{"type": "number",
                              "multipleOf": decimal.Decimal('1e-34')},
                              {}]
                }
            }
        }

        self.assertTrue(changed_decimal)
        self.assertFalse(changed_decimal_2)
        self.assertEqual(expected, schema)
コード例 #8
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_decimal_then_float(self):
        decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))}
        float_row = {"a_field": 1.34}

        schema = {"type": "object", "properties": {}}

        changed_decimal = common.row_to_schema(schema, decimal_row)
        changed_float = common.row_to_schema(schema, float_row)

        expected = {
            "type": "object",
            "properties": {
                "a_field": {
                    "anyOf": [{"type": "number"},
                              {}]
                }
            }
        }

        self.assertTrue(changed_decimal)
        self.assertTrue(changed_float)

        self.assertEqual(expected, schema)
コード例 #9
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_simple_float(self):
        row = {"a_float": 1.34}
        schema = {"type": "object", "properties": {}}
        changed = common.row_to_schema(schema, row)

        expected = {
            "type": "object",
            "properties": {
                "a_float": {
                    "anyOf": [{"type": "number"},
                              {}]
                }
            }
        }
        self.assertTrue(changed)
        self.assertEqual(expected, schema)
コード例 #10
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_simple_decimal(self):
        row = {"a_decimal": bson.Decimal128(decimal.Decimal('1.34'))}
        schema = {"type": "object", "properties": {}}
        changed = common.row_to_schema(schema, row)

        expected = {
            "type": "object",
            "properties": {
                "a_decimal": {
                    "anyOf": [{"type": "number",
                               "multipleOf": decimal.Decimal('1e-34')},
                              {}]
                }
            }
        }
        self.assertTrue(changed)
        self.assertEqual(expected, schema)
コード例 #11
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_array_multiple_types(self):
        row = {
            "foo": [
                bson.timestamp.Timestamp(1565897157, 1),
                bson.Decimal128(decimal.Decimal('1.34'))
            ]
        }
        schema = {"type": "object", "properties": {}}
        changed = common.row_to_schema(schema, row)

        expected = {
            "type": "object",
            "properties": {
                "foo": {
                    "anyOf": [
                        {
                            "type": "array",
                            "items": {
                                "anyOf": [
                                    {
                                        "type": "string",
                                        "format": "date-time"
                                    },
                                    {
                                        "type": "number",
                                        "multipleOf": decimal.Decimal('1e-34')
                                    },
                                    {}
                                ]
                            }
                        },
                        {}
                    ]
                }
            }
        }
        self.assertTrue(changed)
        self.assertEqual(expected, schema)
コード例 #12
0
ファイル: full_table.py プロジェクト: psschroeter/tap-mongodb
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
コード例 #13
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting incremental sync for %s', tap_stream_id)

    stream_metadata = metadata.to_map(stream['metadata']).get(())
    collection = client[stream_metadata['database-name']][stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None

    #pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')

    state = singer.write_bookmark(state,
                                  stream['tap_stream_id'],
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version
    )


    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})

    replication_key_name = stream_metadata.get('replication-key')
    replication_key_value_bookmark = stream_state.get('replication_key_value')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}
    if replication_key_value_bookmark:
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gte'] = \
            common.string_to_class(replication_key_value_bookmark,
                                   stream_state.get('replication_key_type'))

    # log query
    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    LOGGER.info(query_message)


    # query collection
    schema = {"type": "object", "properties": {}}
    with collection.find(find_filter,
                         projection,
                         sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        for row in cursor:
            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(singer.SchemaMessage(
                    stream=common.calculate_destination_stream_name(stream),
                    schema=schema,
                    key_properties=['_id']))
                common.SCHEMA_COUNT[tap_stream_id] += 1
            common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time


            record_message = common.row_to_singer_record(stream,
                                                         row,
                                                         stream_version,
                                                         time_extracted)

            # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
            # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
            #   emit gen_schema
            #   schema = gen_schema
            singer.write_message(record_message)
            rows_saved += 1

            update_bookmark(row, state, tap_stream_id, replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))


        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time()-start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
コード例 #14
0
ファイル: test_common.py プロジェクト: whalyapp/tap-mongodb
    def test_array_nested(self):
        row = {
            "foo": [
                [
                    bson.timestamp.Timestamp(1565897157, 1),
                    bson.Decimal128(decimal.Decimal('1.34'))
                ],
                {
                    "bar": bson.timestamp.Timestamp(1565897157, 1),
                    "bat": bson.Decimal128(decimal.Decimal('1.34'))
                }
            ]
        }
        row_2 = {
            "bar": "1",
            "foo": [
                ["bob", "roger"],
                {
                    "bar": "bob",
                    "bat": "roger"
                }
            ]
        }
        schema = {"type": "object", "properties": {}}
        changed = common.row_to_schema(schema, row)
        changed_2 = common.row_to_schema(schema, row_2)

        expected = {
            "type": "object",
            "properties": {
                "foo": {
                    "anyOf": [
                        {
                            "type": "array",
                            "items": {
                                "anyOf": [
                                    {
                                        "type": "array",
                                        "items": {
                                            "anyOf": [
                                                {
                                                    "type": "string",
                                                    "format": "date-time"
                                                },
                                                {
                                                    "type": "number",
                                                    "multipleOf": decimal.Decimal('1e-34')
                                                },
                                                {}
                                            ]
                                        }
                                    },
                                    {
                                        "type": "object",
                                        "properties": {
                                            "bar": {
                                                "anyOf": [
                                                    {
                                                        "type": "string",
                                                        "format": "date-time"
                                                    },
                                                    {}
                                                ]
                                            },
                                            "bat": {
                                                "anyOf": [
                                                    {
                                                        "type": "number",
                                                        "multipleOf": decimal.Decimal('1e-34')
                                                    },
                                                    {}
                                                ]
                                            }
                                        }
                                    },
                                    {}
                                ]
                            }
                        },
                        {}
                    ]
                }
            }
        }
        singer_row = {k:common.transform_value(v, [k]) for k, v in row_2.items()
                      if type(v) not in [bson.min_key.MinKey, bson.max_key.MaxKey]}


        decimal.getcontext().prec=100000
        validate(instance=singer_row, schema=schema)

        self.assertTrue(changed)
        self.assertFalse(changed_2)
        self.assertEqual(expected, schema)