コード例 #1
0
    def test_round_trip(self):
        record_message = singer.RecordMessage(record={'name': 'foo'},
                                              stream='users')

        schema_message = singer.SchemaMessage(stream='users',
                                              key_properties=['name'],
                                              schema={
                                                  'type': 'object',
                                                  'properties': {
                                                      'name': {
                                                          'type': 'string'
                                                      }
                                                  }
                                              })

        state_message = singer.StateMessage(value={'seq': 1})

        self.assertEqual(
            record_message,
            singer.parse_message(singer.format_message(record_message)))
        self.assertEqual(
            schema_message,
            singer.parse_message(singer.format_message(schema_message)))
        self.assertEqual(
            state_message,
            singer.parse_message(singer.format_message(state_message)))
コード例 #2
0
def sync(config, state, catalog):
    for catalog_entry in catalog.streams:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)
        _emit(singer.StateMessage(value=state))
        if catalog_entry.is_view:
            key_properties = metadata.to_map(catalog_entry.metadata).get(
                (), {}).get('view-key-properties', [])
        else:
            key_properties = metadata.to_map(catalog_entry.metadata).get(
                (), {}).get('table-key-properties', [])

        _emit(
            singer.SchemaMessage(stream=catalog_entry.stream,
                                 schema=catalog_entry.schema.to_dict(),
                                 key_properties=key_properties,
                                 bookmark_properties=replication_key))
        with metrics.job_timer("sync_table") as timer:
            timer.tags["schema"] = catalog_entry.database
            timer.tags["table"] = catalog_entry.table
            _sync_table(config, state, catalog_entry)
    state = singer.set_currently_syncing(state, None)
    _emit(singer.StateMessage(value=state))
コード例 #3
0
def generate_messages(conn, db_schema, catalog, state):
    catalog = resolve.resolve_catalog(discover_catalog(conn, db_schema),
                                      catalog, state)

    for catalog_entry in catalog.streams:
        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)
        catalog_md = metadata.to_map(catalog_entry.metadata)

        if catalog_md.get((), {}).get('is-view'):
            key_properties = catalog_md.get((), {}).get('view-key-properties')
        else:
            key_properties = catalog_md.get((), {}).get('table-key-properties')
        bookmark_properties = catalog_md.get((), {}).get('replication-key')

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(stream=catalog_entry.stream,
                                   schema=catalog_entry.schema.to_dict(),
                                   key_properties=key_properties,
                                   bookmark_properties=bookmark_properties)

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

    # If we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
コード例 #4
0
def write_schema_message(catalog_entry, bookmark_properties=[]):
    key_properties = common.get_key_properties(catalog_entry)

    singer.write_message(
        singer.SchemaMessage(stream=catalog_entry.stream,
                             schema=catalog_entry.schema.to_dict(),
                             key_properties=key_properties,
                             bookmark_properties=bookmark_properties))
コード例 #5
0
ファイル: oplog.py プロジェクト: psschroeter/tap-mongodb
def write_schema(schema, row, stream):
    schema_build_start_time = time.time()
    if common.row_to_schema(schema, row):
        singer.write_message(singer.SchemaMessage(
            stream=common.calculate_destination_stream_name(stream),
            schema=schema,
            key_properties=['_id']))
        common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
    common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time() - schema_build_start_time
コード例 #6
0
 def test_parse_message_schema_good(self):
     message = singer.parse_message('{"type": "SCHEMA", "stream": "users", "schema": {"type": "object", "properties": {"name": {"type": "string"}}}, "key_properties": ["name"]}')  # nopep8
     self.assertEqual(
         message,
         singer.SchemaMessage(
             stream='users',
             key_properties=['name'],
             schema={'type': 'object',
                     'properties': {
                         'name': {'type': 'string'}}}))
コード例 #7
0
def write_schema_message(catalog_entry, bookmark_properties=None):
    if bookmark_properties is None:
        bookmark_properties = []

    key_properties = get_key_properties(catalog_entry)

    singer.write_message(
        singer.SchemaMessage(stream=catalog_entry.stream,
                             schema=catalog_entry.schema.to_dict(),
                             key_properties=key_properties,
                             bookmark_properties=bookmark_properties))
コード例 #8
0
ファイル: common.py プロジェクト: kchan-varicent/tap-oracle
def send_schema_message(stream, bookmark_properties):
    s_md = metadata.to_map(stream.metadata)
    if s_md.get((), {}).get('is-view'):
        key_properties = s_md.get((), {}).get('view-key-properties')
    else:
        key_properties = s_md.get((), {}).get('table-key-properties')

    schema_message = singer.SchemaMessage(stream=(stream.tap_stream_id or stream.stream),
                                          schema=stream.schema.to_dict(),
                                          key_properties=key_properties,
                                          bookmark_properties=bookmark_properties)
    singer.write_message(schema_message)
コード例 #9
0
def sync_stream(config, state, stream):
    table_name = stream['tap_stream_id']

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    key_properties = metadata.get(md_map, (), 'table-key-properties')

    # write state message with currently_syncing bookmark
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, table_name)
    singer.write_state(state)

    singer.write_message(
        singer.SchemaMessage(stream=table_name,
                             schema=stream['schema'],
                             key_properties=key_properties))

    rows_saved = 0
    if replication_method == 'FULL_TABLE':
        LOGGER.info("Syncing full table for stream: %s", table_name)
        rows_saved += sync_full_table(config, state, stream)
    elif replication_method == 'LOG_BASED':
        LOGGER.info("Syncing log based for stream: %s", table_name)

        if has_stream_aged_out(config, state, stream):
            LOGGER.info("Clearing state because stream has aged out")
            state.get('bookmarks', {}).pop(table_name)

        # TODO Check to see if latest stream ARN has changed and wipe state if so

        if not singer.get_bookmark(state, table_name,
                                   'initial_full_table_complete'):
            msg = 'Must complete full table sync before replicating from dynamodb streams for %s'
            LOGGER.info(msg, table_name)

            # only mark latest sequence numbers in dynamo streams on first sync so
            # tap has a starting point after the full table sync
            if not singer.get_bookmark(state, table_name, 'version'):
                latest_sequence_numbers = get_latest_seq_numbers(
                    config, stream)
                state = singer.write_bookmark(state, table_name,
                                              'shard_seq_numbers',
                                              latest_sequence_numbers)

            rows_saved += sync_full_table(config, state, stream)

        rows_saved += sync_log_based(config, state, stream)
    else:
        LOGGER.info('Unknown replication method: %s for stream: %s',
                    replication_method, table_name)

    return rows_saved
コード例 #10
0
ファイル: sync.py プロジェクト: onemedical/tap-dynamodb
def sync_stream(config, state, stream):
    table_name = stream['tap_stream_id']

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    key_properties = metadata.get(md_map, (), 'table-key-properties')

    # write state message with currently_syncing bookmark
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, table_name)
    singer.write_state(state)

    singer.write_message(
        singer.SchemaMessage(stream=table_name,
                             schema=stream['schema'],
                             key_properties=key_properties))

    rows_saved = 0
    if replication_method == 'FULL_TABLE':
        LOGGER.info("Syncing full table for stream: %s", table_name)
        rows_saved += full_table.sync(config, state, stream)
    elif replication_method == "QUERY":
        LOGGER.info(f"Syncing via a query for stream {table_name}")
        rows_saved += sync_query(config, state, stream)
    elif replication_method == 'LOG_BASED':
        LOGGER.info("Syncing log based for stream: %s", table_name)

        if log_based.has_stream_aged_out(state, table_name):
            LOGGER.info("Clearing state because stream has aged out")
            state.get('bookmarks', {}).pop(table_name)

        if not singer.get_bookmark(state, table_name,
                                   'initial_full_table_complete'):
            msg = 'Must complete full table sync before replicating from dynamodb streams for %s'
            LOGGER.info(msg, table_name)

            state = log_based.get_initial_bookmarks(config, state, table_name)
            singer.write_state(state)

            rows_saved += full_table.sync(config, state, stream)

        rows_saved += log_based.sync(config, state, stream)
    else:
        LOGGER.info('Unknown replication method: %s for stream: %s',
                    replication_method, table_name)

    state = singer.write_bookmark(state, table_name, 'success_timestamp',
                                  singer.utils.strftime(singer.utils.now()))
    singer.write_state(state)

    return rows_saved
コード例 #11
0
ファイル: __init__.py プロジェクト: uhlenbrock/tap-shippo
def sync_endpoint(url, state):
    '''Syncs the url and paginates through until there are no more "next"
    urls. Yields schema, record, and state messages. Modifies state by
    setting the NEXT field every time we get a next url from Shippo. This
    allows us to resume paginating if we're terminated.

    '''
    stream = parse_stream_from_url(url)
    yield singer.SchemaMessage(stream=stream,
                               schema=load_schema(stream),
                               key_properties=["object_id"])

    if LAST_START_DATE in state:
        start = pendulum.parse(state[LAST_START_DATE]).subtract(days=2)
    else:
        start = pendulum.parse(CONFIG[START_DATE])
    # The Shippo API does not return data from long ago, so we only try to
    # replicate the last 60 days
    sixty_days_ago = pendulum.now().subtract(days=60)
    bounded_start = max(start, sixty_days_ago)
    LOGGER.info("Replicating all %s from %s", stream, bounded_start)

    rows_read = 0
    rows_written = 0
    finished = False
    with metrics.record_counter(parse_stream_from_url(url)) as counter:
        while url and not finished:
            state[NEXT] = url
            yield singer.StateMessage(value=state)

            data = request(url)

            for row in data['results']:
                counter.increment()
                rows_read += 1
                updated = pendulum.parse(row[OBJECT_UPDATED])
                if updated >= bounded_start:
                    row = fix_extra_map(row)
                    yield singer.RecordMessage(stream=stream, record=row)
                    rows_written += 1
                else:
                    finished = True
                    break

            url = data.get(NEXT)

    if rows_read:
        LOGGER.info("Done syncing %s. Read %d records, wrote %d (%.2f%%)",
                    stream, rows_read, rows_written,
                    100.0 * rows_written / float(rows_read))
コード例 #12
0
ファイル: __init__.py プロジェクト: lanroth/tap-mysql
def generate_messages(con, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'replication_key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=key_properties,
            bookmark_properties=replication_key
        )

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                for message in incremental.sync_table(con, catalog_entry, state):
                    yield message
            elif replication_method == 'FULL_TABLE':
                for message in full_table.sync_table(con, catalog_entry, state):
                    yield message
            else:
                raise Exception("only INCREMENTAL and FULL TABLE replication methods are supported")

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
コード例 #13
0
ファイル: __init__.py プロジェクト: flash716/tap-quickbase
def sync_table(conn, catalog_entry, state):
    LOGGER.info("Beginning sync for {}.{} table.".format(
        catalog_entry.database, catalog_entry.table
    ))

    entity = catalog_entry.tap_stream_id
    if not entity:
        return

    # tell singer about the structure of this schema
    yield singer.SchemaMessage(
        stream=entity,
        schema=catalog_entry.schema.to_dict(),
        key_properties=catalog_entry.key_properties
    )

    start = get_start(entity)
    formatted_start = dateutil.parser.parse(start).strftime(DATETIME_FMT)
    params = {
        'start': formatted_start,
    }

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = catalog_entry.database
        counter.tags['table'] = catalog_entry.table

        for rows_saved, row in enumerate(gen_request(conn, catalog_entry, params)):
            counter.increment()
            transform_data(row, catalog_entry.schema)
            yield singer.RecordMessage(
                stream=catalog_entry.stream,
                record=row
            )
            state = singer.write_bookmark(
                state,
                catalog_entry.tap_stream_id,
                'last_record',
                row['datemodified']
            )
            if rows_saved % 1000 == 0:
                yield singer.StateMessage(value=copy.deepcopy(state))

    yield singer.StateMessage(value=copy.deepcopy(state))
コード例 #14
0
ファイル: __init__.py プロジェクト: flash716/tap-quickbase
def generate_messages(conn, catalog, state):
    for catalog_entry in catalog.streams:

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=catalog_entry.key_properties
        )

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

        # Emit a state message
        yield singer.StateMessage(value=copy.deepcopy(state))
コード例 #15
0
def overloaded_parse_message(msg):
    """Parse a message string into a Message object."""

    # We are not using Decimals for parsing here.
    # We recognize that exposes data to potentially
    # lossy conversions.  However, this will affect
    # very few data points and we have chosen to
    # leave conversion as is for now.
    obj = simplejson.loads(msg, use_decimal=True)
    msg_type = _required_key(obj, 'type')

    if msg_type == 'RECORD':
        time_extracted = obj.get('time_extracted')
        if time_extracted:
            try:
                time_extracted = ciso8601.parse_datetime(time_extracted)
            except Exception:
                time_extracted = None
        return singer.RecordMessage(stream=_required_key(obj, 'stream'),
                                    record=_required_key(obj, 'record'),
                                    version=obj.get('version'),
                                    time_extracted=time_extracted)

    if msg_type == 'SCHEMA':
        return singer.SchemaMessage(
            stream=_required_key(obj, 'stream'),
            schema=_required_key(obj, 'schema'),
            key_properties=_required_key(obj, 'key_properties'),
            bookmark_properties=obj.get('bookmark_properties'))

    if msg_type == 'STATE':
        return singer.StateMessage(value=_required_key(obj, 'value'))

    if msg_type == 'ACTIVATE_VERSION':
        return singer.ActivateVersionMessage(
            stream=_required_key(obj, 'stream'),
            version=_required_key(obj, 'version'))
    return None
コード例 #16
0
ファイル: __init__.py プロジェクト: flash716/tap-quickbase-1
def generate_messages(conn, catalog, state):
    for catalog_entry in catalog.streams:

        if not catalog_entry.is_selected():
            continue

        # Emit a SCHEMA message before we sync any records
        yield singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=catalog_entry.key_properties,
            bookmark_properties=[REPLICATION_KEY]
        )

        # Emit a RECORD message for each record in the result set
        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table
            for message in sync_table(conn, catalog_entry, state):
                yield message

        # Emit a state message
        yield singer.StateMessage(value=copy.deepcopy(state))
コード例 #17
0
def generate_schema_message(catalog_entry, key_properties,
                            bookmark_properties):
    return singer.SchemaMessage(stream=catalog_entry.stream,
                                schema=catalog_entry.schema.to_dict(),
                                key_properties=key_properties,
                                bookmark_properties=bookmark_properties)
コード例 #18
0
ファイル: full_table.py プロジェクト: psschroeter/tap-mongodb
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
コード例 #19
0
ファイル: __init__.py プロジェクト: flopotok/tap-mongodb
def write_schema_message(stream):
    singer.write_message(
        singer.SchemaMessage(
            stream=common.calculate_destination_stream_name(stream),
            schema=stream['schema'],
            key_properties=['_id']))
コード例 #20
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting incremental sync for %s', tap_stream_id)

    stream_metadata = metadata.to_map(stream['metadata']).get(())
    collection = client[stream_metadata['database-name']][stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None

    #pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')

    state = singer.write_bookmark(state,
                                  stream['tap_stream_id'],
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version
    )


    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})

    replication_key_name = stream_metadata.get('replication-key')
    replication_key_value_bookmark = stream_state.get('replication_key_value')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}
    if replication_key_value_bookmark:
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gte'] = \
            common.string_to_class(replication_key_value_bookmark,
                                   stream_state.get('replication_key_type'))

    # log query
    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    LOGGER.info(query_message)


    # query collection
    schema = {"type": "object", "properties": {}}
    with collection.find(find_filter,
                         projection,
                         sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        for row in cursor:
            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(singer.SchemaMessage(
                    stream=common.calculate_destination_stream_name(stream),
                    schema=schema,
                    key_properties=['_id']))
                common.SCHEMA_COUNT[tap_stream_id] += 1
            common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time


            record_message = common.row_to_singer_record(stream,
                                                         row,
                                                         stream_version,
                                                         time_extracted)

            # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
            # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
            #   emit gen_schema
            #   schema = gen_schema
            singer.write_message(record_message)
            rows_saved += 1

            update_bookmark(row, state, tap_stream_id, replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))


        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time()-start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)
コード例 #21
0
def sync_endpoint(initial_url, state):
    '''Syncs the url and paginates through until there are no more "next"
    urls. Yields schema, record, and state messages. Modifies state by
    setting the NEXT field every time we get a next url from Shippo. This
    allows us to resume paginating if we're terminated.

    '''
    stream = parse_stream_from_url(initial_url)
    yield singer.SchemaMessage(stream=stream,
                               schema=load_schema(stream),
                               key_properties=["object_id"])

    # The Shippo API does not return data from long ago, so we only try to
    # replicate the last 60 days
    # Some streams allow us to page by date, so we can request historical data for them
    sliding_window_key = SLIDING_WINDOW_STREAMS.get(stream)
    if sliding_window_key:
        bounded_start = get_start(state)
        sliding_query_start = bounded_start
        sliding_query_end = bounded_start.add(days=SLIDING_WINDOW_DAYS)
        url = initial_url.format(
            sliding_window_key,
            sliding_query_start.strftime("%Y-%m-%dT%I:%M:%SZ"),
            sliding_query_end.strftime("%Y-%m-%dT%I:%M:%SZ"))
    else:
        bounded_start = max(get_start(state), pendulum.now().subtract(days=60))
        url = initial_url
    LOGGER.info("Replicating all %s from %s", stream, bounded_start)

    rows_read = 0
    rows_written = 0

    with metrics.record_counter(parse_stream_from_url(url)) as counter:
        endpoint_start = pendulum.now()
        while url:
            state[NEXT] = url
            yield singer.StateMessage(value=state)

            data = request(url)

            for row in data['results']:
                counter.increment()
                rows_read += 1
                updated = pendulum.parse(row[OBJECT_UPDATED])
                if updated >= bounded_start:
                    row = fix_extra_map(row)
                    yield singer.RecordMessage(stream=stream, record=row)
                    rows_written += 1

            if data.get(NEXT):
                url = data.get(NEXT)
            elif sliding_window_key and sliding_query_end < endpoint_start:
                sliding_query_start = sliding_query_end
                sliding_query_end = sliding_query_start.add(
                    days=SLIDING_WINDOW_DAYS)
                url = initial_url.format(
                    sliding_window_key,
                    sliding_query_start.strftime("%Y-%m-%dT%I:%M:%SZ"),
                    sliding_query_end.strftime("%Y-%m-%dT%I:%M:%SZ"))
            else:
                url = None

    if rows_read:
        LOGGER.info("Done syncing %s. Read %d records, wrote %d (%.2f%%)",
                    stream, rows_read, rows_written,
                    100.0 * rows_written / float(rows_read))
コード例 #22
0
ファイル: __init__.py プロジェクト: checkr/tap-mongodb
def write_schema_message(stream):
    singer.write_message(
        singer.SchemaMessage(stream=stream['tap_stream_id'],
                             schema=stream['schema'],
                             key_properties=['_id']))