Beispiel #1
0
def send_schema_message(stream, bookmark_properties):
    s_md = metadata.to_map(stream.metadata)
    if s_md.get((), {}).get('is-view'):
        key_properties = s_md.get((), {}).get('view-key-properties')
    else:
        key_properties = s_md.get((), {}).get('table-key-properties')

    schema_message = singer.SchemaMessage(stream=(stream.tap_stream_id or stream.stream),
                                          schema=stream.schema.to_dict(),
                                          key_properties=key_properties,
                                          bookmark_properties=bookmark_properties)
    singer.write_message(schema_message)
Beispiel #2
0
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter):
    bulk = Bulk(sf)
    current_bookmark = singer.get_bookmark(
        state, catalog_entry['tap_stream_id'],
        'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry)
    current_bookmark = singer_utils.strptime_with_tz(current_bookmark)
    batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                    'BatchIDs')

    start_time = singer_utils.now()
    stream = catalog_entry['stream']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry.get('metadata'))
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    schema = catalog_entry['schema']

    if not bulk.job_exists(job_id):
        LOGGER.info(
            "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state."
        )
        return counter

    # Iterate over the remaining batches, removing them once they are synced
    for batch_id in batch_ids[:]:
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry):
                counter.increment()
                rec = transformer.transform(rec, schema)
                rec = fix_record_anytype(rec, schema)
                singer.write_message(
                    singer.RecordMessage(stream=(stream_alias or stream),
                                         record=rec,
                                         version=stream_version,
                                         time_extracted=start_time))

                # Update bookmark if necessary
                replication_key_value = replication_key and singer_utils.strptime_with_tz(
                    rec[replication_key])
                if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark:
                    current_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])

        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'JobHighestBookmarkSeen',
                                      singer_utils.strftime(current_bookmark))
        batch_ids.remove(batch_id)
        LOGGER.info("Finished syncing batch %s. Removing batch from state.",
                    batch_id)
        LOGGER.info("Batches to go: %d", len(batch_ids))
        singer.write_state(state)

    return counter
Beispiel #3
0
    def sync(self):
        """
        Perform sync action
        These steps are the same for all streams
        Differences between streams are implemented by overriding .do_sync() method
        """
        if not self.KEEP_IDS and not self.include_stream:
            LOGGER.info('Skipping stream %s - excluded in catalog',
                        self.STREAM_NAME)
            return

        new_bookmark_date = self.bookmark_date = self.starting_bookmark_date()
        # amazon doesn't guarantee that all orders created after the createdafter data that you specify will be returned

        # Will be set to false if we stop early due to reaching the end of a batch
        # to tell the runner to continue with the next batch
        all_done = True

        singer.write_schema(self.STREAM_NAME, self.schema, self.key_properties)
        rows = self.request_list()
        self.ids = []
        with singer.metrics.Counter('record_count',
                                    {'endpoint': self.STREAM_NAME}) as counter:
            for row in rows:
                row_as_dict = self.row_to_dict(row)
                if self.KEEP_IDS:
                    self.ids.append(row_as_dict[self.ID_FIELD])
                self.remove_excluded_fields(row_as_dict)
                message = singer.RecordMessage(
                    stream=self.STREAM_NAME,
                    record=row_as_dict,
                    time_extracted=singer.utils.now())
                if self.include_stream:
                    singer.write_message(message)
                if self.BOOKMARK_FIELD:
                    new_bookmark_date = max(new_bookmark_date,
                                            row_as_dict[self.BOOKMARK_FIELD])
                counter.increment()

                # Stop if we've done enough for one batch
                if self.BATCH_SIZE and counter.value >= self.BATCH_SIZE:
                    # Sync action stopped due to end of batch - so probably more rows
                    # Note that there is a 1/BATCH_SIZE chance that the end of a
                    # batch is exactly the end of the whole process. In that case
                    # the runner will make one more .sync request, for one more (empty) batch
                    all_done = False
                    break

        if self.BOOKMARK_FIELD:
            singer.write_bookmark(self.state, self.STREAM_NAME,
                                  self.BOOKMARK_FIELD, new_bookmark_date)

        return all_done
def do_sync_incremental(mssql_conn, config, catalog_entry, state, columns):
    mssql_conn = MSSQLConnection(config)
    md_map = metadata.to_map(catalog_entry.metadata)
    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    replication_key = md_map.get((), {}).get("replication-key")
    write_schema_message(catalog_entry=catalog_entry,
                         bookmark_properties=[replication_key])
    LOGGER.info("Schema written")
    incremental.sync_table(mssql_conn, config, catalog_entry, state, columns)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #5
0
def sync_binlog_stream(mysql_conn: MySQLConnection, config: Dict,
                       binlog_streams_map: Dict[str,
                                                Any], state: Dict) -> None:
    """
    Capture the binlog events created between the pos in the state and current Master position and creates Singer
    streams to be flushed to stdout
    Args:
        mysql_conn: mysql connection instance
        config: tap config
        binlog_streams_map: tables to stream using binlog
        state: the current state
    """
    for tap_stream_id in binlog_streams_map:
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file = log_pos = gtid = None

    if config['use_gtid']:
        gtid = calculate_gtid_bookmark(mysql_conn, binlog_streams_map, state,
                                       config['engine'])
    else:
        log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                               state)

    reader = None

    try:
        reader = create_binlog_stream_reader(config, log_file, log_pos, gtid)

        end_log_file, end_log_pos = fetch_current_log_file_and_pos(mysql_conn)
        LOGGER.info('Current Master binlog file and pos: %s %s', end_log_file,
                    end_log_pos)

        _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config,
                         end_log_file, end_log_pos)

    except pymysql.err.OperationalError as ex:
        if ex.args[0] == 1236:
            LOGGER.error(
                'Cannot resume logical replication from given GTID %s! This GTID might date back to before '
                'the new primary has been setup, connect to old primary and consume all binlog events to get '
                'a newer GTID then switch back.', gtid)

        raise

    finally:
        # BinLogStreamReader doesn't implement the `with` methods
        # So, try/finally will close the chain from the top
        if reader:
            reader.close()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #6
0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    sync_one_one = config.get('sync_one_one', "True")
    if sync_one_one or sync_one_one == "True" or sync_one_one == "true":
        sync_one_one = True
    elif not sync_one_one or sync_one_one == "False" or sync_one_one == "false":
        sync_one_one = False
    else:
        raise Exception("Don't understand sync_one_one param in config, must be boolean")
    table_name = table_spec['table_name']
    s3_file_handle, tags = s3.get_file_handle_custom(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0
    for row in iterator:
        if not sync_one_one:
            custom_columns = {
                s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
                s3.SDC_SOURCE_FILE_COLUMN: s3_path,

                # index zero, +1 for header row
                s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
            }
            rec = {**row, **custom_columns}
            with Transformer() as transformer:
                to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata']))
            write_record(table_name, to_write)
        if sync_one_one:
            write_message(
                OneOneMessage(table_name, row, TagSet=tags, sync_one_one=sync_one_one, _sdc_source_file=s3_path))

        records_synced += 1

    return records_synced
Beispiel #7
0
def sync_stream(config, state, stream):
    table_name = stream['tap_stream_id']

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    key_properties = metadata.get(md_map, (), 'table-key-properties')

    # write state message with currently_syncing bookmark
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, table_name)
    singer.write_state(state)

    singer.write_message(
        singer.SchemaMessage(stream=table_name,
                             schema=stream['schema'],
                             key_properties=key_properties))

    rows_saved = 0
    if replication_method == 'FULL_TABLE':
        LOGGER.info("Syncing full table for stream: %s", table_name)
        rows_saved += sync_full_table(config, state, stream)
    elif replication_method == 'LOG_BASED':
        LOGGER.info("Syncing log based for stream: %s", table_name)

        if has_stream_aged_out(config, state, stream):
            LOGGER.info("Clearing state because stream has aged out")
            state.get('bookmarks', {}).pop(table_name)

        # TODO Check to see if latest stream ARN has changed and wipe state if so

        if not singer.get_bookmark(state, table_name,
                                   'initial_full_table_complete'):
            msg = 'Must complete full table sync before replicating from dynamodb streams for %s'
            LOGGER.info(msg, table_name)

            # only mark latest sequence numbers in dynamo streams on first sync so
            # tap has a starting point after the full table sync
            if not singer.get_bookmark(state, table_name, 'version'):
                latest_sequence_numbers = get_latest_seq_numbers(
                    config, stream)
                state = singer.write_bookmark(state, table_name,
                                              'shard_seq_numbers',
                                              latest_sequence_numbers)

            rows_saved += sync_full_table(config, state, stream)

        rows_saved += sync_log_based(config, state, stream)
    else:
        LOGGER.info('Unknown replication method: %s for stream: %s',
                    replication_method, table_name)

    return rows_saved
Beispiel #8
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                select_sql = 'SELECT {} FROM {}'.format(
                    ','.join(escaped_columns),
                    post_db.fully_qualified_table_name(schema_name,
                                                       stream.table))

                LOGGER.info("select %s", select_sql)
                cur.execute(select_sql)

                rows_saved = 0
                rec = cur.fetchone()
                while rec is not None:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()
                    rec = cur.fetchone()

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
Beispiel #9
0
def sync_traditional_stream(conn_config, stream, state, sync_method, end_lsn):
    LOGGER.info("Beginning sync of stream(%s) with sync method(%s)",
                stream['tap_stream_id'], sync_method)
    md_map = metadata.to_map(stream['metadata'])
    conn_config['dbname'] = md_map.get(()).get('database-name')
    desired_columns = [
        c for c in stream['schema']['properties'].keys()
        if sync_common.should_sync_column(md_map, c)
    ]
    desired_columns.sort()

    if len(desired_columns) == 0:
        LOGGER.warning(
            'There are no columns selected for stream %s, skipping it',
            stream['tap_stream_id'])
        return state

    register_type_adapters(conn_config)

    if sync_method == 'full':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        state = do_sync_full_table(conn_config, stream, state, desired_columns,
                                   md_map)
    elif sync_method == 'incremental':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        state = do_sync_incremental(conn_config, stream, state,
                                    desired_columns, md_map)
    elif sync_method == 'logical_initial':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        LOGGER.info("Performing initial full table sync")
        state = singer.write_bookmark(state, stream['tap_stream_id'], 'lsn',
                                      end_lsn)

        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
        state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin',
                                      None)
    elif sync_method == 'logical_initial_interrupted':
        state = singer.set_currently_syncing(state, stream['tap_stream_id'])
        LOGGER.info(
            "Initial stage of full table sync was interrupted. resuming...")
        sync_common.send_schema_message(stream, [])
        state = full_table.sync_table(conn_config, stream, state,
                                      desired_columns, md_map)
    else:
        raise Exception("unknown sync method {} for stream {}".format(
            sync_method, stream['tap_stream_id']))

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
Beispiel #10
0
def sync_table(mysql_conn, catalog_entry, state, columns):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id, state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get("replication-key")
    replication_key_state = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, "replication_key"
    )

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, "replication_key_value"
        )
    else:
        state = singer.write_bookmark(
            state, catalog_entry.tap_stream_id, "replication_key", replication_key_metadata
        )
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id, "replication_key_value")

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id, "version", stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version
    )

    singer.write_message(activate_version_message)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[replication_key_metadata].format == "date-time":
                    replication_key_value = pendulum.parse(replication_key_value)

                select_sql += " WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC".format(
                    replication_key_metadata, replication_key_metadata
                )

                params["replication_key_value"] = replication_key_value
            elif replication_key_metadata is not None:
                select_sql += " ORDER BY `{}` ASC".format(replication_key_metadata)

            common.sync_query(
                cur, catalog_entry, state, select_sql, columns, stream_version, params
            )
Beispiel #11
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                select_sql = f"SELECT {','.join(escaped_columns)} FROM " \
                             f"{post_db.fully_qualified_table_name(schema_name,stream['table_name'])}"

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved += 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
Beispiel #12
0
def write_schema_message(catalog_entry, bookmark_properties=None):
    if bookmark_properties is None:
        bookmark_properties = []

    key_properties = get_key_properties(catalog_entry)

    singer.write_message(
        singer.SchemaMessage(
            stream=catalog_entry.stream,
            schema=catalog_entry.schema.to_dict(),
            key_properties=key_properties,
            bookmark_properties=bookmark_properties,
        ))
Beispiel #13
0
def do_sync(state):
    '''Main function for syncing'''
    LOGGER.info("Starting sync")
    urls = get_starting_urls(state)
    LOGGER.info('I will sync urls in this order: %s', urls)
    for url in urls:
        for msg in sync_endpoint(url, state):
            singer.write_message(msg)
    state[NEXT] = None
    state[LAST_START_DATE] = state[THIS_START_DATE]
    state[THIS_START_DATE] = None
    singer.write_state(state)
    LOGGER.info("Sync completed")
Beispiel #14
0
def sync_stream(config, state, stream):
    table_name = stream['tap_stream_id']

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')
    key_properties = metadata.get(md_map, (), 'table-key-properties')

    # write state message with currently_syncing bookmark
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, table_name)
    singer.write_state(state)

    singer.write_message(
        singer.SchemaMessage(stream=table_name,
                             schema=stream['schema'],
                             key_properties=key_properties))

    rows_saved = 0
    if replication_method == 'FULL_TABLE':
        LOGGER.info("Syncing full table for stream: %s", table_name)
        rows_saved += full_table.sync(config, state, stream)
    elif replication_method == "QUERY":
        LOGGER.info(f"Syncing via a query for stream {table_name}")
        rows_saved += sync_query(config, state, stream)
    elif replication_method == 'LOG_BASED':
        LOGGER.info("Syncing log based for stream: %s", table_name)

        if log_based.has_stream_aged_out(state, table_name):
            LOGGER.info("Clearing state because stream has aged out")
            state.get('bookmarks', {}).pop(table_name)

        if not singer.get_bookmark(state, table_name,
                                   'initial_full_table_complete'):
            msg = 'Must complete full table sync before replicating from dynamodb streams for %s'
            LOGGER.info(msg, table_name)

            state = log_based.get_initial_bookmarks(config, state, table_name)
            singer.write_state(state)

            rows_saved += full_table.sync(config, state, stream)

        rows_saved += log_based.sync(config, state, stream)
    else:
        LOGGER.info('Unknown replication method: %s for stream: %s',
                    replication_method, table_name)

    state = singer.write_bookmark(state, table_name, 'success_timestamp',
                                  singer.utils.strftime(singer.utils.now()))
    singer.write_state(state)

    return rows_saved
Beispiel #15
0
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    perform_resumable_sync = sync_is_resumable(mysql_conn, catalog_entry)

    pk_clause = ""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            if perform_resumable_sync:
                LOGGER.info(
                    "Full table sync is resumable based on primary key definition, will replicate incrementally"
                )

                state = update_incremental_full_table_state(
                    catalog_entry, state, cur)
                pk_clause = generate_pk_clause(catalog_entry, state)

            select_sql += pk_clause
            params = {}

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)
Beispiel #16
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    start_lsn = get_bookmark(state, stream.tap_stream_id, 'lsn')
    end_lsn = fetch_current_lsn(conn_info)
    time_extracted = utils.now()

    with post_db.open_connection(conn_info, True) as conn:
        with conn.cursor() as cur:
            LOGGER.info("Starting Logical Replication: %s(%s) -> %s",
                        start_lsn, start_lsn, end_lsn)
            try:
                cur.start_replication(slot_name='stitch',
                                      decode=True,
                                      start_lsn=start_lsn)
            except psycopg2.ProgrammingError:
                raise Exception(
                    "unable to start replication with logical replication slot 'stitch'"
                )

            cur.send_feedback(flush_lsn=start_lsn)
            keepalive_interval = 10.0
            rows_saved = 0
            while True:
                msg = cur.read_message()
                if msg:
                    skip_first_change = singer.get_bookmark(
                        state, stream.tap_stream_id,
                        'initial_logical_replication_complete'
                    ) and rows_saved == 0
                    state = consume_message(stream, state, msg, time_extracted,
                                            md_map, conn_info,
                                            skip_first_change)

                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                else:
                    now = datetime.datetime.now()
                    timeout = keepalive_interval - (
                        now - cur.io_timestamp).total_seconds()
                    try:
                        sel = select([cur], [], [], max(0, timeout))
                        if not any(sel):
                            break
                    except InterruptedError:
                        pass  # recalculate timeout and continue

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    return state
Beispiel #17
0
def sync_shard(shard, seq_number_bookmarks, streams_client, stream_arn,
               projection, deserializer, table_name, stream_version, state):
    seq_number = seq_number_bookmarks.get(shard['ShardId'])

    rows_synced = 0

    for record in get_shard_records(streams_client, stream_arn, shard,
                                    seq_number):
        if record['eventName'] == 'REMOVE':
            record_message = deserializer.deserialize_item(
                record['dynamodb']['Keys'])
            record_message[SDC_DELETED_AT] = singer.utils.strftime(
                record['dynamodb']['ApproximateCreationDateTime'])
        else:
            record_message = deserializer.deserialize_item(
                record['dynamodb'].get('NewImage'))
            if record_message is None:
                LOGGER.fatal(
                    'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                )
                raise RuntimeError(
                    'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                )
            if projection is not None and projection != '':
                try:
                    record_message = deserializer.apply_projection(
                        record_message, projection)
                except:
                    LOGGER.fatal("Projection failed to apply: %s", projection)
                    raise RuntimeError(
                        'Projection failed to apply: {}'.format(projection))

        record_message = singer.RecordMessage(stream=table_name,
                                              record=record_message,
                                              version=stream_version)
        singer.write_message(record_message)

        rows_synced += 1

        seq_number_bookmarks[
            shard['ShardId']] = record['dynamodb']['SequenceNumber']
        state = singer.write_bookmark(state, table_name, 'shard_seq_numbers',
                                      seq_number_bookmarks)

        # Every 100 rows write the state
        if rows_synced % 100 == 0:
            singer.write_state(state)

    singer.write_state(state)
    return rows_synced
Beispiel #18
0
    def handle_line(self, line):
        '''Takes a raw line from stdin and transforms it'''
        try:
            message = singer.parse_message(line)

            if not message:
                raise TransformFieldException('Unknown message type')
        except Exception as exc:
            raise TransformFieldException(
                'Failed to process incoming message: {}\n{}'.format(line, exc))

        LOGGER.debug(message)

        # If we got a Schema, set the schema and key properties for this
        # stream. Flush the batch, if there is one, in case the schema is
        # different
        if isinstance(message, singer.SchemaMessage):
            self.flush()

            self.stream_meta[message.stream] = StreamMeta(
                message.schema,
                message.key_properties,
                message.bookmark_properties)

            # Write the transformed message
            singer.write_message(message)

        elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)):
            if self.messages and (
                    message.stream != self.messages[0].stream or
                    message.version != self.messages[0].version):
                self.flush()
            self.messages.append(message)
            self.buffer_size_bytes += len(line)

            num_bytes = self.buffer_size_bytes
            num_messages = len(self.messages)
            num_seconds = time.time() - self.time_last_batch_sent

            enough_bytes = num_bytes >= self.max_batch_bytes
            enough_messages = num_messages >= self.max_batch_records
            enough_time = num_seconds >= self.batch_delay_seconds
            if enough_bytes or enough_messages or enough_time:
                LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds',
                             num_bytes, num_messages, num_seconds)
                self.flush()

        elif isinstance(message, singer.StateMessage):
            self.state = message.value
def sync_view(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version
    nascent_stream_version = int(time.time() * 1000)
    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # cur = connection.cursor()
    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.tap_stream_id, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        select_sql = 'SELECT {} FROM {}.{}'.format(','.join(escaped_columns),
                                                   escaped_schema,
                                                   escaped_table)

        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, nascent_stream_version, desired_columns,
                time_extracted)
            singer.write_message(record_message)
            counter.increment()

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)
    cur.close()
    connection.close()
    return state
Beispiel #20
0
def sync_non_binlog_streams(mssql_conn, non_binlog_catalog, config, state):
    mssql_conn = MSSQLConnection(config)

    for catalog_entry in non_binlog_catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                "There are no columns selected for stream %s, skipping it.", catalog_entry.stream
            )
            continue

        state = singer.set_currently_syncing(state, catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)
        replication_method = md_map.get((), {}).get("replication-method")
        replication_key = md_map.get((), {}).get("replication-key")
        primary_keys = md_map.get((), {}).get("table-key-properties")
        LOGGER.info(f"Table {catalog_entry.table} proposes {replication_method} sync")
        if replication_method == "INCREMENTAL" and not replication_key:
            LOGGER.info(
                f"No replication key for {catalog_entry.table}, using full table replication"
            )
            replication_method = "FULL_TABLE"
        if replication_method == "INCREMENTAL" and not primary_keys:
            LOGGER.info(f"No primary key for {catalog_entry.table}, using full table replication")
            replication_method = "FULL_TABLE"
        LOGGER.info(f"Table {catalog_entry.table} will use {replication_method} sync")

        database_name = common.get_database_name(catalog_entry)

        with metrics.job_timer("sync_table") as timer:
            timer.tags["database"] = database_name
            timer.tags["table"] = catalog_entry.table

            if replication_method == "INCREMENTAL":
                LOGGER.info(f"syncing {catalog_entry.table} incrementally")
                do_sync_incremental(mssql_conn, config, catalog_entry, state, columns)
            elif replication_method == "FULL_TABLE":
                LOGGER.info(f"syncing {catalog_entry.table} full table")
                do_sync_full_table(mssql_conn, config, catalog_entry, state, columns)
            else:
                raise Exception("only INCREMENTAL and FULL TABLE replication methods are supported")

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #21
0
def do_sync_incremental(mysql_conn, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using incremental replication", catalog_entry.stream)

    md_map = metadata.to_map(catalog_entry.metadata)
    replication_key = md_map.get((), {}).get('replication-key')

    if not replication_key:
        raise Exception("Cannot use INCREMENTAL replication for table ({}) without a replication key.".format(catalog_entry.stream))

    write_schema_message(catalog_entry=catalog_entry,
                         bookmark_properties=[replication_key])

    incremental.sync_table(mysql_conn, catalog_entry, state, columns)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #22
0
def do_sync(conn_config, catalog, default_replication_method, state):
    streams = list(filter(is_selected_via_metadata, catalog.streams))
    streams.sort(key=lambda s: s.tap_stream_id)

    currently_syncing = singer.get_currently_syncing(state)

    if currently_syncing:
        streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing,
                            streams)

    for stream in streams:
        md_map = metadata.to_map(stream.metadata)
        conn_config['dbname'] = md_map.get(()).get('database-name')
        state = singer.set_currently_syncing(state, stream.tap_stream_id)

        desired_columns = [
            c for c in stream.schema.properties.keys()
            if should_sync_column(md_map, c)
        ]
        desired_columns.sort()

        if len(desired_columns) == 0:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it',
                stream.tap_stream_id)
            continue

        replication_method = md_map.get(
            (), {}).get('replication-method', default_replication_method)
        if replication_method == 'LOG_BASED' and md_map.get(
            (), {}).get('is-view'):
            LOGGER.warning(
                'Logical Replication is NOT supported for views. skipping stream %s',
                stream.tap_stream_id)
            continue

        if replication_method == 'LOG_BASED':
            state = do_sync_logical_replication(conn_config, stream, state,
                                                desired_columns, md_map)
        elif replication_method == 'FULL_TABLE':
            state = do_sync_full_table(conn_config, stream, state,
                                       desired_columns, md_map)
        else:
            raise Exception(
                "only LOG_BASED and FULL_TABLE are supported right now :)")

        state = singer.set_currently_syncing(state, None)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #23
0
def sync_non_oplog_streams(client, streams, state):
    for stream in streams:
        md_map = metadata.to_map(stream['metadata'])
        stream_metadata = md_map.get(())
        select_clause = stream_metadata.get('custom-select-clause')

        if not select_clause:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                stream['tap_stream_stream'])
            continue

        columns = [c.strip(' ') for c in select_clause.split(',')]
        columns.append('_id')

        state = singer.set_currently_syncing(state, stream['tap_stream_id'])

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        replication_method = stream_metadata.get('replication-method')

        database_name = get_database_name(stream)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = stream['table_name']

            if replication_method == 'LOG_BASED':
                do_sync_historical_oplog(client, stream, state, columns)
            elif replication_method == 'FULL_TABLE':
                write_schema_message(stream)
                stream_version = common.get_stream_version(
                    stream['tap_stream_id'], state)
                full_table.sync_table(client, stream, state, stream_version,
                                      columns)

                state = singer.write_bookmark(state, stream['tap_stream_id'],
                                              'initial_full_table_complete',
                                              True)
            else:
                raise Exception(
                    f"only LOG_BASED and FULL TABLE replication methods are supported (you passed {replication_method})"
                )

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #24
0
def sync_non_binlog_streams(mysql_conn,
                            non_binlog_catalog,
                            config,
                            state,
                            original_state_file=''):
    for catalog_entry in non_binlog_catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')

        database_name = common.get_database_name(catalog_entry)

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = database_name
            timer.tags['table'] = catalog_entry.table

            log_engine(mysql_conn, catalog_entry)

            if replication_method == 'INCREMENTAL':
                do_sync_incremental(mysql_conn, catalog_entry, state, columns,
                                    original_state_file)
            elif replication_method == 'LOG_BASED':
                do_sync_historical_binlog(mysql_conn, config, catalog_entry,
                                          state, columns)
            elif replication_method == 'FULL_TABLE':
                do_sync_full_table(mysql_conn, config, catalog_entry, state,
                                   columns)
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    state = singer.set_currently_syncing(state, None)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #25
0
def consume_message(streams, state, msg, time_extracted, conn_info):
    payload = json.loads(msg.payload)
    lsn = msg.data_start

    streams_lookup = {}
    for s in streams:
        streams_lookup[s['tap_stream_id']] = s

    for c in payload['change']:
        tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table'])
        if streams_lookup.get(tap_stream_id) is None:
            continue

        target_stream = streams_lookup[tap_stream_id]
        stream_version = get_stream_version(target_stream['tap_stream_id'], state)
        stream_md_map = metadata.to_map(target_stream['metadata'])

        if c['kind'] == 'insert':
            col_vals = c['columnvalues'] + [None]
            col_names = c['columnnames'] + ['_sdc_deleted_at']
            record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
        elif c['kind'] == 'update':
            col_vals = c['columnvalues'] + [None]
            col_names = c['columnnames'] + ['_sdc_deleted_at']
            record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
        elif c['kind'] == 'delete':
            col_names = c['oldkeys']['keynames'] + ['_sdc_deleted_at']
            col_vals = c['oldkeys']['keyvalues']  + [singer.utils.strftime(time_extracted)]
            record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
        else:
            raise Exception("unrecognized replication operation: {}".format(c['kind']))

        sync_common.send_schema_message(target_stream, ['lsn'])

        singer.write_message(record_message)
        state = singer.write_bookmark(state,
                                      target_stream['tap_stream_id'],
                                      'lsn',
                                      lsn)
        LOGGER.debug("sending feedback to server with NO flush_lsn. just a keep-alive")
        msg.cursor.send_feedback()

    LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start)
    msg.cursor.send_feedback(flush_lsn=msg.data_start)


    return state
Beispiel #26
0
def sync_table(snowflake_conn, catalog_entry, state, columns):
    """Sync table incrementally"""
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key',
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    singer.write_message(activate_version_message)

    with snowflake_conn.connect_with_backoff() as open_conn:
        with open_conn.cursor() as cur:
            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == 'date-time':
                    replication_key_value = pendulum.parse(
                        replication_key_value)
            select_sql = common.generate_sql_query(
                catalog_entry, columns, bookmark_value=replication_key_value)
            params = {}
            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)
def do_sync_full_table(mysql_conn, catalog_entry, state, columns):
    LOGGER.info("Stream %s is using full table replication", catalog_entry.stream)

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

    # Prefer initial_full_table_complete going forward
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, "version")

    state = singer.write_bookmark(
        state, catalog_entry.tap_stream_id, "initial_full_table_complete", True
    )

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def sync_traditional_stream(client: MongoClient, stream: Dict, state: Dict):
    """
    Sync given stream
    Args:
        client: MongoDb client instance
        stream: stream to sync
        state: state
    """
    tap_stream_id = stream['tap_stream_id']

    common.COUNTS[tap_stream_id] = 0
    common.TIMES[tap_stream_id] = 0
    common.SCHEMA_COUNT[tap_stream_id] = 0
    common.SCHEMA_TIMES[tap_stream_id] = 0

    md_map = metadata.to_map(stream['metadata'])
    replication_method = metadata.get(md_map, (), 'replication-method')

    if replication_method not in {INCREMENTAL_METHOD, FULL_TABLE_METHOD}:
        raise InvalidReplicationMethodException(replication_method,
                                                'replication method needs to be either FULL_TABLE or INCREMENTAL')

    database_name = metadata.get(md_map, (), 'database-name')

    # Emit a state message to indicate that we've started this stream
    state = clear_state_on_replication_change(stream, state)
    state = singer.set_currently_syncing(state, stream['tap_stream_id'])
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    write_schema_message(stream)
    common.SCHEMA_COUNT[tap_stream_id] += 1

    with metrics.job_timer('sync_table') as timer:
        timer.tags['database'] = database_name
        timer.tags['table'] = stream['table_name']

        collection = client[database_name][stream["table_name"]]

        if replication_method == 'FULL_TABLE':
            full_table.sync_collection(collection, stream, state)
        else:
            incremental.sync_collection(collection, stream, state)

    state = singer.set_currently_syncing(state, None)

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #29
0
def sync_binlog_stream(mysql_conn, config, binlog_streams, state):
    binlog_streams_map = generate_streams_map(binlog_streams)

    for tap_stream_id, _ in binlog_streams_map.items():
        common.whitelist_bookmark_keys(BOOKMARK_KEYS, tap_stream_id, state)

    log_file, log_pos = calculate_bookmark(mysql_conn, binlog_streams_map,
                                           state)

    verify_log_file_exists(mysql_conn, log_file, log_pos)

    if config.get('server_id'):
        server_id = int(config.get('server_id'))
        LOGGER.info("Using provided server_id=%s", server_id)
    else:
        server_id = fetch_server_id(mysql_conn)
        LOGGER.info("No server_id provided, will use global server_id=%s",
                    server_id)

    connection_wrapper = make_connection_wrapper(config)
    reader = None

    try:
        slave_uuid = f"bi-reader-%04x" % random.getrandbits(64)

        reader = BinLogStreamReader(
            connection_settings={},
            server_id=server_id,
            slave_uuid=slave_uuid,
            log_file=log_file,
            log_pos=log_pos,
            resume_stream=True,
            only_events=[
                RotateEvent, WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent
            ],
            pymysql_wrapper=connection_wrapper,
        )
        LOGGER.info("Starting binlog replication with log_file=%s, log_pos=%s",
                    log_file, log_pos)
        _run_binlog_sync(mysql_conn, reader, binlog_streams_map, state, config)
    finally:
        # BinLogStreamReader doesn't implement the `with` methods
        # So, try/finally will close the chain from the top
        reader.close()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #30
0
    def for_each_invoice(invoice, time_extracted, stream_version=None):
        def map_invoice_message(message):
            message['invoice_id'] = invoice['id']
            return message

        def map_invoice_payment(payment):
            payment['invoice_id'] = invoice['id']
            payment['payment_gateway_id'] = payment['payment_gateway']['id']
            payment['payment_gateway_name'] = payment['payment_gateway'][
                'name']
            return payment

        # Sync invoice messages
        sync_endpoint("invoice_messages",
                      endpoint=("invoices/{}/messages".format(invoice['id'])),
                      path="invoice_messages",
                      with_updated_since=False,
                      map_handler=map_invoice_message,
                      stream_version=stream_version)

        # Sync invoice payments
        sync_endpoint("invoice_payments",
                      endpoint=("invoices/{}/payments".format(invoice['id'])),
                      path="invoice_payments",
                      with_updated_since=False,
                      map_handler=map_invoice_payment,
                      date_fields=["send_reminder_on"],
                      stream_version=stream_version)

        # Extract all invoice_line_items
        line_items_schema = load_and_write_schema("invoice_line_items")
        with Transformer() as transformer:
            for line_item in invoice['line_items']:
                line_item['invoice_id'] = invoice['id']
                if line_item['project'] is not None:
                    line_item['project_id'] = line_item['project']['id']
                else:
                    line_item['project_id'] = None
                line_item = transformer.transform(line_item, line_items_schema)

                new_record = singer.RecordMessage(
                    stream="invoice_line_items",
                    record=line_item,
                    version=stream_version,
                    time_extracted=time_extracted)
                singer.write_message(new_record)