def consume_message(streams, state, msg, time_extracted, conn_info):
    payload = json.loads(msg.payload)
    lsn = msg.data_start

    streams_lookup = {}
    for s in streams:
        streams_lookup[s['tap_stream_id']] = s

    for c in payload['change']:
        tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'],
                                                      c['schema'], c['table'])
        if streams_lookup.get(tap_stream_id) is None:
            continue

        target_stream = streams_lookup[tap_stream_id]
        stream_version = get_stream_version(target_stream['tap_stream_id'],
                                            state)
        stream_md_map = metadata.to_map(target_stream['metadata'])

        if c['kind'] == 'insert':
            col_vals = c['columnvalues'] + [None]
            col_names = c['columnnames'] + ['_sdc_deleted_at']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)
        elif c['kind'] == 'update':
            col_vals = c['columnvalues'] + [None]
            col_names = c['columnnames'] + ['_sdc_deleted_at']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)
        elif c['kind'] == 'delete':
            col_names = c['oldkeys']['keynames'] + ['_sdc_deleted_at']
            col_vals = c['oldkeys']['keyvalues'] + [
                singer.utils.strftime(time_extracted)
            ]
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)
        else:
            raise Exception("unrecognized replication operation: {}".format(
                c['kind']))

        sync_common.send_schema_message(target_stream, ['lsn'])

        singer.write_message(record_message)
        state = singer.write_bookmark(state, target_stream['tap_stream_id'],
                                      'lsn', lsn)
        LOGGER.debug(
            "sending feedback to server with NO flush_lsn. just a keep-alive")
        msg.cursor.send_feedback()

    LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start)
    msg.cursor.send_feedback(flush_lsn=msg.data_start)

    return state
Example #2
0
def consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn):
    ## Action Types:
    # I = Insert
    # U = Update
    # D = Delete
    # B = Begin Transaction
    # C = Commit Transaction
    # M = Message
    # T = Truncate
    action = payload['action']
    if action not in ['U', 'I', 'D']:
        LOGGER.debug("Skipping message of type %s", action)
        yield None
    else:
        tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], payload['schema'], payload['table'])
        if streams_lookup.get(tap_stream_id) is None:
            yield None
        else:
            target_stream = streams_lookup[tap_stream_id]
            stream_version = get_stream_version(target_stream['tap_stream_id'], state)
            stream_md_map = metadata.to_map(target_stream['metadata'])

            desired_columns = [col for col in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, col)]

            col_names = []
            col_vals = []
            if payload['action'] in ['I', 'U']:
                for column in payload['columns']:
                    if column['name'] in set(desired_columns):
                        col_names.append(column['name'])
                        col_vals.append(column['value'])

                col_names = col_names + ['_sdc_deleted_at']
                col_vals = col_vals + [None]

                if conn_info.get('debug_lsn'):
                    col_names = col_names + ['_sdc_lsn']
                    col_vals = col_vals + [str(lsn)]

            elif payload['action'] == 'D':
                for column in payload['identity']:
                    if column['name'] in set(desired_columns):
                        col_names.append(column['name'])
                        col_vals.append(column['value'])

                col_names = col_names + ['_sdc_deleted_at']
                col_vals = col_vals + [singer.utils.strftime(singer.utils.strptime_to_utc(payload['timestamp']))]

                if conn_info.get('debug_lsn'):
                    col_vals = col_vals + [str(lsn)]
                    col_names = col_names + ['_sdc_lsn']

            # Yield 1 record to match the API of V1
            yield row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)

            state = singer.write_bookmark(state,
                                          target_stream['tap_stream_id'],
                                          'lsn',
                                          lsn)
Example #3
0
def discover_columns(connection, table_info):
    """
    Generates more info about columns of the given table
    """
    entries = []
    for schema_name in table_info.keys():
        for table_name in table_info[schema_name].keys():

            mdata = {}
            columns = table_info[schema_name][table_name]['columns']
            table_pks = [col_name for col_name, col_info in columns.items() if col_info.is_primary_key]
            with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(" SELECT current_database()")
                database_name = cur.fetchone()[0]

            metadata.write(mdata, (), 'table-key-properties', table_pks)
            metadata.write(mdata, (), 'schema-name', schema_name)
            metadata.write(mdata, (), 'database-name', database_name)
            metadata.write(mdata, (), 'row-count', table_info[schema_name][table_name]['row_count'])
            metadata.write(mdata, (), 'is-view', table_info[schema_name][table_name].get('is_view'))

            column_schemas = {col_name: schema_for_column(col_info) for col_name, col_info in columns.items()}

            schema = {'type': 'object',
                      'properties': column_schemas,
                      'definitions': {}}

            schema = include_array_schemas(columns, schema)

            for c_name in column_schemas.keys():
                mdata = write_sql_data_type_md(mdata, columns[c_name])

                if column_schemas[c_name].get('type') is None:
                    mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'unsupported')
                    mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', False)
                elif table_info[schema_name][table_name]['columns'][c_name].is_primary_key:
                    mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'automatic')
                    mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True)
                else:
                    mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'available')
                    mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True)

            entry = {'table_name': table_name,
                     'stream': table_name,
                     'metadata': metadata.to_list(mdata),
                     'tap_stream_id': post_db.compute_tap_stream_id(schema_name, table_name),
                     'schema': schema}

            entries.append(entry)

    return entries
Example #4
0
def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
    payload = json.loads(msg.payload)
    lsn = msg.data_start

    streams_lookup = {}
    for s in streams:
        streams_lookup[s['tap_stream_id']] = s

    for c in payload['change']:
        tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'],
                                                      c['schema'], c['table'])
        if streams_lookup.get(tap_stream_id) is None:
            continue

        target_stream = streams_lookup[tap_stream_id]
        stream_version = get_stream_version(target_stream['tap_stream_id'],
                                            state)
        stream_md_map = metadata.to_map(target_stream['metadata'])

        desired_columns = [
            c for c in target_stream['schema']['properties'].keys()
            if sync_common.should_sync_column(stream_md_map, c)
        ]

        if c['kind'] == 'insert':
            col_names = []
            col_vals = []
            for idx, col in enumerate(c['columnnames']):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c['columnvalues'][idx])

            col_names = col_names + ['_sdc_deleted_at']
            col_vals = col_vals + [None]
            if conn_info.get('debug_lsn'):
                col_names = col_names + ['_sdc_lsn']
                col_vals = col_vals + [str(lsn)]
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)

        elif c['kind'] == 'update':
            col_names = []
            col_vals = []
            for idx, col in enumerate(c['columnnames']):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c['columnvalues'][idx])

            col_names = col_names + ['_sdc_deleted_at']
            col_vals = col_vals + [None]

            if conn_info.get('debug_lsn'):
                col_vals = col_vals + [str(lsn)]
                col_names = col_names + ['_sdc_lsn']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)

        elif c['kind'] == 'delete':
            col_names = []
            col_vals = []
            for idx, col in enumerate(c['oldkeys']['keynames']):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c['oldkeys']['keyvalues'][idx])

            col_names = col_names + ['_sdc_deleted_at']
            col_vals = col_vals + [singer.utils.strftime(time_extracted)]
            if conn_info.get('debug_lsn'):
                col_vals = col_vals + [str(lsn)]
                col_names = col_names + ['_sdc_lsn']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)

        else:
            raise Exception("unrecognized replication operation: {}".format(
                c['kind']))

        singer.write_message(record_message)
        state = singer.write_bookmark(state, target_stream['tap_stream_id'],
                                      'lsn', lsn)
        LOGGER.debug(
            "sending feedback to server with NO flush_lsn. just a keep-alive")
        msg.cursor.send_feedback()

    LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start)
    if msg.data_start > end_lsn:
        raise Exception(
            "incorrectly attempting to flush an lsn({}) > end_lsn({})".format(
                msg.data_start, end_lsn))

    msg.cursor.send_feedback(flush_lsn=msg.data_start)

    return state
def consume_message(streams, state, msg, time_extracted, conn_info):
    # Strip leading comma generated by write-in-chunks and parse valid JSON
    try:
        payload = json.loads(msg.payload.lstrip(','))
    except Exception:
        return state

    lsn = msg.data_start

    streams_lookup = {s['tap_stream_id']: s for s in streams}

    tap_stream_id = post_db.compute_tap_stream_id(payload['schema'],
                                                  payload['table'])
    if streams_lookup.get(tap_stream_id) is None:
        return state

    target_stream = streams_lookup[tap_stream_id]

    if payload['kind'] not in {'insert', 'update', 'delete'}:
        raise UnsupportedPayloadKindError(
            f"unrecognized replication operation: {payload['kind']}")

    # Get the additional fields in payload that are not in schema properties:
    # only inserts and updates have the list of columns that can be used to detect any different in columns
    diff = set()
    if payload['kind'] in {'insert', 'update'}:
        diff = set(payload['columnnames']).difference(
            target_stream['schema']['properties'].keys())

    # if there is new columns in the payload that are not in the schema properties then refresh the stream schema
    if diff:
        LOGGER.info(
            'Detected new columns "%s", refreshing schema of stream %s', diff,
            target_stream['stream'])
        # encountered a column that is not in the schema
        # refresh the stream schema and metadata by running discovery
        refresh_streams_schema(conn_info, [target_stream])

        # add the automatic properties back to the stream
        add_automatic_properties(target_stream,
                                 conn_info.get('debug_lsn', False))

        # publish new schema
        sync_common.send_schema_message(target_stream, ['lsn'])

    stream_version = get_stream_version(target_stream['tap_stream_id'], state)
    stream_md_map = metadata.to_map(target_stream['metadata'])

    desired_columns = {
        c
        for c in target_stream['schema']['properties'].keys()
        if sync_common.should_sync_column(stream_md_map, c)
    }

    if payload['kind'] in {'insert', 'update'}:
        col_names = []
        col_vals = []

        for idx, col in enumerate(payload['columnnames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['columnvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(None)

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    elif payload['kind'] == 'delete':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['oldkeys']['keynames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['oldkeys']['keyvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(singer.utils.strftime(time_extracted))

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    singer.write_message(record_message)
    state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn',
                                  lsn)

    return state
def consume_message_format_1(payload, conn_info, streams_lookup, state,
                             time_extracted, lsn):
    for c in payload['change']:
        tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'],
                                                      c['schema'], c['table'])
        if streams_lookup.get(tap_stream_id) is None:
            continue

        target_stream = streams_lookup[tap_stream_id]
        stream_version = get_stream_version(target_stream['tap_stream_id'],
                                            state)
        stream_md_map = metadata.to_map(target_stream['metadata'])

        desired_columns = [
            c for c in target_stream['schema']['properties'].keys()
            if sync_common.should_sync_column(stream_md_map, c)
        ]

        if c['kind'] == 'insert':
            col_names = []
            col_vals = []
            for idx, col in enumerate(c['columnnames']):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c['columnvalues'][idx])

            col_names = col_names + ['_sdc_deleted_at']
            col_vals = col_vals + [None]
            if conn_info.get('debug_lsn'):
                col_names = col_names + ['_sdc_lsn']
                col_vals = col_vals + [str(lsn)]
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)

        elif c['kind'] == 'update':
            col_names = []
            col_vals = []
            for idx, col in enumerate(c['columnnames']):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c['columnvalues'][idx])

            col_names = col_names + ['_sdc_deleted_at']
            col_vals = col_vals + [None]

            if conn_info.get('debug_lsn'):
                col_vals = col_vals + [str(lsn)]
                col_names = col_names + ['_sdc_lsn']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)

        elif c['kind'] == 'delete':
            col_names = []
            col_vals = []
            for idx, col in enumerate(c['oldkeys']['keynames']):
                if col in set(desired_columns):
                    col_names.append(col)
                    col_vals.append(c['oldkeys']['keyvalues'][idx])

            col_names = col_names + ['_sdc_deleted_at']
            col_vals = col_vals + [singer.utils.strftime(time_extracted)]
            if conn_info.get('debug_lsn'):
                col_vals = col_vals + [str(lsn)]
                col_names = col_names + ['_sdc_lsn']
            record_message = row_to_singer_message(target_stream, col_vals,
                                                   stream_version, col_names,
                                                   time_extracted,
                                                   stream_md_map, conn_info)

        else:
            raise Exception("unrecognized replication operation: {}".format(
                c['kind']))

        yield record_message
        state = singer.write_bookmark(state, target_stream['tap_stream_id'],
                                      'lsn', lsn)
def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn):
    # Strip leading comma generated by write-in-chunks and parse valid JSON
    try:
        payload = json.loads(msg.payload.lstrip(','))
    except:
        return state

    lsn = msg.data_start

    streams_lookup = {}
    for s in streams:
        streams_lookup[s['tap_stream_id']] = s

    tap_stream_id = post_db.compute_tap_stream_id(payload['schema'],
                                                  payload['table'])
    if streams_lookup.get(tap_stream_id) is None:
        return state

    target_stream = streams_lookup[tap_stream_id]
    stream_version = get_stream_version(target_stream['tap_stream_id'], state)
    stream_md_map = metadata.to_map(target_stream['metadata'])

    desired_columns = [
        c for c in target_stream['schema']['properties'].keys()
        if sync_common.should_sync_column(stream_md_map, c)
    ]

    if payload['kind'] == 'insert':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['columnnames']):
            if col in set(desired_columns):
                col_names.append(col)
                col_vals.append(payload['columnvalues'][idx])

        col_names = col_names + ['_sdc_deleted_at']
        col_vals = col_vals + [None]
        if conn_info.get('debug_lsn'):
            col_names = col_names + ['_sdc_lsn']
            col_vals = col_vals + [str(lsn)]
        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    elif payload['kind'] == 'update':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['columnnames']):
            if col in set(desired_columns):
                col_names.append(col)
                col_vals.append(payload['columnvalues'][idx])

        col_names = col_names + ['_sdc_deleted_at']
        col_vals = col_vals + [None]

        if conn_info.get('debug_lsn'):
            col_vals = col_vals + [str(lsn)]
            col_names = col_names + ['_sdc_lsn']
        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    elif payload['kind'] == 'delete':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['oldkeys']['keynames']):
            if col in set(desired_columns):
                col_names.append(col)
                col_vals.append(payload['oldkeys']['keyvalues'][idx])

        col_names = col_names + ['_sdc_deleted_at']
        col_vals = col_vals + [singer.utils.strftime(time_extracted)]
        if conn_info.get('debug_lsn'):
            col_vals = col_vals + [str(lsn)]
            col_names = col_names + ['_sdc_lsn']
        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    else:
        raise Exception("unrecognized replication operation: {}".format(
            payload['kind']))

    singer.write_message(record_message)
    state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn',
                                  lsn)

    # Below is the behaviour of the original tap-progres to flush the source server wal to the latest lsn received in the current run
    # The Pipelinewise version flushes only at the start of the next run to ensure the data has been comitted on the destination server
    # if msg.data_start > end_lsn:
    #     raise Exception("incorrectly attempting to flush an lsn({}) > end_lsn({})".format(msg.data_start, end_lsn))
    # LOGGER.info("Confirming write up to {}, flush to {}".format(int_to_lsn(msg.data_start), int_to_lsn(msg.data_start)))
    # msg.cursor.send_feedback(write_lsn=msg.data_start, flush_lsn=msg.data_start, reply=True)

    return state
Example #8
0
def discover_columns(connection, table_info):
    entries = []
    for schema_name in table_info.keys():
        for table_name in table_info[schema_name].keys():

            mdata = {}
            columns = table_info[schema_name][table_name]["columns"]
            table_pks = [
                col_name for col_name, col_info in columns.items()
                if col_info.is_primary_key
            ]
            with connection.cursor(
                    cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(" SELECT current_database()")
                database_name = cur.fetchone()[0]

            metadata.write(mdata, (), "table-key-properties", table_pks)
            metadata.write(mdata, (), "schema-name", schema_name)
            metadata.write(mdata, (), "database-name", database_name)
            metadata.write(mdata, (), "row-count",
                           table_info[schema_name][table_name]["row_count"])
            metadata.write(mdata, (), "is-view",
                           table_info[schema_name][table_name].get("is_view"))

            column_schemas = {
                col_name: schema_for_column(col_info)
                for col_name, col_info in columns.items()
            }

            schema = {
                "type": "object",
                "properties": column_schemas,
                "definitions": {}
            }

            schema = include_array_schemas(columns, schema)

            for c_name in column_schemas.keys():
                mdata = write_sql_data_type_md(mdata, columns[c_name])

                if column_schemas[c_name].get("type") is None:
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "inclusion", "unsupported")
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "selected-by-default", False)
                elif table_info[schema_name][table_name]["columns"][
                        c_name].is_primary_key:
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "inclusion", "automatic")
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "selected-by-default", True)
                else:
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "inclusion", "available")
                    mdata = metadata.write(mdata, ("properties", c_name),
                                           "selected-by-default", True)

            entry = {
                "table_name":
                table_name,
                "stream":
                table_name,
                "metadata":
                metadata.to_list(mdata),
                "tap_stream_id":
                post_db.compute_tap_stream_id(database_name, schema_name,
                                              table_name),
                "schema":
                schema,
            }

            entries.append(entry)

    return entries