Esempio n. 1
0
    def test_catalog(self):
        conn_config = get_test_connection_config()
        streams = tap_postgres.do_discovery(conn_config)
        chicken_streams = [
            s for s in streams
            if s["tap_stream_id"] == "postgres-public-CHICKEN TIMES"
        ]

        self.assertEqual(len(chicken_streams), 1)
        stream_dict = chicken_streams[0]
        stream_dict.get("metadata").sort(key=lambda md: md["breadcrumb"])

        self.assertEqual(
            metadata.to_map(stream_dict.get("metadata")),
            {
                (): {
                    "is-view": False,
                    "table-key-properties": [],
                    "row-count": 0,
                    "schema-name": "public",
                    "database-name": "postgres",
                },
                ("properties", "bytea_col"): {
                    "sql-datatype": "bytea",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "bit_string_col"): {
                    "sql-datatype": "bit(5)",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "line_col"): {
                    "sql-datatype": "line",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "xml_col"): {
                    "sql-datatype": "xml",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "circle_col"): {
                    "sql-datatype": "circle",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "polygon_col"): {
                    "sql-datatype": "polygon",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "box_col"): {
                    "sql-datatype": "box",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "lseg_col"): {
                    "sql-datatype": "lseg",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "composite_col"): {
                    "sql-datatype": "person_composite",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
                ("properties", "point_col"): {
                    "sql-datatype": "point",
                    "selected-by-default": False,
                    "inclusion": "unsupported",
                },
            },
        )
Esempio n. 2
0
def sync_log_based(config, state, stream):
    table_name = stream['tap_stream_id']

    client = dynamodb.get_client(config)
    streams_client = dynamodb.get_stream_client(config)

    md_map = metadata.to_map(stream['metadata'])
    projection = metadata.get(md_map, (), 'tap-mongodb.projection')
    if projection is not None:
        projection = [x.strip().split('.') for x in projection.split(',')]

    # Write activate version message
    stream_version = singer.get_bookmark(state, table_name, 'version')
    singer.write_version(table_name, stream_version)

    table = client.describe_table(TableName=table_name)['Table']
    stream_arn = table['LatestStreamArn']
    seq_number_bookmarks = singer.get_bookmark(state, table_name,
                                               'shard_seq_numbers')

    deserializer = deserialize.Deserializer()

    rows_saved = 0

    for shard in get_shards(streams_client, stream_arn):
        # check for bookmark
        seq_number = seq_number_bookmarks.get(shard['ShardId'])
        if seq_number:
            iterator_type = 'AFTER_SEQUENCE_NUMBER'
        else:
            iterator_type = 'TRIM_HORIZON'

        for record in get_shard_records(streams_client, stream_arn, shard,
                                        iterator_type, seq_number):
            if record['eventName'] == 'REMOVE':
                record_message = deserializer.deserialize_item(
                    record['dynamodb']['Keys'])
                record_message[SDC_DELETED_AT] = singer.utils.strftime(
                    record['dynamodb']['ApproximateCreationDateTime'])
            else:
                record_message = deserializer.deserialize_item(
                    record['dynamodb'].get('NewImage'))
                if record_message is None:
                    LOGGER.fatal(
                        'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                    )
                    raise RuntimeError(
                        'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                    )
                if projection is not None:
                    try:
                        record_message = deserializer.apply_projection(
                            record_message, projection)
                    except:
                        LOGGER.fatal(
                            "Projection failed to apply: %s",
                            metadata.get(md_map, (), 'tap-mongodb.projection'))
                        raise RuntimeError(
                            'Projection failed to apply: {}'.format(
                                metadata.get(md_map, (),
                                             'tap-mongodb.projection')))

            record_message = singer.RecordMessage(stream=table_name,
                                                  record=record_message,
                                                  version=stream_version)
            singer.write_message(record_message)

            rows_saved += 1

            seq_number_bookmarks[
                shard['ShardId']] = record['dynamodb']['SequenceNumber']
            state = singer.write_bookmark(state, table_name,
                                          'shard_seq_numbers',
                                          seq_number_bookmarks)

            if rows_saved % WRITE_STATE_PERIOD == 0:
                singer.write_state(state)

        # If the shard we just finished syncing is closed (i.e. has an
        # EndingSequenceNumber), pop it off
        if shard['SequenceNumberRange'].get('EndingSequenceNumber'):
            # Must check if the bookmark exists because if a shard has 0
            # records we will never set a bookmark for the shard
            if seq_number_bookmarks.get(shard['ShardId']):
                seq_number_bookmarks.pop(shard['ShardId'])
                state = singer.write_bookmark(state, table_name,
                                              'shard_seq_numbers',
                                              seq_number_bookmarks)

        singer.write_state(state)

    return rows_saved
Esempio n. 3
0
def sync_query(cursor, catalog_entry, state, select_sql, columns,
               stream_version, params):
    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          "replication_key")

    # query_string = cursor.mogrify(select_sql, params)

    time_extracted = utils.now()
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags["database"] = database_name
        counter.tags["table"] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version, row, columns,
                                                  time_extracted)
            singer.write_message(record_message)
            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get("replication-method")

            #Get the definition for replication-keys-coalesce if exists.
            replication_keys_coalesce = stream_metadata.get(
                "replication-keys-coalesce")

            if replication_method in {"FULL_TABLE", "LOG_BASED"}:
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id, "max_pk_values")

                if max_pk_values:
                    last_pk_fetched = {
                        k: v
                        for k, v in record_message.record.items()
                        if k in key_properties
                    }

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  "last_pk_fetched",
                                                  last_pk_fetched)

            elif replication_method == "INCREMENTAL":
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  "replication_key",
                                                  replication_key)

                    #If the replication_keys_coalesce is defined, use the
                    #first value that is not null.
                    if replication_keys_coalesce is not None:
                        for replication_key_current in replication_keys_coalesce:
                            replication_key_for_value = replication_key_current
                            if record_message.record[
                                    replication_key_current] is not None:
                                break
                    else:
                        replication_key_for_value = replication_key

                    state = singer.write_bookmark(
                        state,
                        catalog_entry.tap_stream_id,
                        "replication_key_value",
                        record_message.record[replication_key_for_value],
                    )

            #On a FULL_TABLE, do not update the state until the whole table has completed.
            if rows_saved % 1000 == 0 and replication_method != "FULL_TABLE":
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Esempio n. 4
0
def get_non_binlog_streams(mysql_conn, catalog, config, state):
    '''Returns the Catalog of data we're going to sync for all SELECT-based
    streams (i.e. INCREMENTAL, FULL_TABLE, and LOG_BASED that require a historical
    sync). LOG_BASED streams that require a historical sync are inferred from lack
    of any state.

    Using the Catalog provided from the input file, this function will return a
    Catalog representing exactly which tables and columns that will be emitted
    by SELECT-based syncs. This is achieved by comparing the input Catalog to a
    freshly discovered Catalog to determine the resulting Catalog.

    The resulting Catalog will include the following any streams marked as
    "selected" that currently exist in the database. Columns marked as "selected"
    and those labled "automatic" (e.g. primary keys and replication keys) will be
    included. Streams will be prioritized in the following order:
      1. currently_syncing if it is SELECT-based
      2. any streams that do not have state
      3. any streams that do not have a replication method of LOG_BASED

    '''
    discovered = discover_catalog(mysql_conn, config)

    # Filter catalog to include only selected streams
    selected_streams = list(
        filter(lambda s: common.stream_is_selected(s), catalog.streams))
    streams_with_state = []
    streams_without_state = []

    for stream in selected_streams:
        stream_metadata = metadata.to_map(stream.metadata)
        replication_method = stream_metadata.get((),
                                                 {}).get('replication-method')
        stream_state = state.get('bookmarks', {}).get(stream.tap_stream_id)

        if not stream_state:
            if replication_method == 'LOG_BASED':
                LOGGER.info(
                    "LOG_BASED stream %s requires full historical sync",
                    stream.tap_stream_id)

            streams_without_state.append(stream)
        elif stream_state and replication_method == 'LOG_BASED' and binlog_stream_requires_historical(
                stream, state):
            is_view = common.get_is_view(stream)

            if is_view:
                raise Exception(
                    "Unable to replicate stream({}) with binlog because it is a view."
                    .format(stream.stream))

            LOGGER.info("LOG_BASED stream %s will resume its historical sync",
                        stream.tap_stream_id)

            streams_with_state.append(stream)
        elif stream_state and replication_method != 'LOG_BASED':
            streams_with_state.append(stream)

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)

    # prioritize streams that have not been processed
    ordered_streams = streams_without_state + streams_with_state

    if currently_syncing:
        currently_syncing_stream = list(
            filter(
                lambda s: s.tap_stream_id == currently_syncing and
                is_valid_currently_syncing_stream(s, state),
                streams_with_state))

        non_currently_syncing_streams = list(
            filter(lambda s: s.tap_stream_id != currently_syncing,
                   ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        # prioritize streams that have not been processed
        streams_to_sync = ordered_streams

    return resolve_catalog(discovered, streams_to_sync)
Esempio n. 5
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        sf.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Salesforce data for stream %s', stream)

    for rec in sf.query(catalog_entry, state):
        counter.increment()
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            rec = transformer.transform(rec, schema)
        rec = fix_record_anytype(rec, schema)
        singer.write_message(
            singer.RecordMessage(stream=(stream_alias or stream),
                                 record=rec,
                                 version=stream_version,
                                 time_extracted=start_time))

        replication_key_value = replication_key and singer_utils.strptime_with_tz(
            rec[replication_key])

        if sf.pk_chunking:
            if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark:
                # Replace the highest seen bookmark and save the state in case we need to resume later
                chunked_bookmark = singer_utils.strptime_with_tz(
                    rec[replication_key])
                state = singer.write_bookmark(
                    state, catalog_entry['tap_stream_id'],
                    'JobHighestBookmarkSeen',
                    singer_utils.strftime(chunked_bookmark))
                singer.write_state(state)
        # Before writing a bookmark, make sure Salesforce has not given us a
        # record with one outside our range
        elif replication_key_value and replication_key_value <= start_time:
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          replication_key,
                                          rec[replication_key])
            singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'version', None)

    # If pk_chunking is set, only write a bookmark at the end
    if sf.pk_chunking:
        # Write a bookmark with the highest value we've seen
        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      replication_key,
                                      singer_utils.strftime(chunked_bookmark))
Esempio n. 6
0
def sync_stream(stream_name):
    """
    Sync each stream, looking for newly created records. Updates are captured by events stream.
    """
    LOGGER.info("Started syncing stream %s", stream_name)

    stream_metadata = metadata.to_map(
        Context.get_catalog_entry(stream_name)['metadata'])
    stream_field_whitelist = json.loads(
        Context.config.get('whitelist_map', '{}')).get(stream_name)

    extraction_time = singer.utils.now()
    replication_key = metadata.get(stream_metadata, (),
                                   'valid-replication-keys')[0]
    # Invoice Items bookmarks on `date`, but queries on `created`
    filter_key = 'created' if stream_name == 'invoice_items' else replication_key
    stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \
        int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    bookmark = stream_bookmark

    # if this stream has a sub_stream, compare the bookmark
    sub_stream_name = SUB_STREAMS.get(stream_name)

    # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark)
    should_sync_sub_stream = sub_stream_name and Context.is_selected(
        sub_stream_name)
    if should_sync_sub_stream:
        sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \
            or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())

        # if there is a sub stream, set bookmark to sub stream's bookmark
        # since we know it must be earlier than the stream's bookmark
        if sub_stream_bookmark != stream_bookmark:
            bookmark = sub_stream_bookmark
    else:
        sub_stream_bookmark = None

    with Transformer(
            singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        end_time = dt_to_epoch(utils.now())
        window_size = int(
            Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE))
        if DEFAULT_DATE_WINDOW_SIZE != window_size:
            LOGGER.info('Using non-default date window size of %d',
                        window_size)
        start_window = bookmark

        # NB: Immutable streams are never synced for updates. We've
        # observed a short lag period between when records are created and
        # when they are available via the API, so these streams will need
        # a short lookback window.
        if stream_name in IMMUTABLE_STREAMS:
            # pylint:disable=fixme
            # TODO: This may be an issue for other streams' created_at
            # entries, but to keep the surface small, doing this only for
            # immutable streams at first to confirm the suspicion.
            start_window -= IMMUTABLE_STREAM_LOOKBACK

        # NB: We observed records coming through newest->oldest and so
        # date-windowing was added and the tap only bookmarks after it has
        # gotten through a date window
        while start_window < end_time:
            stop_window = dt_to_epoch(
                epoch_to_dt(start_window) + timedelta(days=window_size))
            # cut off the last window at the end time
            if stop_window > end_time:
                stop_window = end_time

            for stream_obj in paginate(
                    STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key,
                    start_window, stop_window):

                # get the replication key value from the object
                rec = unwrap_data_objects(stream_obj.to_dict_recursive())
                rec = reduce_foreign_keys(rec, stream_name)
                stream_obj_created = rec[replication_key]
                rec['updated'] = stream_obj_created

                # sync stream if object is greater than or equal to the bookmark
                if stream_obj_created >= stream_bookmark:
                    rec = transformer.transform(
                        rec,
                        Context.get_catalog_entry(stream_name)['schema'],
                        stream_metadata)

                    # At this point, the record has been transformed and so
                    # any de-selected fields have been pruned. Now, prune off
                    # any fields that aren't present in the whitelist.
                    if stream_field_whitelist:
                        rec = apply_whitelist(rec, stream_field_whitelist)

                    singer.write_record(stream_name,
                                        rec,
                                        time_extracted=extraction_time)

                    Context.new_counts[stream_name] += 1

                # sync sub streams if its selected and the parent object
                # is greater than its bookmark
                if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark:
                    sync_sub_stream(sub_stream_name, stream_obj)

            # Update stream/sub-streams bookmarks as stop window
            if stop_window > stream_bookmark:
                stream_bookmark = stop_window
                singer.write_bookmark(Context.state, stream_name,
                                      replication_key, stream_bookmark)

            # the sub stream bookmarks on its parent
            if should_sync_sub_stream and stop_window > sub_stream_bookmark:
                sub_stream_bookmark = stop_window
                singer.write_bookmark(Context.state, sub_stream_name,
                                      replication_key, sub_stream_bookmark)

            singer.write_state(Context.state)

            # update window for next iteration
            start_window = stop_window

    singer.write_state(Context.state)
Esempio n. 7
0
def generate_messages(con, config, catalog, state):
    catalog = resolve_catalog(con, catalog, state)

    for catalog_entry in catalog.streams:
        columns = list(catalog_entry.schema.properties.keys())

        if not columns:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it.',
                catalog_entry.stream)
            continue

        state = singer.set_currently_syncing(state,
                                             catalog_entry.tap_stream_id)

        # Emit a state message to indicate that we've started this stream
        yield singer.StateMessage(value=copy.deepcopy(state))

        md_map = metadata.to_map(catalog_entry.metadata)

        replication_method = md_map.get((), {}).get('replication-method')
        replication_key = md_map.get((), {}).get('replication-key')

        if catalog_entry.is_view:
            key_properties = md_map.get((), {}).get('view-key-properties')
        else:
            key_properties = md_map.get((), {}).get('table-key-properties')

        with metrics.job_timer('sync_table') as timer:
            timer.tags['database'] = catalog_entry.database
            timer.tags['table'] = catalog_entry.table

            log_engine(con, catalog_entry)

            if replication_method == 'INCREMENTAL':
                LOGGER.info("Stream %s is using incremental replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [replication_key])

                for message in incremental.sync_table(con, catalog_entry,
                                                      state, columns):
                    yield message
            elif replication_method == 'LOG_BASED':
                if catalog_entry.is_view:
                    raise Exception(
                        "Unable to replicate stream({}) with binlog because it is a view."
                        .format(catalog_entry.stream))

                LOGGER.info("Stream %s is using binlog replication",
                            catalog_entry.stream)

                log_file = singer.get_bookmark(state,
                                               catalog_entry.tap_stream_id,
                                               'log_file')

                log_pos = singer.get_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'log_pos')

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                if log_file and log_pos:
                    columns = binlog.add_automatic_properties(
                        catalog_entry, columns)

                    for message in binlog.sync_table(con, config,
                                                     catalog_entry, state,
                                                     columns):
                        yield message
                else:
                    LOGGER.info("Performing initial full table sync")

                    log_file, log_pos = binlog.fetch_current_log_file_and_pos(
                        con)

                    stream_version = common.get_stream_version(
                        catalog_entry.tap_stream_id, state)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'version', stream_version)

                    for message in full_table.sync_table(
                            con, catalog_entry, state, columns,
                            stream_version):
                        yield message

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_file', log_file)

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'log_pos', log_pos)

                    yield singer.StateMessage(value=copy.deepcopy(state))
            elif replication_method == 'FULL_TABLE':
                LOGGER.info("Stream %s is using full table replication",
                            catalog_entry.stream)

                yield generate_schema_message(catalog_entry, key_properties,
                                              [])

                stream_version = common.get_stream_version(
                    catalog_entry.tap_stream_id, state)

                for message in full_table.sync_table(con, catalog_entry, state,
                                                     columns, stream_version):
                    yield message

                # Prefer initial_full_table_complete going forward
                singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'version')

                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'initial_full_table_complete',
                                              True)

                yield singer.StateMessage(value=copy.deepcopy(state))
            else:
                raise Exception(
                    "only INCREMENTAL, LOG_BASED, and FULL TABLE replication methods are supported"
                )

    # if we get here, we've finished processing all the streams, so clear
    # currently_syncing from the state and emit a state message.
    state = singer.set_currently_syncing(state, None)
    yield singer.StateMessage(value=copy.deepcopy(state))
    def test_catalog(self):
        singer.write_message = singer_write_message

        with get_test_connection() as conn:
            conn.autocommit = True

            catalog = tap_oracle.do_discovery(get_test_conn_config(), [])
            chicken_stream = [
                s for s in catalog.streams if s.table == 'CHICKEN'
            ][0]
            mdata = metadata.to_map(chicken_stream.metadata)

            self.assertEqual(
                mdata, {
                    ('properties', 'AGE'): {
                        'inclusion': 'available',
                        'selected-by-default': True,
                        'sql-datatype': 'NUMBER'
                    },
                    (): {
                        'is-view': False,
                        'row-count': 0,
                        'table-key-properties': [],
                        'schema-name': 'ROOT',
                        'database-name': 'ORCL'
                    },
                    ('properties', 'INTERVAL_COLUMN'): {
                        'inclusion': 'unsupported',
                        'selected-by-default': False,
                        'sql-datatype': 'INTERVAL DAY(2) TO SECOND(6)'
                    }
                })

            chicken_stream = select_all_of_stream(chicken_stream)

            chicken_stream = set_replication_method_for_stream(
                chicken_stream, 'FULL_TABLE')
            cur = conn.cursor()

            cur.execute("""
               INSERT INTO CHICKEN (AGE, INTERVAL_COLUMN) values (3,
                   TIMESTAMP '2001-09-04 17:00:00.000000' - TIMESTAMP '2001-09-03 17:00:00.000000'
               )""")

            state = {}
            tap_oracle.do_sync(get_test_conn_config(), catalog, None, state)

            #messages: ActivateVersion, SchemaMessage, Record, Record, State, ActivateVersion
            self.assertEqual(6, len(CAUGHT_MESSAGES))
            self.assertTrue(
                isinstance(CAUGHT_MESSAGES[0], singer.SchemaMessage))

            self.assertEqual([], CAUGHT_MESSAGES[0].key_properties)
            self.assertTrue(isinstance(CAUGHT_MESSAGES[1],
                                       singer.StateMessage))
            self.assertTrue(
                isinstance(CAUGHT_MESSAGES[2], singer.ActivateVersionMessage))
            self.assertTrue(
                isinstance(CAUGHT_MESSAGES[3], singer.RecordMessage))
            self.assertEqual({'AGE': 3}, CAUGHT_MESSAGES[3].record)
            self.assertTrue(
                isinstance(CAUGHT_MESSAGES[4], singer.ActivateVersionMessage))
            self.assertTrue(isinstance(CAUGHT_MESSAGES[5],
                                       singer.StateMessage))
Esempio n. 9
0
def is_selected(stream):
    table_md = metadata.to_map(stream.metadata).get((), {})

    return table_md.get('selected') or stream.is_selected()
Esempio n. 10
0
def resolve_catalog(con, catalog, state):
    '''Returns the Catalog of data we're going to sync.

    Takes the Catalog we read from the input file and turns it into a
    Catalog representing exactly which tables and columns we're going to
    emit in this process. Compares the input Catalog to a freshly
    discovered Catalog to determine the resulting Catalog. Returns a new
    instance. The result may differ from the input Catalog in the
    following ways:

      * It will only include streams marked as "selected".
      * We will remove any streams and columns that were selected but do
        not actually exist in the database right now.
      * If the state has a currently_syncing, we will skip to that stream and
        drop all streams appearing before it in the catalog.
      * We will add any columns that were not selected but should be
        automatically included. For example, primary key columns and
        columns used as replication keys.

    '''
    discovered = discover_catalog(con)

    # Filter catalog to include only selected streams
    streams = list(filter(lambda stream: is_selected(stream), catalog.streams))

    # If the state says we were in the middle of processing a stream, skip
    # to that stream.
    currently_syncing = singer.get_currently_syncing(state)
    if currently_syncing:
        streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing,
                            streams)

    result = Catalog(streams=[])

    # Iterate over the streams in the input catalog and match each one up
    # with the same stream in the discovered catalog.
    for catalog_entry in streams:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        discovered_table = discovered.get_stream(catalog_entry.tap_stream_id)
        if not discovered_table:
            LOGGER.warning(
                'Database %s table %s was selected but does not exist',
                catalog_entry.database, catalog_entry.table)
            continue
        selected = set([
            k for k, v in catalog_entry.schema.properties.items()
            if v.selected or k == replication_key
        ])

        # These are the columns we need to select
        columns = desired_columns(selected, discovered_table.schema)

        result.streams.append(
            CatalogEntry(tap_stream_id=catalog_entry.tap_stream_id,
                         metadata=catalog_entry.metadata,
                         stream=catalog_entry.stream,
                         database=catalog_entry.database,
                         table=catalog_entry.table,
                         is_view=catalog_entry.is_view,
                         schema=Schema(
                             type='object',
                             properties={
                                 col: discovered_table.schema.properties[col]
                                 for col in columns
                             })))

    return result
Esempio n. 11
0
def get_database_name(catalog_entry):
    md_map = metadata.to_map(catalog_entry.metadata)

    return md_map.get((), {}).get("database-name")
Esempio n. 12
0
def get_is_view(catalog_entry):
    md_map = metadata.to_map(catalog_entry.metadata)

    return md_map.get((), {}).get("is-view")
Esempio n. 13
0
def stream_is_selected(stream):
    md_map = metadata.to_map(stream.metadata)
    selected_md = metadata.get(md_map, (), "selected")

    return selected_md
Esempio n. 14
0
    def do_test(self, conn_id):
        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream, {}).get('messages', [])
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        satisfaction_ratings_bookmark = "2020-03-05T14:14:42Z"

        state = menagerie.get_state(conn_id)
        state['bookmarks']['satisfaction_ratings'][
            'updated_at'] = satisfaction_ratings_bookmark
        menagerie.set_state(conn_id, state)

        # Create a new record
        creds = {
            "email": "*****@*****.**",
            "subdomain": self.get_properties()['subdomain'],
            "token": os.getenv('TAP_ZENDESK_API_TOKEN')
        }

        self.client = Zenpy(**creds)

        # Create some new objects
        group_name = str(uuid.uuid4())
        group = Group(name=group_name)
        self.created_group = self.client.groups.create(group)

        org_name = str(uuid.uuid4())
        org = Organization(name=org_name)
        self.created_org = self.client.organizations.create(org)

        user = User(name="John Doe",
                    email="{}@mailinator.com".format(uuid.uuid4()))
        self.created_user = self.client.users.create(user)

        # Sleeping 1 minute to validate lookback behavior needed in tap
        # We've observed a delay between when users are created and when
        # they're available through the API
        print("sleeping for 60 seconds")
        time.sleep(60)

        # Run another Sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Check both sets of records and make sure we have our new rows
        records = runner.get_records_from_target_output()
        messages = records.get('groups', {}).get('messages', [])
        new_record = [
            r for r in messages if r['data']['id'] == self.created_group.id
        ]
        self.assertTrue(any(new_record))
        self.assertEqual(len(messages),
                         2,
                         msg="Sync'd incorrect count of messages: {}".format(
                             len(messages)))

        messages = records.get('organizations', {}).get('messages', [])

        new_record = [
            r for r in messages if r['data']['id'] == self.created_org.id
        ]
        self.assertTrue(any(new_record))
        self.assertEqual(len(messages),
                         2,
                         msg="Sync'd incorrect count of messages: {}".format(
                             len(messages)))

        messages = records.get('users', {}).get('messages', [])
        new_record = [
            r for r in messages if r['data']['id'] == self.created_user.id
        ]
        self.assertTrue(any(new_record))
        # NB: GreaterEqual because we suspect Zendesk updates users in the backend
        # >= 1 because we're no longer inclusive of the last replicated user record. The lookback will control this going forward.
        # If we get the user we wanted and then some, this assertion should succeed
        self.assertGreaterEqual(
            len(messages),
            1,
            msg="Sync'd incorrect count of messages: {}".format(len(messages)))

        messages = records.get('satisfaction_ratings', {}).get('messages', [])
        new_record = [
            r for r in messages
            if r['data']['id'] in [364471784994, 364465631433, 364465212373]
        ]
        self.assertTrue(any(new_record))
        self.assertGreaterEqual(
            len(messages),
            3,
            msg="Sync'd incorrect count of messages: {}".format(len(messages)))
        for message in messages:
            self.assertGreaterEqual(
                utils.strptime_to_utc(
                    message.get('data', {}).get('updated_at', '')),
                utils.strptime_to_utc(satisfaction_ratings_bookmark))
Esempio n. 15
0
def calculate_destination_stream_name(stream):
    s_md = metadata.to_map(stream['metadata'])
    if INCLUDE_SCHEMAS_IN_DESTINATION_STREAM_NAME:
        return "{}_{}".format(s_md.get((), {}).get('database-name'), stream['stream'])

    return stream['stream']
Esempio n. 16
0
def sync_tables(conn_config, streams, state, end_scn):
    connection = orc_db.open_connection(conn_config)
    if not verify_db_supplemental_log_level(connection):
        for stream in streams:
            if not verify_table_supplemental_log_level(stream, connection):
                raise Exception("""
      Unable to replicate with logminer for stream({}) because supplmental_log_data is not set to 'ALL' for either the table or the database.
      Please run: ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS;
            """.format(stream.tap_stream_id))

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )

    start_scn = min(
        [get_bookmark(state, s.tap_stream_id, 'scn') for s in streams])
    time_extracted = utils.now()

    start_logmnr_sql = """BEGIN
                         DBMS_LOGMNR.START_LOGMNR(
                                 startScn => {},
                                 endScn => {},
                                 OPTIONS => DBMS_LOGMNR.DICT_FROM_ONLINE_CATALOG +
                                            DBMS_LOGMNR.COMMITTED_DATA_ONLY +
                                            DBMS_LOGMNR.CONTINUOUS_MINE);
                         END;""".format(start_scn, end_scn)

    LOGGER.info("Starting LogMiner for %s: %s -> %s",
                list(map(lambda s: s.tap_stream_id, streams)), start_scn,
                end_scn)
    LOGGER.info("%s", start_logmnr_sql)
    cur.execute(start_logmnr_sql)

    #mine changes
    for stream in streams:
        md_map = metadata.to_map(stream.metadata)
        desired_columns = [
            c for c in stream.schema.properties.keys()
            if common.should_sync_column(md_map, c)
        ]
        redo_value_sql_clause = ",\n ".join([
            """DBMS_LOGMNR.MINE_VALUE(REDO_VALUE, :{})""".format(idx + 1)
            for idx, c in enumerate(desired_columns)
        ])
        undo_value_sql_clause = ",\n ".join([
            """DBMS_LOGMNR.MINE_VALUE(UNDO_VALUE, :{})""".format(idx + 1)
            for idx, c in enumerate(desired_columns)
        ])

        schema_name = md_map.get(()).get('schema-name')
        stream_version = get_stream_version(stream.tap_stream_id, state)
        mine_sql = """
      SELECT OPERATION, SQL_REDO, SCN, CSCN, COMMIT_TIMESTAMP,  {}, {} from v$logmnr_contents where table_name = :table_name AND operation in ('INSERT', 'UPDATE', 'DELETE')
      """.format(redo_value_sql_clause, undo_value_sql_clause)
        binds = [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \
                [orc_db.fully_qualified_column_name(schema_name, stream.table, c) for c in desired_columns] + \
                [stream.table]

        rows_saved = 0
        columns_for_record = desired_columns + ['scn', '_sdc_deleted_at']
        with metrics.record_counter(None) as counter:
            LOGGER.info("Examing log for table %s", stream.tap_stream_id)
            common.send_schema_message(stream, ['lsn'])
            for op, redo, scn, cscn, commit_ts, *col_vals in cur.execute(
                    mine_sql, binds):
                redo_vals = col_vals[0:len(desired_columns)]
                undo_vals = col_vals[len(desired_columns):]
                if op == 'INSERT' or op == 'UPDATE':
                    redo_vals += [cscn, None]
                    record_message = row_to_singer_message(
                        stream, redo_vals, stream_version, columns_for_record,
                        time_extracted)
                elif op == 'DELETE':
                    undo_vals += [
                        cscn,
                        singer.utils.strftime(
                            commit_ts.replace(tzinfo=pytz.UTC))
                    ]
                    record_message = row_to_singer_message(
                        stream, undo_vals, stream_version, columns_for_record,
                        time_extracted)
                else:
                    raise Exception(
                        "unrecognized logminer operation: {}".format(op))

                singer.write_message(record_message)
                rows_saved = rows_saved + 1
                counter.increment()
                state = singer.write_bookmark(state, stream.tap_stream_id,
                                              'scn', int(cscn))

                if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                    singer.write_message(
                        singer.StateMessage(value=copy.deepcopy(state)))

    for s in streams:
        LOGGER.info("updating bookmark for stream %s to end_lsn %s",
                    s.tap_stream_id, end_scn)
        state = singer.write_bookmark(state, s.tap_stream_id, 'scn', end_scn)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    cur.close()
    connection.close()
    return state
Esempio n. 17
0
def sync(client, config, catalog, state):
    if 'start_date' in config:
        start_date = config['start_date']

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams or selected_streams == []:
        return

    # Get current datetime (now_dt_str) for query parameters
    now_dt_str = utils.now().strftime('%Y-%m-%d')

    # Loop through selected_streams
    for stream_name in selected_streams:
        LOGGER.info('STARTED Syncing: {}'.format(stream_name))
        update_currently_syncing(state, stream_name)
        write_schema(catalog, stream_name)
        endpoint_config = STREAMS[stream_name]
        bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None)
        body_params = endpoint_config.get('body', {})
        endpoint_total = 0
        # Initialize body
        body = endpoint_config.get('body', {})
        # Loop through sites from config site_urls
        site_list = []
        if 'site_urls' in config:
            site_list = config['site_urls'].replace(" ", "").split(",")
        for site in site_list:
            LOGGER.info('STARTED Syncing: {}, Site: {}'.format(stream_name, site))
            site_total = 0
            site_encoded = quote(site, safe='')
            path = endpoint_config.get('path').format(site_encoded)

            # Set dimension_list for performance_reports
            if stream_name == 'performance_report_custom':
                dimensions_list = []
                # Create dimensions_list from catalog breadcrumb
                stream = catalog.get_stream(stream_name)
                mdata = metadata.to_map(stream.metadata)
                dimensions_all = ['date', 'country', 'device', 'page', 'query']
                for dim in dimensions_all:
                    if singer.metadata.get(mdata, ('properties', dim), 'selected'):
                        # metadata is selected for the dimension
                        dimensions_list.append(dim)
                body_params['dimensions'] = dimensions_list
            dimensions_list = body_params.get('dimensions')

            # loop through each sub type
            sub_types = endpoint_config.get('sub_types', ['self'])
            for sub_type in sub_types:
                if stream_name.startswith('performance_report'):
                    reports_dttm_str = get_bookmark(
                        state,
                        stream_name,
                        site,
                        sub_type,
                        start_date)
                    reports_dt_str = transform_datetime(reports_dttm_str)[:10]
                    body = {
                        'searchType': sub_type,
                        'startDate': reports_dt_str,
                        'endDate': now_dt_str,
                        **body_params
                    }

                LOGGER.info('START Syncing Stream: {}, Site: {}, Type: {}'.format(
                    stream_name, site, sub_type))
                total_records = sync_endpoint(
                    client=client,
                    catalog=catalog,
                    state=state,
                    start_date=start_date,
                    stream_name=stream_name,
                    site=site,
                    sub_type=sub_type,
                    dimensions_list=dimensions_list,
                    path=path,
                    endpoint_config=endpoint_config,
                    api_method=endpoint_config.get('api_method', 'GET'),
                    pagination=endpoint_config.get('pagination', 'none'),
                    static_params=endpoint_config.get('params', {}),
                    bookmark_field=bookmark_field,
                    bookmark_type=endpoint_config.get('bookmark_type'),
                    data_key=endpoint_config.get('data_key', None),
                    body_params=body,
                    id_fields=endpoint_config.get('key_properties'))

                endpoint_total = endpoint_total + total_records
                site_total = site_total + total_records
                LOGGER.info('FINISHED Syncing Stream: {}, Site: {}, Type: {}'.format(
                    stream_name, site, sub_type))
                LOGGER.info('  Records Synced for Type: {}'.format(total_records))
            LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format(stream_name, site))
            LOGGER.info('  Records Synced for Site: {}'.format(site_total))
        LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name))
        LOGGER.info('  Records Synced for Stream: {}'.format(endpoint_total))
        update_currently_syncing(state, None)
Esempio n. 18
0
def discover_catalog(snowflake_conn, config, select_all=False):
    """Returns a Catalog describing the structure of the database."""
    tables = config.get('tables').split(',')
    sql_columns = get_table_columns(snowflake_conn, tables)
    config_meta = config_meta_parser(config)

    table_info = {}
    columns = []
    for sql_col in sql_columns:
        catalog = sql_col['TABLE_CATALOG']
        schema = sql_col['TABLE_SCHEMA']
        table_name = sql_col['TABLE_NAME']

        if catalog not in table_info:
            table_info[catalog] = {}

        if schema not in table_info[catalog]:
            table_info[catalog][schema] = {}

        table_info[catalog][schema][table_name] = {
            'row_count': sql_col.get('ROW_COUNT'),
            'is_view': sql_col.get('TABLE_TYPE') == 'VIEW'
        }

        columns.append(
            Column(
                table_catalog=catalog,
                table_schema=schema,
                table_name=table_name,
                column_name=sql_col['COLUMN_NAME'],
                data_type=sql_col['DATA_TYPE'],
                character_maximum_length=sql_col['CHARACTER_MAXIMUM_LENGTH'],
                numeric_precision=sql_col['NUMERIC_PRECISION'],
                numeric_scale=sql_col['NUMERIC_SCALE']))

    entries = []
    for (k, cols) in itertools.groupby(
            columns, lambda c:
        (c.table_catalog, c.table_schema, c.table_name)):
        cols = list(cols)
        (table_catalog, table_schema, table_name) = k
        schema = Schema(
            type='object',
            properties={c.column_name: schema_for_column(c)
                        for c in cols})
        md = create_column_metadata(cols, select_all)
        md_map = metadata.to_map(md)

        md_map = metadata.write(md_map, (), 'database-name', table_catalog)
        md_map = metadata.write(md_map, (), 'schema-name', table_schema)

        if (table_catalog in table_info
                and table_schema in table_info[table_catalog]
                and table_name in table_info[table_catalog][table_schema]):
            # Row Count of views returns NULL - Transform it to not null integer by defaults to 0
            row_count = table_info[table_catalog][table_schema][
                table_name].get('row_count', 0) or 0
            is_view = table_info[table_catalog][table_schema][table_name][
                'is_view']
            md_map = metadata.write(md_map, (), 'row-count', row_count)
            md_map = metadata.write(md_map, (), 'is-view', is_view)
            # if select_all is True set replication-method default to FULL_TABLE, will be overriden if
            # user defined INCREMENTAL in the config metadata
            if select_all:
                md_map = metadata.write(md_map, (), 'replication-method',
                                        'FULL_TABLE')

            # check config to see if there was optional rolling-lookback defined, inject into catalog if so
            full_table_name = f'{catalog}-{table_schema}-{table_name}'.upper()
            rolling = config.get('rolling-lookback')
            if rolling and full_table_name in rolling:
                rolling_table_meta = rolling.get(full_table_name)
                md_map = metadata.write(md_map, (), 'rolling-lookback',
                                        rolling_table_meta)

            # check config to see if there was optional metadata defined already
            full_table_name = f'{catalog}.{table_schema}.{table_name}'.upper()
            if config_meta and full_table_name in config_meta:
                table_meta = config_meta.get(full_table_name)
                for meta_key, meta_value in table_meta.items():
                    md_map = metadata.write(md_map, (), meta_key, meta_value)

            entry = CatalogEntry(table=table_name,
                                 stream=table_name,
                                 metadata=metadata.to_list(md_map),
                                 tap_stream_id=common.generate_tap_stream_id(
                                     table_catalog, table_schema, table_name),
                                 schema=schema)

            entries.append(entry)

    return Catalog(entries)
Esempio n. 19
0
def sync_table(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version IFF we do not have an ORA_ROWSCN in our state
    #the presence of an ORA_ROWSCN indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream.tap_stream_id,
                                                     'version')

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # cur = connection.cursor()
    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.tap_stream_id, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        ora_rowscn = singer.get_bookmark(state, stream.tap_stream_id,
                                         'ORA_ROWSCN')
        if ora_rowscn:
            LOGGER.info(
                "Resuming Full Table replication %s from ORA_ROWSCN %s",
                nascent_stream_version, ora_rowscn)
            select_sql = """SELECT {}, ORA_ROWSCN
                                FROM {}.{}
                               WHERE ORA_ROWSCN >= {}
                               ORDER BY ORA_ROWSCN ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema, escaped_table,
                                           ora_rowscn)
        else:
            select_sql = """SELECT {}, ORA_ROWSCN
                                FROM {}.{}
                               ORDER BY ORA_ROWSCN ASC""".format(
                ','.join(escaped_columns), escaped_schema, escaped_table)

        rows_saved = 0
        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            ora_rowscn = row[-1]
            row = row[:-1]
            record_message = common.row_to_singer_message(
                stream, row, nascent_stream_version, desired_columns,
                time_extracted)

            singer.write_message(record_message)
            state = singer.write_bookmark(state, stream.tap_stream_id,
                                          'ORA_ROWSCN', ora_rowscn)
            rows_saved = rows_saved + 1
            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            counter.increment()

    state = singer.write_bookmark(state, stream.tap_stream_id, 'ORA_ROWSCN',
                                  None)
    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)
    cur.close()
    connection.close()
    return state
Esempio n. 20
0
 def is_selected(cls, stream_name):
     stream = cls.get_catalog_entry(stream_name)
     stream_metadata = metadata.to_map(stream['metadata'])
     return metadata.get(stream_metadata, (), 'selected')
Esempio n. 21
0
def discover_catalog(mysql_conn, config):
    '''Returns a Catalog describing the structure of the database.'''

    filter_dbs_config = config.get('filter_dbs')

    if filter_dbs_config:
        filter_dbs_clause = ",".join(
            ["'{}'".format(db) for db in filter_dbs_config.split(",")])

        table_schema_clause = "WHERE table_schema IN ({})".format(
            filter_dbs_clause)
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql',
        'sys'
        )"""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute("""
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {}
            """.format(table_schema_clause))

            table_info = {}

            for (db, table, table_type, rows) in cur.fetchall():
                if db not in table_info:
                    table_info[db] = {}

                table_info[db][table] = {
                    'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute("""
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {}
                    ORDER BY table_schema, table_name
            """.format(table_schema_clause))

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

    return Catalog(entries)
Esempio n. 22
0
def sync_sub_stream(sub_stream_name, parent_obj, updates=False):
    """
    Given a parent object, retrieve its values for the specified substream.
    """
    extraction_time = singer.utils.now()

    if sub_stream_name == "invoice_line_items":
        object_list = parent_obj.lines
    elif sub_stream_name == "subscription_items":
        # parent_obj.items is a function that returns a dict iterator, so use the attribute
        object_list = parent_obj.get("items")
    elif sub_stream_name == "payout_transactions":
        payout_id = parent_obj['id']
        acct_id = Context.config.get('account_id')
        # Balance transaction history with a payout id param
        # provides the link of transactions to payouts
        if 'automatic' in parent_obj and parent_obj['automatic']:
            object_list = stripe.BalanceTransaction.list(
                limit=100, stripe_account=acct_id, payout=payout_id)
        else:
            # According to the API docs balance history is only available
            # for automatic stripe payouts.
            # https://stripe.com/docs/api/balance/balance_history#balance_history-payout
            LOGGER.info(
                'Skipping retrieval of balance history for manual payout %s',
                payout_id)
            return
    else:
        raise Exception(
            "Attempted to sync substream that is not implemented: {}".format(
                sub_stream_name))

    substream_count = 0
    expected_count = None
    # The following code arose because we encountered a bug in the API
    # whereby we enter an infinite loop based on what appears to be bad
    # API behavior on Stripe's end, which is [acknowledged by their
    # team][1]
    #
    # [1]: https://github.com/stripe/stripe-python/issues/567#issuecomment-490957400
    #
    # Our workaround is to rely on the `total_count` of the object_list if
    # we have it (in the case of the affected sub stream,
    # `invoice_line_items`, it has that attribute. Presumably they all
    # have it but the following code was written out of an abundance of
    # caution.) to track whether we've emitted more records than the API
    # told us it had. This may be brittle but we'll have to prove that out
    # in the wild. To make it as robust as possible we're currently
    # restricting it to the `invoice_line_items` substream only. If it
    # were to prove useful elsewhere we will need to increase the
    # complexity of the ValueError generated in the event of an infinite
    # loop to emit other urls.
    if sub_stream_name == 'invoice_line_items' and hasattr(
            object_list, 'total_count'):
        LOGGER.debug(
            "Will verify substream sync using the object_list's total_count.")
        expected_count = object_list.total_count
    else:
        LOGGER.debug(("Will not verify substream sync because object_list "
                      "has no total_count attribute or is not "
                      "invoice_line_items substream."))

    with Transformer(
            singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        iterator = get_object_list_iterator(object_list)
        for sub_stream_obj in iterator:
            if expected_count:
                substream_count += 1
                if (expected_count + INITIAL_SUB_STREAM_OBJECT_LIST_LENGTH
                    ) < substream_count:
                    # If we detect that the total records are greater than
                    # the first page length (10) plus the expected total,
                    # we can confidently say we are in an infinite loop.
                    raise ValueError((
                        "Infinite loop detected. Please contact Stripe "
                        "support with the following curl request: `curl "
                        "-v -H 'Stripe-Account: <redacted>' -H "
                        "'Stripe-Version: {}' -u '<redacted>:' -G "
                        "--data-urlencode 'limit=100' "
                        "https://api.stripe.com/v1/invoices/{}/lines`. "
                        "You can reference the following Github issue "
                        "in your conversation with Stripe support: "
                        "https://github.com/stripe/stripe-python/issues/567#issuecomment-490957400"
                    ).format(stripe.api_version, parent_obj.id))
            obj_ad_dict = sub_stream_obj.to_dict_recursive()

            if sub_stream_name == "invoice_line_items":
                # Synthetic addition of a key to the record we sync
                obj_ad_dict["invoice"] = parent_obj.id
            elif sub_stream_name == "payout_transactions":
                # payout_transactions is a join table
                obj_ad_dict = {
                    "id": obj_ad_dict['id'],
                    "payout_id": parent_obj['id']
                }

            rec = transformer.transform(
                unwrap_data_objects(obj_ad_dict),
                Context.get_catalog_entry(sub_stream_name)['schema'],
                metadata.to_map(
                    Context.get_catalog_entry(sub_stream_name)['metadata']))
            # NB: Older structures (such as invoice_line_items) may not have had their ID present.
            #     Skip these if they don't match the structure we expect.
            if "id" in rec:
                singer.write_record(sub_stream_name,
                                    rec,
                                    time_extracted=extraction_time)
            if updates:
                Context.updated_counts[sub_stream_name] += 1
            else:
                Context.new_counts[sub_stream_name] += 1
def consume_message(streams, state, msg, time_extracted, conn_info):
    # Strip leading comma generated by write-in-chunks and parse valid JSON
    try:
        payload = json.loads(msg.payload.lstrip(','))
    except Exception:
        return state

    lsn = msg.data_start

    streams_lookup = {s['tap_stream_id']: s for s in streams}

    tap_stream_id = post_db.compute_tap_stream_id(payload['schema'],
                                                  payload['table'])
    if streams_lookup.get(tap_stream_id) is None:
        return state

    target_stream = streams_lookup[tap_stream_id]

    if payload['kind'] not in {'insert', 'update', 'delete'}:
        raise UnsupportedPayloadKindError(
            "unrecognized replication operation: {}".format(payload['kind']))

    # Get the additional fields in payload that are not in schema properties:
    # only inserts and updates have the list of columns that can be used to detect any different in columns
    diff = set()
    if payload['kind'] in {'insert', 'update'}:
        diff = set(payload['columnnames']).difference(
            target_stream['schema']['properties'].keys())

    # if there is new columns in the payload that are not in the schema properties then refresh the stream schema
    if diff:
        LOGGER.info(
            'Detected new columns "%s", refreshing schema of stream %s', diff,
            target_stream['stream'])
        # encountered a column that is not in the schema
        # refresh the stream schema and metadata by running discovery
        refresh_streams_schema(conn_info, [target_stream])

        # add the automatic properties back to the stream
        add_automatic_properties(target_stream,
                                 conn_info.get('debug_lsn', False))

        # publish new schema
        sync_common.send_schema_message(target_stream, ['lsn'])

    stream_version = get_stream_version(target_stream['tap_stream_id'], state)
    stream_md_map = metadata.to_map(target_stream['metadata'])

    desired_columns = {
        c
        for c in target_stream['schema']['properties'].keys()
        if sync_common.should_sync_column(stream_md_map, c)
    }

    if payload['kind'] in {'insert', 'update'}:
        col_names = []
        col_vals = []

        for idx, col in enumerate(payload['columnnames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['columnvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(None)

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    elif payload['kind'] == 'delete':
        col_names = []
        col_vals = []
        for idx, col in enumerate(payload['oldkeys']['keynames']):
            if col in desired_columns:
                col_names.append(col)
                col_vals.append(payload['oldkeys']['keyvalues'][idx])

        col_names.append('_sdc_deleted_at')
        col_vals.append(singer.utils.strftime(time_extracted))

        if conn_info.get('debug_lsn'):
            col_names.append('_sdc_lsn')
            col_vals.append(str(lsn))

        record_message = row_to_singer_message(target_stream, col_vals,
                                               stream_version, col_names,
                                               time_extracted, stream_md_map,
                                               conn_info)

    singer.write_message(record_message)
    state = singer.write_bookmark(state, target_stream['tap_stream_id'], 'lsn',
                                  lsn)

    return state