Exemple #1
0
def sync_table(connection, catalog_entry, state, columns, stream_version):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        yield activate_version_message

    with connection.cursor() as cursor:
        select_sql = common.generate_select_sql(catalog_entry, columns)

        params = {}

        for message in common.sync_query(cursor, catalog_entry, state,
                                         select_sql, columns, stream_version,
                                         params):
            yield message

    yield activate_version_message
Exemple #2
0
def sync_table(snowflake_conn, catalog_entry, state, columns):
    """Sync table incrementally"""
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key',
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    singer.write_message(activate_version_message)

    select_sql = common.generate_select_sql(catalog_entry, columns)
    params = {}

    with snowflake_conn.connect_with_backoff() as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == 'date-time':
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                # pylint: disable=duplicate-string-formatting-argument
                select_sql += ' WHERE "{}" >= \'{}\' ORDER BY "{}" ASC'.format(
                    replication_key_metadata, replication_key_value,
                    replication_key_metadata)

            elif replication_key_metadata is not None:
                select_sql += ' ORDER BY "{}" ASC'.format(
                    replication_key_metadata)

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)
Exemple #3
0
def sync_file(bucket, s3_path, stream, version=None):
    LOGGER.info('Syncing file "%s".', s3_path)

    table_name = stream['stream']

    s3_file_handle = s3.get_file_handle(bucket, s3_path)
    iterator = fastavro.reader(s3_file_handle._raw_stream)
    mdata = metadata.to_map(stream['metadata'])
    schema = generate_schema_from_avro(iterator.schema)

    key_properties = metadata.get(mdata, (), 'table-key-properties')
    singer.write_schema(table_name, schema, key_properties)

    # Activate a version so we execute a full table sync
    if version is not None:
        LOGGER.info('Sending Activate Version Message with version %d',
                    version)
        message = singer.ActivateVersionMessage(stream=table_name,
                                                version=version)
        singer.write_message(message)

    records_synced = 0
    with Transformer() as transformer:
        for row in iterator:
            to_write = transformer.filter_data_by_metadata(row, mdata)
            singer.write_message(
                singer.RecordMessage(table_name, to_write, version=version))
            records_synced += 1

    return records_synced
Exemple #4
0
def sync_table(mysql_conn, catalog_entry, state, columns, stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    perform_resumable_sync = sync_is_resumable(mysql_conn, catalog_entry)

    pk_clause = ""

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)

            if perform_resumable_sync:
                LOGGER.info(
                    "Full table sync is resumable based on primary key definition, will replicate incrementally"
                )

                state = update_incremental_full_table_state(
                    catalog_entry, state, cur)
                pk_clause = generate_pk_clause(catalog_entry, state)

            select_sql += pk_clause

            try:
                select_sql = _create_temp_table(mysql_conn, catalog_entry,
                                                columns, pk_clause)
            except Exception as ex:
                logging.warning("creating temp table failed: {}".format(
                    str(ex)))

            params = {}

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)
def sync_view(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version
    nascent_stream_version = int(time.time() * 1000)
    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # cur = connection.cursor()
    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.tap_stream_id, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        select_sql = 'SELECT {} FROM {}.{}'.format(','.join(escaped_columns),
                                                   escaped_schema,
                                                   escaped_table)

        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, nascent_stream_version, desired_columns,
                time_extracted)
            singer.write_message(record_message)
            counter.increment()

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)
    cur.close()
    connection.close()
    return state
Exemple #6
0
def sync_stream(kafka_config, stream, state):
    consumer = KafkaConsumer(
        kafka_config['topic'],
        group_id=kafka_config['group_id'],
        enable_auto_commit=False,
        consumer_timeout_ms=kafka_config.get('consumer_timeout_ms', 10000),
        auto_offset_reset='earliest',
        value_deserializer=lambda m: json.loads(m.decode('ascii')),
        bootstrap_servers=kafka_config['bootstrap_servers'])

    send_schema_message(stream)
    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream['tap_stream_id'], version=stream_version)

    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    for message in consumer:
        LOGGER.info("%s:%s:%s: key=%s value=%s" %
                    (message.topic, message.partition, message.offset,
                     message.key, message.value))
        # stream['schema']
        record = singer.RecordMessage(stream=stream['tap_stream_id'],
                                      record=message.value,
                                      time_extracted=time_extracted)

        [valid, error] = validate_record(stream['schema'], record)
        rows_saved = rows_saved + 1

        if valid:
            singer.write_message(record)
        elif kafka_config.get('reject_topic'):
            send_reject_message(kafka_config, record, error)
        else:
            raise Exception(
                "record failed validation and no reject_topic was specified")

        state = singer.write_bookmark(state, stream['tap_stream_id'], 'offset',
                                      message.offset)

        #commit offsets because we processed the message
        tp = TopicPartition(message.topic, message.partition)
        consumer.commit({tp: OffsetAndMetadata(message.offset + 1, None)})

        if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemple #7
0
def sync_table(mysql_conn,
               catalog_entry,
               state,
               columns,
               original_state_file=''):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key',
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream='%s_%s' %
        (common.get_database_name(catalog_entry), catalog_entry.stream),
        version=stream_version)

    singer.write_message(activate_version_message)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == 'date-time':
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                    replication_key_metadata, replication_key_metadata)

                params['replication_key_value'] = replication_key_value

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params, original_state_file)
Exemple #8
0
def sync_table(mssql_conn, config, catalog_entry, state, columns):
    mssql_conn = MSSQLConnection(config)
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get("replication-key")
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                "replication_key")

    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, "replication_key_value")
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      "replication_key",
                                      replication_key_metadata)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      "replication_key_value")

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  "version", stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream, version=stream_version)

    singer.write_message(activate_version_message)
    LOGGER.info("Beginning SQL")
    with connect_with_backoff(mssql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            if replication_key_value is not None:
                if catalog_entry.schema.properties[
                        replication_key_metadata].format == "date-time":
                    replication_key_value = pendulum.parse(
                        replication_key_value)

                select_sql += " WHERE \"{}\" >= %(replication_key_value)s ORDER BY \"{}\" ASC".format(
                    replication_key_metadata, replication_key_metadata)

                params["replication_key_value"] = replication_key_value
            elif replication_key_metadata is not None:
                select_sql += " ORDER BY \"{}\" ASC".format(
                    replication_key_metadata)

            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream["tap_stream_id"],
                                    "version") is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream["tap_stream_id"], "version",
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get("schema-name")

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version,
    )

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name="stitch_cursor") as cur:
                cur.itersize = post_db.cursor_iter_size
                select_sql = "SELECT {} FROM {}".format(
                    ",".join(escaped_columns),
                    post_db.fully_qualified_table_name(schema_name,
                                                       stream["table_name"]),
                )

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1
                    if rows_saved % conn_info["emit_state_every_n_rows"] == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
Exemple #10
0
def sync(config, state, catalog):
    """ Sync data from tap source """
    # Loop over selected streams in catalog

    for stream in catalog.get_selected_streams(state):
        LOGGER.info('Syncing stream: %s', stream.tap_stream_id)

        bookmark_column = stream.replication_key
        is_sorted = True  # TODO: indicate whether data is sorted ascending on bookmark value
        activate_version_ind = True #full table replication of all streams

        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=stream.schema.to_dict(),
            key_properties=stream.key_properties
        )

        columns = list(stream.schema.to_dict().get('properties').keys())

        max_bookmark = None
        for row in tap_data(config, stream.tap_stream_id, columns):
            # TODO: place type conversions or transformations here

            # write one or more rows to the stream:
            singer.write_records(stream.tap_stream_id, [row])
            if bookmark_column:
                if is_sorted:
                    # update bookmark to latest value
                    singer.write_state({stream.tap_stream_id: row[bookmark_column]})
                else:
                    # if data unsorted, save max value until end of writes
                    max_bookmark = max(max_bookmark, row[bookmark_column])
        if bookmark_column and not is_sorted:
            singer.write_state({stream.tap_stream_id: max_bookmark})

        if activate_version_ind:
            activate_version = max_bookmark
            activate_version_message = singer.ActivateVersionMessage(
                stream=stream_name,
                version=activate_version)
        else:
            activate_version = None

        if total_records > 0 :
            # End of Stream: Send Activate Version (if needed)
            if activate_version_ind:
                singer.write_message(activate_version_message)
        else:
            LOGGER.warning('NO NEW DATA FOR STREAM: {}'.format(stream_name))

        LOGGER.info('Synced: {}, total_records: {}'.format(
                        input_stream_id,
                        total_records))
        LOGGER.info('FINISHED Syncing: {}'.format(input_stream_id))

    return
def sync_table(connection, catalog_entry, state, columns):
    common.whitelist_bookmark_keys(BOOKMARK_KEYS, catalog_entry.tap_stream_id,
                                   state)

    catalog_metadata = metadata.to_map(catalog_entry.metadata)
    stream_metadata = catalog_metadata.get((), {})

    replication_key_metadata = stream_metadata.get('replication-key')
    replication_key_state = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key')

    replication_key = replication_key_state or replication_key_metadata
    replication_key_value = None

    if replication_key_metadata == replication_key_state:
        replication_key_value = singer.get_bookmark(
            state, catalog_entry.tap_stream_id, 'replication_key_value')
    else:
        state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key', replication_key)
        state = singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                                      'replication_key_value')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    yield singer.ActivateVersionMessage(stream=catalog_entry.stream,
                                        version=stream_version)

    with connection.cursor() as cursor:
        select_sql = common.generate_select_sql(catalog_entry, columns)
        params = {}

        if replication_key_value is not None:
            if catalog_entry.schema.properties[
                    replication_key].format == 'date-time':
                replication_key_value = pendulum.parse(replication_key_value)

            select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                replication_key, replication_key)

            params['replication_key_value'] = replication_key_value
        elif replication_key is not None:
            select_sql += ' ORDER BY `{}` ASC'.format(replication_key)

        for message in common.sync_query(cursor, catalog_entry, state,
                                         select_sql, columns, stream_version,
                                         params):
            yield message
Exemple #12
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                select_sql = f"SELECT {','.join(escaped_columns)} FROM " \
                             f"{post_db.fully_qualified_table_name(schema_name,stream['table_name'])}"

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved += 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    # always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
Exemple #13
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                select_sql = 'SELECT {} FROM {}'.format(
                    ','.join(escaped_columns),
                    post_db.fully_qualified_table_name(schema_name,
                                                       stream.table))

                LOGGER.info("select %s", select_sql)
                cur.execute(select_sql)

                rows_saved = 0
                rec = cur.fetchone()
                while rec is not None:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()
                    rec = cur.fetchone()

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
Exemple #14
0
def sync_table(connection, catalog_entry, state):
    columns = common.generate_column_list(catalog_entry)

    if not columns:
        LOGGER.warning(
            'There are no columns selected for table %s, skipping it',
            catalog_entry.table)
        return

    bookmark_is_empty = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id) is None


    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)
    state = singer.write_bookmark(state,
                                  catalog_entry.tap_stream_id,
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream,
        version=stream_version
    )

    # If there is no bookmark at all for this stream, assume it is the
    # very first replication. Emity an ACTIVATE_VERSION message at the
    # beginning so the recors show up right away.
    if bookmark_is_empty:
       yield activate_version_message

    with connection.cursor() as cursor:
        select_sql = common.generate_select_sql(catalog_entry, columns)

        params = {}

        for message in common.sync_query(cursor,
                                         catalog_entry,
                                         state,
                                         select_sql,
                                         columns,
                                         stream_version,
                                         params):
            yield message

    # Clear the stream's version from the state so that subsequent invocations will
    # emit a distinct stream version.
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id, 'version', None)

    yield activate_version_message
    yield singer.StateMessage(value=copy.deepcopy(state))
Exemple #15
0
def sync_table(connection, catalog_entry, state):
    columns = common.generate_column_list(catalog_entry)

    if not columns:
        LOGGER.warning(
            'There are no columns selected for table %s, skipping it',
            catalog_entry.table)
        return

    replication_key_value = singer.get_bookmark(state,
                                                catalog_entry.tap_stream_id,
                                                'replication_key_value')

    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          'replication_key')

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id,
                                               state)
    state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                  'version', stream_version)

    yield singer.ActivateVersionMessage(stream=catalog_entry.stream,
                                        version=stream_version)

    with connection.cursor() as cursor:
        select_sql = common.generate_select_sql(catalog_entry, columns)
        params = {}

        if replication_key_value is not None:
            if catalog_entry.schema.properties[
                    replication_key].format == 'date-time':
                replication_key_value = pendulum.parse(replication_key_value)

            select_sql += ' WHERE `{}` >= %(replication_key_value)s ORDER BY `{}` ASC'.format(
                replication_key, replication_key)

            params['replication_key_value'] = replication_key_value
        elif replication_key is not None:
            select_sql += ' ORDER BY `{}` ASC'.format(replication_key)

        for message in common.sync_query(cursor, catalog_entry, state,
                                         select_sql, columns, stream_version,
                                         params):
            yield message
Exemple #16
0
def sync_table(mysql_conn, config, catalog_entry, state, columns,
               stream_version):
    common.whitelist_bookmark_keys(generate_bookmark_keys(catalog_entry),
                                   catalog_entry.tap_stream_id, state)

    bookmark = state.get('bookmarks', {}).get(catalog_entry.tap_stream_id, {})
    version_exists = True if 'version' in bookmark else False

    initial_full_table_complete = singer.get_bookmark(
        state, catalog_entry.tap_stream_id, 'initial_full_table_complete')

    state_version = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                        'version')

    activate_version_message = singer.ActivateVersionMessage(
        stream='%s_%s' %
        (common.get_database_name(catalog_entry), catalog_entry.stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if not initial_full_table_complete and not (version_exists
                                                and state_version is None):
        singer.write_message(activate_version_message)

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            select_sql = common.generate_select_sql(catalog_entry, columns)
            params = {}

            # common.sync_query(cur, catalog_entry, state, select_sql, columns, stream_version, params)
            common.sync_query(cur, catalog_entry, state, select_sql, columns,
                              stream_version, params)

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, catalog_entry.tap_stream_id, 'max_pk_values')
    singer.clear_bookmark(state, catalog_entry.tap_stream_id,
                          'last_pk_fetched')

    singer.write_message(activate_version_message)
Exemple #17
0
def sync_stream(bucket, state, stream, manifests):
    table_name = stream['stream']
    LOGGER.info('Syncing table "%s".', table_name)

    table_manifests, should_create_new_version = filter_manifests_to_sync(manifests,
                                                                          table_name,
                                                                          state)

    files = get_files_to_sync(table_manifests, table_name, state, bucket)

    records_streamed = 0

    version = singer.get_bookmark(state, table_name, 'version')

    if should_create_new_version:
        # Set version so it can be used for an activate version message
        version = int(time.time() * 1000)

        LOGGER.info('Detected full sync for stream table name %s, setting version to %d',
                    table_name,
                    version)
        state = singer.write_bookmark(state, table_name, 'version', version)
        singer.write_state(state)

    for s3_file_path in files:
        file_records_streamed = sync_file(bucket, s3_file_path, stream, version)
        records_streamed += file_records_streamed
        LOGGER.info('Wrote %d records for file %s', file_records_streamed, s3_file_path)

        # Finished syncing a file, write a bookmark
        state = singer.write_bookmark(state, table_name, 'file', s3_file_path)
        singer.write_state(state)

    if records_streamed > 0:
        LOGGER.info('Sending activate version message %d', version)
        message = singer.ActivateVersionMessage(stream=table_name, version=version)
        singer.write_message(message)

    LOGGER.info('Wrote %s records for table "%s".', records_streamed, table_name)
    return records_streamed
Exemple #18
0
def sync_endpoint(catalog_entry, schema, mdata, date_fields = None):
    singer.write_schema(catalog_entry.tap_stream_id,
                        schema,
                        [PRIMARY_KEY],
                        bookmark_properties = [REPLICATION_KEY])

    start = get_start(catalog_entry.tap_stream_id)
    url = get_url(catalog_entry.tap_stream_id)
    data = request(url)[catalog_entry.tap_stream_id]
    time_extracted = utils.now()

    stream_version = get_stream_version(catalog_entry.tap_stream_id)
    activate_version_message = singer.ActivateVersionMessage(
        stream=catalog_entry.stream,
        version=stream_version
    )

    for row in data:
        with Transformer() as transformer:
            rec = transformer.transform(row, schema, mdata)
            append_times_to_dates(rec, date_fields)

            try:
                updated_at = rec[REPLICATION_KEY]
            except KeyError:
                updated_at = start
            
            if updated_at >= start:
                new_record = singer.RecordMessage(
                    stream=catalog_entry.stream,
                    record=rec,
                    version=stream_version,
                    time_extracted=time_extracted)
                singer.write_message(new_record)
 
                utils.update_state(STATE, catalog_entry.tap_stream_id, updated_at)

    singer.write_state(STATE)
    singer.write_message(activate_version_message)
Exemple #19
0
def overloaded_parse_message(msg):
    """Parse a message string into a Message object."""

    # We are not using Decimals for parsing here.
    # We recognize that exposes data to potentially
    # lossy conversions.  However, this will affect
    # very few data points and we have chosen to
    # leave conversion as is for now.
    obj = simplejson.loads(msg, use_decimal=True)
    msg_type = _required_key(obj, 'type')

    if msg_type == 'RECORD':
        time_extracted = obj.get('time_extracted')
        if time_extracted:
            try:
                time_extracted = ciso8601.parse_datetime(time_extracted)
            except Exception:
                time_extracted = None
        return singer.RecordMessage(stream=_required_key(obj, 'stream'),
                                    record=_required_key(obj, 'record'),
                                    version=obj.get('version'),
                                    time_extracted=time_extracted)

    if msg_type == 'SCHEMA':
        return singer.SchemaMessage(
            stream=_required_key(obj, 'stream'),
            schema=_required_key(obj, 'schema'),
            key_properties=_required_key(obj, 'key_properties'),
            bookmark_properties=obj.get('bookmark_properties'))

    if msg_type == 'STATE':
        return singer.StateMessage(value=_required_key(obj, 'value'))

    if msg_type == 'ACTIVATE_VERSION':
        return singer.ActivateVersionMessage(
            stream=_required_key(obj, 'stream'),
            version=_required_key(obj, 'version'))
    return None
Exemple #20
0
def sync_stream(config, consumer, stream, state):
    write_schema(stream.tap_stream_id, stream.schema.to_dict(), stream.key_properties or [])
    stream_version = singer.get_bookmark(state, stream.tap_stream_id, "version")
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream.tap_stream_id, "version", stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    activate_version_message = singer.ActivateVersionMessage(stream=stream.tap_stream_id, version=stream_version)
    singer.write_message(activate_version_message)

    validator = Draft4Validator(stream.schema.to_dict(), format_checker=FormatChecker())
    time_extracted = utils.now()
    rows_saved = 0

    # Assign all the partitions for the topic to this consumer
    topic_partitions = [TopicPartition(config['topic'], partition_id) for partition_id in consumer.partitions_for_topic(config['topic'])]
    consumer.assign(topic_partitions)

    # Seek each partition to it's value from the STATE, or to the beginning otherwise
    offsets = singer.get_offset(state, stream.tap_stream_id, {})
    for topic_partition in topic_partitions:
        if str(topic_partition.partition) in offsets:
            consumer.seek(topic_partition, offsets[str(topic_partition.partition)])
        else:
            consumer.seek_to_beginning(topic_partition)

    for message in consumer:
        record = singer.RecordMessage(stream=stream.tap_stream_id, record=message.value, time_extracted=time_extracted)
        validator.validate(record.record)
        singer.write_message(record)

        state = singer.set_offset(state, stream.tap_stream_id, message.partition, message.offset)
        rows_saved = rows_saved + 1
        if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemple #21
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version IFF we do not have an xmin in our state
    #the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream.tap_stream_id, 'xmin') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream.tap_stream_id,
                                                     'version')

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                xmin = singer.get_bookmark(state, stream.tap_stream_id, 'xmin')
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version, xmin)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) < age('{}'::xid)
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table), xmin)
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table))

                LOGGER.info("select %s", select_sql)
                cur.execute(select_sql)

                rows_saved = 0
                rec = cur.fetchone()
                while rec is not None:
                    xmin = rec['xmin']
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state, stream.tap_stream_id,
                                                  'xmin', xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()
                    rec = cur.fetchone()

    #once we have completed the full table replication, discard the xmin bookmark.
    #the xmin bookmark only comes into play when a full table replication is interrupted
    state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', None)

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
def sync_table(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream.tap_stream_id,
                                         'version')
    # If there was no bookmark for stream_version, it is the first time
    # this table is being sync'd, so get a new version, write to
    # state
    if stream_version is None:
        stream_version = int(time.time() * 1000)
        state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                      stream_version)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=stream_version)
    singer.write_message(activate_version_message)

    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table

    replication_key = md.get((), {}).get('replication-key')
    #escaped_replication_key = common.prepare_columns_sql(stream, replication_key)
    replication_key_value = singer.get_bookmark(state, stream.tap_stream_id,
                                                'replication_key_value')
    replication_key_sql_datatype = md.get(
        ('properties', replication_key)).get('sql-datatype')

    with metrics.record_counter(None) as counter:
        if replication_key_value:
            LOGGER.info("Resuming Incremental replication from %s = %s",
                        replication_key, replication_key_value)
            casted_where_clause_arg = common.prepare_where_clause_arg(
                replication_key_value, replication_key_sql_datatype)

            select_sql = """SELECT {}
                                FROM {}.{}
                               WHERE {} >= {}
                               ORDER BY {} ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema, escaped_table,
                                           replication_key,
                                           casted_where_clause_arg,
                                           replication_key)
        else:
            select_sql = """SELECT {}
                                FROM {}.{}
                               ORDER BY {} ASC
                               """.format(','.join(escaped_columns),
                                          escaped_schema, escaped_table,
                                          replication_key)

        rows_saved = 0
        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, stream_version, desired_columns, time_extracted)

            singer.write_message(record_message)
            rows_saved = rows_saved + 1

            #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
            #event worse would be allowing the NULL value to enter into the state
            if record_message.record[replication_key] is not None:
                state = singer.write_bookmark(
                    state, stream.tap_stream_id, 'replication_key_value',
                    record_message.record[replication_key])

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            counter.increment()

    cur.close()
    connection.close()
    return state
Exemple #23
0
def sync_table(connection, catalog_entry, state):
    columns = list(catalog_entry.schema.properties.keys())
    start_date = CONFIG.get('start_date')
    formatted_start_date = None

    if not columns:
        LOGGER.warning(
            'There are no columns selected for table {}, skipping it'.format(
                catalog_entry.table))
        return

    tap_stream_id = catalog_entry.tap_stream_id
    LOGGER.info('Beginning sync for {} table'.format(tap_stream_id))
    with connection.cursor() as cursor:
        schema, table = catalog_entry.table.split('.')
        select = 'SELECT {} FROM {}.{}'.format(
            ','.join('"{}"'.format(c) for c in columns), '"{}"'.format(schema),
            '"{}"'.format(table))
        params = {}

        if start_date is not None:
            formatted_start_date = datetime.datetime.strptime(
                start_date, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC)

        replication_key = metadata.to_map(catalog_entry.metadata).get(
            (), {}).get('replication-key')
        replication_key_value = None
        bookmark_is_empty = state.get('bookmarks',
                                      {}).get(tap_stream_id) is None
        stream_version = get_stream_version(tap_stream_id, state)
        state = singer.write_bookmark(state, tap_stream_id, 'version',
                                      stream_version)
        activate_version_message = singer.ActivateVersionMessage(
            stream=catalog_entry.stream, version=stream_version)

        # If there's a replication key, we want to emit an ACTIVATE_VERSION
        # message at the beginning so the records show up right away. If
        # there's no bookmark at all for this stream, assume it's the very
        # first replication. That is, clients have never seen rows for this
        # stream before, so they can immediately acknowledge the present
        # version.
        if replication_key or bookmark_is_empty:
            yield activate_version_message

        if replication_key:
            replication_key_value = singer.get_bookmark(
                state, tap_stream_id,
                'replication_key_value') or formatted_start_date.isoformat()

        if replication_key_value is not None:
            entry_schema = catalog_entry.schema

            if entry_schema.properties[replication_key].format == 'date-time':
                replication_key_value = pendulum.parse(replication_key_value)

            select += ' WHERE {} >= %(replication_key_value)s ORDER BY {} ' \
                      'ASC'.format(replication_key, replication_key)
            params['replication_key_value'] = replication_key_value

        elif replication_key is not None:
            select += ' ORDER BY {} ASC'.format(replication_key)

        time_extracted = utils.now()
        query_string = cursor.mogrify(select, params)
        LOGGER.info('Running {}'.format(query_string))
        cursor.execute(select, params)
        row = cursor.fetchone()
        rows_saved = 0

        with metrics.record_counter(None) as counter:
            counter.tags['database'] = catalog_entry.database
            counter.tags['table'] = catalog_entry.table
            while row:
                counter.increment()
                rows_saved += 1
                record_message = row_to_record(catalog_entry, stream_version,
                                               row, columns, time_extracted)
                yield record_message

                if replication_key is not None:
                    state = singer.write_bookmark(
                        state, tap_stream_id, 'replication_key_value',
                        record_message.record[replication_key])
                if rows_saved % 1000 == 0:
                    yield singer.StateMessage(value=copy.deepcopy(state))
                row = cursor.fetchone()

        if not replication_key:
            yield activate_version_message
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'version', None)

        yield singer.StateMessage(value=copy.deepcopy(state))
Exemple #24
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        sf.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    replication_key = catalog_entry.get('replication_key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Salesforce data for stream %s', stream)
    with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
        for rec in sf.query(catalog_entry, state):
            counter.increment()
            rec = transformer.transform(rec, schema)
            rec = fix_record_anytype(rec, schema)
            singer.write_message(
                singer.RecordMessage(stream=(stream_alias or stream),
                                     record=rec,
                                     version=stream_version,
                                     time_extracted=start_time))

            replication_key_value = replication_key and singer_utils.strptime_with_tz(
                rec[replication_key])

            if sf.pk_chunking:
                if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark:
                    # Replace the highest seen bookmark and save the state in case we need to resume later
                    chunked_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])
                    state = singer.write_bookmark(
                        state, catalog_entry['tap_stream_id'],
                        'JobHighestBookmarkSeen',
                        singer_utils.strftime(chunked_bookmark))
                    singer.write_state(state)
            # Before writing a bookmark, make sure Salesforce has not given us a
            # record with one outside our range
            elif replication_key_value and replication_key_value <= start_time:
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              replication_key,
                                              rec[replication_key])
                singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
        if not replication_key:
            singer.write_message(activate_version_message)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          'version', None)

        # If pk_chunking is set, only write a bookmark at the end
        if sf.pk_chunking:
            # Write a bookmark with the highest value we've seen
            state = singer.write_bookmark(
                state, catalog_entry['tap_stream_id'], replication_key,
                singer_utils.strptime(chunked_bookmark))
Exemple #25
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=stream_version)

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get('replication-key')
    replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                                'replication_key_value')
    replication_key_sql_datatype = md_map.get(
        ('properties', replication_key)).get('sql-datatype')

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='pipelinewise') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                LOGGER.info("Beginning new incremental replication sync %s",
                            stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} >= '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value, replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key))
                else:
                    #if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key))

                LOGGER.info('select statement: %s with itersize %s',
                            select_sql, cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns,
                        time_extracted, md_map)

                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
                    #event worse would be allowing the NULL value to enter into the state
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state, stream['tap_stream_id'],
                            'replication_key_value',
                            record_message.record[replication_key])

                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    return state
Exemple #26
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting full table sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')

    db = client[database_name]
    collection = db[stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    if singer.get_bookmark(state, stream['tap_stream_id'], 'max_id_value'):
        # There is a bookmark
        max_id_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                           'max_id_value')
        max_id_type = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'max_id_type')
        max_id_value = common.string_to_class(max_id_value, max_id_type)
    else:
        max_id_value = get_max_id_value(collection)

    last_id_fetched = singer.get_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched')

    if max_id_value:
        # Write the bookmark if max_id_value is defined
        state = singer.write_bookmark(
            state, stream['tap_stream_id'], 'max_id_value',
            common.class_to_string(max_id_value,
                                   max_id_value.__class__.__name__))
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'max_id_type',
                                      max_id_value.__class__.__name__)

    find_filter = {'$lte': max_id_value}
    if last_id_fetched:
        last_id_fetched_type = singer.get_bookmark(state,
                                                   stream['tap_stream_id'],
                                                   'last_id_fetched_type')
        find_filter['$gte'] = common.string_to_class(last_id_fetched,
                                                     last_id_fetched_type)

    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(
        stream['tap_stream_id'], find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    # pylint: disable=logging-format-interpolation
    LOGGER.info(query_message)

    with collection.find({'_id': find_filter},
                         projection,
                         sort=[("_id", pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        schema = stream['schema'] or {"type": "object", "properties": {}}
        for row in cursor:
            rows_saved += 1

            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(
                    singer.SchemaMessage(
                        stream=common.calculate_destination_stream_name(
                            stream),
                        schema=schema,
                        key_properties=['_id']))
                common.SCHEMA_COUNT[stream['tap_stream_id']] += 1
            common.SCHEMA_TIMES[stream['tap_stream_id']] += time.time(
            ) - schema_build_start_time

            record_message = common.row_to_singer_record(
                stream, row, stream_version, time_extracted)

            singer.write_message(record_message)

            state = singer.write_bookmark(
                state, stream['tap_stream_id'], 'last_id_fetched',
                common.class_to_string(row['_id'],
                                       row['_id'].__class__.__name__))
            state = singer.write_bookmark(state, stream['tap_stream_id'],
                                          'last_id_fetched_type',
                                          row['_id'].__class__.__name__)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time() - start_time

    # clear max pk value and last pk fetched upon successful sync
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_value')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'max_id_type')
    singer.clear_bookmark(state, stream['tap_stream_id'], 'last_id_fetched')
    singer.clear_bookmark(state, stream['tap_stream_id'],
                          'last_id_fetched_type')

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'initial_full_table_complete', True)

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd {} records for {}'.format(rows_saved, tap_stream_id))
def do_sync(sf, catalog, state):
    starting_stream = state.get("current_stream")

    if starting_stream:
        LOGGER.info("Resuming sync from %s", starting_stream)
    else:
        LOGGER.info("Starting sync")

    for catalog_entry in catalog["streams"]:
        stream_version = get_stream_version(catalog_entry, state)
        stream = catalog_entry['stream']
        stream_alias = catalog_entry.get('stream_alias')
        stream_name = catalog_entry["tap_stream_id"]
        activate_version_message = singer.ActivateVersionMessage(
            stream=(stream_alias or stream), version=stream_version)

        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_key = catalog_metadata.get((), {}).get('replication-key')

        mdata = metadata.to_map(catalog_entry['metadata'])

        if not stream_is_selected(mdata):
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        if starting_stream:
            if starting_stream == stream_name:
                LOGGER.info("%s: Resuming", stream_name)
                starting_stream = None
            else:
                LOGGER.info("%s: Skipping - already synced", stream_name)
                continue
        else:
            LOGGER.info("%s: Starting", stream_name)

        state["current_stream"] = stream_name
        singer.write_state(state)
        key_properties = metadata.to_map(catalog_entry['metadata']).get((), {}).get('table-key-properties')
        singer.write_schema(
            stream,
            catalog_entry['schema'],
            key_properties,
            replication_key,
            stream_alias)

        job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID')
        if job_id:
            with metrics.record_counter(stream) as counter:
                LOGGER.info("Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id)
                # Resuming a sync should clear out the remaining state once finished
                counter = resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter)
                LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value)
                # Remove Job info from state once we complete this resumed query. One of a few cases could have occurred:
                # 1. The job succeeded, in which case make JobHighestBookmarkSeen the new bookmark
                # 2. The job partially completed, in which case make JobHighestBookmarkSeen the new bookmark, or
                #    existing bookmark if no bookmark exists for the Job.
                # 3. The job completely failed, in which case maintain the existing bookmark, or None if no bookmark
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None)
                state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None)
                bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                     .pop('JobHighestBookmarkSeen', None)
                existing_bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}) \
                                                              .pop(replication_key, None)
                state = singer.write_bookmark(
                    state,
                    catalog_entry['tap_stream_id'],
                    replication_key,
                    bookmark or existing_bookmark) # If job is removed, reset to existing bookmark or None
                singer.write_state(state)
        else:
            # Tables with a replication_key or an empty bookmark will emit an
            # activate_version at the beginning of their sync
            bookmark_is_empty = state.get('bookmarks', {}).get(
                catalog_entry['tap_stream_id']) is None

            if replication_key or bookmark_is_empty:
                singer.write_message(activate_version_message)
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              'version',
                                              stream_version)
            counter = sync_stream(sf, catalog_entry, state)
            LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter.value)

    state["current_stream"] = None
    singer.write_state(state)
    LOGGER.info("Finished sync")
def sync_collection(
    collection: Collection,
    stream: Dict,
    state: Optional[Dict],
) -> None:
    """
    Syncs the stream records incrementally
    Args:
        collection: MongoDB collection instance
        stream: stream dictionary
        state: state dictionary if exists
    """
    LOGGER.info('Starting incremental sync for %s', stream['tap_stream_id'])

    # before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None

    # pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                             'version')

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'], {})

    replication_key_name = metadata.to_map(stream['metadata']).get(
        ()).get('replication-key')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}

    if stream_state.get('replication_key_value'):
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gt'] = common.string_to_class(
            stream_state.get('replication_key_value'),
            stream_state.get('replication_key_type'))

    # log query
    LOGGER.info('Querying %s with: %s', stream['tap_stream_id'],
                dict(find=find_filter))

    with collection.find(find_filter,
                         sort=[(replication_key_name, pymongo.ASCENDING)
                               ]) as cursor:
        rows_saved = 0
        start_time = time.time()

        for row in cursor:

            singer.write_message(
                common.row_to_singer_record(stream=stream,
                                            row=row,
                                            time_extracted=utils.now(),
                                            time_deleted=None,
                                            version=stream_version))
            rows_saved += 1

            update_bookmark(row, state, stream['tap_stream_id'],
                            replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        common.COUNTS[stream['tap_stream_id']] += rows_saved
        common.TIMES[stream['tap_stream_id']] += time.time() - start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Syncd %s records for %s', rows_saved, stream['tap_stream_id'])
Exemple #29
0
def sync_collection(client, stream, state, stream_projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting oplog sync for %s', tap_stream_id)

    md_map = metadata.to_map(stream['metadata'])
    database_name = metadata.get(md_map, (), 'database-name')
    collection_name = stream.get("table_name")
    stream_state = state.get('bookmarks', {}).get(tap_stream_id)

    oplog_ts = timestamp.Timestamp(stream_state['oplog_ts_time'],
                                   stream_state['oplog_ts_inc'])

    # Write activate version message
    version = common.get_stream_version(tap_stream_id, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=version)
    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    start_time = time.time()

    oplog_query = {'ts': {'$gte': oplog_ts}}

    projection = transform_projection(stream_projection)

    oplog_replay = stream_projection is None

    LOGGER.info(
        'Querying %s with:\n\tFind Parameters: %s\n\tProjection: %s\n\toplog_replay: %s',
        tap_stream_id, oplog_query, projection, oplog_replay)

    update_buffer = set()

    # consider adding oplog_replay, but this would require removing the projection
    # default behavior is a non_tailable cursor but we might want a tailable one
    # regardless of whether its long lived or not.
    with client.local.oplog.rs.find(oplog_query,
                                    projection,
                                    sort=[('$natural', pymongo.ASCENDING)],
                                    oplog_replay=oplog_replay) as cursor:
        for row in cursor:
            # assertions that mongo is respecing the ts query and sort order
            if row.get('ts') and row.get('ts') < oplog_ts:
                raise common.MongoAssertionException(
                    "Mongo is not honoring the query param")
            if row.get('ts') and row.get('ts') < timestamp.Timestamp(
                    stream_state['oplog_ts_time'],
                    stream_state['oplog_ts_inc']):
                raise common.MongoAssertionException(
                    "Mongo is not honoring the sort ascending param")

            if row.get('ns') != '{}.{}'.format(database_name, collection_name):
                if row.get('ts'):
                    state = update_bookmarks(state, tap_stream_id, row['ts'])
                continue

            row_op = row['op']
            if row_op == 'i':

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            elif row_op == 'u':
                update_buffer.add(row['o2']['_id'])

            elif row_op == 'd':

                # remove update from buffer if that document has been deleted
                if row['o']['_id'] in update_buffer:
                    update_buffer.remove(row['o']['_id'])

                # Delete ops only contain the _id of the row deleted
                row['o'][SDC_DELETED_AT] = row['ts']

                record_message = common.row_to_singer_record(
                    stream, row['o'], version, time_extracted)
                singer.write_message(record_message)

                rows_saved += 1

            state = update_bookmarks(state, tap_stream_id, row['ts'])

            # flush buffer if it has filled up
            if len(update_buffer) >= MAX_UPDATE_BUFFER_LENGTH:
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

            # write state every UPDATE_BOOKMARK_PERIOD messages
            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                # flush buffer before writing state
                for buffered_row in flush_buffer(client, update_buffer,
                                                 stream_projection,
                                                 database_name,
                                                 collection_name):
                    record_message = common.row_to_singer_record(
                        stream, buffered_row, version, time_extracted)
                    singer.write_message(record_message)

                    rows_saved += 1
                update_buffer = set()

                # write state
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

        # flush buffer if finished with oplog
        for buffered_row in flush_buffer(client, update_buffer,
                                         stream_projection, database_name,
                                         collection_name):
            record_message = common.row_to_singer_record(
                stream, buffered_row, version, time_extracted)

            singer.write_message(record_message)
            rows_saved += 1

    common.COUNTS[tap_stream_id] += rows_saved
    common.TIMES[tap_stream_id] += time.time() - start_time
    LOGGER.info('Syncd %s records for %s', rows_saved, tap_stream_id)
Exemple #30
0
def sync_collection(client, stream, state, projection):
    tap_stream_id = stream['tap_stream_id']
    LOGGER.info('Starting incremental sync for %s', tap_stream_id)

    stream_metadata = metadata.to_map(stream['metadata']).get(())
    collection = client[stream_metadata['database-name']][stream['stream']]

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None

    #pick a new table version if last run wasn't interrupted
    if first_run:
        stream_version = int(time.time() * 1000)
    else:
        stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')

    state = singer.write_bookmark(state,
                                  stream['tap_stream_id'],
                                  'version',
                                  stream_version)

    activate_version_message = singer.ActivateVersionMessage(
        stream=common.calculate_destination_stream_name(stream),
        version=stream_version
    )


    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_message(activate_version_message)

    # get replication key, and bookmarked value/type
    stream_state = state.get('bookmarks', {}).get(tap_stream_id, {})

    replication_key_name = stream_metadata.get('replication-key')
    replication_key_value_bookmark = stream_state.get('replication_key_value')

    # write state message
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    # create query
    find_filter = {}
    if replication_key_value_bookmark:
        find_filter[replication_key_name] = {}
        find_filter[replication_key_name]['$gte'] = \
            common.string_to_class(replication_key_value_bookmark,
                                   stream_state.get('replication_key_type'))

    # log query
    query_message = 'Querying {} with:\n\tFind Parameters: {}'.format(tap_stream_id, find_filter)
    if projection:
        query_message += '\n\tProjection: {}'.format(projection)
    LOGGER.info(query_message)


    # query collection
    schema = {"type": "object", "properties": {}}
    with collection.find(find_filter,
                         projection,
                         sort=[(replication_key_name, pymongo.ASCENDING)]) as cursor:
        rows_saved = 0
        time_extracted = utils.now()
        start_time = time.time()

        for row in cursor:
            schema_build_start_time = time.time()
            if common.row_to_schema(schema, row):
                singer.write_message(singer.SchemaMessage(
                    stream=common.calculate_destination_stream_name(stream),
                    schema=schema,
                    key_properties=['_id']))
                common.SCHEMA_COUNT[tap_stream_id] += 1
            common.SCHEMA_TIMES[tap_stream_id] += time.time() - schema_build_start_time


            record_message = common.row_to_singer_record(stream,
                                                         row,
                                                         stream_version,
                                                         time_extracted)

            # gen_schema = common.row_to_schema_message(schema, record_message.record, row)
            # if DeepDiff(schema, gen_schema, ignore_order=True) != {}:
            #   emit gen_schema
            #   schema = gen_schema
            singer.write_message(record_message)
            rows_saved += 1

            update_bookmark(row, state, tap_stream_id, replication_key_name)

            if rows_saved % common.UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))


        common.COUNTS[tap_stream_id] += rows_saved
        common.TIMES[tap_stream_id] += time.time()-start_time

    singer.write_message(activate_version_message)

    LOGGER.info('Synced %s records for %s', rows_saved, tap_stream_id)