Esempi in Python per get_currently_syncing, esempi in Python per singer.get_currently_syncing

Esempio n. 1

0

Mostra file

def sync_contact_lists(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("contact_lists")
    bookmark_key = 'updatedAt'
    singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, "contact_lists", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_contact_lists from %s", start)

    url = get_url("contact_lists")
    params = {'count': 250}
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'contact_lists', url, params, "lists",
                               "has-more", ["offset"], ["offset"]):
            record = bumble_bee.transform(lift_properties_and_versions(row),
                                          schema, mdata)

            if record[bookmark_key] >= start:
                singer.write_record("contact_lists",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

    STATE = singer.write_bookmark(STATE, 'contact_lists', bookmark_key,
                                  max_bk_value)
    singer.write_state(STATE)

    return STATE

Esempio n. 2

0

Mostra file

def sync_forms(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("forms")
    bookmark_key = 'updatedAt'

    singer.write_schema("forms", schema, ["guid"], [bookmark_key],
                        catalog.get('stream_alias'))
    start = get_start(STATE, "forms", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_forms from %s", start)

    data = request(get_url("forms")).json()
    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(lift_properties_and_versions(row),
                                          schema, mdata)

            if record[bookmark_key] >= start:
                singer.write_record("forms",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

    STATE = singer.write_bookmark(STATE, 'forms', bookmark_key, max_bk_value)
    singer.write_state(STATE)

    return STATE

Esempio n. 3

0

Mostra file

File: __init__.py Progetto: flatfair/tap-hubspot

def sync_owners(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("owners")
    bookmark_key = 'updatedAt'

    singer.write_schema("owners", schema, ["ownerId"], [bookmark_key],
                        catalog.get('stream_alias'))
    start = get_start(STATE, "owners", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_owners from %s", start)

    params = {}
    if CONFIG.get('include_inactives'):
        params['includeInactives'] = "true"
    data = request(get_url("owners"), params).json()

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema, mdata)
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

            if record[bookmark_key] >= start:
                singer.write_record("owners",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)

    STATE = singer.write_bookmark(STATE, 'owners', bookmark_key, max_bk_value)
    singer.write_state(STATE)
    return STATE

Esempio n. 4

0

Mostra file

File: __init__.py Progetto: psschroeter/tap-mongodb

def get_streams_to_sync(streams, state):

    # get selected streams
    selected_streams = [s for s in streams if is_stream_selected(s)]
    # prioritize streams that have not been processed
    streams_with_state = []
    streams_without_state = []
    for stream in selected_streams:
        if state.get('bookmarks', {}).get(stream['tap_stream_id']):
            streams_with_state.append(stream)
        else:
            streams_without_state.append(stream)

    ordered_streams = streams_without_state + streams_with_state

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)
    if currently_syncing:
        currently_syncing_stream = list(filter(
            lambda s: s['tap_stream_id'] == currently_syncing,
            ordered_streams))
        non_currently_syncing_streams = list(filter(lambda s: s['tap_stream_id']
                                                    != currently_syncing,
                                                    ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        streams_to_sync = ordered_streams

    return streams_to_sync

Esempio n. 5

0

Mostra file

def _sync_contacts_by_company(STATE, ctx, company_id):
    schema = load_schema(CONTACTS_BY_COMPANY)
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    url = get_url("contacts_by_company", company_id=company_id)
    path = 'vids'
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        with metrics.record_counter(CONTACTS_BY_COMPANY) as counter:
            data = request(url, default_contacts_by_company_params).json()

            if data.get(path) is None:
                raise RuntimeError(
                    "Unexpected API response: {} not in {}".format(
                        path, data.keys()))

            for row in data[path]:
                counter.increment()
                record = {'company-id': company_id, 'contact-id': row}
                record = bumble_bee.transform(
                    lift_properties_and_versions(record), schema, mdata)
                singer.write_record("contacts_by_company",
                                    record,
                                    time_extracted=utils.now())

    return STATE

Esempio n. 6

0

Mostra file

File: __init__.py Progetto: tivonahaug/tap-postgres

def do_sync(conn_config, catalog, default_replication_method, state):
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog['streams']))
    streams.sort(key=lambda s: s['tap_stream_id'])
    LOGGER.info("Selected streams: %s ", list(map(lambda s: s['tap_stream_id'], streams)))
    if any_logical_streams(streams, default_replication_method):
        LOGGER.info("Use of logical replication requires fetching an lsn...")
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.info("End LSN: %s ", end_lsn)
    else:
        end_lsn = None

    sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(streams, state, default_replication_method)

    if currently_syncing:
        LOGGER.info("found currently_syncing: %s", currently_syncing)
        currently_syncing_stream = list(filter(lambda s: s['tap_stream_id'] == currently_syncing, traditional_streams))
        if currently_syncing_stream is None:
            LOGGER.warning("unable to locate currently_syncing(%s) amongst selected traditional streams(%s). will ignore", currently_syncing, list(map(lambda s: s['tap_stream_id'], traditional_streams)))
        other_streams = list(filter(lambda s: s['tap_stream_id'] != currently_syncing, traditional_streams))
        traditional_streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("No currently_syncing found")

    for stream in traditional_streams:
        state = sync_traditional_stream(conn_config, stream, state, sync_method_lookup[stream['tap_stream_id']], end_lsn)

    logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get(()).get('database-name'))
    for dbname, streams in itertools.groupby(logical_streams, lambda s: metadata.to_map(s['metadata']).get(()).get('database-name')):
        conn_config['dbname'] = dbname
        state = sync_logical_streams(conn_config, list(streams), state, end_lsn)
    return state

Esempio n. 7

0

Mostra file

def do_sync(conn_config, catalog, default_replication_method, state):
   currently_syncing = singer.get_currently_syncing(state)
   streams = list(filter(is_selected_via_metadata, catalog.streams))
   streams.sort(key=lambda s: s.tap_stream_id)
   LOGGER.info("Selected streams: %s ", list(map(lambda s: s.tap_stream_id, streams)))

   if any_logical_streams(streams, default_replication_method):
      LOGGER.info("Use of log_miner requires fetching current scn...")
      end_scn = log_miner.fetch_current_scn(conn_config)
      LOGGER.info("End SCN: %s ", end_scn)
   else:
      end_scn = None

   sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(streams, state, default_replication_method)

   if currently_syncing:
      LOGGER.info("found currently_syncing: %s", currently_syncing)
      currently_syncing_stream = list(filter(lambda s: s.tap_stream_id == currently_syncing, traditional_streams))
      if currently_syncing_stream is None:
         LOGGER.warning("unable to locate currently_syncing(%s) amongst selected traditional streams(%s). will ignore", currently_syncing, list(map(lambda s: s.tap_stream_id, traditional_streams)))
      else:
         other_streams = list(filter(lambda s: s.tap_stream_id != currently_syncing, traditional_streams))
         traditional_streams = currently_syncing_stream + other_streams
   else:
      LOGGER.info("No currently_syncing found")

   for stream in traditional_streams:
      state = sync_traditional_stream(conn_config, stream, state, sync_method_lookup[stream.tap_stream_id], end_scn)

   state = sync_log_miner_streams(conn_config, list(logical_streams), state, end_scn)
   return state

Esempio n. 8

0

Mostra file

def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    # Append all the properties fields for deals to the request
    additional_properties = schema.get("properties").get("properties").get(
        "properties")
    for key in additional_properties.keys():
        params['properties'].append(key)

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore",
                               ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(row, schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE

Esempio n. 9

0

Mostra file

def do_sync(conn_config, catalog, default_replication_method, state):
    streams = list(filter(is_selected_via_metadata, catalog.streams))
    streams.sort(key=lambda s: s.tap_stream_id)

    currently_syncing = singer.get_currently_syncing(state)

    if currently_syncing:
        streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing,
                            streams)

    for stream in streams:
        md_map = metadata.to_map(stream.metadata)
        conn_config['dbname'] = md_map.get(()).get('database-name')
        state = singer.set_currently_syncing(state, stream.tap_stream_id)

        desired_columns = [
            c for c in stream.schema.properties.keys()
            if should_sync_column(md_map, c)
        ]
        desired_columns.sort()

        if len(desired_columns) == 0:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it',
                stream.tap_stream_id)
            continue

        replication_method = md_map.get(
            (), {}).get('replication-method', default_replication_method)
        if replication_method not in set(
            ['LOG_BASED', 'FULL_TABLE', 'INCREMENTAL']):
            raise Exception("Unrecognized replication_method {}".format(
                replication_method))

        replication_key = md_map.get((), {}).get('replication-key')

        state = clear_state_on_replication_change(state, stream.tap_stream_id,
                                                  replication_key,
                                                  replication_method)

        if replication_method == 'LOG_BASED' and md_map.get(
            (), {}).get('is-view'):
            LOGGER.warning(
                'Logical Replication is NOT supported for views. skipping stream %s',
                stream.tap_stream_id)
            continue

        if replication_method == 'LOG_BASED':
            state = do_sync_logical_replication(conn_config, stream, state,
                                                desired_columns, md_map)
        elif replication_method == 'FULL_TABLE':
            state = do_sync_full_table(conn_config, stream, state,
                                       desired_columns, md_map)
        elif replication_method == 'INCREMENTAL':
            state = do_sync_incremental(conn_config, stream, state,
                                        desired_columns, md_map)

        state = singer.set_currently_syncing(state, None)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

Esempio n. 10

0

Mostra file

File: __init__.py Progetto: transferwise/pipelinewise-tap-postgres

def do_sync(conn_config,
            catalog,
            default_replication_method,
            state,
            state_file=None):
    """
    Orchestrates sync of all streams
    """
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog['streams']))
    streams.sort(key=lambda s: s['tap_stream_id'])
    LOGGER.info("Selected streams: %s ", [s['tap_stream_id'] for s in streams])
    if any_logical_streams(streams, default_replication_method):
        # Use of logical replication requires fetching an lsn
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.debug("end_lsn = %s ", end_lsn)
    else:
        end_lsn = None

    refresh_streams_schema(conn_config, streams)

    sync_method_lookup, traditional_streams, logical_streams = \
        sync_method_for_streams(streams, state, default_replication_method)

    if currently_syncing:
        LOGGER.debug("Found currently_syncing: %s", currently_syncing)

        currently_syncing_stream = list(
            filter(lambda s: s['tap_stream_id'] == currently_syncing,
                   traditional_streams))

        if not currently_syncing_stream:
            LOGGER.warning(
                "unable to locate currently_syncing(%s) amongst selected traditional streams(%s). "
                "Will ignore", currently_syncing,
                {s['tap_stream_id']
                 for s in traditional_streams})

        other_streams = list(
            filter(lambda s: s['tap_stream_id'] != currently_syncing,
                   traditional_streams))
        traditional_streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("No streams marked as currently_syncing in state file")

    for stream in traditional_streams:
        state = sync_traditional_stream(
            conn_config, stream, state,
            sync_method_lookup[stream['tap_stream_id']], end_lsn)

    logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get(
        ()).get('database-name'))
    for dbname, streams in itertools.groupby(
            logical_streams, lambda s: metadata.to_map(s['metadata']).get(
                ()).get('database-name')):
        conn_config['dbname'] = dbname
        state = sync_logical_streams(conn_config, list(streams), state,
                                     end_lsn, state_file)
    return state

Esempio n. 11

0

Mostra file

def sync(client, config, catalog, state):
    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))
    if not selected_streams or selected_streams == []:
        return

    # Get the streams to sync (based on dependencies)
    sync_streams = []
    flat_streams = flatten_streams()
    # Loop thru all streams
    for stream_name, stream_metadata in flat_streams.items():
        # If stream has a parent_stream, then it is a child stream
        parent_stream = stream_metadata.get('parent_stream')
        grandparent_stream = stream_metadata.get('grandparent_stream')
        great_grandparent_stream = stream_metadata.get(
            'great_grandparent_stream')

        if stream_name in selected_streams:
            LOGGER.info(
                'stream: {}, parent: {}, grandparent: {}, great_grandparent: {}'
                .format(stream_name, parent_stream, grandparent_stream,
                        great_grandparent_stream))
            if stream_name not in sync_streams:
                sync_streams.append(stream_name)
            if parent_stream and parent_stream not in sync_streams:
                sync_streams.append(parent_stream)
            if grandparent_stream and grandparent_stream not in sync_streams:
                sync_streams.append(grandparent_stream)
            if great_grandparent_stream and great_grandparent_stream not in sync_streams:
                sync_streams.append(grandparent_stream)
    LOGGER.info('Sync Streams: {}'.format(sync_streams))

    # Loop through selected_streams
    # Loop through endpoints in selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in sync_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            write_schema(catalog, stream_name)
            update_currently_syncing(state, stream_name)

            total_records = sync_endpoint(client=client,
                                          config=config,
                                          catalog=catalog,
                                          state=state,
                                          stream_name=stream_name,
                                          endpoint_config=endpoint_config,
                                          sync_streams=sync_streams,
                                          selected_streams=selected_streams)

            update_currently_syncing(state, None)
            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name, total_records))

Esempio n. 12

0

Mostra file

def sync_deal_histories(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deal_histories", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deal_histories from %s", start)
    most_recent_modified_time = start
    params = {'count': 250,
              'includeAssociations': False,
              'properties': []}

    schema = load_schema("deal_histories")
    singer.write_schema("deal_histories", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    # if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        # params['includeAllProperties'] = True
        # params['allPropertiesFetchMode'] = 'latest_version'

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deal_histories', url, params, 'deals', "hasMore", ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    url=get_url("deals_detail", deal_id=row['dealId']),
                    params={"includePropertyVersions": "true"}
                ).json()
                record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata)
                singer.write_record("deal_histories", record, catalog.get('stream_alias'), time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deal_histories', bookmark_key, utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE

Esempio n. 13

0

Mostra file

File: sync.py Progetto: uptilab2/tap-intercom

def sync(client, config, catalog, state):
    if 'start_date' in config:
        start_date = config['start_date']

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []

    flat_streams = flatten_streams()

    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
        parent_stream = flat_streams.get(stream.stream, {}).get('parent_stream')
        if parent_stream:
            if parent_stream not in selected_streams:
                selected_streams.append(parent_stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams:
        return

    # Loop through selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            update_currently_syncing(state, stream_name)
            path = endpoint_config.get('path', stream_name)
            bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None)
            replication_ind = endpoint_config.get('replication_ind', True)
            if replication_ind:
                selected_fields = get_selected_fields(catalog, stream_name)
                LOGGER.info('Stream: {}, selected_fields: {}'.format(stream_name, selected_fields))
                write_schema(catalog, stream_name)
            else:
                selected_fields = None
            total_records = sync_endpoint(
                client=client,
                catalog=catalog,
                state=state,
                start_date=start_date,
                stream_name=stream_name,
                path=path,
                endpoint_config=endpoint_config,
                static_params=endpoint_config.get('params', {}),
                bookmark_query_field=endpoint_config.get('bookmark_query_field', None),
                bookmark_field=bookmark_field,
                bookmark_type=endpoint_config.get('bookmark_type', None),
                data_key=endpoint_config.get('data_key', stream_name),
                id_fields=endpoint_config.get('key_properties'),
                selected_streams=selected_streams,
                replication_ind=replication_ind)

            update_currently_syncing(state, None)
            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name,
                total_records))

Esempio n. 14

0

Mostra file

def get_streams(snowflake_conn, catalog, config, state):
    """Returns the Catalog of data we're going to sync for all SELECT-based
    streams (i.e. INCREMENTAL and FULL_TABLE that require a historical
    sync).

    Using the Catalog provided from the input file, this function will return a
    Catalog representing exactly which tables and columns that will be emitted
    by SELECT-based syncs. This is achieved by comparing the input Catalog to a
    freshly discovered Catalog to determine the resulting Catalog.

    The resulting Catalog will include the following any streams marked as
    "selected" that currently exist in the database. Columns marked as "selected"
    and those labled "automatic" (e.g. primary keys and replication keys) will be
    included. Streams will be prioritized in the following order:
      1. currently_syncing if it is SELECT-based
      2. any streams that do not have state
      3. any streams that do not have a replication method of LOG_BASED
    """
    discovered = discover_catalog(snowflake_conn, config)

    # Filter catalog to include only selected streams
    # pylint: disable=unnecessary-lambda
    selected_streams = list(
        filter(lambda s: common.stream_is_selected(s), catalog.streams))
    streams_with_state = []
    streams_without_state = []

    for stream in selected_streams:
        stream_state = state.get('bookmarks', {}).get(stream.tap_stream_id)

        if not stream_state:
            streams_without_state.append(stream)
        else:
            streams_with_state.append(stream)

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)

    # prioritize streams that have not been processed
    ordered_streams = streams_without_state + streams_with_state

    if currently_syncing:
        currently_syncing_stream = list(
            filter(lambda s: s.tap_stream_id == currently_syncing,
                   streams_with_state))

        non_currently_syncing_streams = list(
            filter(lambda s: s.tap_stream_id != currently_syncing,
                   ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        # prioritize streams that have not been processed
        streams_to_sync = ordered_streams

    return resolve_catalog(discovered, streams_to_sync)

Esempio n. 15

0

Mostra file

def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE, "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema, ["contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    url=get_url("companies_detail", company_id=row['companyId']),
                    params={"includePropertyVersions": "true"}
                ).json()
                record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now())

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE

Esempio n. 16

0

Mostra file

File: __init__.py Progetto: jeffsloan/tap-hubspot

def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(record, schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE,
                                                      record['companyId'])

    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE

Esempio n. 17

0

Mostra file

File: __init__.py Progetto: nsckir/tap-hubspot

def get_streams_to_sync(streams, state):
    target_stream = singer.get_currently_syncing(state)
    result = streams
    if target_stream:
        result = list(itertools.dropwhile(
            lambda x: x.tap_stream_id != target_stream, streams))
    if not result:
        raise Exception('Unknown stream {} in state'.format(target_stream))
    return result

Esempio n. 18

0

Mostra file

File: __init__.py Progetto: globalprofessionalsearch/tap-hubspot

def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("engagements")
    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"],
                        [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "engagements") or utils.now()
    STATE = write_current_sync_start(STATE, "engagements", current_sync_start)
    singer.write_state(STATE)

    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key,
                              "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(
                lift_properties_and_versions(engagement), schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start)
    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'engagements', None)
    singer.write_state(STATE)
    return STATE

Esempio n. 19

0

Mostra file

def sync_subscription_changes(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    STATE = sync_entity_chunked(
        STATE,
        catalog,
        "subscription_changes",
        ["timestamp", "portalId", "recipient"],
        "timeline",
    )
    return STATE

Esempio n. 20

0

Mostra file

def get_streams_to_sync(streams, state):
    '''Get the streams to sync'''
    current_stream = singer.get_currently_syncing(state)
    result = streams
    if current_stream:
        result = list(itertools.dropwhile(
            lambda x: x.tap_stream_id != current_stream, streams))
    if not result:
        raise Exception("Unknown stream {} in state".format(current_stream))
    return result

Esempio n. 21

0

Mostra file

File: __init__.py Progetto: checkr/tap-mongodb

def get_non_oplog_streams(client, streams, state):
    selected_streams = list(filter(lambda s: is_stream_selected(s), streams))

    streams_with_state = []
    streams_without_state = []

    for stream in selected_streams:
        stream_metadata = metadata.to_map(stream['metadata'])
        replication_method = stream_metadata.get((),
                                                 {}).get('replication-method')
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'replication_method', replication_method)

        stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'])

        if not stream_state:
            if replication_method == 'LOG_BASED':
                LOGGER.info(
                    "LOG_BASED stream %s requires full historical sync",
                    stream['tap_stream_id'])

            streams_without_state.append(stream)
        elif stream_state and replication_method == 'LOG_BASED' and oplog_stream_requires_historical(
                stream, state):
            LOGGER.info("LOG_BASED stream %s will resume its historical sync",
                        stream['tap_stream_id'])
            streams_with_state.append(stream)
        elif stream_state and replication_method != 'LOG_BASED':
            streams_with_state.append(stream)

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)

    # prioritize streams that have not been processed
    ordered_streams = streams_without_state + streams_with_state

    if currently_syncing:
        currently_syncing_stream = list(
            filter(
                lambda s: s['tap_stream_id'] == currently_syncing and
                is_valid_currently_syncing_stream(s, state),
                streams_with_state))

        non_currently_syncing_streams = list(
            filter(lambda s: s['tap_stream_id'] != currently_syncing,
                   ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        # prioritize streams that have not been processed
        streams_to_sync = ordered_streams

    return (streams_to_sync, state)

Esempio n. 22

0

Mostra file

File: __init__.py Progetto: hz-lschick/tap-adwords

def do_sync(catalog, sdk_client, state):
    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    for stream in catalog.get_selected_streams(state):
        stream_name = stream.stream
        stream_metadata = metadata.to_map(stream.metadata)

        LOGGER.info('Syncing stream %s ...', stream_name)
        sync_stream(stream_name, stream_metadata, sdk_client)

Esempio n. 23

0

Mostra file

File: sync.py Progetto: jeffhuth-bytecode/tap-ilevel

def sync(client, config, catalog, state):
    start_date = config.get('start_date')[:10]
    period_types = config.get('period_types', 'FiscalQuarter')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    selected_streams_by_name = {}
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
        selected_streams_by_name[stream.stream] = stream

    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams or selected_streams == []:
        return

    # Loop through endpoints in selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            stream = selected_streams_by_name[stream_name]

            bookmark_field = next(
                iter(endpoint_config.get('replication_keys', [])), None)
            id_fields = endpoint_config.get('key_properties')
            singer_ops.write_schema(catalog, stream_name)
            total_records = 0

            last_date = singer_ops.get_bookmark(state, stream_name, start_date)

            #Request is made using currrent ddate + 1 as the end period.
            req_state = singer_ops.get_request_state(
                client=client,
                stream_name=stream_name,
                start_date=start_date,
                last_date=last_date,
                end_date=datetime.now(),
                state=state,
                bookmark_field=bookmark_field,
                id_fields=id_fields,
                period_types=period_types,
                stream=stream,
                catalog=catalog)

            # Main sync routine
            total_records = __sync_endpoint(req_state)

            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name, total_records))

    LOGGER.info('sync.py: sync complete')

Esempio n. 24

0

Mostra file

File: __init__.py Progetto: nsckir/tap-hubspot

def sync_deal_pipelines(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    schema = load_schema('deal_pipelines')
    singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias'))
    LOGGER.info('sync_deal_pipelines')
    data = request(get_url('deal_pipelines')).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema)
            singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now())
    singer.write_state(STATE)
    return STATE

Esempio n. 25

0

Mostra file

def sync(client, config, catalog, state):
    start_date = config.get('start_date')
    git_owner = config.get('git_owner')
    git_repository_list = config['git_repositories'].replace(" ",
                                                             "").split(",")

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: %s', last_stream)
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: %s', selected_streams)

    if not selected_streams:
        return

    # Loop through selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            for git_repository in git_repository_list:
                LOGGER.info('START Syncing Repository: %s, Stream: %s',
                            git_repository, stream_name)
                update_currently_syncing(state, stream_name)
                search_path = endpoint_config.get(
                    'search_path', stream_name).replace(
                        '[GIT_OWNER]',
                        git_owner).replace('[GIT_REPOSITORY]', git_repository)
                bookmark_field = next(
                    iter(endpoint_config.get('replication_keys', [])), None)
                total_records = sync_endpoint(
                    client=client,
                    catalog=catalog,
                    state=state,
                    start_date=start_date,
                    stream_name=stream_name,
                    search_path=search_path,
                    endpoint_config=endpoint_config,
                    git_owner=git_owner,
                    git_repository=git_repository,
                    bookmark_query_field=endpoint_config.get(
                        'bookmark_query_field', None),
                    bookmark_field=bookmark_field,
                    data_key=endpoint_config.get('data_key', stream_name),
                    id_fields=endpoint_config.get('key_properties'),
                    selected_streams=selected_streams)

                update_currently_syncing(state, None)
                LOGGER.info(
                    'FINISHED Syncing Repository: %s, Stream: %s, total_records: %s',
                    git_repository, stream_name, total_records)

Esempio n. 26

0

Mostra file

def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("engagements")
    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must save a lookback window
    # that handles the duration of time that this stream was last syncing,
    # and look back by that amount on the next sync
    last_sync_duration = get_previous_time_window(STATE, "engagements")
    current_sync_start = utils.now()
    if has_bookmark(STATE, "engagements", bookmark_key) and \
       last_sync_duration is not None:
        LOGGER.info(("Last sync of engagements lasted {} seconds. Adjusting bookmark by this "
                     "amount to account for race conditions with record updates.").format(last_sync_duration))
        start = utils.strptime_to_utc(start) - datetime.timedelta(seconds=last_sync_duration)
        start = utils.strftime(start)
    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(engagement, schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, max_bk_value)
    # Write duration for next sync's lookback window
    STATE = write_stream_duration(STATE, 'engagements', current_sync_start, utils.now())
    singer.write_state(STATE)
    return STATE

Esempio n. 27

0

Mostra file

def get_streams_to_sync(streams, state):
    target_stream = singer.get_currently_syncing(state)
    result = streams
    if target_stream:
        skipped = list(
            itertools.takewhile(lambda x: x.tap_stream_id != target_stream,
                                streams))
        rest = list(
            itertools.dropwhile(lambda x: x.tap_stream_id != target_stream,
                                streams))
        result = rest + skipped  # Move skipped streams to end
    if not result:
        raise Exception("Unknown stream {} in state".format(target_stream))
    return result

Esempio n. 28

0

Mostra file

File: helper.py Progetto: mlavoie-sm360/tap-rest-api

def get_streams_to_sync(streams, state):
    '''Get the streams to sync'''
    current_stream = singer.get_currently_syncing(state)
    result = streams

    if current_stream:
        for key in result.keys():
            if result[key].tap_stream_id != current_stream:
                result.pop(key, None)

    if not result:
        raise Exception("Unknown stream {} in state".format(current_stream))

    return result

Esempio n. 29

0

Mostra file

def sync_contacts(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    bookmark_key = "versionTimestamp"
    start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key))
    LOGGER.info("sync_contacts from %s", start)

    max_bk_value = start
    schema = load_schema("contacts")

    singer.write_schema("contacts", schema, ["vid"], [bookmark_key],
                        catalog.get("stream_alias"))

    url = get_url("contacts_all")

    vids = []
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(
                STATE,
                "contacts",
                url,
                default_contact_params,
                "contacts",
                "has-more",
            ["vid-offset"],
            ["vidOffset"],
        ):
            modified_time = None
            if bookmark_key in row:
                modified_time = utils.strptime_with_tz(
                    _transform_datetime(  # pylint: disable=protected-access
                        row[bookmark_key],
                        UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING))

            if not modified_time or modified_time >= start:
                vids.append(row["vid"])

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if len(vids) == 100:
                _sync_contact_vids(catalog, vids, schema, bumble_bee)
                vids = []

        _sync_contact_vids(catalog, vids, schema, bumble_bee)

    STATE = singer.write_bookmark(STATE, "contacts", bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE

Esempio n. 30

0

Mostra file

File: __init__.py Progetto: nsckir/tap-hubspot

def sync_campaigns(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    schema = load_schema("campaigns")
    singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias'))
    LOGGER.info("sync_campaigns(NO bookmarks)")
    url = get_url("campaigns_all")
    params = {'limit': 500}

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]):
            record = request(get_url("campaigns_detail", campaign_id=row['id'])).json()
            record = bumble_bee.transform(record, schema)
            singer.write_record("campaigns", record, catalog.get('stream_alias'), time_extracted=utils.now())

    return STATE