Beispiel #1
0
def sync_contact_lists(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("contact_lists")
    bookmark_key = 'updatedAt'
    singer.write_schema("contact_lists", schema, ["listId"], [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, "contact_lists", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_contact_lists from %s", start)

    url = get_url("contact_lists")
    params = {'count': 250}
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'contact_lists', url, params, "lists",
                               "has-more", ["offset"], ["offset"]):
            record = bumble_bee.transform(lift_properties_and_versions(row),
                                          schema, mdata)

            if record[bookmark_key] >= start:
                singer.write_record("contact_lists",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

    STATE = singer.write_bookmark(STATE, 'contact_lists', bookmark_key,
                                  max_bk_value)
    singer.write_state(STATE)

    return STATE
Beispiel #2
0
def sync_forms(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("forms")
    bookmark_key = 'updatedAt'

    singer.write_schema("forms", schema, ["guid"], [bookmark_key],
                        catalog.get('stream_alias'))
    start = get_start(STATE, "forms", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_forms from %s", start)

    data = request(get_url("forms")).json()
    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(lift_properties_and_versions(row),
                                          schema, mdata)

            if record[bookmark_key] >= start:
                singer.write_record("forms",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

    STATE = singer.write_bookmark(STATE, 'forms', bookmark_key, max_bk_value)
    singer.write_state(STATE)

    return STATE
Beispiel #3
0
def sync_owners(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("owners")
    bookmark_key = 'updatedAt'

    singer.write_schema("owners", schema, ["ownerId"], [bookmark_key],
                        catalog.get('stream_alias'))
    start = get_start(STATE, "owners", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_owners from %s", start)

    params = {}
    if CONFIG.get('include_inactives'):
        params['includeInactives'] = "true"
    data = request(get_url("owners"), params).json()

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema, mdata)
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

            if record[bookmark_key] >= start:
                singer.write_record("owners",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)

    STATE = singer.write_bookmark(STATE, 'owners', bookmark_key, max_bk_value)
    singer.write_state(STATE)
    return STATE
Beispiel #4
0
def get_streams_to_sync(streams, state):

    # get selected streams
    selected_streams = [s for s in streams if is_stream_selected(s)]
    # prioritize streams that have not been processed
    streams_with_state = []
    streams_without_state = []
    for stream in selected_streams:
        if state.get('bookmarks', {}).get(stream['tap_stream_id']):
            streams_with_state.append(stream)
        else:
            streams_without_state.append(stream)

    ordered_streams = streams_without_state + streams_with_state

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)
    if currently_syncing:
        currently_syncing_stream = list(filter(
            lambda s: s['tap_stream_id'] == currently_syncing,
            ordered_streams))
        non_currently_syncing_streams = list(filter(lambda s: s['tap_stream_id']
                                                    != currently_syncing,
                                                    ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        streams_to_sync = ordered_streams

    return streams_to_sync
Beispiel #5
0
def _sync_contacts_by_company(STATE, ctx, company_id):
    schema = load_schema(CONTACTS_BY_COMPANY)
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    url = get_url("contacts_by_company", company_id=company_id)
    path = 'vids'
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        with metrics.record_counter(CONTACTS_BY_COMPANY) as counter:
            data = request(url, default_contacts_by_company_params).json()

            if data.get(path) is None:
                raise RuntimeError(
                    "Unexpected API response: {} not in {}".format(
                        path, data.keys()))

            for row in data[path]:
                counter.increment()
                record = {'company-id': company_id, 'contact-id': row}
                record = bumble_bee.transform(
                    lift_properties_and_versions(record), schema, mdata)
                singer.write_record("contacts_by_company",
                                    record,
                                    time_extracted=utils.now())

    return STATE
Beispiel #6
0
def do_sync(conn_config, catalog, default_replication_method, state):
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog['streams']))
    streams.sort(key=lambda s: s['tap_stream_id'])
    LOGGER.info("Selected streams: %s ", list(map(lambda s: s['tap_stream_id'], streams)))
    if any_logical_streams(streams, default_replication_method):
        LOGGER.info("Use of logical replication requires fetching an lsn...")
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.info("End LSN: %s ", end_lsn)
    else:
        end_lsn = None

    sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(streams, state, default_replication_method)

    if currently_syncing:
        LOGGER.info("found currently_syncing: %s", currently_syncing)
        currently_syncing_stream = list(filter(lambda s: s['tap_stream_id'] == currently_syncing, traditional_streams))
        if currently_syncing_stream is None:
            LOGGER.warning("unable to locate currently_syncing(%s) amongst selected traditional streams(%s). will ignore", currently_syncing, list(map(lambda s: s['tap_stream_id'], traditional_streams)))
        other_streams = list(filter(lambda s: s['tap_stream_id'] != currently_syncing, traditional_streams))
        traditional_streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("No currently_syncing found")

    for stream in traditional_streams:
        state = sync_traditional_stream(conn_config, stream, state, sync_method_lookup[stream['tap_stream_id']], end_lsn)

    logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get(()).get('database-name'))
    for dbname, streams in itertools.groupby(logical_streams, lambda s: metadata.to_map(s['metadata']).get(()).get('database-name')):
        conn_config['dbname'] = dbname
        state = sync_logical_streams(conn_config, list(streams), state, end_lsn)
    return state
Beispiel #7
0
def do_sync(conn_config, catalog, default_replication_method, state):
   currently_syncing = singer.get_currently_syncing(state)
   streams = list(filter(is_selected_via_metadata, catalog.streams))
   streams.sort(key=lambda s: s.tap_stream_id)
   LOGGER.info("Selected streams: %s ", list(map(lambda s: s.tap_stream_id, streams)))

   if any_logical_streams(streams, default_replication_method):
      LOGGER.info("Use of log_miner requires fetching current scn...")
      end_scn = log_miner.fetch_current_scn(conn_config)
      LOGGER.info("End SCN: %s ", end_scn)
   else:
      end_scn = None

   sync_method_lookup, traditional_streams, logical_streams = sync_method_for_streams(streams, state, default_replication_method)

   if currently_syncing:
      LOGGER.info("found currently_syncing: %s", currently_syncing)
      currently_syncing_stream = list(filter(lambda s: s.tap_stream_id == currently_syncing, traditional_streams))
      if currently_syncing_stream is None:
         LOGGER.warning("unable to locate currently_syncing(%s) amongst selected traditional streams(%s). will ignore", currently_syncing, list(map(lambda s: s.tap_stream_id, traditional_streams)))
      else:
         other_streams = list(filter(lambda s: s.tap_stream_id != currently_syncing, traditional_streams))
         traditional_streams = currently_syncing_stream + other_streams
   else:
      LOGGER.info("No currently_syncing found")

   for stream in traditional_streams:
      state = sync_traditional_stream(conn_config, stream, state, sync_method_lookup[stream.tap_stream_id], end_scn)

   state = sync_log_miner_streams(conn_config, list(logical_streams), state, end_scn)
   return state
Beispiel #8
0
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'count': 250, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    # Append all the properties fields for deals to the request
    additional_properties = schema.get("properties").get("properties").get(
        "properties")
    for key in additional_properties.keys():
        params['properties'].append(key)

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deals', url, params, 'deals', "hasMore",
                               ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(row, schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Beispiel #9
0
def do_sync(conn_config, catalog, default_replication_method, state):
    streams = list(filter(is_selected_via_metadata, catalog.streams))
    streams.sort(key=lambda s: s.tap_stream_id)

    currently_syncing = singer.get_currently_syncing(state)

    if currently_syncing:
        streams = dropwhile(lambda s: s.tap_stream_id != currently_syncing,
                            streams)

    for stream in streams:
        md_map = metadata.to_map(stream.metadata)
        conn_config['dbname'] = md_map.get(()).get('database-name')
        state = singer.set_currently_syncing(state, stream.tap_stream_id)

        desired_columns = [
            c for c in stream.schema.properties.keys()
            if should_sync_column(md_map, c)
        ]
        desired_columns.sort()

        if len(desired_columns) == 0:
            LOGGER.warning(
                'There are no columns selected for stream %s, skipping it',
                stream.tap_stream_id)
            continue

        replication_method = md_map.get(
            (), {}).get('replication-method', default_replication_method)
        if replication_method not in set(
            ['LOG_BASED', 'FULL_TABLE', 'INCREMENTAL']):
            raise Exception("Unrecognized replication_method {}".format(
                replication_method))

        replication_key = md_map.get((), {}).get('replication-key')

        state = clear_state_on_replication_change(state, stream.tap_stream_id,
                                                  replication_key,
                                                  replication_method)

        if replication_method == 'LOG_BASED' and md_map.get(
            (), {}).get('is-view'):
            LOGGER.warning(
                'Logical Replication is NOT supported for views. skipping stream %s',
                stream.tap_stream_id)
            continue

        if replication_method == 'LOG_BASED':
            state = do_sync_logical_replication(conn_config, stream, state,
                                                desired_columns, md_map)
        elif replication_method == 'FULL_TABLE':
            state = do_sync_full_table(conn_config, stream, state,
                                       desired_columns, md_map)
        elif replication_method == 'INCREMENTAL':
            state = do_sync_incremental(conn_config, stream, state,
                                        desired_columns, md_map)

        state = singer.set_currently_syncing(state, None)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def do_sync(conn_config,
            catalog,
            default_replication_method,
            state,
            state_file=None):
    """
    Orchestrates sync of all streams
    """
    currently_syncing = singer.get_currently_syncing(state)
    streams = list(filter(is_selected_via_metadata, catalog['streams']))
    streams.sort(key=lambda s: s['tap_stream_id'])
    LOGGER.info("Selected streams: %s ", [s['tap_stream_id'] for s in streams])
    if any_logical_streams(streams, default_replication_method):
        # Use of logical replication requires fetching an lsn
        end_lsn = logical_replication.fetch_current_lsn(conn_config)
        LOGGER.debug("end_lsn = %s ", end_lsn)
    else:
        end_lsn = None

    refresh_streams_schema(conn_config, streams)

    sync_method_lookup, traditional_streams, logical_streams = \
        sync_method_for_streams(streams, state, default_replication_method)

    if currently_syncing:
        LOGGER.debug("Found currently_syncing: %s", currently_syncing)

        currently_syncing_stream = list(
            filter(lambda s: s['tap_stream_id'] == currently_syncing,
                   traditional_streams))

        if not currently_syncing_stream:
            LOGGER.warning(
                "unable to locate currently_syncing(%s) amongst selected traditional streams(%s). "
                "Will ignore", currently_syncing,
                {s['tap_stream_id']
                 for s in traditional_streams})

        other_streams = list(
            filter(lambda s: s['tap_stream_id'] != currently_syncing,
                   traditional_streams))
        traditional_streams = currently_syncing_stream + other_streams
    else:
        LOGGER.info("No streams marked as currently_syncing in state file")

    for stream in traditional_streams:
        state = sync_traditional_stream(
            conn_config, stream, state,
            sync_method_lookup[stream['tap_stream_id']], end_lsn)

    logical_streams.sort(key=lambda s: metadata.to_map(s['metadata']).get(
        ()).get('database-name'))
    for dbname, streams in itertools.groupby(
            logical_streams, lambda s: metadata.to_map(s['metadata']).get(
                ()).get('database-name')):
        conn_config['dbname'] = dbname
        state = sync_logical_streams(conn_config, list(streams), state,
                                     end_lsn, state_file)
    return state
Beispiel #11
0
def sync(client, config, catalog, state):
    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))
    if not selected_streams or selected_streams == []:
        return

    # Get the streams to sync (based on dependencies)
    sync_streams = []
    flat_streams = flatten_streams()
    # Loop thru all streams
    for stream_name, stream_metadata in flat_streams.items():
        # If stream has a parent_stream, then it is a child stream
        parent_stream = stream_metadata.get('parent_stream')
        grandparent_stream = stream_metadata.get('grandparent_stream')
        great_grandparent_stream = stream_metadata.get(
            'great_grandparent_stream')

        if stream_name in selected_streams:
            LOGGER.info(
                'stream: {}, parent: {}, grandparent: {}, great_grandparent: {}'
                .format(stream_name, parent_stream, grandparent_stream,
                        great_grandparent_stream))
            if stream_name not in sync_streams:
                sync_streams.append(stream_name)
            if parent_stream and parent_stream not in sync_streams:
                sync_streams.append(parent_stream)
            if grandparent_stream and grandparent_stream not in sync_streams:
                sync_streams.append(grandparent_stream)
            if great_grandparent_stream and great_grandparent_stream not in sync_streams:
                sync_streams.append(grandparent_stream)
    LOGGER.info('Sync Streams: {}'.format(sync_streams))

    # Loop through selected_streams
    # Loop through endpoints in selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in sync_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            write_schema(catalog, stream_name)
            update_currently_syncing(state, stream_name)

            total_records = sync_endpoint(client=client,
                                          config=config,
                                          catalog=catalog,
                                          state=state,
                                          stream_name=stream_name,
                                          endpoint_config=endpoint_config,
                                          sync_streams=sync_streams,
                                          selected_streams=selected_streams)

            update_currently_syncing(state, None)
            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name, total_records))
Beispiel #12
0
def sync_deal_histories(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deal_histories", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deal_histories from %s", start)
    most_recent_modified_time = start
    params = {'count': 250,
              'includeAssociations': False,
              'properties': []}

    schema = load_schema("deal_histories")
    singer.write_schema("deal_histories", schema, ["dealId"], [bookmark_key], catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected') and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    # if mdata.get(('properties', 'properties'), {}).get('selected') or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        # params['includeAllProperties'] = True
        # params['allPropertiesFetchMode'] = 'latest_version'

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'deal_histories', url, params, 'deals', "hasMore", ["offset"], ["offset"]):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    url=get_url("deals_detail", deal_id=row['dealId']),
                    params={"includePropertyVersions": "true"}
                ).json()
                record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata)
                singer.write_record("deal_histories", record, catalog.get('stream_alias'), time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deal_histories', bookmark_key, utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Beispiel #13
0
def sync(client, config, catalog, state):
    if 'start_date' in config:
        start_date = config['start_date']

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []

    flat_streams = flatten_streams()

    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
        parent_stream = flat_streams.get(stream.stream, {}).get('parent_stream')
        if parent_stream:
            if parent_stream not in selected_streams:
                selected_streams.append(parent_stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams:
        return

    # Loop through selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            update_currently_syncing(state, stream_name)
            path = endpoint_config.get('path', stream_name)
            bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None)
            replication_ind = endpoint_config.get('replication_ind', True)
            if replication_ind:
                selected_fields = get_selected_fields(catalog, stream_name)
                LOGGER.info('Stream: {}, selected_fields: {}'.format(stream_name, selected_fields))
                write_schema(catalog, stream_name)
            else:
                selected_fields = None
            total_records = sync_endpoint(
                client=client,
                catalog=catalog,
                state=state,
                start_date=start_date,
                stream_name=stream_name,
                path=path,
                endpoint_config=endpoint_config,
                static_params=endpoint_config.get('params', {}),
                bookmark_query_field=endpoint_config.get('bookmark_query_field', None),
                bookmark_field=bookmark_field,
                bookmark_type=endpoint_config.get('bookmark_type', None),
                data_key=endpoint_config.get('data_key', stream_name),
                id_fields=endpoint_config.get('key_properties'),
                selected_streams=selected_streams,
                replication_ind=replication_ind)

            update_currently_syncing(state, None)
            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name,
                total_records))
Beispiel #14
0
def get_streams(snowflake_conn, catalog, config, state):
    """Returns the Catalog of data we're going to sync for all SELECT-based
    streams (i.e. INCREMENTAL and FULL_TABLE that require a historical
    sync).

    Using the Catalog provided from the input file, this function will return a
    Catalog representing exactly which tables and columns that will be emitted
    by SELECT-based syncs. This is achieved by comparing the input Catalog to a
    freshly discovered Catalog to determine the resulting Catalog.

    The resulting Catalog will include the following any streams marked as
    "selected" that currently exist in the database. Columns marked as "selected"
    and those labled "automatic" (e.g. primary keys and replication keys) will be
    included. Streams will be prioritized in the following order:
      1. currently_syncing if it is SELECT-based
      2. any streams that do not have state
      3. any streams that do not have a replication method of LOG_BASED
    """
    discovered = discover_catalog(snowflake_conn, config)

    # Filter catalog to include only selected streams
    # pylint: disable=unnecessary-lambda
    selected_streams = list(
        filter(lambda s: common.stream_is_selected(s), catalog.streams))
    streams_with_state = []
    streams_without_state = []

    for stream in selected_streams:
        stream_state = state.get('bookmarks', {}).get(stream.tap_stream_id)

        if not stream_state:
            streams_without_state.append(stream)
        else:
            streams_with_state.append(stream)

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)

    # prioritize streams that have not been processed
    ordered_streams = streams_without_state + streams_with_state

    if currently_syncing:
        currently_syncing_stream = list(
            filter(lambda s: s.tap_stream_id == currently_syncing,
                   streams_with_state))

        non_currently_syncing_streams = list(
            filter(lambda s: s.tap_stream_id != currently_syncing,
                   ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        # prioritize streams that have not been processed
        streams_to_sync = ordered_streams

    return resolve_catalog(discovered, streams_to_sync)
Beispiel #15
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key], catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE, "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema, ["contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params, 'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key]['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate']['timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    url=get_url("companies_detail", company_id=row['companyId']),
                    params={"includePropertyVersions": "true"}
                ).json()
                record = bumble_bee.transform(lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies", record, catalog.get('stream_alias'), time_extracted=utils.now())

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key, utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE
Beispiel #16
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(record, schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE,
                                                      record['companyId'])

    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Beispiel #17
0
def get_streams_to_sync(streams, state):
    target_stream = singer.get_currently_syncing(state)
    result = streams
    if target_stream:
        result = list(itertools.dropwhile(
            lambda x: x.tap_stream_id != target_stream, streams))
    if not result:
        raise Exception('Unknown stream {} in state'.format(target_stream))
    return result
def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("engagements")
    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"],
                        [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "engagements") or utils.now()
    STATE = write_current_sync_start(STATE, "engagements", current_sync_start)
    singer.write_state(STATE)

    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key,
                              "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(
                lift_properties_and_versions(engagement), schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start)
    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'engagements', None)
    singer.write_state(STATE)
    return STATE
Beispiel #19
0
def sync_subscription_changes(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    STATE = sync_entity_chunked(
        STATE,
        catalog,
        "subscription_changes",
        ["timestamp", "portalId", "recipient"],
        "timeline",
    )
    return STATE
Beispiel #20
0
def get_streams_to_sync(streams, state):
    '''Get the streams to sync'''
    current_stream = singer.get_currently_syncing(state)
    result = streams
    if current_stream:
        result = list(itertools.dropwhile(
            lambda x: x.tap_stream_id != current_stream, streams))
    if not result:
        raise Exception("Unknown stream {} in state".format(current_stream))
    return result
Beispiel #21
0
def get_non_oplog_streams(client, streams, state):
    selected_streams = list(filter(lambda s: is_stream_selected(s), streams))

    streams_with_state = []
    streams_without_state = []

    for stream in selected_streams:
        stream_metadata = metadata.to_map(stream['metadata'])
        replication_method = stream_metadata.get((),
                                                 {}).get('replication-method')
        state = singer.write_bookmark(state, stream['tap_stream_id'],
                                      'replication_method', replication_method)

        stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'])

        if not stream_state:
            if replication_method == 'LOG_BASED':
                LOGGER.info(
                    "LOG_BASED stream %s requires full historical sync",
                    stream['tap_stream_id'])

            streams_without_state.append(stream)
        elif stream_state and replication_method == 'LOG_BASED' and oplog_stream_requires_historical(
                stream, state):
            LOGGER.info("LOG_BASED stream %s will resume its historical sync",
                        stream['tap_stream_id'])
            streams_with_state.append(stream)
        elif stream_state and replication_method != 'LOG_BASED':
            streams_with_state.append(stream)

    # If the state says we were in the middle of processing a stream, skip
    # to that stream. Then process streams without prior state and finally
    # move onto streams with state (i.e. have been synced in the past)
    currently_syncing = singer.get_currently_syncing(state)

    # prioritize streams that have not been processed
    ordered_streams = streams_without_state + streams_with_state

    if currently_syncing:
        currently_syncing_stream = list(
            filter(
                lambda s: s['tap_stream_id'] == currently_syncing and
                is_valid_currently_syncing_stream(s, state),
                streams_with_state))

        non_currently_syncing_streams = list(
            filter(lambda s: s['tap_stream_id'] != currently_syncing,
                   ordered_streams))

        streams_to_sync = currently_syncing_stream + non_currently_syncing_streams
    else:
        # prioritize streams that have not been processed
        streams_to_sync = ordered_streams

    return (streams_to_sync, state)
Beispiel #22
0
def do_sync(catalog, sdk_client, state):
    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    for stream in catalog.get_selected_streams(state):
        stream_name = stream.stream
        stream_metadata = metadata.to_map(stream.metadata)

        LOGGER.info('Syncing stream %s ...', stream_name)
        sync_stream(stream_name, stream_metadata, sdk_client)
Beispiel #23
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')[:10]
    period_types = config.get('period_types', 'FiscalQuarter')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    selected_streams_by_name = {}
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
        selected_streams_by_name[stream.stream] = stream

    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams or selected_streams == []:
        return

    # Loop through endpoints in selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            stream = selected_streams_by_name[stream_name]

            bookmark_field = next(
                iter(endpoint_config.get('replication_keys', [])), None)
            id_fields = endpoint_config.get('key_properties')
            singer_ops.write_schema(catalog, stream_name)
            total_records = 0

            last_date = singer_ops.get_bookmark(state, stream_name, start_date)

            #Request is made using currrent ddate + 1 as the end period.
            req_state = singer_ops.get_request_state(
                client=client,
                stream_name=stream_name,
                start_date=start_date,
                last_date=last_date,
                end_date=datetime.now(),
                state=state,
                bookmark_field=bookmark_field,
                id_fields=id_fields,
                period_types=period_types,
                stream=stream,
                catalog=catalog)

            # Main sync routine
            total_records = __sync_endpoint(req_state)

            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name, total_records))

    LOGGER.info('sync.py: sync complete')
Beispiel #24
0
def sync_deal_pipelines(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    schema = load_schema('deal_pipelines')
    singer.write_schema('deal_pipelines', schema, ['pipelineId'], catalog.get('stream_alias'))
    LOGGER.info('sync_deal_pipelines')
    data = request(get_url('deal_pipelines')).json()
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(row, schema)
            singer.write_record("deal_pipelines", record, catalog.get('stream_alias'), time_extracted=utils.now())
    singer.write_state(STATE)
    return STATE
Beispiel #25
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')
    git_owner = config.get('git_owner')
    git_repository_list = config['git_repositories'].replace(" ",
                                                             "").split(",")

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: %s', last_stream)
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: %s', selected_streams)

    if not selected_streams:
        return

    # Loop through selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            for git_repository in git_repository_list:
                LOGGER.info('START Syncing Repository: %s, Stream: %s',
                            git_repository, stream_name)
                update_currently_syncing(state, stream_name)
                search_path = endpoint_config.get(
                    'search_path', stream_name).replace(
                        '[GIT_OWNER]',
                        git_owner).replace('[GIT_REPOSITORY]', git_repository)
                bookmark_field = next(
                    iter(endpoint_config.get('replication_keys', [])), None)
                total_records = sync_endpoint(
                    client=client,
                    catalog=catalog,
                    state=state,
                    start_date=start_date,
                    stream_name=stream_name,
                    search_path=search_path,
                    endpoint_config=endpoint_config,
                    git_owner=git_owner,
                    git_repository=git_repository,
                    bookmark_query_field=endpoint_config.get(
                        'bookmark_query_field', None),
                    bookmark_field=bookmark_field,
                    data_key=endpoint_config.get('data_key', stream_name),
                    id_fields=endpoint_config.get('key_properties'),
                    selected_streams=selected_streams)

                update_currently_syncing(state, None)
                LOGGER.info(
                    'FINISHED Syncing Repository: %s, Stream: %s, total_records: %s',
                    git_repository, stream_name, total_records)
Beispiel #26
0
def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("engagements")
    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"], [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must save a lookback window
    # that handles the duration of time that this stream was last syncing,
    # and look back by that amount on the next sync
    last_sync_duration = get_previous_time_window(STATE, "engagements")
    current_sync_start = utils.now()
    if has_bookmark(STATE, "engagements", bookmark_key) and \
       last_sync_duration is not None:
        LOGGER.info(("Last sync of engagements lasted {} seconds. Adjusting bookmark by this "
                     "amount to account for race conditions with record updates.").format(last_sync_duration))
        start = utils.strptime_to_utc(start) - datetime.timedelta(seconds=last_sync_duration)
        start = utils.strftime(start)
    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key, "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(engagement, schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements", record, catalog.get('stream_alias'), time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, max_bk_value)
    # Write duration for next sync's lookback window
    STATE = write_stream_duration(STATE, 'engagements', current_sync_start, utils.now())
    singer.write_state(STATE)
    return STATE
Beispiel #27
0
def get_streams_to_sync(streams, state):
    target_stream = singer.get_currently_syncing(state)
    result = streams
    if target_stream:
        skipped = list(
            itertools.takewhile(lambda x: x.tap_stream_id != target_stream,
                                streams))
        rest = list(
            itertools.dropwhile(lambda x: x.tap_stream_id != target_stream,
                                streams))
        result = rest + skipped  # Move skipped streams to end
    if not result:
        raise Exception("Unknown stream {} in state".format(target_stream))
    return result
Beispiel #28
0
def get_streams_to_sync(streams, state):
    '''Get the streams to sync'''
    current_stream = singer.get_currently_syncing(state)
    result = streams

    if current_stream:
        for key in result.keys():
            if result[key].tap_stream_id != current_stream:
                result.pop(key, None)

    if not result:
        raise Exception("Unknown stream {} in state".format(current_stream))

    return result
Beispiel #29
0
def sync_contacts(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    bookmark_key = "versionTimestamp"
    start = utils.strptime_with_tz(get_start(STATE, "contacts", bookmark_key))
    LOGGER.info("sync_contacts from %s", start)

    max_bk_value = start
    schema = load_schema("contacts")

    singer.write_schema("contacts", schema, ["vid"], [bookmark_key],
                        catalog.get("stream_alias"))

    url = get_url("contacts_all")

    vids = []
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(
                STATE,
                "contacts",
                url,
                default_contact_params,
                "contacts",
                "has-more",
            ["vid-offset"],
            ["vidOffset"],
        ):
            modified_time = None
            if bookmark_key in row:
                modified_time = utils.strptime_with_tz(
                    _transform_datetime(  # pylint: disable=protected-access
                        row[bookmark_key],
                        UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING))

            if not modified_time or modified_time >= start:
                vids.append(row["vid"])

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if len(vids) == 100:
                _sync_contact_vids(catalog, vids, schema, bumble_bee)
                vids = []

        _sync_contact_vids(catalog, vids, schema, bumble_bee)

    STATE = singer.write_bookmark(STATE, "contacts", bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
Beispiel #30
0
def sync_campaigns(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    schema = load_schema("campaigns")
    singer.write_schema("campaigns", schema, ["id"], catalog.get('stream_alias'))
    LOGGER.info("sync_campaigns(NO bookmarks)")
    url = get_url("campaigns_all")
    params = {'limit': 500}

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE, 'campaigns', url, params, "campaigns", "hasMore", ["offset"], ["offset"]):
            record = request(get_url("campaigns_detail", campaign_id=row['id'])).json()
            record = bumble_bee.transform(record, schema)
            singer.write_record("campaigns", record, catalog.get('stream_alias'), time_extracted=utils.now())

    return STATE