Ejemplos de get_offset en Python, ejemplos de singer.get_offset en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: __init__.py Proyecto: jeffsloan/tap-hubspot

def gen_request(STATE, tap_stream_id, url, params, path, more_key, offset_keys,
                offset_targets):
    if len(offset_keys) != len(offset_targets):
        raise ValueError(
            "Number of offset_keys must match number of offset_targets")

    if singer.get_offset(STATE, tap_stream_id):
        params.update(singer.get_offset(STATE, tap_stream_id))

    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            data = request(url, params).json()

            for row in data[path]:
                counter.increment()
                yield row

            if not data.get(more_key, False):
                break

            STATE = singer.clear_offset(STATE, tap_stream_id)
            for key, target in zip(offset_keys, offset_targets):
                if key in data:
                    params[target] = data[key]
                    STATE = singer.set_offset(STATE, tap_stream_id, target,
                                              data[key])

            singer.write_state(STATE)

    STATE = singer.clear_offset(STATE, tap_stream_id)
    singer.write_state(STATE)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: __init__.py Proyecto: JasonTruter/tap-hubspot

def gen_request(STATE,
                tap_stream_id,
                url,
                params,
                path,
                more_key,
                offset_keys,
                offset_targets,
                v3_fields=None):
    if len(offset_keys) != len(offset_targets):
        raise ValueError(
            "Number of offset_keys must match number of offset_targets")

    if singer.get_offset(STATE, tap_stream_id):
        params.update(singer.get_offset(STATE, tap_stream_id))

    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            data = request(url, params).json()

            if v3_fields:
                url = get_url('deals_v3_search')
                body = {"properties": v3_fields, "limit": 100}
                v3_data = post_search_endpoint(url, body)
                additional_fields = {}
                for item in v3_data.json()['results']:
                    # We nest `y` under 'value' in order to match the
                    # schema and the shape of other fields in 'properties'
                    additional_fields[int(item['id'])] = {
                        key: {
                            'value': value
                        }
                        for key, value in item['properties'].items()
                        if ('hs_date_entered' in key or 'hs_date_exited' in key
                            )
                    }
                for item in data[path]:
                    item['properties'] = {
                        **item['properties'],
                        **additional_fields.get(item['dealId'])
                    }

            for row in data[path]:
                counter.increment()
                yield row

            if not data.get(more_key, False):
                break

            STATE = singer.clear_offset(STATE, tap_stream_id)
            for key, target in zip(offset_keys, offset_targets):
                if key in data:
                    params[target] = data[key]
                    STATE = singer.set_offset(STATE, tap_stream_id, target,
                                              data[key])

            singer.write_state(STATE)

    STATE = singer.clear_offset(STATE, tap_stream_id)
    singer.write_state(STATE)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: __init__.py Proyecto: globalprofessionalsearch/tap-hubspot

def gen_request(STATE,
                tap_stream_id,
                url,
                params,
                path,
                more_key,
                offset_keys,
                offset_targets,
                v3_fields=None):
    if len(offset_keys) != len(offset_targets):
        raise ValueError(
            "Number of offset_keys must match number of offset_targets")

    if singer.get_offset(STATE, tap_stream_id):
        params.update(singer.get_offset(STATE, tap_stream_id))

    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            data = request(url, params).json()

            if data.get(path) is None:
                raise RuntimeError(
                    "Unexpected API response: {} not in {}".format(
                        path, data.keys()))

            if v3_fields:
                v3_data = get_v3_deals(v3_fields, data[path])

                # The shape of v3_data is different than the V1 response,
                # so we transform v3 to look like v1
                transformed_v3_data = process_v3_deals_records(v3_data)
                merge_responses(data[path], transformed_v3_data)

            for row in data[path]:
                counter.increment()
                yield row

            if not data.get(more_key, False):
                break

            STATE = singer.clear_offset(STATE, tap_stream_id)
            for key, target in zip(offset_keys, offset_targets):
                if key in data:
                    params[target] = data[key]
                    STATE = singer.set_offset(STATE, tap_stream_id, target,
                                              data[key])

            singer.write_state(STATE)

    STATE = singer.clear_offset(STATE, tap_stream_id)
    singer.write_state(STATE)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: __init__.py Proyecto: jeffsloan/tap-hubspot

def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))
    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + CHUNK_SIZES[entity_name]
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(
                    UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(row, schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset',
                                                  data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE

Ejemplo n.º 5

0

Mostrar archivo

Archivo: __init__.py Proyecto: vroomly/tap-hubspot

def gen_request(state: State,
                tap_stream_id,
                url,
                params,
                path,
                more_key,
                offset_keys,
                offset_targets,
                v3_fields=None):
    if len(offset_keys) != len(offset_targets):
        raise ValueError(
            "Number of offset_keys must match number of offset_targets")

    offset = singer.get_offset(state, tap_stream_id)
    logger.debug(f'offset={offset}')
    if offset:
        params.update(offset)

    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            data = request(url, params).json()

            rows = data.pop(path, None)
            if rows is None:
                raise RuntimeError(
                    "Unexpected API response: {} not in {}".format(
                        path, data.keys()))

            logger.debug(f'Got resp: {len(rows)} {tap_stream_id}, {data}')

            if v3_fields:
                v3_data = get_v3_deals(v3_fields, rows)

                # The shape of v3_data is different than the V1 response,
                # so we transform v3 to look like v1
                transformed_v3_data = process_v3_deals_records(v3_data)
                merge_responses(rows, transformed_v3_data)

            for row in rows:
                counter.increment()
                yield row

            if not data.get(more_key, False):
                logger.debug('HAS NO MORE, BREAK')
                break

            state = singer.clear_offset(state, tap_stream_id)
            for key, target in zip(offset_keys, offset_targets):
                if key in data:
                    params[target] = data[key]
                    state = singer.set_offset(state, tap_stream_id, target,
                                              data[key])
            singer.write_state(state)

    state = singer.clear_offset(state, tap_stream_id)
    singer.write_state(state)

Ejemplo n.º 6

0

Mostrar archivo

def sync_stream(config, consumer, stream, state):
    write_schema(stream.tap_stream_id, stream.schema.to_dict(), stream.key_properties or [])
    stream_version = singer.get_bookmark(state, stream.tap_stream_id, "version")
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream.tap_stream_id, "version", stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    activate_version_message = singer.ActivateVersionMessage(stream=stream.tap_stream_id, version=stream_version)
    singer.write_message(activate_version_message)

    validator = Draft4Validator(stream.schema.to_dict(), format_checker=FormatChecker())
    time_extracted = utils.now()
    rows_saved = 0

    # Assign all the partitions for the topic to this consumer
    topic_partitions = [TopicPartition(config['topic'], partition_id) for partition_id in consumer.partitions_for_topic(config['topic'])]
    consumer.assign(topic_partitions)

    # Seek each partition to it's value from the STATE, or to the beginning otherwise
    offsets = singer.get_offset(state, stream.tap_stream_id, {})
    for topic_partition in topic_partitions:
        if str(topic_partition.partition) in offsets:
            consumer.seek(topic_partition, offsets[str(topic_partition.partition)])
        else:
            consumer.seek_to_beginning(topic_partition)

    for message in consumer:
        record = singer.RecordMessage(stream=stream.tap_stream_id, record=message.value, time_extracted=time_extracted)
        validator.validate(record.record)
        singer.write_message(record)

        state = singer.set_offset(state, stream.tap_stream_id, message.partition, message.offset)
        rows_saved = rows_saved + 1
        if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

Ejemplo n.º 7

0

Mostrar archivo

def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    if "schema" in catalog:
        schema = catalog["schema"]
    else:
        schema = load_schema(entity_name)

    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(
                    UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    if data.get(path) is None:
                        raise RuntimeError(
                            "Unexpected API response: {} not in {}".format(
                                path, data.keys()))

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(
                            lift_properties_and_versions(row), schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset',
                                                  data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE

Ejemplo n.º 8

0

Mostrar archivo

Archivo: __init__.py Proyecto: vroomly/tap-hubspot

def sync_entity_chunked(state: State, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema('hubspot_' + entity_name, schema, key_properties,
                        [bookmark_key])

    start = get_start(state, entity_name, bookmark_key)
    logger.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            while True:
                our_offset = singer.get_offset(state, entity_name)
                if bool(our_offset) and our_offset.get('offset') is not None:
                    params[StateFields.offset] = our_offset.get('offset')

                data = request(url, params).json()

                if data.get(path) is None:
                    raise RuntimeError(
                        "Unexpected API response: {} not in {}".format(
                            path, data.keys()))

                for row in data[path]:
                    counter.increment()
                    record = build_record(row, schema)
                    write_record(entity_name, record)

                if data.get('hasMore'):
                    state = singer.set_offset(state, entity_name, 'offset',
                                              data['offset'])
                    singer.write_state(state)
                else:
                    state = singer.clear_offset(state, entity_name)
                    singer.write_state(state)
                    break
            state = singer.write_bookmark(state, "hubspot_" + entity_name, 'startTimestamp', utils.strftime(
                datetime.datetime.fromtimestamp((start_ts / 1000),
                                                datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(state)
            start_ts = end_ts

    state = singer.clear_offset(state, entity_name)
    singer.write_state(state)
    return state

Ejemplo n.º 9

0

Mostrar archivo

Archivo: __init__.py Proyecto: whalyapp/tap-hubspot

def gen_request_v3(STATE,
                   tap_stream_id,
                   url,
                   params,
                   path,
                   custom_properties_chunks=[],
                   associations=[]):

    # Looping on all the pages until we reach the final one
    while True:

        if singer.get_offset(STATE, tap_stream_id):
            params.update(singer.get_offset(STATE, tap_stream_id))

        # We are doing 1 API call per properties chunk
        # Each call will return the same deal page but with a subset of the deal custom properties each time
        results = []
        for property_chunk in custom_properties_chunks:

            if property_chunk:
                params['properties'] = ",".join(property_chunk)

            data = request(url, params).json()

            results.append(data[path])

        # Resetting the "properties" param
        del params["properties"]

        # Making an API call to fetch the associations for a given page of deals
        params['associations'] = ",".join(associations)
        data = request(url, params).json()
        results.append(data[path])
        del params["associations"]

        # We use a dict to merge together all the properties for a given deal
        # We use the dealId as the dict key
        records_map = {}
        for result in results:
            for record in result:

                # We have to init the dict entry if it doesn't exist
                if not record["id"] in records_map:
                    records_map[record["id"]] = {"properties": {}}

                merged_properties = {
                    **records_map[record["id"]]["properties"],
                    **record["properties"]
                }
                records_map[record["id"]]["properties"] = merged_properties

                # If we reached the deal record that contains the association info, we take it
                if "associations" in record:
                    records_map[
                        record["id"]]["associations"] = record["associations"]

                # Keep the DealId in "dealId" field to preserve the table
                # primary key that is "dealId" before the switch to v3 API for deals
                records_map[record["id"]]["dealId"] = record["id"]

        with metrics.record_counter(tap_stream_id) as counter:

            for key, value in records_map.items():
                counter.increment()
                yield value

        # This is the paging break signal from Hubspot
        if not "paging" in data:
            break

        # We update the paging parameter and go the next page / loop
        # "after" is the paging offset
        after = data["paging"]["next"]["after"]
        STATE = singer.set_offset(STATE, tap_stream_id, "after", after)
        singer.write_state(STATE)

    # We clear the offset
    STATE = singer.clear_offset(STATE, tap_stream_id)
    singer.write_state(STATE)