Ejemplo n.º 1
0
 def test_incremental(self, log):
     with metrics.record_counter(endpoint='users') as counter:
         counter.increment(1)
         counter._ready_to_log = lambda: True
         counter.increment(2)
         counter._ready_to_log = lambda: False
         counter.increment(5)
     self.assertEqual(
         [metrics.Point('counter', 'record_count', 3, {'endpoint': 'users'}),
          metrics.Point('counter', 'record_count', 5, {'endpoint': 'users'})],
         logged_points(log))
Ejemplo n.º 2
0
def sync_generic_basic_endpoint(sdk_client, stream, stream_metadata):
    discovered_schema = load_schema(stream)
    field_list = get_field_list(discovered_schema, stream, stream_metadata)

    discovered_schema['properties']['_sdc_customer_id'] = {
        'description': 'Profile ID',
        'type': 'string',
        'field': "customer_id"
    }
    primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys']
    write_schema(stream, discovered_schema, primary_keys)

    LOGGER.info("Syncing %s for customer %s", stream,
                sdk_client.client_customer_id)

    start_index = 0
    selector = {
        'fields': field_list,
        'paging': {
            'startIndex': str(start_index),
            'numberResults': str(PAGE_SIZE)
        }
    }

    while True:
        page = get_page(sdk_client, selector, stream, start_index)
        if page['totalNumEntries'] > GOOGLE_MAX_START_INDEX:
            raise Exception("Too many %s (%s > %s) for customer %s", stream,
                            GOOGLE_MAX_START_INDEX, page['totalNumEntries'],
                            sdk_client.client_customer_id)

        if 'entries' in page:
            with metrics.record_counter(stream) as counter:
                time_extracted = utils.now()

                for entry in page['entries']:
                    obj = suds_to_dict(entry)
                    obj['_sdc_customer_id'] = sdk_client.client_customer_id
                    with Transformer(
                            singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING
                    ) as bumble_bee:
                        bumble_bee.pre_hook = transform_pre_hook
                        record = bumble_bee.transform(obj, discovered_schema)

                        singer.write_record(stream,
                                            record,
                                            time_extracted=time_extracted)
                        counter.increment()

        start_index += PAGE_SIZE
        if start_index > int(page['totalNumEntries']):
            break
    LOGGER.info("Done syncing %s for customer_id %s", stream,
                sdk_client.client_customer_id)
Ejemplo n.º 3
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        max_bookmark_value=None,
        last_datetime=None,
        parent=None,
        parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer() as transformer:
                try:
                    transformed_record = transformer.transform(
                        record, schema, stream_metadata)
                except Exception as err:
                    LOGGER.error('Transformer Error: {}'.format(err))
                    LOGGER.error('Stream: {}, record: {}'.format(
                        stream_name, record))
                    raise err

                # Reset max_bookmark_value to new value if higher
                if transformed_record.get(bookmark_field):
                    if max_bookmark_value is None or \
                        transformed_record[bookmark_field] > transform_datetime(max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    last_dttm = transform_datetime(last_datetime)
                    bookmark_dttm = transform_datetime(
                        transformed_record[bookmark_field])
                    # Keep only records whose bookmark is after the last_datetime
                    if bookmark_dttm:
                        if bookmark_dttm >= last_dttm:
                            write_record(stream_name, transformed_record, \
                                time_extracted=time_extracted)
                            counter.increment()
                else:
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Ejemplo n.º 4
0
def sync_offer_stages(offer):
    LOGGER.info("-----/ Syncing offer_stages")
    offer_id = int(offer["id"])
    if "stages" in offer and len(offer["stages"]) > 0:
        with metrics.record_counter("offer_stages") as counter:
            offer_stages = [
                ofr.filter_stage(stage, offer_id) for stage in offer["stages"]
            ]
            for stage in offer_stages:
                counter.increment()
                singer.write_record("offer_stages", stage)
Ejemplo n.º 5
0
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))
    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + CHUNK_SIZES[entity_name]
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(row, schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc ))) # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Ejemplo n.º 6
0
def sync_candidate_tags(candidate):
    LOGGER.info("-----/ Syncing candidate_tags")
    candidate_id = int(candidate["id"])
    if "tags" in candidate and len(candidate["tags"]) > 0:
        with metrics.record_counter("candidate_tags") as counter:
            candidate_tags = [
                cand.filter_tag(tag, candidate_id) for tag in candidate["tags"]
            ]
            for tag in candidate_tags:
                counter.increment()
                singer.write_record("candidate_tags", tag)
Ejemplo n.º 7
0
    async def process_lines(self, lines, loop=None):

        loop = loop or asyncio.get_event_loop()
        api = self._api_client

        schemas = {}
        active_versions = {}

        queues = {}
        consumers = {}

        logger.info('Checking network connectivity')
        api.connection_check()  # Fail fast

        logger.info('Ensuring dataset exists and is in good state')
        await self._fix_dataset()

        with metrics.record_counter() as counter:
            for line in lines:
                try:
                    msg = singer.parse_message(line)
                except (json.JSONDecodeError, simplejson.JSONDecodeError) as e:
                    raise UnparseableMessageError(line, str(e))

                if isinstance(msg, singer.RecordMessage):
                    await self._handle_record_msg(msg, schemas,
                                                  active_versions, loop,
                                                  queues, consumers)
                    counter.increment()
                    logger.debug('Line #{} in {} queued for upload'.format(
                        counter.value, msg.stream))
                elif isinstance(msg, singer.SchemaMessage):
                    logger.info('Schema found for {}'.format(msg.stream))
                    schemas[msg.stream] = await self._handle_schema_msg(msg)
                elif isinstance(msg, singer.StateMessage):
                    logger.info('State message found: {}'.format(msg.value))
                    state = await self._handle_state_msg(
                        msg, queues, consumers)
                    queues = {}
                    yield state
                elif isinstance(msg, singer.ActivateVersionMessage):
                    logger.info('Version message found: {}/{}'.format(
                        msg.stream, msg.version))

                    current_version = active_versions.get(msg.stream)
                    active_version = await self._handle_active_version_msg(
                        msg, current_version, api)
                    active_versions[msg.stream] = active_version
                else:
                    logger.warn('Unrecognized message ({})'.format(msg))

        await TargetDataDotWorld._drain_queues(queues, consumers)
        self._api_client.sync(self.config['dataset_owner'],
                              self.config['dataset_id'])
Ejemplo n.º 8
0
def output_responses(stream_id, config: dict, state: dict) -> dict:
    """ Query and output the api for individual responses """

    while True:
        previous_state_end_datetime = state.get('bookmarks', {}).get(
            stream_id, {}).get('last_record', None)

        # Start where the previous run left off, or it's a first run use the one from the config.
        start_datetime = arrow.get(previous_state_end_datetime
                                   or config.get('start_date'))

        # request data from the api in blocks of a month
        end_datetime = start_datetime.shift(months=1)

        # Fetch data from api
        params = {
            "format": "json",
            "project": config["project_id"],
            "startDate": start_datetime.isoformat(),
            "endDate": end_datetime.isoformat(),
        }

        res_json = request('/responses/',
                           params=params,
                           auth=HTTPBasicAuth(config["api_key"], None),
                           user_agent=config.get('user_agent', None)).json()

        # Output items
        bookmark = start_datetime
        with record_counter(endpoint=stream_id) as counter:
            for record in res_json['responses']:
                write_record(stream_id, record)
                counter.increment()
                bookmark = max([arrow.get(record['created']), bookmark])

        # If we're not past the current timestamp, set the bookmark
        # to the end_datetime requested as there won't be any new ones
        # coming in for past times.
        if end_datetime < arrow.utcnow():
            bookmark = end_datetime

        # Update and export state
        if 'bookmarks' not in state:
            state['bookmarks'] = {}
        state['bookmarks'][stream_id] = {'last_record': bookmark.isoformat()}

        write_state(state)

        # Stop when we had requested past the current timestamp,
        # there won't be anything more.
        if end_datetime > arrow.utcnow():
            break

    return state
Ejemplo n.º 9
0
def sync_view(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream['tap_stream_id'],
                                    'version') is None
    nascent_stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='stitch_cursor') as cur:
                cur.itersize = post_db.cursor_iter_size
                select_sql = 'SELECT {} FROM {}'.format(
                    ','.join(escaped_columns),
                    post_db.fully_qualified_table_name(schema_name,
                                                       stream['table_name']))

                LOGGER.info("select %s with itersize %s", select_sql,
                            cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0
                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
Ejemplo n.º 10
0
def sync_candidate_sources(candidate):
    LOGGER.info("-----/ Syncing candidate_sources")
    candidate_id = int(candidate["id"])
    if "sources" in candidate and len(candidate["sources"]) > 0:
        with metrics.record_counter("candidate_sources") as counter:
            candidate_sources = [
                cand.filter_source(source, candidate_id)
                for source in candidate["sources"]
            ]
            for source in candidate_sources:
                counter.increment()
                singer.write_record("candidate_sources", source)
Ejemplo n.º 11
0
def sync_candidate_placements(candidate):
    LOGGER.info("-----/ Syncing candidate_placements")
    candidate_id = int(candidate["id"])
    if "placements" in candidate and len(candidate["placements"]) > 0:
        with metrics.record_counter("candidate_placements") as counter:
            candidate_placements = [
                cand.filter_placement(placement, candidate_id)
                for placement in candidate["placements"]
            ]
            for placement in candidate_placements:
                counter.increment()
                singer.write_record("candidate_placements", placement)
Ejemplo n.º 12
0
def persist_records(catalog, stream_id, records):
    stream = catalog.get_stream(stream_id)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)
    with metrics.record_counter(stream_id) as counter:
        for record in records:
            with Transformer(
                    integer_datetime_fmt=UNIX_SECONDS_INTEGER_DATETIME_PARSING
            ) as transformer:
                record = transformer.transform(record, schema, stream_metadata)
            singer.write_record(stream_id, record)
            counter.increment()
Ejemplo n.º 13
0
def process_records(catalog, stream_name, records):
    if records:
        stream = catalog.get_stream(stream_name)
        schema = stream.schema.to_dict()
        stream_metedata = metadata.to_map(stream.metadata)
        with metrics.record_counter(stream_name) as counter:
            for record in records:
                with Transformer() as transformer:
                    record = transformer.transform(record, schema,
                                                   stream_metedata)
                singer.write_record(stream_name, record)
                counter.increment()
Ejemplo n.º 14
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        bookmark_type=None,
        max_bookmark_value=None,
        last_datetime=None,
        last_integer=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(
                    record, schema, stream_metadata)
                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    if (max_bookmark_value is None) or \
                        (transformed_record[bookmark_field] > transform_datetime(max_bookmark_value)):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    if bookmark_type == 'integer':
                        # Keep only records whose bookmark is after the last_integer
                        if transformed_record[bookmark_field] >= last_integer:
                            write_record(stream_name, transformed_record, \
                                time_extracted=time_extracted)
                            counter.increment()
                    elif bookmark_type == 'datetime':
                        last_dttm = transform_datetime(last_datetime)
                        bookmark_dttm = transform_datetime(
                            transformed_record[bookmark_field])
                        # Keep only records whose bookmark is after the last_datetime
                        if bookmark_dttm >= last_dttm:
                            # LOGGER.info('record1: {}'.format(record)) # TESTING, comment out
                            write_record(stream_name, transformed_record, \
                                time_extracted=time_extracted)
                            counter.increment()
                else:
                    # LOGGER.info('record2: {}'.format(record)) # TESTING, comment out
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        LOGGER.info('Stream: {}, Processed {} records'.format(
            stream_name, counter.value))
        return max_bookmark_value
Ejemplo n.º 15
0
def persist_records(catalog, stream_id, records):
    if records:  # check for empty array
        stream = catalog.get_stream(stream_id)
        schema = stream.schema.to_dict()
        stream_metadata = metadata.to_map(stream.metadata)
        with metrics.record_counter(stream_id) as counter:
            for record in records:
                with Transformer() as transformer:
                    record = transformer.transform(record, schema,
                                                   stream_metadata)
                singer.write_record(stream_id, record)
                counter.increment()
Ejemplo n.º 16
0
def process_records(catalog, #pylint: disable=too-many-branches
                    stream_name,
                    records,
                    time_extracted,
                    bookmark_field=None,
                    bookmark_type=None,
                    max_bookmark_value=None,
                    last_datetime=None,
                    last_integer=None,
                    parent=None,
                    parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(record,
                                               schema,
                                               stream_metadata)

                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    if (max_bookmark_value is None) or \
                        (transformed_record[bookmark_field] > max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field:
                    if bookmark_field in transformed_record:
                        if bookmark_type == 'integer':
                            # Keep only records whose bookmark is after the last_integer
                            if transformed_record[bookmark_field] >= last_integer:
                                write_record(stream_name, transformed_record, time_extracted=time_extracted)
                                counter.increment()
                        elif bookmark_type == 'datetime':
                            last_dttm = transformer._transform_datetime(last_datetime)
                            bookmark_dttm = transformer._transform_datetime(transformed_record[bookmark_field])
                            # Keep only records whose bookmark is after the last_datetime
                            if bookmark_dttm >= last_dttm:
                                write_record(stream_name, transformed_record, time_extracted=time_extracted)
                                counter.increment()
                else:
                    write_record(stream_name, transformed_record, time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Ejemplo n.º 17
0
def sync_campaign_ids_endpoint(sdk_client,
                               campaign_ids,
                               stream_schema,
                               stream,
                               stream_metadata):
    discovered_schema = load_schema(stream)

    field_list = get_field_list(discovered_schema, stream, stream_metadata)
    discovered_schema['properties']['_sdc_customer_id'] = {
        'description': 'Profile ID',
        'type': 'string',
        'field': "customer_id"
    }
    primary_keys = GENERIC_ENDPOINT_MAPPINGS[stream]['primary_keys']
    write_schema(stream, discovered_schema, primary_keys)

    LOGGER.info("Syncing %s for customer %s", stream, sdk_client.client_customer_id)

    for safe_selector in get_campaign_ids_safe_selectors(
            sdk_client,
            campaign_ids,
            stream):
        start_index = 0
        while True:
            page = get_campaign_ids_filtered_page(sdk_client,
                                                  field_list,
                                                  safe_selector,
                                                  stream,
                                                  start_index)
            if page['totalNumEntries'] > GOOGLE_MAX_RESULTSET_SIZE:
                raise Exception("Too many {} ({} > {}) for customer {}, campaigns {}".format(
                    stream,
                    GOOGLE_MAX_RESULTSET_SIZE,
                    page['totalNumEntries'],
                    sdk_client.client_customer_id,
                    campaign_ids))
            if 'entries' in page:
                with metrics.record_counter(stream) as counter:
                    for entry in page['entries']:
                        obj = suds_to_dict(entry)
                        obj['_sdc_customer_id'] = sdk_client.client_customer_id
                        with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee: #pylint: disable=line-too-long
                            bumble_bee.pre_hook = transform_pre_hook
                            record = bumble_bee.transform(obj, discovered_schema)

                            singer.write_record(stream, record)
                            counter.increment()

            start_index += PAGE_SIZE
            if start_index > int(page['totalNumEntries']):
                break
    LOGGER.info("Done syncing %s for customer_id %s", stream, sdk_client.client_customer_id)
Ejemplo n.º 18
0
def sync_rows(filename,
              STATE,
              tap_stream_id,
              key_properties=[],
              auth_method=None,
              max_page=None):
    schema = load_schema(tap_stream_id)
    singer.write_schema(tap_stream_id, schema, key_properties)

    bookmark_type = get_bookmark_type()

    start = get_start(STATE, tap_stream_id, "last_update")
    end = get_end()

    pretty_start = start
    pretty_end = end
    if bookmark_type == "timestamp":
        pretty_start = str(start) + " (" + str(
            datetime.datetime.fromtimestamp(start)) + ")"
        if end is not None:
            pretty_end = str(end) + " (" + str(
                datetime.datetime.fromtimestamp(end)) + ")"

    LOGGER.info(
        "Stream %s has %s set starting %s and ending %s. I trust you set URL format contains those params. The behavior depends on the data source API's spec. I will not filter out the records outside the boundary. Every record received is will be written out."
        % (tap_stream_id, bookmark_type, pretty_start, pretty_end))

    last_update = start
    etl_tstamp = int(time.time())
    with metrics.record_counter(tap_stream_id) as counter:
        data = read_csv_as_dict(filename,
                                skip=CONFIG.get("skip"),
                                lower=True,
                                replace_special="_",
                                snake_case=True)

        LOGGER.info("Read %d records from CSV. ETL timestamp %d" %
                    (len(data), etl_tstamp))
        for row in data:
            counter.increment()
            row = get_record(row, CONFIG.get("record_level"))
            row = filter_result(row, schema)
            if "_etl_tstamp" in schema["properties"].keys():
                row["_etl_tstamp"] = etl_tstamp
            last_update = get_last_update(row, last_update)
            singer.write_record(tap_stream_id, row)

    STATE = singer.write_bookmark(STATE, tap_stream_id, 'last_update',
                                  last_update)

    singer.write_state(STATE)
    return STATE
Ejemplo n.º 19
0
    def print_counts(cls):
        # Separate loops for formatting.
        for stream_name, stream_count in Context.new_counts.items():
            with metrics.record_counter(stream_name) as counter:
                updates_count = Context.updated_counts[stream_name]
                total_replicated = stream_count + updates_count
                counter.increment(total_replicated)

        LOGGER.info('------------------')
        for stream_name, stream_count in Context.new_counts.items():
            LOGGER.info('%s: %d new, %d updates', stream_name, stream_count,
                        Context.updated_counts[stream_name])
        LOGGER.info('------------------')
Ejemplo n.º 20
0
 def write_page(self, page):
     stream = Context.get_catalog_entry(self.tap_stream_id)
     stream_metadata = metadata.to_map(stream.metadata)
     extraction_time = singer.utils.now()
     for rec in page:
         with Transformer() as transformer:
             rec = transformer.transform(rec, stream.schema.to_dict(),
                                         stream_metadata)
         singer.write_record(self.tap_stream_id,
                             rec,
                             time_extracted=extraction_time)
     with metrics.record_counter(self.tap_stream_id) as counter:
         counter.increment(len(page))
Ejemplo n.º 21
0
def sync_events():
    schema = load_schema("events")
    singer.write_schema("events", schema, [])

    for export_bundle in request_export_bundles():
        with metrics.record_counter("events") as counter:
            for event in download_events(export_bundle['Id']):
                transform_event(event)
                counter.increment()
                singer.write_record("events", event)
            stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop'])
            utils.update_state(STATE, "events", stop_timestamp)
            singer.write_state(STATE)
Ejemplo n.º 22
0
def sync_events():
    schema = load_schema("events")
    singer.write_schema("events", schema, [])

    for export_bundle in request_export_bundles():
        with metrics.record_counter("events") as counter:
            for event in download_events(export_bundle['Id']):
                transform_event(event)
                counter.increment()
                singer.write_record("events", event)
            stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop'])
            utils.update_state(STATE, "events", stop_timestamp)
            singer.write_state(STATE)
def gen_request(STATE,
                tap_stream_id,
                url,
                params,
                path,
                more_key,
                offset_keys,
                offset_targets,
                v3_fields=None):
    if len(offset_keys) != len(offset_targets):
        raise ValueError(
            "Number of offset_keys must match number of offset_targets")

    if singer.get_offset(STATE, tap_stream_id):
        params.update(singer.get_offset(STATE, tap_stream_id))

    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            data = request(url, params).json()

            if data.get(path) is None:
                raise RuntimeError(
                    "Unexpected API response: {} not in {}".format(
                        path, data.keys()))

            if v3_fields:
                v3_data = get_v3_deals(v3_fields, data[path])

                # The shape of v3_data is different than the V1 response,
                # so we transform v3 to look like v1
                transformed_v3_data = process_v3_deals_records(v3_data)
                merge_responses(data[path], transformed_v3_data)

            for row in data[path]:
                counter.increment()
                yield row

            if not data.get(more_key, False):
                break

            STATE = singer.clear_offset(STATE, tap_stream_id)
            for key, target in zip(offset_keys, offset_targets):
                if key in data:
                    params[target] = data[key]
                    STATE = singer.set_offset(STATE, tap_stream_id, target,
                                              data[key])

            singer.write_state(STATE)

    STATE = singer.clear_offset(STATE, tap_stream_id)
    singer.write_state(STATE)
Ejemplo n.º 24
0
def sync_stream(sf, catalog_entry, state):
    stream = catalog_entry['stream']

    with metrics.record_counter(stream) as counter:
        try:
            sync_records(sf, catalog_entry, state, counter)
            singer.write_state(state)
        except RequestException as ex:
            raise Exception("Error syncing {}: {} Response: {}".format(
                stream, ex, ex.response.text))
        except Exception as ex:
            raise Exception("Error syncing {}: {}".format(stream, ex)) from ex

        return counter
Ejemplo n.º 25
0
def sync_stream(sf, catalog_entry, state, state_msg_threshold):
    stream = catalog_entry['stream']

    with metrics.record_counter(stream) as counter:
        try:
            sync_records(sf, catalog_entry, state, counter,
                         state_msg_threshold)
            # Write the state generated for the last record generated by sf.query
            singer.write_state(state)
        except RequestException as ex:
            raise Exception("Error syncing {}: {} Response: {}".format(
                stream, ex, ex.response.text))
        except Exception as ex:
            raise Exception("Error syncing {}: {}".format(stream, ex)) from ex
Ejemplo n.º 26
0
def sync_endpoint(url, state):
    '''Syncs the url and paginates through until there are no more "next"
    urls. Yields schema, record, and state messages. Modifies state by
    setting the NEXT field every time we get a next url from Shippo. This
    allows us to resume paginating if we're terminated.

    '''
    stream = parse_stream_from_url(url)
    yield singer.SchemaMessage(stream=stream,
                               schema=load_schema(stream),
                               key_properties=["object_id"])

    if LAST_START_DATE in state:
        start = pendulum.parse(state[LAST_START_DATE]).subtract(days=2)
    else:
        start = pendulum.parse(CONFIG[START_DATE])
    # The Shippo API does not return data from long ago, so we only try to
    # replicate the last 60 days
    sixty_days_ago = pendulum.now().subtract(days=60)
    bounded_start = max(start, sixty_days_ago)
    LOGGER.info("Replicating all %s from %s", stream, bounded_start)

    rows_read = 0
    rows_written = 0
    finished = False
    with metrics.record_counter(parse_stream_from_url(url)) as counter:
        while url and not finished:
            state[NEXT] = url
            yield singer.StateMessage(value=state)

            data = request(url)

            for row in data['results']:
                counter.increment()
                rows_read += 1
                updated = pendulum.parse(row[OBJECT_UPDATED])
                if updated >= bounded_start:
                    row = fix_extra_map(row)
                    yield singer.RecordMessage(stream=stream, record=row)
                    rows_written += 1
                else:
                    finished = True
                    break

            url = data.get(NEXT)

    if rows_read:
        LOGGER.info("Done syncing %s. Read %d records, wrote %d (%.2f%%)",
                    stream, rows_read, rows_written,
                    100.0 * rows_written / float(rows_read))
Ejemplo n.º 27
0
def get_all_projects(schemas, repo_path, state, mdata):
    bookmark_value = get_bookmark(state, repo_path, "projects", "since")
    if bookmark_value:
        bookmark_time = singer.utils.strptime_to_utc(bookmark_value)
    else:
        bookmark_time = 0

    with metrics.record_counter('projects') as counter:
        #pylint: disable=too-many-nested-blocks
        for response in authed_get_all_pages(
                'projects',
                'https://api.github.com/repos/{}/projects?sort=created_at&direction=desc'.format(repo_path),
                { 'Accept': 'application/vnd.github.inertia-preview+json' }
        ):
            projects = response.json()
            extraction_time = singer.utils.now()
            for r in projects:
                r['_sdc_repository'] = repo_path

                # skip records that haven't been updated since the last run
                # the GitHub API doesn't currently allow a ?since param for pulls
                # once we find the first piece of old data we can return, thanks to
                # the sorting
                if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time:
                    return state

                # transform and write release record
                with singer.Transformer() as transformer:
                    rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata))
                singer.write_record('projects', rec, time_extracted=extraction_time)
                singer.write_bookmark(state, repo_path, 'projects', {'since': singer.utils.strftime(extraction_time)})
                counter.increment()

                project_id = r.get('id')



                # sync project_columns if that schema is present (only there if selected)
                if schemas.get('project_columns'):
                    for project_column_rec in get_all_project_columns(project_id, schemas['project_columns'], repo_path, state, mdata):
                        singer.write_record('project_columns', project_column_rec, time_extracted=extraction_time)
                        singer.write_bookmark(state, repo_path, 'project_columns', {'since': singer.utils.strftime(extraction_time)})

                        # sync project_cards if that schema is present (only there if selected)
                        if schemas.get('project_cards'):
                            column_id = project_column_rec['id']
                            for project_card_rec in get_all_project_cards(column_id, schemas['project_cards'], repo_path, state, mdata):
                                singer.write_record('project_cards', project_card_rec, time_extracted=extraction_time)
                                singer.write_bookmark(state, repo_path, 'project_cards', {'since': singer.utils.strftime(extraction_time)})
    return state
Ejemplo n.º 28
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        max_bookmark_value=None,
        last_datetime=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # Transform record for Singer.io
            # LOGGER.info('Process record = {}'.format(record)) # COMMENT OUT
            with Transformer() as transformer:
                try:
                    transformed_record = transformer.transform(
                        record, schema, stream_metadata)
                except Exception as err:
                    LOGGER.error('Error: {}'.format(err))
                    LOGGER.error('Error record: {}'.format(
                        json.dumps(record, sort_keys=True, indent=2)))
                    LOGGER.error(' for schema: {}'.format(
                        json.dumps(schema, sort_keys=True, indent=2)))
                    raise err

                # Reset max_bookmark_value to new value if higher
                if transformed_record.get(bookmark_field):
                    if max_bookmark_value is None or \
                        transformed_record[bookmark_field] > transform_datetime(max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    last_dttm = transform_datetime(last_datetime)
                    bookmark_dttm = transform_datetime(
                        transformed_record[bookmark_field])
                    # Keep only records whose bookmark is after the last_datetime
                    if bookmark_dttm >= last_dttm:
                        write_record(stream_name, transformed_record, \
                            time_extracted=time_extracted)
                        counter.increment()
                else:
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value
Ejemplo n.º 29
0
def sync_stream(sf, catalog_entry, state):
    stream = catalog_entry['stream']

    with metrics.record_counter(stream) as counter:
        try:
            sync_records(sf, catalog_entry, state, counter)
            singer.write_state(state)
        except TapSalesforceException as ex:
            raise type(ex)("Error syncing {}: {}".format(stream, ex))
        except Exception as ex:
            raise Exception("Unexpected error syncing {}: {}".format(
                stream, ex)) from ex

        return counter
Ejemplo n.º 30
0
def process_records(stream, mdata, max_modified, records):
    schema = stream.schema.to_dict()
    with metrics.record_counter(stream.tap_stream_id) as counter:
        for record in records:
            if record['Modified'] > max_modified:
                max_modified = record['Modified']

            with Transformer() as transformer:
                record = transformer.transform(record,
                                               schema,
                                               mdata)
            singer.write_record(stream.tap_stream_id, record)
            counter.increment()
        return max_modified
Ejemplo n.º 31
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        max_bookmark_value=None,
        last_datetime=None,
        parent=None,
        parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer(integer_datetime_fmt=UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) \
                as transformer:
                transformed_record = transformer.transform(
                    record, schema, stream_metadata)

                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    if max_bookmark_value is None or \
                        strptime_to_utc(transformed_record[bookmark_field]) > strptime_to_utc(max_bookmark_value):
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    last_dttm = strptime_to_utc(last_datetime)
                    bookmark_dttm = strptime_to_utc(
                        transformed_record[bookmark_field])
                    # Keep only records whose bookmark is after the last_datetime
                    if bookmark_dttm >= last_dttm:
                        write_record(stream_name,
                                     transformed_record,
                                     time_extracted=time_extracted)
                        counter.increment()
                else:
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, counter.value