Esempio n. 1
0
def save_state(state):
    if not state:
        return

    LOGGER.info('Updating state.')

    singer.write_state(state)
Esempio n. 2
0
def sync_events():
    schema = load_schema("events")
    singer.write_schema("events", schema, [])

    for export_bundle in request_export_bundles():
        with metrics.record_counter("events") as counter:
            for event in download_events(export_bundle['Id']):
                transform_event(event)
                counter.increment()
                singer.write_record("events", event)
            stop_timestamp = datetime.datetime.utcfromtimestamp(export_bundle['Stop'])
            utils.update_state(STATE, "events", stop_timestamp)
            singer.write_state(STATE)
Esempio n. 3
0
def update_currently_syncing(state, stream_name):
    if (stream_name is None) and ('currently_syncing' in state):
        del state['currently_syncing']
    else:
        singer.set_currently_syncing(state, stream_name)
    singer.write_state(state)
Esempio n. 4
0
def sync_parthners():
    schema = load_schema("raw_data/parthners")
    singer.write_schema("partners_report", schema, [])
    fieldnames = (
        "agency_pmd_af_prt",
        "media_source_pid",
        "campaign",
        "impressions",
        "clicks",
        "ctr",
        "installs",
        "conversion_rate",
        "sessions",
        "loyal_users",
        "loyal_users_Installs",
        "total_revenue",
        "total_cost",
        "roi",
        "arpu",
        "average_ecpi",
        "af_content_view_unique_users",
        "af_content_view_event_counter",
        "af_content_view_sales_in_usd",
        "app_confirmed_sms_unique_users",
        "app_confirmed_sms_event_counter",
        "app_confirmed_sms_sales_in_usd",
        "app_facial_image_unique_users",
        "app_facial_image_event_counter",
        "app_facial_image_sales_in_usd",
        "app_loginpage_unique_users",
        "app_loginpage_event_counter",
        "app_loginpage_sales_in_usd",
        "app_onboard_success_unique_users",
        "app_onboard_success_event_counter",
        "app_onboard_success_sales_in_usd",
        "app_open_unique_users",
        "app_open_event_counter",
        "app_open_sales_in_usd",
        "app_passcode_1_unique_users",
        "app_passcode_1_event_counter",
        "app_passcode_1_sales_in_usd",
        "app_passcode_2_unique_users",
        "app_passcode_2_event_counter",
        "app_passcode_2_sales_in_usd",
        "app_phone_number_add_unique_users",
        "app_phone_number_add_event_counter",
        "app_phone_number_add_sales_in_usd",
        "app_registered_success_unique_users",
        "app_registered_success_event_counter",
        "app_registered_success_sales_in_usd",
        "app_waiting_sms_code_unique_users",
        "app_waiting_sms_code_event_counter",
        "app_waiting_sms_code_sales_in_usd",
        "emotion_validation_unique_users",
        "emotion_validation_event_counter",
        "emotion_validation_sales_in_usd"
    )

    from_datetime = get_start("partners")
    to_datetime = get_stop(from_datetime, datetime.datetime.now())

    if to_datetime < from_datetime:
        LOGGER.error("to_datetime (%s) is less than from_endtime (%s).", to_datetime, from_datetime)
        return

    params = dict()
    params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M")
    params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M")
    params["api_token"] = CONFIG["api_token"]

    url = get_url("partners", app_id=CONFIG["app_id"])
    request_data = request(url, params)

    csv_data = RequestToCsvAdapter(request_data)
    reader = csv.DictReader(csv_data, fieldnames)

    next(reader)  # Skip the heading row

    bookmark = from_datetime
    for i, row in enumerate(reader):
        record = xform(row, schema)
        singer.write_record("parthners", record)
        # AppsFlyer returns records in order of most recent first.
        if utils.strptime(record["attributed_touch_time"]) > bookmark:
            bookmark = utils.strptime(record["attributed_touch_time"])

    # Write out state
    utils.update_state(STATE, "parthners", bookmark)
    singer.write_state(STATE)
Esempio n. 5
0
def update_current_stream(state, stream_name=None):
    set_currently_syncing(state, stream_name)
    singer.write_state(state)
Esempio n. 6
0
def write_bookmark(state, stream, value):
    if 'bookmarks' not in state:
        state['bookmarks'] = {}
    state['bookmarks'][stream] = value
    LOGGER.info('Write state for stream: {}, value: {}'.format(stream, value))
    singer.write_state(state)
Esempio n. 7
0
 def write_state(self):
     return singer.write_state(self.state)
Esempio n. 8
0
    def do_sync(self):
        logger.debug('Starting sync')

        # resuming when currently_syncing within state
        resume_from_stream = False
        if self.state and 'currently_syncing' in self.state:
            resume_from_stream = self.state['currently_syncing']

        for stream in self.streams:
            stream.tap = self

            if resume_from_stream:
                if stream.schema == resume_from_stream:
                    logger.info('Resuming from {}'.format(resume_from_stream))
                    resume_from_stream = False
                else:
                    logger.info('Skipping stream {} as resuming from {}'.format(stream.schema, resume_from_stream))
                    continue

            # stream state, from state/bookmark or start_date
            stream.set_initial_state(self.state, self.config['start_date'])

            # currently syncing
            if stream.state_field:
                set_currently_syncing(self.state, stream.schema)
                self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field, str(stream.initial_state))
                singer.write_state(self.state)

            # schema
            stream.write_schema()

            # paginate
            while stream.has_data():

                with singer.metrics.http_request_timer(stream.schema) as timer:
                    try:
                        response = self.execute_stream_request(stream)
                    except (ConnectionError, RequestException) as e:
                        raise e
                    timer.tags[singer.metrics.Tag.http_status_code] = response.status_code

                self.validate_response(response)
                self.rate_throttling(response)
                stream.paginate(response)

                # records with metrics
                with singer.metrics.record_counter(stream.schema) as counter:
                    with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING) as optimus_prime:
                        for row in self.iterate_response(response):
                            row = stream.process_row(row)

                            if not row: # in case of a non-empty response with an empty element
                                continue
                            row = optimus_prime.transform(row, stream.get_schema())
                            if stream.write_record(row):
                                counter.increment()
                            stream.update_state(row)

            # update state / bookmarking only when supported by stream
            if stream.state_field:
                self.state = singer.write_bookmark(self.state, stream.schema, stream.state_field,
                                                   str(stream.earliest_state))
            singer.write_state(self.state)

        # clear currently_syncing
        try:
            del self.state['currently_syncing']
        except KeyError as e:
            pass
        singer.write_state(self.state)
Esempio n. 9
0
 def test_write_state(self):
     singer.write_state({"foo": 1})
Esempio n. 10
0
def sync_event_updates(stream_name):
    '''
    Get updates via events endpoint

    look at 'events update' bookmark and pull events after that
    '''
    LOGGER.info("Started syncing event based updates")

    bookmark_value = singer.get_bookmark(Context.state,
                                         stream_name + '_events',
                                         'updates_created') or \
                     int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    max_created = bookmark_value
    date_window_start = max_created
    date_window_end = max_created + 604800  # Number of seconds in a week

    stop_paging = False

    # Create a map to hold relate event object ids to timestamps
    updated_object_timestamps = {}

    while not stop_paging:
        extraction_time = singer.utils.now()

        response = STREAM_SDK_OBJECTS['events']['sdk_object'].list(
            **{
                "limit": 100,
                "type": STREAM_TO_TYPE_FILTER[stream_name]['type'],
                "stripe_account": Context.config.get('account_id'),
                # None passed to starting_after appears to retrieve
                # all of them so this should always be safe.
                "created[gte]": date_window_start,
                "created[lt]": date_window_end,
            })

        # If no results, and we are not up to current time
        if not len(response) and date_window_end > extraction_time.timestamp():  # pylint: disable=len-as-condition
            stop_paging = True

        for events_obj in response.auto_paging_iter():
            event_resource_obj = events_obj.data.object
            sub_stream_name = SUB_STREAMS.get(stream_name)

            # Check whether we should sync the event based on its created time
            if not should_sync_event(
                    events_obj, STREAM_TO_TYPE_FILTER[stream_name]['object'],
                    updated_object_timestamps):
                continue

            # Syncing an event as its the first time we've seen it or its the most recent version
            with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING
                             ) as transformer:
                event_resource_metadata = metadata.to_map(
                    Context.get_catalog_entry(stream_name)['metadata'])

                # Filter out line items with null ids
                if isinstance(
                        events_obj.get('data').get('object'), stripe.Invoice):
                    invoice_obj = events_obj.get('data', {}).get('object', {})
                    line_items = invoice_obj.get('lines', {}).get('data')

                    if line_items:
                        filtered_line_items = [
                            line_item for line_item in line_items
                            if line_item.get('id')
                        ]

                        invoice_obj['lines']['data'] = filtered_line_items

                rec = unwrap_data_objects(
                    event_resource_obj.to_dict_recursive())
                rec = reduce_foreign_keys(rec, stream_name)
                rec["updated"] = events_obj.created
                rec = transformer.transform(
                    rec,
                    Context.get_catalog_entry(stream_name)['schema'],
                    event_resource_metadata)

                if events_obj.created >= bookmark_value:
                    if rec.get('id') is not None:
                        singer.write_record(stream_name,
                                            rec,
                                            time_extracted=extraction_time)
                        Context.updated_counts[stream_name] += 1

                        # Delete events should be synced but not their subobjects
                        if events_obj.get('type', '').endswith('.deleted'):
                            continue

                        if sub_stream_name and Context.is_selected(
                                sub_stream_name):
                            if event_resource_obj:
                                sync_sub_stream(sub_stream_name,
                                                event_resource_obj,
                                                updates=True)
            if events_obj.created > max_created:
                max_created = events_obj.created

        date_window_start = date_window_end
        date_window_end = date_window_end + 604800
        singer.write_bookmark(Context.state, stream_name + '_events',
                              'updates_created', max_created)
        singer.write_state(Context.state)

    singer.write_state(Context.state)
Esempio n. 11
0
def sync_stream(stream_name):
    """
    Sync each stream, looking for newly created records. Updates are captured by events stream.
    """
    LOGGER.info("Started syncing stream %s", stream_name)

    stream_metadata = metadata.to_map(
        Context.get_catalog_entry(stream_name)['metadata'])
    stream_field_whitelist = json.loads(
        Context.config.get('whitelist_map', '{}')).get(stream_name)

    extraction_time = singer.utils.now()
    replication_key = metadata.get(stream_metadata, (),
                                   'valid-replication-keys')[0]
    # Invoice Items bookmarks on `date`, but queries on `created`
    filter_key = 'created' if stream_name == 'invoice_items' else replication_key
    stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \
        int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    bookmark = stream_bookmark

    # if this stream has a sub_stream, compare the bookmark
    sub_stream_name = SUB_STREAMS.get(stream_name)

    # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark)
    should_sync_sub_stream = sub_stream_name and Context.is_selected(
        sub_stream_name)
    if should_sync_sub_stream:
        sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \
            or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())

        # if there is a sub stream, set bookmark to sub stream's bookmark
        # since we know it must be earlier than the stream's bookmark
        if sub_stream_bookmark != stream_bookmark:
            bookmark = sub_stream_bookmark
    else:
        sub_stream_bookmark = None

    with Transformer(
            singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        end_time = dt_to_epoch(utils.now())
        window_size = int(
            Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE))
        if DEFAULT_DATE_WINDOW_SIZE != window_size:
            LOGGER.info('Using non-default date window size of %d',
                        window_size)
        start_window = bookmark

        # NB: We observed records coming through newest->oldest and so
        # date-windowing was added and the tap only bookmarks after it has
        # gotten through a date window
        while start_window < end_time:
            stop_window = dt_to_epoch(
                epoch_to_dt(start_window) + timedelta(days=window_size))
            # cut off the last window at the end time
            if stop_window > end_time:
                stop_window = end_time

            for stream_obj in paginate(
                    STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key,
                    start_window, stop_window):

                # get the replication key value from the object
                rec = unwrap_data_objects(stream_obj.to_dict_recursive())
                rec = reduce_foreign_keys(rec, stream_name)
                stream_obj_created = rec[replication_key]
                rec['updated'] = stream_obj_created

                # sync stream if object is greater than the bookmark
                if stream_obj_created > stream_bookmark:
                    rec = transformer.transform(
                        rec,
                        Context.get_catalog_entry(stream_name)['schema'],
                        stream_metadata)

                    # At this point, the record has been transformed and so
                    # any de-selected fields have been pruned. Now, prune off
                    # any fields that aren't present in the whitelist.
                    if stream_field_whitelist:
                        rec = apply_whitelist(rec, stream_field_whitelist)

                    singer.write_record(stream_name,
                                        rec,
                                        time_extracted=extraction_time)

                    Context.new_counts[stream_name] += 1

                # sync sub streams if its selected and the parent object
                # is greater than its bookmark
                if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark:
                    sync_sub_stream(sub_stream_name, stream_obj)

            # Update stream/sub-streams bookmarks as stop window
            if stop_window > stream_bookmark:
                stream_bookmark = stop_window
                singer.write_bookmark(Context.state, stream_name,
                                      replication_key, stream_bookmark)

            # the sub stream bookmarks on its parent
            if should_sync_sub_stream and stop_window > sub_stream_bookmark:
                sub_stream_bookmark = stop_window
                singer.write_bookmark(Context.state, sub_stream_name,
                                      replication_key, sub_stream_bookmark)

            singer.write_state(Context.state)

            # update window for next iteration
            start_window = stop_window

    singer.write_state(Context.state)
Esempio n. 12
0
def sync_endpoint(schema_name,
                  endpoint=None,
                  path=None,
                  special_field_name=None,
                  special_field_value=None,
                  keys=None,
                  object_to_id=None,
                  parameter_for_updated=None):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    if keys is None:
        keys = ['id']
    singer.write_schema(schema_name,
                        schema,
                        keys,
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)
    start_dt = datetime.datetime.strptime(start, '%Y-%m-%dT%H:%M:%S.%fZ')
    updated_since = start_dt.strftime("%Y%m%dT%H%M%S")
    LOGGER.info('updated_since ' + updated_since)
    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url
        if parameter_for_updated is not None:
            url = url + '?' + parameter_for_updated + '=' + updated_since
        response = request(url, None)
        LOGGER.info('URL :' + url)
        if schema_name is 'project_financials':
            response = [response]

        time_extracted = utils.now()

        for row in response:

            if special_field_name is not None:
                row[special_field_name] = special_field_value

            if object_to_id is not None:
                for key in object_to_id:
                    if row[key] is not None:
                        row[key + '_id'] = row[key]['id']
                    else:
                        row[key + '_id'] = None

            item = transformer.transform(row, schema)

            if not bookmark_property in item:
                item[bookmark_property] = \
                    datetime.datetime.now().strftime('%Y-%m-%d') \
                    + 'T00:00:00.00Z'

            if datetime.datetime.strptime(item[bookmark_property],
                                          '%Y-%m-%dT%H:%M:%S.%fZ') >= start_dt:
                singer.write_record(schema_name,
                                    item,
                                    time_extracted=time_extracted)

                utils.update_state(STATE, schema_name, item[bookmark_property])
    singer.write_state(STATE)
Esempio n. 13
0
def sync_rate_cards(  # pylint: disable=too-many-arguments
    schema_name,
    endpoint=None,
    path=None,
    special_field_name=None,
    special_field_value=None,
    date_fields=None,
    with_updated_since=True,
    for_each_handler=None,
    map_handler=None,
    object_to_id=None,
):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    singer.write_schema(schema_name,
                        schema, ['id'],
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)

    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url
        response = request(url, None)

        time_extracted = utils.now()

        for row in response:
            if map_handler is not None:
                row = map_handler(row)

            if object_to_id is not None:
                for key in object_to_id:
                    if row[key] is not None:
                        row[key + '_id'] = row[key]['id']
                    else:
                        row[key + '_id'] = None

            item = transformer.transform(row, schema)
            if not bookmark_property in item:
                item[bookmark_property] = \
                    datetime.datetime.now().strftime('%Y-%m-%d') \
                    + 'T00:00:00Z'

            # find expenses

            sync_endpoint(
                'rate_cards_rates',
                BASE_API_URL + 'rate_cards/' + str(row['id']) + '/rates',
                None,
                'rate_card_id',
                str(row['id']),
                ['rate_card_id', 'role'],
            )

            singer.write_record(schema_name,
                                item,
                                time_extracted=time_extracted)

            # take any additional actions required for the currently loaded endpoint

            utils.update_state(STATE, schema_name, item[bookmark_property])
    singer.write_state(STATE)
Esempio n. 14
0
def sync_project(  # pylint: disable=too-many-arguments
    schema_name,
    endpoint=None,
    path=None,
    special_field_name=None,
    special_field_value=None,
    date_fields=None,
    with_updated_since=True,
    for_each_handler=None,
    map_handler=None,
    object_to_id=None,
):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    singer.write_schema(schema_name,
                        schema, ['id'],
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)

    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url

        response = request(url, None)

        for row in response:

            item = transformer.transform(row, schema)

            time_extracted = utils.now()

            # find related

            sync_endpoint(
                'expense_items',
                BASE_API_URL + 'projects/' + str(row['id']) + '/expense_items',
                None, 'project_id', str(row['id']))
            sync_endpoint(
                'invoices',
                BASE_API_URL + 'projects/' + str(row['id']) + '/invoices',
                None, 'project_id', str(row['id']))
            sync_endpoint(
                'milestones',
                BASE_API_URL + 'projects/' + str(row['id']) + '/milestones',
                None, 'project_id', str(row['id']))
            sync_endpoint(
                'project_team',
                BASE_API_URL + 'projects/' + str(row['id']) + '/team',
                None,
                'project_id',
                str(row['id']),
                ['person_id', 'project_id'],
            )
            sync_endpoint(
                'sprints',
                BASE_API_URL + 'projects/' + str(row['id']) + '/sprints', None,
                'project_id', str(row['id']))
            sync_endpoint(
                'workflow_columns', BASE_API_URL + 'projects/' +
                str(row['id']) + '/workflow_columns', None, 'project_id',
                str(row['id']))
            sync_endpoint(
                'project_financials',
                BASE_API_URL + 'projects/' + str(row['id']) + '/financials',
                None,
                None,
                None,
                ['project_id'],
            )

            if bookmark_property in item and item[bookmark_property] \
                >= start:
                singer.write_record(schema_name,
                                    item,
                                    time_extracted=time_extracted)

                utils.update_state(STATE, schema_name, item[bookmark_property])
    singer.write_state(STATE)
Esempio n. 15
0
def sync_allocations(
    schema_name,
    endpoint=None,
    path=None,
    special_field_name=None,
    special_field_value=None,
    keys=None,
    object_to_id=None,
):
    schema = load_schema(schema_name)
    bookmark_property = 'updated_at'
    LOGGER.info('Loading ' + schema_name)
    if keys is None:
        keys = ['id']
    singer.write_schema(schema_name,
                        schema,
                        keys,
                        bookmark_properties=[bookmark_property])

    start = get_start(schema_name)
    weekDays = [
        'monday',
        'tuesday',
        'wednesday',
        'thursday',
        'friday',
        'saturday',
        'sunday',
    ]
    with Transformer() as transformer:
        url = get_url(endpoint or schema_name)
        url = endpoint or url

        response = request(url, None)

        time_extracted = utils.now()

        for row in response:

            # add here logic

            date = datetime.datetime.strptime(row['start_date'], '%Y-%m-%d')
            LOGGER.info("Project" + str(row['project']) + "-" +
                        str(row['person']))
            end_date = datetime.datetime.strptime(row['end_date'], '%Y-%m-%d')

            newRow = {}
            #LOGGER.info("ID:"  + str(row['id']))
            #LOGGER.info("Date :  "  + date.strftime('%Y%m%d'))

            while date <= end_date:
                #LOGGER.info('Date :  ' + str(date.weekday()) + 'weekday'
                #             + weekDays[date.weekday()])
                #LOGGER.info(row['project'])
                #LOGGER.info(row[weekDays[date.weekday()]])
                #LOGGER.info(str(date.strftime('%Y-%m-%d')))
                #if row['id'] = 72051:
                #    LOGGER.info(row['project'])
                #    LOGGER.info(row['person'])
                #    LOGGER.info(str(date.strftime('%Y-%m-%d')))
                #    LOGGER.info(str(end_date.strftime('%Y-%m-%d')))

                newRow['allocation'] = row[weekDays[date.weekday()]]
                if not newRow['allocation'] > 0:
                    date = date + timedelta(days=1)
                    continue
                newRow['project'] = row['project']
                newRow['non_project_time'] = row['non_project_time']
                newRow['connected_project'] = row['connected_project']
                newRow['person'] = row['person']
                newRow['project'] = row['project']
                newRow['date'] = date.strftime('%Y-%m-%d')
                newRow['notes'] = row['notes']
                newRow['created_by'] = row['created_by']
                newRow['updated_by'] = row['updated_by']
                newRow['created_at'] = row['created_at']
                newRow['updated_at'] = row['updated_at']
                newRow['id'] = str(row['id']) \
                    + str(date.strftime('%Y%m%d'))

                date = date + timedelta(days=1)

                item = transformer.transform(newRow, schema)

                if not bookmark_property in item:
                    item[bookmark_property] = \
                        datetime.datetime.now().strftime('%Y-%m-%d') \
                        + 'T00:00:00Z'

                if bookmark_property in item \
                    and item[bookmark_property] >= start:
                    singer.write_record(schema_name,
                                        item,
                                        time_extracted=time_extracted)

                    utils.update_state(STATE, schema_name,
                                       item[bookmark_property])
                else:
                    singer.write_record(schema_name,
                                        item,
                                        time_extracted=time_extracted)

                    # take any additional actions required for the currently loaded endpoint

                    utils.update_state(STATE, schema_name,
                                       item[bookmark_property])
        singer.write_state(STATE)
Esempio n. 16
0
def sync_query(config, state, stream):
    table_name = stream['tap_stream_id']

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, table_name, 'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, table_name,
                                          'last_evaluated_key') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, table_name, 'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, table_name, 'version', stream_version)
    singer.write_state(state)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_version(table_name, stream_version)

    mdata = metadata.to_map(stream['metadata'])
    index_name = metadata.get(mdata, (), "IndexName")
    key_condition_expression = metadata.get(mdata, (),
                                            "KeyConditionExpression")
    expression_attribute_values = metadata.get(mdata, (),
                                               "ExpressionAttributeValues")

    rows_saved = 0

    deserializer = Deserializer()
    for result in query_table(table_name, index_name, key_condition_expression,
                              expression_attribute_values, config):
        for item in result.get('Items', []):
            rows_saved += 1
            # TODO: Do we actually have to put the item we retreive from
            # dynamo into a map before we can deserialize?
            record = deserializer.deserialize_item(item)
            record_message = singer.RecordMessage(stream=table_name,
                                                  record=record,
                                                  version=stream_version)

            singer.write_message(record_message)
        if result.get('LastEvaluatedKey'):
            state = singer.write_bookmark(state, table_name,
                                          'last_evaluated_key',
                                          result.get('LastEvaluatedKey'))
            singer.write_state(state)

    state = singer.clear_bookmark(state, table_name, 'last_evaluated_key')

    state = singer.write_bookmark(state, table_name,
                                  'initial_full_table_complete', True)

    singer.write_state(state)

    singer.write_version(table_name, stream_version)

    return rows_saved
Esempio n. 17
0
 def write_state_message(self):
     '''Writes a Singer state message.'''
     return singer.write_state(self.state)
Esempio n. 18
0
def sync_tickets():
    bookmark_property = 'updated_at'

    singer.write_schema("tickets",
                        utils.load_schema("tickets"),
                        ["id"],
                        bookmark_properties=[bookmark_property])

    singer.write_schema("conversations",
                        utils.load_schema("conversations"),
                        ["id"],
                        bookmark_properties=[bookmark_property])

    singer.write_schema("satisfaction_ratings",
                        utils.load_schema("satisfaction_ratings"),
                        ["id"],
                        bookmark_properties=[bookmark_property])

    singer.write_schema("time_entries",
                        utils.load_schema("time_entries"),
                        ["id"],
                        bookmark_properties=[bookmark_property])

    start = get_start("tickets")
    params = {
        'updated_since': start,
        'order_by': bookmark_property,
        'order_type': "asc",
        'include': "requester,company,stats"
    }
    for i, row in enumerate(gen_request(get_url("tickets"), params)):
        logger.info("Ticket {}: Syncing".format(row['id']))
        row.pop('attachments', None)
        row['custom_fields'] = transform_dict(row['custom_fields'], force_str=True)

        # get all sub-entities and save them
        logger.info("Ticket {}: Syncing conversations".format(row['id']))

        try:
            for subrow in gen_request(get_url("sub_ticket", id=row['id'], entity="conversations")):
                subrow.pop("attachments", None)
                subrow.pop("body", None)
                if subrow[bookmark_property] >= start:
                    singer.write_record("conversations", subrow, time_extracted=singer.utils.now())
        except HTTPError as e:
            if e.response.status_code == 403:
                logger.info('Invalid ticket ID requested from Freshdesk {0}'.format(row['id']))
            else:
                raise

        try:
            logger.info("Ticket {}: Syncing satisfaction ratings".format(row['id']))
            for subrow in gen_request(get_url("sub_ticket", id=row['id'], entity="satisfaction_ratings")):
                subrow['ratings'] = transform_dict(subrow['ratings'], key_key="question")
                if subrow[bookmark_property] >= start:
                    singer.write_record("satisfaction_ratings", subrow, time_extracted=singer.utils.now())
        except HTTPError as e:
            if e.response.status_code == 403:
                logger.info("The Surveys feature is unavailable. Skipping the satisfaction_ratings stream.")
            else:
                raise

        try:
            logger.info("Ticket {}: Syncing time entries".format(row['id']))
            for subrow in gen_request(get_url("sub_ticket", id=row['id'], entity="time_entries")):
                if subrow[bookmark_property] >= start:
                    singer.write_record("time_entries", subrow, time_extracted=singer.utils.now())

        except HTTPError as e:
            if e.response.status_code == 403:
                logger.info("The Timesheets feature is unavailable. Skipping the time_entries stream.")
            else:
                raise

        utils.update_state(STATE, "tickets", row[bookmark_property])
        singer.write_record("tickets", row, time_extracted=singer.utils.now())
        singer.write_state(STATE)
Esempio n. 19
0
def sync_tickets_by_filter(bookmark_property, predefined_filter=None):
    endpoint = "tickets"

    state_entity = endpoint
    if predefined_filter:
        state_entity = state_entity + "_" + predefined_filter

    start = get_start(state_entity)

    params = {
        'updated_since': start,
        'order_by': bookmark_property,
        'order_type': "asc",
        'include': "requester,company,stats"
    }

    if predefined_filter:
        logger.info("Syncing tickets with filter {}".format(predefined_filter))

    if predefined_filter:
        params['filter'] = predefined_filter

    for i, row in enumerate(gen_request(get_url(endpoint), params)):
        logger.info("Ticket {}: Syncing".format(row['id']))
        row.pop('attachments', None)
        row['custom_fields'] = transform_dict(row['custom_fields'],
                                              force_str=True)

        # get all sub-entities and save them
        logger.info("Ticket {}: Syncing conversations".format(row['id']))

        try:
            for subrow in gen_request(
                    get_url("sub_ticket", id=row['id'],
                            entity="conversations")):
                subrow.pop("attachments", None)
                subrow.pop("body", None)
                if subrow[bookmark_property] >= start:
                    singer.write_record("conversations",
                                        subrow,
                                        time_extracted=singer.utils.now())
        except HTTPError as e:
            if e.response.status_code == 403:
                logger.info(
                    'Invalid ticket ID requested from Freshdesk {0}'.format(
                        row['id']))
            else:
                raise

        try:
            logger.info("Ticket {}: Syncing satisfaction ratings".format(
                row['id']))
            for subrow in gen_request(
                    get_url("sub_ticket",
                            id=row['id'],
                            entity="satisfaction_ratings")):
                subrow['ratings'] = transform_dict(subrow['ratings'],
                                                   key_key="question")
                if subrow[bookmark_property] >= start:
                    singer.write_record("satisfaction_ratings",
                                        subrow,
                                        time_extracted=singer.utils.now())
        except HTTPError as e:
            if e.response.status_code == 403:
                logger.info(
                    "The Surveys feature is unavailable. Skipping the satisfaction_ratings stream."
                )
            else:
                raise

        try:
            logger.info("Ticket {}: Syncing time entries".format(row['id']))
            for subrow in gen_request(
                    get_url("sub_ticket", id=row['id'],
                            entity="time_entries")):
                if subrow[bookmark_property] >= start:
                    singer.write_record("time_entries",
                                        subrow,
                                        time_extracted=singer.utils.now())

        except HTTPError as e:
            if e.response.status_code == 403:
                logger.info(
                    "The Timesheets feature is unavailable. Skipping the time_entries stream."
                )
            elif e.response.status_code == 404:
                # 404 is being returned for deleted tickets and spam
                logger.info(
                    "Could not retrieve time entries for ticket id {}. This may be caused by tickets "
                    "marked as spam or deleted.".format(row['id']))
            else:
                raise

        utils.update_state(STATE, state_entity, row[bookmark_property])
        singer.write_record(endpoint, row, time_extracted=singer.utils.now())
        singer.write_state(STATE)
Esempio n. 20
0
def sync_report_for_day(stream_name, stream_schema, sdk_client, start,
                        field_list):  # pylint: disable=too-many-locals
    report_downloader = sdk_client.GetReportDownloader(version=VERSION)
    customer_id = sdk_client.client_customer_id
    report = {
        'reportName': 'Seems this is required',
        'dateRangeType': 'CUSTOM_DATE',
        'reportType': stream_name,
        'downloadFormat': 'CSV',
        'selector': {
            'fields': field_list,
            'dateRange': {
                'min': start.strftime('%Y%m%d'),
                'max': start.strftime('%Y%m%d')
            }
        }
    }

    # Fetch the report as a csv string
    with metrics.http_request_timer(stream_name):
        result = attempt_download_report(report_downloader, report)

    headers, csv_reader = parse_csv_stream(result)
    with metrics.record_counter(stream_name) as counter:
        time_extracted = utils.now()

        with Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING
                         ) as bumble_bee:
            for row in csv_reader:
                obj = dict(
                    zip(get_xml_attribute_headers(stream_schema, headers),
                        row))
                obj['_sdc_customer_id'] = customer_id
                obj['_sdc_report_datetime'] = REPORT_RUN_DATETIME

                bumble_bee.pre_hook = transform_pre_hook
                obj = bumble_bee.transform(obj, stream_schema)

                singer.write_record(stream_name,
                                    obj,
                                    time_extracted=time_extracted)
                counter.increment()

        if start > get_start_for_stream(sdk_client.client_customer_id,
                                        stream_name):
            LOGGER.info(
                'updating bookmark: %s > %s', start,
                get_start_for_stream(sdk_client.client_customer_id,
                                     stream_name))
            bookmarks.write_bookmark(
                STATE,
                state_key_name(sdk_client.client_customer_id, stream_name),
                'date', start.strftime(utils.DATETIME_FMT))
            singer.write_state(STATE)
        else:
            LOGGER.info(
                'not updating bookmark: %s <= %s', start,
                get_start_for_stream(sdk_client.client_customer_id,
                                     stream_name))

        LOGGER.info(
            "Done syncing %s records for the %s report for customer_id %s on %s",
            counter.value, stream_name, customer_id, start)
Esempio n. 21
0
 def write_state(self):
     singer.write_state(self.state)
Esempio n. 22
0
 def sync(self, start_date):
     for page in self.client.get(self.endpoint, params={}):
         for rec in page:
             yield rec
         singer.write_state(self.state)
Esempio n. 23
0
 def update_bookmark(self, last_updated):
     singer.bookmarks.write_bookmark(
         self.state, self.stream,
         self.stream_metadata.get('replication-key'),
         safe_to_iso8601(last_updated))
     singer.write_state(self.state)
Esempio n. 24
0
    def sync(self, state):
        bookmark = self.get_bookmark(state)
        tickets = self.client.tickets.incremental(start_time=bookmark)

        audits_stream = TicketAudits(self.client)
        metrics_stream = TicketMetrics(self.client)
        comments_stream = TicketComments(self.client)

        def emit_sub_stream_metrics(sub_stream):
            if sub_stream.is_selected():
                singer.metrics.log(
                    LOGGER,
                    Point(metric_type='counter',
                          metric=singer.metrics.Metric.record_count,
                          value=sub_stream.count,
                          tags={'endpoint': sub_stream.stream.tap_stream_id}))
                sub_stream.count = 0

        if audits_stream.is_selected():
            LOGGER.info("Syncing ticket_audits per ticket...")

        for ticket in tickets:
            zendesk_metrics.capture('ticket')
            generated_timestamp_dt = datetime.datetime.utcfromtimestamp(
                ticket.generated_timestamp).replace(tzinfo=pytz.UTC)
            self.update_bookmark(state, utils.strftime(generated_timestamp_dt))

            ticket_dict = ticket.to_dict()
            ticket_dict.pop(
                'fields'
            )  # NB: Fields is a duplicate of custom_fields, remove before emitting
            should_yield = self._buffer_record((self.stream, ticket_dict))

            if audits_stream.is_selected():
                try:
                    for audit in audits_stream.sync(ticket_dict["id"]):
                        zendesk_metrics.capture('ticket_audit')
                        self._buffer_record(audit)
                except RecordNotFoundException:
                    LOGGER.warning("Unable to retrieve audits for ticket (ID: %s), " \
                    "the Zendesk API returned a RecordNotFound error", ticket_dict["id"])

            if metrics_stream.is_selected():
                try:
                    for metric in metrics_stream.sync(ticket_dict["id"]):
                        zendesk_metrics.capture('ticket_metric')
                        self._buffer_record(metric)
                except RecordNotFoundException:
                    LOGGER.warning("Unable to retrieve metrics for ticket (ID: %s), " \
                    "the Zendesk API returned a RecordNotFound error", ticket_dict["id"])

            if comments_stream.is_selected():
                try:
                    # add ticket_id to ticket_comment so the comment can
                    # be linked back to it's corresponding ticket
                    for comment in comments_stream.sync(ticket_dict["id"]):
                        zendesk_metrics.capture('ticket_comment')
                        comment[1].ticket_id = ticket_dict["id"]
                        self._buffer_record(comment)
                except RecordNotFoundException:
                    LOGGER.warning("Unable to retrieve comments for ticket (ID: %s), " \
                    "the Zendesk API returned a RecordNotFound error", ticket_dict["id"])

            if should_yield:
                for rec in self._empty_buffer():
                    yield rec
                emit_sub_stream_metrics(audits_stream)
                emit_sub_stream_metrics(metrics_stream)
                emit_sub_stream_metrics(comments_stream)
                singer.write_state(state)

        for rec in self._empty_buffer():
            yield rec
        emit_sub_stream_metrics(audits_stream)
        emit_sub_stream_metrics(metrics_stream)
        emit_sub_stream_metrics(comments_stream)
        singer.write_state(state)
Esempio n. 25
0
def sync_transactions():
    schema = load_schema("transactions")

    singer.write_schema("transactions",
                        schema, ["id"],
                        bookmark_properties=['created_at'])

    latest_updated_at = utils.strptime_to_utc(
        STATE.get('latest_updated_at', DEFAULT_TIMESTAMP))

    run_maximum_updated_at = latest_updated_at

    latest_disbursement_date = utils.strptime_to_utc(
        STATE.get('latest_disbursment_date', DEFAULT_TIMESTAMP))

    run_maximum_disbursement_date = latest_disbursement_date

    latest_start_date = utils.strptime_to_utc(get_start("transactions"))

    period_start = latest_start_date - TRAILING_DAYS

    period_end = utils.now()

    logger.info("transactions: Syncing from {}".format(period_start))

    logger.info(
        "transactions: latest_updated_at from {}, disbursement_date from {}".
        format(latest_updated_at, latest_disbursement_date))

    logger.info(
        "transactions: latest_start_date from {}".format(latest_start_date))

    # increment through each day (20k results max from api)
    for start, end in daterange(period_start, period_end):

        end = min(end, period_end)

        data = braintree.Transaction.search(
            braintree.TransactionSearch.created_at.between(start, end))
        time_extracted = utils.now()

        logger.info("transactions: Fetched {} records from {} - {}".format(
            data.maximum_size, start, end))

        row_written_count = 0
        row_skipped_count = 0

        for row in data:
            # Ensure updated_at consistency
            if not getattr(row, 'updated_at'):
                row.updated_at = row.created_at

            transformed = transform_row(row, schema)
            updated_at = to_utc(row.updated_at)

            # if disbursement is successful, get disbursement date
            # set disbursement datetime to min if not found

            if row.disbursement_details is None:
                disbursement_date = datetime.min

            else:
                if row.disbursement_details.disbursement_date is None:
                    row.disbursement_details.disbursement_date = datetime.min

                disbursement_date = to_utc(
                    datetime.combine(
                        row.disbursement_details.disbursement_date,
                        datetime.min.time()))

            # Is this more recent than our past stored value of update_at?
            # Is this more recent than our past stored value of disbursement_date?
            # Use >= for updated_at due to non monotonic updated_at values
            # Use > for disbursement_date - confirming all transactions disbursed
            # at the same time
            # Update our high water mark for updated_at and disbursement_date
            # in this run
            if (updated_at >= latest_updated_at) or (disbursement_date >=
                                                     latest_disbursement_date):

                if updated_at > run_maximum_updated_at:
                    run_maximum_updated_at = updated_at

                if disbursement_date > run_maximum_disbursement_date:
                    run_maximum_disbursement_date = disbursement_date

                singer.write_record("transactions",
                                    transformed,
                                    time_extracted=time_extracted)
                row_written_count += 1

            else:

                row_skipped_count += 1

        logger.info("transactions: Written {} records from {} - {}".format(
            row_written_count, start, end))

        logger.info("transactions: Skipped {} records from {} - {}".format(
            row_skipped_count, start, end))

    # End day loop
    logger.info("transactions: Complete. Last updated record: {}".format(
        run_maximum_updated_at))

    logger.info("transactions: Complete. Last disbursement date: {}".format(
        run_maximum_disbursement_date))

    latest_updated_at = run_maximum_updated_at

    latest_disbursement_date = run_maximum_disbursement_date

    STATE['latest_updated_at'] = utils.strftime(latest_updated_at)

    STATE['latest_disbursement_date'] = utils.strftime(
        latest_disbursement_date)

    utils.update_state(STATE, "transactions", utils.strftime(end))

    singer.write_state(STATE)
Esempio n. 26
0
def do_sync(client, config, catalog, state):
    """
    Translate metadata into a set of metrics and dimensions and call out
    to sync to generate the required reports.
    """
    selected_streams = catalog.get_selected_streams(state)
    for stream in selected_streams:
        # Transform state for this report to new format before proceeding
        state = clean_state_for_report(config, state, stream.tap_stream_id)

        state = singer.set_currently_syncing(state, stream.tap_stream_id)
        singer.write_state(state)

        metrics = []
        dimensions = []
        mdata = metadata.to_map(stream.metadata)
        for field_path, field_mdata in mdata.items():
            if field_path == tuple():
                continue
            if field_mdata.get('inclusion') == 'unsupported':
                continue
            _, field_name = field_path
            if field_mdata.get('inclusion') == 'automatic' or \
               field_mdata.get('selected') or \
               (field_mdata.get('selected-by-default') and field_mdata.get('selected') is None):
                if field_mdata.get('behavior') == 'METRIC':
                    metrics.append(field_name)
                elif field_mdata.get('behavior') == 'DIMENSION':
                    dimensions.append(field_name)

        view_ids = get_view_ids(config)

        # NB: Resume from previous view for this report, dropping all
        # views before it to keep streams moving forward
        current_view = state.get('currently_syncing_view')
        if current_view:
            if current_view in view_ids:
                view_not_current = functools.partial(lambda cv, v: v != cv,
                                                     current_view)
                view_ids = list(itertools.dropwhile(view_not_current,
                                                    view_ids))
            else:
                state.pop('currently_syncing_view', None)

        reports_per_view = [{
            "profile_id": view_id,
            "name": stream.stream,
            "id": stream.tap_stream_id,
            "metrics": metrics,
            "dimensions": dimensions
        } for view_id in view_ids]

        end_date = get_end_date(config)

        schema = stream.schema.to_dict()

        singer.write_schema(stream.stream, schema, stream.key_properties)

        for report in reports_per_view:
            state['currently_syncing_view'] = report['profile_id']
            singer.write_state(state)

            is_historical_sync, start_date = get_start_date(
                config, report['profile_id'], state, report['id'])

            sync_report(client, schema, report, start_date, end_date, state,
                        is_historical_sync)
        state.pop('currently_syncing_view', None)
        singer.write_state(state)
    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
Esempio n. 27
0
def write_bookmark(state, stream_name, value):
    if 'bookmarks' not in state:
        state['bookmarks'] = {}
    state['bookmarks'][stream_name] = value
    singer.write_state(state)
Esempio n. 28
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        sf.get_start_date(state, catalog_entry))
    stream = catalog_entry["stream"]
    schema = catalog_entry["schema"]
    stream_alias = catalog_entry.get("stream_alias")
    catalog_metadata = metadata.to_map(catalog_entry["metadata"])
    replication_key = catalog_metadata.get((),
                                           {}).get("valid-replication-keys")[0]
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info("Syncing Salesforce data for stream %s", stream)

    for rec in sf.query(catalog_entry, state):
        replication_key_value = replication_key and singer_utils.strptime_with_tz(
            rec[replication_key])
        if replication_key_value <= chunked_bookmark:
            continue
        counter.increment()
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            rec = transformer.transform(rec, schema)
        rec = fix_record_anytype(rec, schema)
        singer.write_message(
            singer.RecordMessage(
                stream=(stream_alias or stream),
                record=rec,
                version=stream_version,
                time_extracted=start_time,
            ))

        if sf.pk_chunking:
            if (replication_key_value and replication_key_value <= start_time
                    and replication_key_value > chunked_bookmark):
                # Replace the highest seen bookmark and save the state in case we need to resume later
                chunked_bookmark = singer_utils.strptime_with_tz(
                    rec[replication_key])
                state = singer.write_bookmark(
                    state,
                    catalog_entry["tap_stream_id"],
                    "JobHighestBookmarkSeen",
                    singer_utils.strftime(chunked_bookmark),
                )
                singer.write_state(state)
        # Before writing a bookmark, make sure Salesforce has not given us a
        # record with one outside our range
        elif replication_key_value and replication_key_value <= start_time:
            state = singer.write_bookmark(
                state,
                catalog_entry["tap_stream_id"],
                replication_key,
                rec[replication_key],
            )
            singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
    if not replication_key:
        singer.write_message(activate_version_message)
        state = singer.write_bookmark(state, catalog_entry["tap_stream_id"],
                                      "version", None)

    # If pk_chunking is set, only write a bookmark at the end
    if sf.pk_chunking:
        # Write a bookmark with the highest value we've seen
        state = singer.write_bookmark(
            state,
            catalog_entry["tap_stream_id"],
            replication_key,
            singer_utils.strftime(chunked_bookmark),
        )
Esempio n. 29
0
def sync_in_app_events():

    schema = load_schema("raw_data/in_app_events")
    singer.write_schema("in_app_events", schema, [
        "event_time",
        "event_name",
        "appsflyer_id"
    ])

    # This order matters
    fieldnames = (
        "attributed_touch_type",
        "attributed_touch_time",
        "install_time",
        "event_time",
        "event_name",
        "event_value",
        "event_revenue",
        "event_revenue_currency",
        "event_revenue_usd",
        "event_source",
        "is_receipt_validated",
        "af_prt",
        "media_source",
        "af_channel",
        "af_keywords",
        "campaign",
        "af_c_id",
        "af_adset",
        "af_adset_id",
        "af_ad",
        "af_ad_id",
        "af_ad_type",
        "af_siteid",
        "af_sub_siteid",
        "af_sub1",
        "af_sub2",
        "af_sub3",
        "af_sub4",
        "af_sub5",
        "af_cost_model",
        "af_cost_value",
        "af_cost_currency",
        "contributor1_af_prt",
        "contributor1_media_source",
        "contributor1_campaign",
        "contributor1_touch_type",
        "contributor1_touch_time",
        "contributor2_af_prt",
        "contributor2_media_source",
        "contributor2_campaign",
        "contributor2_touch_type",
        "contributor2_touch_time",
        "contributor3_af_prt",
        "contributor3_media_source",
        "contributor3_campaign",
        "contributor3_touch_type",
        "contributor3_touch_time",
        "region",
        "country_code",
        "state",
        "city",
        "postal_code",
        "dma",
        "ip",
        "wifi",
        "operator",
        "carrier",
        "language",
        "appsflyer_id",
        "advertising_id",
        "idfa",
        "android_id",
        "customer_user_id",
        "imei",
        "idfv",
        "platform",
        "device_type",
        "os_version",
        "app_version",
        "sdk_version",
        "app_id",
        "app_name",
        "bundle_id",
        "is_retargeting",
        "retargeting_conversion_type",
        "af_attribution_lookback",
        "af_reengagement_window",
        "is_primary_attribution",
        "user_agent",
        "http_referrer",
        "original_url",
    )

    stop_time = datetime.datetime.now()
    from_datetime = get_start("in_app_events")
    to_datetime = get_stop(from_datetime, stop_time, 10)

    while to_datetime <= stop_time:
        LOGGER.info("Syncing data from %s to %s", from_datetime, to_datetime)
        params = dict()
        params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M")
        params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M")
        params["api_token"] = CONFIG["api_token"]

        url = get_url("in_app_events", app_id=CONFIG["app_id"])
        request_data = request(url, params)

        csv_data = RequestToCsvAdapter(request_data)
        reader = csv.DictReader(csv_data, fieldnames)

        next(reader) # Skip the heading row

        bookmark = from_datetime
        for i, row in enumerate(reader):
            record = xform(row, schema)
            singer.write_record("in_app_events", record)
            # AppsFlyer returns records in order of most recent first.
            if utils.strptime(record["event_time"]) > bookmark:
                bookmark = utils.strptime(record["event_time"])

        # Write out state
        utils.update_state(STATE, "in_app_events", bookmark)
        singer.write_state(STATE)

        # Move the timings forward
        from_datetime = to_datetime
        to_datetime = get_stop(from_datetime, stop_time, 10)
Esempio n. 30
0
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter):
    bulk = Bulk(sf)
    current_bookmark = singer.get_bookmark(
        state, catalog_entry["tap_stream_id"],
        "JobHighestBookmarkSeen") or sf.get_start_date(state, catalog_entry)
    current_bookmark = singer_utils.strptime_with_tz(current_bookmark)
    batch_ids = singer.get_bookmark(state, catalog_entry["tap_stream_id"],
                                    "BatchIDs")

    start_time = singer_utils.now()
    stream = catalog_entry["stream"]
    stream_alias = catalog_entry.get("stream_alias")
    catalog_metadata = metadata.to_map(catalog_entry.get("metadata"))
    replication_key = catalog_metadata.get((),
                                           {}).get("valid-replication-keys")[0]
    stream_version = get_stream_version(catalog_entry, state)
    schema = catalog_entry["schema"]

    if not bulk.job_exists(job_id):
        LOGGER.info(
            "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state."
        )
        return counter

    # Iterate over the remaining batches, removing them once they are synced
    for batch_id in batch_ids[:]:
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry):
                counter.increment()
                rec = transformer.transform(rec, schema)
                rec = fix_record_anytype(rec, schema)
                singer.write_message(
                    singer.RecordMessage(
                        stream=(stream_alias or stream),
                        record=rec,
                        version=stream_version,
                        time_extracted=start_time,
                    ))

                # Update bookmark if necessary
                replication_key_value = (replication_key
                                         and singer_utils.strptime_with_tz(
                                             rec[replication_key]))
                if (replication_key_value
                        and replication_key_value <= start_time
                        and replication_key_value > current_bookmark):
                    current_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])

        state = singer.write_bookmark(
            state,
            catalog_entry["tap_stream_id"],
            "JobHighestBookmarkSeen",
            singer_utils.strftime(current_bookmark),
        )
        batch_ids.remove(batch_id)
        LOGGER.info("Finished syncing batch %s. Removing batch from state.",
                    batch_id)
        LOGGER.info("Batches to go: %d", len(batch_ids))
        singer.write_state(state)

    return counter
Esempio n. 31
0
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key], catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(lift_properties_and_versions(row), schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset', data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(
                datetime.datetime.fromtimestamp((start_ts / 1000),
                                                datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Esempio n. 32
0
def sync(config, state, catalog):
    errors_encountered = False

    selected_stream_ids = get_selected_streams(catalog)

    client = GAClient(config)

    # Loop over streams in catalog
    for stream in catalog['streams']:
        stream_id = stream['tap_stream_id']
        stream_schema = stream['schema']
        if state and stream_id in state:
            client.start_date = state[stream_id]
        stream_metadata = metadata.to_map(stream['metadata'])
        key_properties = update_key_properties(stream_schema, stream_metadata)

        if stream_id in selected_stream_ids:
            LOGGER.info('Syncing stream: ' + stream_id)

            try:
                singer.write_schema(stream_id, stream_schema, key_properties)
                report_definition = ReportsHelper.get_report_definition(stream)
                for page, date in client.process_stream(report_definition):
                    singer.write_records(stream_id, page)
                    if date is not None:  # we need to update all dates that are not "golden", even if it's the start date
                        singer.write_state({stream_id: date})
            except TapGaInvalidArgumentError as e:
                errors_encountered = True
                LOGGER.error(
                    "Skipping stream: '{}' due to invalid report definition.".
                    format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
            except TapGaRateLimitError as e:
                errors_encountered = True
                LOGGER.error(
                    "Skipping stream: '{}' due to Rate Limit Errors.".format(
                        stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
            except TapGaQuotaExceededError as e:
                errors_encountered = True
                LOGGER.error(
                    "Skipping stream: '{}' due to Quota Exceeded Errors.".
                    format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
            except TapGaAuthenticationError as e:
                LOGGER.error(
                    "Stopping execution while processing '{}' due to Authentication Errors."
                    .format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
                sys.exit(1)
            except TapGaUnknownError as e:
                LOGGER.error(
                    "Stopping execution while processing '{}' due to Unknown Errors."
                    .format(stream_id))
                LOGGER.debug("Error: '{}'.".format(e))
                sys.exit(1)
        else:
            LOGGER.info('Skipping unselected stream: ' + stream_id)

    # If we encountered errors, exit with 1
    if errors_encountered:
        sys.exit(1)

    return