Beispiel #1
0
def sync_stream(config, state, table_spec, stream):
    table_name = table_spec['table_name']
    bookmark = singer.get_bookmark(state, table_name, 'modified_since')
    modified_since = singer_utils.strptime_with_tz(
        bookmark or '1990-01-01T00:00:00Z')

    LOGGER.info('Syncing table "%s".', table_name)
    LOGGER.info('Getting files modified since %s.', modified_since)

    s3_files = s3.get_input_files_for_table(
        config, table_spec, modified_since)

    records_streamed = 0

    # Original implementation sorted by 'modified_since' so that the modified_since bookmark makes
    # sense. We sort by 'key' because we import multiple part files generated from Spark where the
    # names are incremental order.
    # This means that we can't sync s3 buckets that are larger than
    # we can sort in memory which is suboptimal. If we could bookmark
    # based on anything else then we could just sync files as we see them.
    for s3_file in sorted(s3_files, key=lambda item: item['key']):
        records_streamed += sync_table_file(
            config, s3_file['key'], table_spec, stream)

        state = singer.write_bookmark(
            state, table_name, 'modified_since', s3_file['last_modified'].isoformat())
        singer.write_state(state)

    if s3.skipped_files_count:
        LOGGER.warn("%s files got skipped during the last sync.",
                    s3.skipped_files_count)

    LOGGER.info('Wrote %s records for table "%s".',
                records_streamed, table_name)

    return records_streamed
Beispiel #2
0
def do_sync_incremental(conn_config, stream, state, desired_columns, md_map):
    replication_key = md_map.get((), {}).get('replication-key')
    LOGGER.info(
        "Stream %s is using incremental replication with replication key %s",
        stream['tap_stream_id'], replication_key)

    stream_state = state.get('bookmarks', {}).get(stream['tap_stream_id'])
    illegal_bk_keys = set(stream_state.keys()).difference(
        set([
            'replication_key', 'replication_key_value', 'version',
            'last_replication_method'
        ]))
    if len(illegal_bk_keys) != 0:
        raise Exception(
            "invalid keys found in state: {}".format(illegal_bk_keys))

    state = singer.write_bookmark(state, stream['tap_stream_id'],
                                  'replication_key', replication_key)

    sync_common.send_schema_message(stream, [replication_key])
    state = incremental.sync_table(conn_config, stream, state, desired_columns,
                                   md_map)

    return state
Beispiel #3
0
def sync_owners(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("owners")
    bookmark_key = 'updatedAt'

    singer.write_schema("owners", schema, ["ownerId"], [bookmark_key],
                        catalog.get('stream_alias'))
    start = get_start(STATE, "owners", bookmark_key)
    max_bk_value = start

    LOGGER.info("sync_owners from %s", start)

    params = {}
    if CONFIG.get('include_inactives'):
        params['includeInactives'] = "true"
    data = request(get_url("owners"), params).json()

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in data:
            record = bumble_bee.transform(lift_properties_and_versions(row),
                                          schema, mdata)
            if record[bookmark_key] >= max_bk_value:
                max_bk_value = record[bookmark_key]

            if record[bookmark_key] >= start:
                singer.write_record("owners",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)

    STATE = singer.write_bookmark(STATE, 'owners', bookmark_key, max_bk_value)
    singer.write_state(STATE)
    return STATE
Beispiel #4
0
    def sync(self):
        start_position = 1
        max_results = int(self.config.get('max_results', '200'))

        bookmark = singer.get_bookmark(self.state, self.stream_name, 'LastUpdatedTime', self.config.get('start_date'))

        while True:
            query = query_builder.build_query(self.table_name, bookmark, start_position, max_results, additional_where=self.additional_where)

            resp = self.client.get(self.endpoint, params={"query": query}).get('QueryResponse',{})

            results = resp.get(self.table_name, [])
            for rec in results:
                yield rec

            if results:
                self.state = singer.write_bookmark(self.state, self.stream_name, 'LastUpdatedTime', rec.get('MetaData').get('LastUpdatedTime'))
                singer.write_state(self.state)

            if len(results) < max_results:
                break
            start_position += max_results

        singer.write_state(self.state)
def clear_state_on_replication_change(state: Dict, tap_stream_id: str,
                                      replication_key: str,
                                      replication_method: str) -> Dict:
    """
    Update state if replication method change is detected
    Returns: new state dictionary
    """
    # user changed replication, nuke state
    last_replication_method = singer.get_bookmark(state, tap_stream_id,
                                                  'last_replication_method')
    if last_replication_method is not None and (replication_method !=
                                                last_replication_method):
        state = singer.reset_stream(state, tap_stream_id)

    # key changed
    if replication_method == 'INCREMENTAL' and \
            replication_key != singer.get_bookmark(state, tap_stream_id, 'replication_key'):
        state = singer.reset_stream(state, tap_stream_id)

    state = singer.write_bookmark(state, tap_stream_id,
                                  'last_replication_method',
                                  replication_method)

    return state
Beispiel #6
0
def sync_forms(state: State):
    schema = load_schema("forms")
    bookmark_key = 'updatedAt'

    singer.write_schema("hubspot_forms", schema, ["guid"], [bookmark_key])
    start = get_start(state, "forms", bookmark_key)
    max_bk_value = start

    logger.info("sync_forms from %s", start)

    data = request(get_url("forms")).json()

    for row in data:
        record = build_record(row, schema)
        if record[bookmark_key] >= start:
            write_record('hubspot_forms', record)
        if record[bookmark_key] >= max_bk_value:
            max_bk_value = record[bookmark_key]

    state = singer.write_bookmark(state, 'hubspot_forms', bookmark_key,
                                  max_bk_value)
    singer.write_state(state)

    return state
Beispiel #7
0
    def sync(self, state, stream_schema, stream_metadata, transformer):

        # Bookmark is in timezone UTC
        start_time = self._get_start_time(state, RAW_BOOKMARK_DATE_FORMAT)
        end_time = self._get_end_time(RAW_BOOKMARK_DATE_FORMAT)

        for record in self.client.get_raw_data(
                self.report_name, self.report_version, start_time, end_time,
                RAW_INSTALL_N_IN_APP_FIELDNAMES):

            transformed_record = transformer.transform(xform(record),
                                                       stream_schema,
                                                       stream_metadata)
            singer.write_record(self.tap_stream_id,
                                transformed_record,
                                time_extracted=end_time)

        # Convert to bookmark format
        end_time_str = datetime.strftime(end_time, RAW_BOOKMARK_DATE_FORMAT)
        state = singer.write_bookmark(state, self.tap_stream_id,
                                      self.replication_key, end_time_str)
        singer.write_state(state)

        return state
Beispiel #8
0
def build_state(raw_state, catalog):
    state = {}

    for catalog_entry in catalog['streams']:
        tap_stream_id = catalog_entry['tap_stream_id']
        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_method = catalog_metadata.get((),
                                                  {}).get('replication-method')

        version = singer.get_bookmark(raw_state, tap_stream_id, 'version')

        # Preserve state that deals with resuming an incomplete bulk job
        if singer.get_bookmark(raw_state, tap_stream_id, 'JobID'):
            job_id = singer.get_bookmark(raw_state, tap_stream_id, 'JobID')
            batches = singer.get_bookmark(raw_state, tap_stream_id, 'BatchIDs')
            current_bookmark = singer.get_bookmark(raw_state, tap_stream_id,
                                                   'JobHighestBookmarkSeen')
            state = singer.write_bookmark(state, tap_stream_id, 'JobID',
                                          job_id)
            state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs',
                                          batches)
            state = singer.write_bookmark(state, tap_stream_id,
                                          'JobHighestBookmarkSeen',
                                          current_bookmark)

        if replication_method == 'INCREMENTAL':
            replication_key = catalog_metadata.get((),
                                                   {}).get('replication-key')
            replication_key_value = singer.get_bookmark(
                raw_state, tap_stream_id, replication_key)
            if version is not None:
                state = singer.write_bookmark(state, tap_stream_id, 'version',
                                              version)
            if replication_key_value is not None:
                state = singer.write_bookmark(state, tap_stream_id,
                                              replication_key,
                                              replication_key_value)
        elif replication_method == 'FULL_TABLE' and version is None:
            state = singer.write_bookmark(state, tap_stream_id, 'version',
                                          version)

    return state
Beispiel #9
0
def build_state(raw_state, catalog):
    state = {}
    replication_method = "INCREMENTAL"

    for catalog_entry in catalog["streams"]:
        tap_stream_id = catalog_entry["tap_stream_id"]
        catalog_metadata = metadata.to_map(catalog_entry["metadata"])

        version = singer.get_bookmark(raw_state, tap_stream_id, "version")

        # Preserve state that deals with resuming an incomplete bulk job
        if singer.get_bookmark(raw_state, tap_stream_id, "JobID"):
            job_id = singer.get_bookmark(raw_state, tap_stream_id, "JobID")
            batches = singer.get_bookmark(raw_state, tap_stream_id, "BatchIDs")
            current_bookmark = singer.get_bookmark(raw_state, tap_stream_id,
                                                   "JobHighestBookmarkSeen")
            state = singer.write_bookmark(state, tap_stream_id, "JobID",
                                          job_id)
            state = singer.write_bookmark(state, tap_stream_id, "BatchIDs",
                                          batches)
            state = singer.write_bookmark(state, tap_stream_id,
                                          "JobHighestBookmarkSeen",
                                          current_bookmark)

        if replication_method == "INCREMENTAL":

            replication_key = catalog_metadata.get(
                (), {}).get("valid-replication-keys")[0]
            replication_key_value = singer.get_bookmark(
                raw_state, tap_stream_id, replication_key)
            if version is not None:
                state = singer.write_bookmark(state, tap_stream_id, "version",
                                              version)
            if replication_key_value is not None:
                state = singer.write_bookmark(state, tap_stream_id,
                                              replication_key,
                                              replication_key_value)
        elif replication_method == "FULL_TABLE" and version is None:
            state = singer.write_bookmark(state, tap_stream_id, "version",
                                          version)

    return state
Beispiel #10
0
def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    schema = load_schema("engagements")
    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"],
                        [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must save a lookback window
    # that handles the duration of time that this stream was last syncing,
    # and look back by that amount on the next sync
    last_sync_duration = get_previous_time_window(STATE, "engagements")
    current_sync_start = utils.now()
    if has_bookmark(STATE, "engagements", bookmark_key) and \
       last_sync_duration is not None:
        LOGGER.info((
            "Last sync of engagements lasted {} seconds. Adjusting bookmark by this "
            "amount to account for race conditions with record updates."
        ).format(last_sync_duration))
        start = utils.strptime_to_utc(start) - datetime.timedelta(
            seconds=last_sync_duration)
        start = utils.strftime(start)
    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key,
                              "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(engagement, schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key,
                                  max_bk_value)
    # Write duration for next sync's lookback window
    STATE = write_stream_duration(STATE, 'engagements', current_sync_start,
                                  utils.now())
    singer.write_state(STATE)
    return STATE
Beispiel #11
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, stream.tap_stream_id,
                                    'version') is None

    #pick a new table version IFF we do not have an xmin in our state
    #the presence of an xmin indicates that we were interrupted last time through
    if singer.get_bookmark(state, stream.tap_stream_id, 'xmin') is None:
        nascent_stream_version = int(time.time() * 1000)
    else:
        nascent_stream_version = singer.get_bookmark(state,
                                                     stream.tap_stream_id,
                                                     'version')

    state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                  nascent_stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(post_db.prepare_columns_sql, desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=nascent_stream_version)

    if first_run:
        singer.write_message(activate_version_message)

    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                xmin = singer.get_bookmark(state, stream.tap_stream_id, 'xmin')
                if xmin:
                    LOGGER.info(
                        "Resuming Full Table replication %s from xmin %s",
                        nascent_stream_version, xmin)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {} where age(xmin::xid) < age('{}'::xid)
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table), xmin)
                else:
                    LOGGER.info("Beginning new Full Table replication %s",
                                nascent_stream_version)
                    select_sql = """SELECT {}, xmin::text::bigint
                                      FROM {}
                                     ORDER BY xmin::text ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream.table))

                LOGGER.info("select %s", select_sql)
                cur.execute(select_sql)

                rows_saved = 0
                rec = cur.fetchone()
                while rec is not None:
                    xmin = rec['xmin']
                    rec = rec[:-1]
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, nascent_stream_version, desired_columns,
                        time_extracted, md_map)
                    singer.write_message(record_message)
                    state = singer.write_bookmark(state, stream.tap_stream_id,
                                                  'xmin', xmin)
                    rows_saved = rows_saved + 1
                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()
                    rec = cur.fetchone()

    #once we have completed the full table replication, discard the xmin bookmark.
    #the xmin bookmark only comes into play when a full table replication is interrupted
    state = singer.write_bookmark(state, stream.tap_stream_id, 'xmin', None)

    #always send the activate version whether first run or subsequent
    singer.write_message(activate_version_message)

    return state
def sync_table(conn_config, stream, state, desired_columns):
    connection = orc_db.open_connection(conn_config)
    connection.outputtypehandler = common.OutputTypeHandler

    cur = connection.cursor()
    cur.execute("ALTER SESSION SET TIME_ZONE = '00:00'")
    cur.execute(
        """ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS."00+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_FORMAT='YYYY-MM-DD"T"HH24:MI:SSXFF"+00:00"'"""
    )
    cur.execute(
        """ALTER SESSION SET NLS_TIMESTAMP_TZ_FORMAT  = 'YYYY-MM-DD"T"HH24:MI:SS.FFTZH:TZM'"""
    )
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream.tap_stream_id,
                                         'version')
    # If there was no bookmark for stream_version, it is the first time
    # this table is being sync'd, so get a new version, write to
    # state
    if stream_version is None:
        stream_version = int(time.time() * 1000)
        state = singer.write_bookmark(state, stream.tap_stream_id, 'version',
                                      stream_version)
        singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    activate_version_message = singer.ActivateVersionMessage(
        stream=stream.stream, version=stream_version)
    singer.write_message(activate_version_message)

    md = metadata.to_map(stream.metadata)
    schema_name = md.get(()).get('schema-name')

    escaped_columns = map(lambda c: common.prepare_columns_sql(stream, c),
                          desired_columns)
    escaped_schema = schema_name
    escaped_table = stream.table

    replication_key = md.get((), {}).get('replication-key')
    #escaped_replication_key = common.prepare_columns_sql(stream, replication_key)
    replication_key_value = singer.get_bookmark(state, stream.tap_stream_id,
                                                'replication_key_value')
    replication_key_sql_datatype = md.get(
        ('properties', replication_key)).get('sql-datatype')

    with metrics.record_counter(None) as counter:
        if replication_key_value:
            LOGGER.info("Resuming Incremental replication from %s = %s",
                        replication_key, replication_key_value)
            casted_where_clause_arg = common.prepare_where_clause_arg(
                replication_key_value, replication_key_sql_datatype)

            select_sql = """SELECT {}
                                FROM {}.{}
                               WHERE {} >= {}
                               ORDER BY {} ASC
                                """.format(','.join(escaped_columns),
                                           escaped_schema, escaped_table,
                                           replication_key,
                                           casted_where_clause_arg,
                                           replication_key)
        else:
            select_sql = """SELECT {}
                                FROM {}.{}
                               ORDER BY {} ASC
                               """.format(','.join(escaped_columns),
                                          escaped_schema, escaped_table,
                                          replication_key)

        rows_saved = 0
        LOGGER.info("select %s", select_sql)
        for row in cur.execute(select_sql):
            record_message = common.row_to_singer_message(
                stream, row, stream_version, desired_columns, time_extracted)

            singer.write_message(record_message)
            rows_saved = rows_saved + 1

            #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
            #event worse would be allowing the NULL value to enter into the state
            if record_message.record[replication_key] is not None:
                state = singer.write_bookmark(
                    state, stream.tap_stream_id, 'replication_key_value',
                    record_message.record[replication_key])

            if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            counter.increment()

    cur.close()
    connection.close()
    return state
Beispiel #13
0
def sync_rows(config,
              state,
              tap_stream_id,
              key_properties=[],
              auth_method=None,
              max_page=None,
              assume_sorted=True,
              filter_by_schema=True,
              raw_output=False):
    """
    - max_page: Force sync to end after max_page. Mostly used for debugging.
    - assume_sorted: Trust the data to be presorted by the
                     index/timestamp/datetime keys
                     so it is safe to finish the replication once the last
                     update index/timestamp/datetime passes the end.
    """
    schema = load_schema(config["schema_dir"], tap_stream_id)
    params = get_init_endpoint_params(config, state, tap_stream_id)
    bookmark_type = get_bookmark_type(config)
    start = get_start(config, state, tap_stream_id, "last_update")
    end = get_end(config)

    headers = get_http_headers(config)

    if start is None:
        LOGGER.warning("None of timestamp_key, datetime_key, and index_key" +
                       " are set in conifg. Bookmarking is not available.")

    start_str = human_readable(bookmark_type, start)
    end_str = human_readable(bookmark_type, end)
    # Log the conditions
    LOGGER.info("Stream %s has %s set starting %s and ending %s." %
                (tap_stream_id, bookmark_type, start_str, end_str))
    # I trust you set URL format contains those params. The behavior depends
    # on the data source API's spec.
    # I will not filter out the records outside the boundary. Every record
    # received is will be written out.

    LOGGER.info("assume_sorted is set to %s" % assume_sorted)
    # I trust the data to be sorted by the index/timestamp/datetime keys.
    # So it is safe to finish the replication once the last
    # update index/timestamp/datetime passes the end.
    # When in doubt, set this to False. Always perform post-replication dedup.

    LOGGER.info("filter_by_schema is set to %s." % filter_by_schema)
    # The fields undefined/not-conforming to the schema will be written out.

    LOGGER.info("auth_method is set to %s" % auth_method)

    # Initialize the counters
    last_update = start

    # Offset is the number of records (vs. page)
    offset_number = params.get("current_offset", 0)
    page_number = params.get("current_page", 0)

    # When we rely on index/datetime/timestamp to parse the next GET URL,
    # we will get the record we have already seen in the current process.
    # When we get last_record_extracted from state file, we can also
    # compare with the previous process to further avoiding duplicated
    # records in the target data store.
    prev_written_record = None
    last_record_extracted = singer.get_bookmark(state, tap_stream_id,
                                                "last_record_extracted")
    if last_record_extracted:
        prev_written_record = json.loads(last_record_extracted)

    # First writ out the schema
    if raw_output is False:
        singer.write_schema(tap_stream_id, schema, key_properties)

    # Fetch and iterate over to write the records
    with metrics.record_counter(tap_stream_id) as counter:
        while True:
            params.update({"current_page": page_number})
            params.update({"current_page_one_base": page_number + 1})
            params.update({"current_offset": offset_number})
            params.update({"last_update": last_update})

            endpoint = get_endpoint(config["url"], tap_stream_id, params)
            LOGGER.info("GET %s", endpoint)

            rows = generate_request(tap_stream_id, endpoint, auth_method,
                                    headers, config.get("username"),
                                    config.get("password"))
            rows = get_record_list(rows, config.get("record_list_level"))

            LOGGER.info("Current page %d" % page_number)
            LOGGER.info("Current offset %d" % offset_number)

            for row in rows:
                record = get_record(row, config.get("record_level"))
                if filter_by_schema:
                    record = filter_record(record, schema)

                    if not validate(record, schema):
                        LOGGER.debug("Skipping the schema invalidated row %s" %
                                     record)
                        continue

                # It's important to compare the record before adding
                # EXTRACT_TIMESTAMP
                if record == prev_written_record:
                    LOGGER.debug("Skipping the duplicated row %s" % record)
                    continue

                if EXTRACT_TIMESTAMP in schema["properties"].keys():
                    extract_tstamp = datetime.datetime.utcnow()
                    extract_tstamp = extract_tstamp.replace(
                        tzinfo=datetime.timezone.utc)
                    record[EXTRACT_TIMESTAMP] = extract_tstamp.isoformat()

                next_last_update = get_last_update(config, record, last_update)

                if not end or next_last_update < end:
                    if raw_output:
                        sys.stdout.write(json.dumps(record) + "\n")
                    else:
                        singer.write_record(tap_stream_id, record)

                    counter.increment()  # Increment only when we write
                    last_update = next_last_update

                    # prev_written_record may be persisted for the next run.
                    # EXTRACT_TIMESTAMP will be different. So popping it out
                    # before storing.
                    record.pop(EXTRACT_TIMESTAMP)
                    prev_written_record = record

            # Exit conditions
            if len(rows) < config["items_per_page"]:
                LOGGER.info(
                    ("Response is less than set item per page (%d)." +
                     "Finishing the extraction") % config["items_per_page"])
                break
            if max_page and page_number + 1 >= max_page:
                LOGGER.info("Max page %d reached. Finishing the extraction.")
                break
            if assume_sorted and end and next_last_update >= end:
                LOGGER.info(("Record greater than %s and assume_sorted is" +
                             " set. Finishing the extraction.") % end)
                break

            page_number += 1
            offset_number += len(rows)

    state = singer.write_bookmark(state, tap_stream_id, "last_update",
                                  last_update)
    if prev_written_record:
        state = singer.write_bookmark(state, tap_stream_id,
                                      "last_record_extracted",
                                      json.dumps(prev_written_record))

    if raw_output == False:
        singer.write_state(state)

    return state
Beispiel #14
0
    def do_sync(self):
        logger.debug('Starting sync')

        # resuming when currently_syncing within state
        resume_from_stream = False
        if self.state and 'currently_syncing' in self.state:
            resume_from_stream = self.state['currently_syncing']

        for stream in self.streams:
            stream.tap = self

            if resume_from_stream:
                if stream.schema == resume_from_stream:
                    logger.info('Resuming from {}'.format(resume_from_stream))
                    resume_from_stream = False
                else:
                    logger.info(
                        'Skipping stream {} as resuming from {}'.format(
                            stream.schema, resume_from_stream))
                    continue

            # stream state, from state/bookmark or start_date
            stream.set_initial_state(self.state, self.config['start_date'])

            # currently syncing
            if stream.state_field:
                set_currently_syncing(self.state, stream.schema)
                self.state = singer.write_bookmark(self.state, stream.schema,
                                                   stream.state_field,
                                                   str(stream.initial_state))
                singer.write_state(self.state)

            # schema
            stream.write_schema()

            # paginate
            while stream.has_data():

                with singer.metrics.http_request_timer(stream.schema) as timer:
                    try:
                        response = self.execute_stream_request(stream)
                    except (ConnectionError, RequestException) as e:
                        raise e
                    timer.tags[singer.metrics.Tag.
                               http_status_code] = response.status_code

                self.validate_response(response)
                self.rate_throttling(response)
                stream.paginate(response)

                # records with metrics
                with singer.metrics.record_counter(stream.schema) as counter:
                    with singer.Transformer(singer.NO_INTEGER_DATETIME_PARSING
                                            ) as optimus_prime:
                        for row in self.iterate_response(response):
                            row = stream.process_row(row)
                            row = optimus_prime.transform(
                                row, stream.get_schema())
                            if stream.write_record(row):
                                counter.increment()
                            stream.update_state(row)

            # update state / bookmarking only when supported by stream
            if stream.state_field:
                self.state = singer.write_bookmark(self.state, stream.schema,
                                                   stream.state_field,
                                                   str(stream.earliest_state))
            singer.write_state(self.state)

        # clear currently_syncing
        del self.state['currently_syncing']
        singer.write_state(self.state)
Beispiel #15
0
def sync_records(sf, catalog_entry, state, counter):
    chunked_bookmark = singer_utils.strptime_with_tz(
        sf.get_start_date(state, catalog_entry))
    stream = catalog_entry['stream']
    schema = catalog_entry['schema']
    stream_alias = catalog_entry.get('stream_alias')
    replication_key = catalog_entry.get('replication_key')
    stream_version = get_stream_version(catalog_entry, state)
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    start_time = singer_utils.now()

    LOGGER.info('Syncing Salesforce data for stream %s', stream)
    with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
        for rec in sf.query(catalog_entry, state):
            counter.increment()
            rec = transformer.transform(rec, schema)
            rec = fix_record_anytype(rec, schema)
            singer.write_message(
                singer.RecordMessage(stream=(stream_alias or stream),
                                     record=rec,
                                     version=stream_version,
                                     time_extracted=start_time))

            replication_key_value = replication_key and singer_utils.strptime_with_tz(
                rec[replication_key])

            if sf.pk_chunking:
                if replication_key_value and replication_key_value <= start_time and replication_key_value > chunked_bookmark:
                    # Replace the highest seen bookmark and save the state in case we need to resume later
                    chunked_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])
                    state = singer.write_bookmark(
                        state, catalog_entry['tap_stream_id'],
                        'JobHighestBookmarkSeen',
                        singer_utils.strftime(chunked_bookmark))
                    singer.write_state(state)
            # Before writing a bookmark, make sure Salesforce has not given us a
            # record with one outside our range
            elif replication_key_value and replication_key_value <= start_time:
                state = singer.write_bookmark(state,
                                              catalog_entry['tap_stream_id'],
                                              replication_key,
                                              rec[replication_key])
                singer.write_state(state)

        # Tables with no replication_key will send an
        # activate_version message for the next sync
        if not replication_key:
            singer.write_message(activate_version_message)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          'version', None)

        # If pk_chunking is set, only write a bookmark at the end
        if sf.pk_chunking:
            # Write a bookmark with the highest value we've seen
            state = singer.write_bookmark(
                state, catalog_entry['tap_stream_id'], replication_key,
                singer_utils.strptime(chunked_bookmark))
def sync_deals(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_with_tz(get_start(STATE, "deals", bookmark_key))
    max_bk_value = start
    LOGGER.info("sync_deals from %s", start)
    most_recent_modified_time = start
    params = {'limit': 100, 'includeAssociations': False, 'properties': []}

    schema = load_schema("deals")
    singer.write_schema("deals", schema, ["dealId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Check if we should  include associations
    for key in mdata.keys():
        if 'associations' in key:
            assoc_mdata = mdata.get(key)
            if (assoc_mdata.get('selected')
                    and assoc_mdata.get('selected') == True):
                params['includeAssociations'] = True

    v3_fields = None
    has_selected_properties = mdata.get(('properties', 'properties'),
                                        {}).get('selected')
    if has_selected_properties or has_selected_custom_field(mdata):
        # On 2/12/20, hubspot added a lot of additional properties for
        # deals, and appending all of them to requests ended up leading to
        # 414 (url-too-long) errors. Hubspot recommended we use the
        # `includeAllProperties` and `allpropertiesFetchMode` params
        # instead.
        params['includeAllProperties'] = True
        params['allPropertiesFetchMode'] = 'latest_version'

        # Grab selected `hs_date_entered/exited` fields to call the v3 endpoint with
        v3_fields = [
            breadcrumb[1].replace('property_', '')
            for breadcrumb, mdata_map in mdata.items()
            if breadcrumb and (mdata_map.get('selected') == True
                               or has_selected_properties) and any(
                                   prefix in breadcrumb[1]
                                   for prefix in V3_PREFIXES)
        ]

    url = get_url('deals_all')
    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for row in gen_request(STATE,
                               'deals',
                               url,
                               params,
                               'deals',
                               "hasMore", ["offset"], ["offset"],
                               v3_fields=v3_fields):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = bumble_bee.transform(
                    lift_properties_and_versions(row), schema, mdata)
                singer.write_record("deals",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())

    STATE = singer.write_bookmark(STATE, 'deals', bookmark_key,
                                  utils.strftime(max_bk_value))
    singer.write_state(STATE)
    return STATE
def sync_entity_chunked(STATE, catalog, entity_name, key_properties, path):
    schema = load_schema(entity_name)
    bookmark_key = 'startTimestamp'

    singer.write_schema(entity_name, schema, key_properties, [bookmark_key],
                        catalog.get('stream_alias'))

    start = get_start(STATE, entity_name, bookmark_key)
    LOGGER.info("sync_%s from %s", entity_name, start)

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    now_ts = int(now.timestamp() * 1000)

    start_ts = int(utils.strptime_with_tz(start).timestamp() * 1000)
    url = get_url(entity_name)

    mdata = metadata.to_map(catalog.get('metadata'))

    if entity_name == 'email_events':
        window_size = int(CONFIG['email_chunk_size'])
    elif entity_name == 'subscription_changes':
        window_size = int(CONFIG['subscription_chunk_size'])

    with metrics.record_counter(entity_name) as counter:
        while start_ts < now_ts:
            end_ts = start_ts + window_size
            params = {
                'startTimestamp': start_ts,
                'endTimestamp': end_ts,
                'limit': 1000,
            }
            with Transformer(
                    UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
                while True:
                    our_offset = singer.get_offset(STATE, entity_name)
                    if bool(our_offset) and our_offset.get('offset') != None:
                        params[StateFields.offset] = our_offset.get('offset')

                    data = request(url, params).json()
                    time_extracted = utils.now()

                    if data.get(path) is None:
                        raise RuntimeError(
                            "Unexpected API response: {} not in {}".format(
                                path, data.keys()))

                    for row in data[path]:
                        counter.increment()
                        record = bumble_bee.transform(
                            lift_properties_and_versions(row), schema, mdata)
                        singer.write_record(entity_name,
                                            record,
                                            catalog.get('stream_alias'),
                                            time_extracted=time_extracted)
                    if data.get('hasMore'):
                        STATE = singer.set_offset(STATE, entity_name, 'offset',
                                                  data['offset'])
                        singer.write_state(STATE)
                    else:
                        STATE = singer.clear_offset(STATE, entity_name)
                        singer.write_state(STATE)
                        break
            STATE = singer.write_bookmark(STATE, entity_name, 'startTimestamp', utils.strftime(datetime.datetime.fromtimestamp((start_ts / 1000), datetime.timezone.utc)))  # pylint: disable=line-too-long
            singer.write_state(STATE)
            start_ts = end_ts

    STATE = singer.clear_offset(STATE, entity_name)
    singer.write_state(STATE)
    return STATE
Beispiel #18
0
def get_all_pull_requests(schemas, repo_path, state, mdata):
    '''
    https://developer.github.com/v3/pulls/#list-pull-requests
    '''

    bookmark_value = get_bookmark(state, repo_path, "pull_requests", "since")
    if bookmark_value:
        bookmark_time = singer.utils.strptime_to_utc(bookmark_value)
    else:
        bookmark_time = 0

    with metrics.record_counter('pull_requests') as counter:
        with metrics.record_counter('reviews') as reviews_counter:
            for response in authed_get_all_pages(
                    'pull_requests',
                    'https://api.github.com/repos/{}/pulls?state=all&sort=updated&direction=desc'
                    .format(repo_path)):
                pull_requests = response.json()
                extraction_time = singer.utils.now()
                for pr in pull_requests:

                    # skip records that haven't been updated since the last run
                    # the GitHub API doesn't currently allow a ?since param for pulls
                    # once we find the first piece of old data we can return, thanks to
                    # the sorting
                    if bookmark_time and singer.utils.strptime_to_utc(
                            pr.get('updated_at')) < bookmark_time:
                        return state

                    pr_num = pr.get('number')
                    pr_id = pr.get('id')
                    pr['_sdc_repository'] = repo_path

                    # transform and write pull_request record
                    with singer.Transformer() as transformer:
                        rec = transformer.transform(
                            pr,
                            schemas['pull_requests'],
                            metadata=metadata.to_map(mdata))
                    singer.write_record('pull_requests',
                                        rec,
                                        time_extracted=extraction_time)
                    singer.write_bookmark(
                        state, repo_path, 'pull_requests',
                        {'since': singer.utils.strftime(extraction_time)})
                    counter.increment()

                    # sync reviews if that schema is present (only there if selected)
                    if schemas.get('reviews'):
                        for review_rec in get_reviews_for_pr(
                                pr_num, schemas['reviews'], repo_path, state,
                                mdata):
                            singer.write_record('reviews',
                                                review_rec,
                                                time_extracted=extraction_time)
                            singer.write_bookmark(
                                state, repo_path, 'reviews', {
                                    'since':
                                    singer.utils.strftime(extraction_time)
                                })

                            reviews_counter.increment()

                    # sync review comments if that schema is present (only there if selected)
                    if schemas.get('review_comments'):
                        for review_comment_rec in get_review_comments_for_pr(
                                pr_num, schemas['review_comments'], repo_path,
                                state, mdata):
                            singer.write_record('review_comments',
                                                review_comment_rec,
                                                time_extracted=extraction_time)
                            singer.write_bookmark(
                                state, repo_path, 'review_comments', {
                                    'since':
                                    singer.utils.strftime(extraction_time)
                                })

                    if schemas.get('pr_commits'):
                        for pr_commit in get_commits_for_pr(
                                pr_num, pr_id, schemas['pr_commits'],
                                repo_path, state, mdata):
                            singer.write_record('pr_commits',
                                                pr_commit,
                                                time_extracted=extraction_time)
                            singer.write_bookmark(
                                state, repo_path, 'pr_commits', {
                                    'since':
                                    singer.utils.strftime(extraction_time)
                                })

    return state
Beispiel #19
0
def do_sync_historical_binlog(mysql_conn, catalog_entry, state, columns, use_gtid: bool, engine: str):
    binlog.verify_binlog_config(mysql_conn)

    if use_gtid and engine == MYSQL_ENGINE:
        binlog.verify_gtid_config(mysql_conn)

    is_view = common.get_is_view(catalog_entry)

    if is_view:
        raise Exception(f"Unable to replicate stream({catalog_entry.stream}) with binlog because it is a view.")

    log_file = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'log_file')

    log_pos = singer.get_bookmark(state,
                                  catalog_entry.tap_stream_id,
                                  'log_pos')

    gtid = None
    if use_gtid:
        gtid = singer.get_bookmark(state,
                                   catalog_entry.tap_stream_id,
                                   'gtid')

    max_pk_values = singer.get_bookmark(state,
                                        catalog_entry.tap_stream_id,
                                        'max_pk_values')

    write_schema_message(catalog_entry)

    stream_version = common.get_stream_version(catalog_entry.tap_stream_id, state)

    if max_pk_values and ((use_gtid and gtid) or (log_file and log_pos)):
        LOGGER.info("Resuming initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)
        full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
    else:
        LOGGER.info("Performing initial full table sync for LOG_BASED stream %s", catalog_entry.tap_stream_id)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'initial_binlog_complete',
                                      False)

        current_log_file, current_log_pos = binlog.fetch_current_log_file_and_pos(mysql_conn)

        current_gtid = None
        if use_gtid:
            current_gtid = binlog.fetch_current_gtid_pos(mysql_conn, engine)

        state = singer.write_bookmark(state,
                                      catalog_entry.tap_stream_id,
                                      'version',
                                      stream_version)

        if full_table.pks_are_auto_incrementing(mysql_conn, catalog_entry):
            # We must save log_file, log_pos, gtid across FULL_TABLE syncs when using
            # an incrementing PK
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            if current_gtid:
                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'gtid',
                                              current_gtid)

            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)

        else:
            full_table.sync_table(mysql_conn, catalog_entry, state, columns, stream_version)
            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_file',
                                          current_log_file)

            state = singer.write_bookmark(state,
                                          catalog_entry.tap_stream_id,
                                          'log_pos',
                                          current_log_pos)

            if current_gtid:
                state = singer.write_bookmark(state,
                                              catalog_entry.tap_stream_id,
                                              'gtid',
                                              current_gtid)
Beispiel #20
0
def sync_table(conn_info, stream, state, desired_columns, md_map):
    time_extracted = utils.now()

    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))

    schema_name = md_map.get(()).get('schema-name')

    escaped_columns = map(
        partial(post_db.prepare_columns_for_select_sql, md_map=md_map),
        desired_columns)

    activate_version_message = singer.ActivateVersionMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        version=stream_version)

    singer.write_message(activate_version_message)

    replication_key = md_map.get((), {}).get('replication-key')
    replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'],
                                                'replication_key_value')
    replication_key_sql_datatype = md_map.get(
        ('properties', replication_key)).get('sql-datatype')

    hstore_available = post_db.hstore_available(conn_info)
    with metrics.record_counter(None) as counter:
        with post_db.open_connection(conn_info) as conn:

            # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
            # The server / db can also have its own configred encoding.
            with conn.cursor() as cur:
                cur.execute("show server_encoding")
                LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
                cur.execute("show client_encoding")
                LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])

            if hstore_available:
                LOGGER.info("hstore is available")
                psycopg2.extras.register_hstore(conn)
            else:
                LOGGER.info("hstore is UNavailable")

            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor,
                             name='pipelinewise') as cur:
                cur.itersize = post_db.CURSOR_ITER_SIZE
                LOGGER.info("Beginning new incremental replication sync %s",
                            stream_version)
                if replication_key_value:
                    select_sql = """SELECT {}
                                    FROM {}
                                    WHERE {} >= '{}'::{}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key),
                        replication_key_value, replication_key_sql_datatype,
                        post_db.prepare_columns_sql(replication_key))
                else:
                    #if not replication_key_value
                    select_sql = """SELECT {}
                                    FROM {}
                                    ORDER BY {} ASC""".format(
                        ','.join(escaped_columns),
                        post_db.fully_qualified_table_name(
                            schema_name, stream['table_name']),
                        post_db.prepare_columns_sql(replication_key))

                LOGGER.info('select statement: %s with itersize %s',
                            select_sql, cur.itersize)
                cur.execute(select_sql)

                rows_saved = 0

                for rec in cur:
                    record_message = post_db.selected_row_to_singer_message(
                        stream, rec, stream_version, desired_columns,
                        time_extracted, md_map)

                    singer.write_message(record_message)
                    rows_saved = rows_saved + 1

                    #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
                    #event worse would be allowing the NULL value to enter into the state
                    if record_message.record[replication_key] is not None:
                        state = singer.write_bookmark(
                            state, stream['tap_stream_id'],
                            'replication_key_value',
                            record_message.record[replication_key])

                    if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
                        singer.write_message(
                            singer.StateMessage(value=copy.deepcopy(state)))

                    counter.increment()

    return state
def sync(config, state, catalog):
    errors_encountered = False

    selected_stream_ids = get_selected_streams(catalog)

    client = GAClient(config)

    if not state.get('bookmarks'):
        state['bookmarks'] = {}

    # Loop over streams in catalog
    for stream in catalog['streams']:
        stream_id = stream['tap_stream_id']
        stream_schema = stream['schema']
        report_definition = ReportsHelper.get_report_definition(stream)

        stream_metadata = metadata.to_map(stream['metadata'])
        key_properties = metadata.get(stream_metadata, (),
                                      "table-key-properties")

        if stream_id in selected_stream_ids:
            LOGGER.info('Syncing stream: ' + stream_id)

            start_date = get_start_date(config, state, stream_id)
            end_date = get_end_date(config)
            date_range = []

            while start_date <= end_date:
                date_range.append(utils.strftime(start_date, '%Y-%m-%d'))
                start_date += timedelta(days=1)

            for date in date_range:
                try:
                    results = client.process_stream(date, report_definition)

                    # we write the schema message after we are sure that we could
                    #  fetch records without errors
                    singer.write_schema(stream_id, stream_schema,
                                        key_properties)
                    singer.write_records(stream_id, results)
                    singer.write_bookmark(state, stream_id, 'last_report_date',
                                          date)
                    singer.write_state(state)
                except TapGaInvalidArgumentError as e:
                    errors_encountered = True
                    LOGGER.error(
                        "Skipping stream: '{}' due to invalid report definition."
                        .format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                except TapGaRateLimitError as e:
                    errors_encountered = True
                    LOGGER.error(
                        "Skipping stream: '{}' due to Rate Limit Errors.".
                        format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                except TapGaQuotaExceededError as e:
                    errors_encountered = True
                    LOGGER.error(
                        "Skipping stream: '{}' due to Quota Exceeded Errors.".
                        format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                except TapGaAuthenticationError as e:
                    LOGGER.error(
                        "Stopping execution while processing '{}' due to Authentication Errors."
                        .format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                    sys.exit(1)
                except TapGaUnknownError as e:
                    LOGGER.error(
                        "Stopping execution while processing '{}' due to Unknown Errors."
                        .format(stream_id))
                    LOGGER.debug("Error: '{}'.".format(e))
                    sys.exit(1)
        else:
            LOGGER.info('Skipping unselected stream: ' + stream_id)

    # If we encountered errors, exit with 1
    if errors_encountered:
        sys.exit(1)

    return
    def setUp(self):
        self.maxDiff = None
        self.state = {}
        self.conn = test_utils.get_test_connection()

        log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn)

        with connect_with_backoff(self.conn) as open_conn:
            with open_conn.cursor() as cursor:
                cursor.execute(
                    'CREATE TABLE binlog_1 (id int, updated datetime)')
                cursor.execute(
                    'CREATE TABLE binlog_2 (id int, updated datetime)')
                cursor.execute(
                    'INSERT INTO binlog_1 (id, updated) VALUES (1, \'2017-06-01\')'
                )
                cursor.execute(
                    'INSERT INTO binlog_1 (id, updated) VALUES (2, \'2017-06-20\')'
                )
                cursor.execute(
                    'INSERT INTO binlog_1 (id, updated) VALUES (3, \'2017-09-22\')'
                )
                cursor.execute(
                    'INSERT INTO binlog_2 (id, updated) VALUES (1, \'2017-10-22\')'
                )
                cursor.execute(
                    'INSERT INTO binlog_2 (id, updated) VALUES (2, \'2017-11-10\')'
                )
                cursor.execute(
                    'INSERT INTO binlog_2 (id, updated) VALUES (3, \'2017-12-10\')'
                )
                cursor.execute(
                    'UPDATE binlog_1 set updated=\'2018-06-18\' WHERE id = 3')
                cursor.execute(
                    'UPDATE binlog_2 set updated=\'2018-06-18\' WHERE id = 2')
                cursor.execute('DELETE FROM binlog_1 WHERE id = 2')
                cursor.execute('DELETE FROM binlog_2 WHERE id = 1')

            open_conn.commit()

        self.catalog = test_utils.discover_catalog(self.conn, {})

        for stream in self.catalog.streams:
            stream.stream = stream.table

            stream.metadata = [{
                'breadcrumb': (),
                'metadata': {
                    'selected': True,
                    'database-name': 'tap_mysql_test',
                    'table-key-propertes': ['id']
                }
            }, {
                'breadcrumb': ('properties', 'id'),
                'metadata': {
                    'selected': True
                }
            }, {
                'breadcrumb': ('properties', 'updated'),
                'metadata': {
                    'selected': True
                }
            }]

            test_utils.set_replication_method_and_key(stream, 'LOG_BASED',
                                                      None)

            self.state = singer.write_bookmark(self.state,
                                               stream.tap_stream_id,
                                               'log_file', log_file)

            self.state = singer.write_bookmark(self.state,
                                               stream.tap_stream_id, 'log_pos',
                                               log_pos)

            self.state = singer.write_bookmark(self.state,
                                               stream.tap_stream_id, 'version',
                                               singer.utils.now())
Beispiel #23
0
def sync_table(connection, catalog_entry, state):
    columns = list(catalog_entry.schema.properties.keys())
    start_date = CONFIG.get('start_date')
    formatted_start_date = None

    if not columns:
        LOGGER.warning(
            'There are no columns selected for table {}, skipping it'.format(
                catalog_entry.table))
        return

    tap_stream_id = catalog_entry.tap_stream_id
    LOGGER.info('Beginning sync for {} table'.format(tap_stream_id))
    with connection.cursor() as cursor:
        schema, table = catalog_entry.table.split('.')
        select = 'SELECT {} FROM {}.{}'.format(
            ','.join('"{}"'.format(c) for c in columns), '"{}"'.format(schema),
            '"{}"'.format(table))
        params = {}

        if start_date is not None:
            formatted_start_date = datetime.datetime.strptime(
                start_date, '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo=pytz.UTC)

        replication_key = metadata.to_map(catalog_entry.metadata).get(
            (), {}).get('replication-key')
        replication_key_value = None
        bookmark_is_empty = state.get('bookmarks',
                                      {}).get(tap_stream_id) is None
        stream_version = get_stream_version(tap_stream_id, state)
        state = singer.write_bookmark(state, tap_stream_id, 'version',
                                      stream_version)
        activate_version_message = singer.ActivateVersionMessage(
            stream=catalog_entry.stream, version=stream_version)

        # If there's a replication key, we want to emit an ACTIVATE_VERSION
        # message at the beginning so the records show up right away. If
        # there's no bookmark at all for this stream, assume it's the very
        # first replication. That is, clients have never seen rows for this
        # stream before, so they can immediately acknowledge the present
        # version.
        if replication_key or bookmark_is_empty:
            yield activate_version_message

        if replication_key:
            replication_key_value = singer.get_bookmark(
                state, tap_stream_id,
                'replication_key_value') or formatted_start_date.isoformat()

        if replication_key_value is not None:
            entry_schema = catalog_entry.schema

            if entry_schema.properties[replication_key].format == 'date-time':
                replication_key_value = pendulum.parse(replication_key_value)

            select += ' WHERE {} >= %(replication_key_value)s ORDER BY {} ' \
                      'ASC'.format(replication_key, replication_key)
            params['replication_key_value'] = replication_key_value

        elif replication_key is not None:
            select += ' ORDER BY {} ASC'.format(replication_key)

        time_extracted = utils.now()
        query_string = cursor.mogrify(select, params)
        LOGGER.info('Running {}'.format(query_string))
        cursor.execute(select, params)
        row = cursor.fetchone()
        rows_saved = 0

        with metrics.record_counter(None) as counter:
            counter.tags['database'] = catalog_entry.database
            counter.tags['table'] = catalog_entry.table
            while row:
                counter.increment()
                rows_saved += 1
                record_message = row_to_record(catalog_entry, stream_version,
                                               row, columns, time_extracted)
                yield record_message

                if replication_key is not None:
                    state = singer.write_bookmark(
                        state, tap_stream_id, 'replication_key_value',
                        record_message.record[replication_key])
                if rows_saved % 1000 == 0:
                    yield singer.StateMessage(value=copy.deepcopy(state))
                row = cursor.fetchone()

        if not replication_key:
            yield activate_version_message
            state = singer.write_bookmark(state, catalog_entry.tap_stream_id,
                                          'version', None)

        yield singer.StateMessage(value=copy.deepcopy(state))
Beispiel #24
0
async def sync_report_interval(client, account_id, report_stream, start_date,
                               end_date):
    state_key = '{}_{}'.format(account_id, report_stream.stream)
    report_name = stringcase.pascalcase(report_stream.stream)

    report_schema = get_report_schema(client, report_name)
    singer.write_schema(report_stream.stream, report_schema, [])

    report_time = arrow.get().isoformat()

    # Get request id to retrieve report stream
    request_id = get_report_request_id(client, account_id, report_stream,
                                       report_name, start_date, end_date,
                                       state_key)

    singer.write_bookmark(STATE, state_key, 'request_id', request_id)
    singer.write_state(STATE)

    try:
        # Get success status and download url
        success, download_url = await poll_report(client, account_id,
                                                  report_name, start_date,
                                                  end_date, request_id)

    except Exception as some_error:  # pylint: disable=broad-except,unused-variable
        LOGGER.info(
            'The request_id %s for %s is invalid, generating a new one',
            request_id, state_key)
        request_id = get_report_request_id(client,
                                           account_id,
                                           report_stream,
                                           report_name,
                                           start_date,
                                           end_date,
                                           state_key,
                                           force_refresh=True)

        singer.write_bookmark(STATE, state_key, 'request_id', request_id)
        singer.write_state(STATE)

        success, download_url = await poll_report(client, account_id,
                                                  report_name, start_date,
                                                  end_date, request_id)

    if success and download_url:  # pylint: disable=no-else-return
        LOGGER.info('Streaming report: %s for account %s - from %s to %s',
                    report_name, account_id, start_date, end_date)

        stream_report(report_stream.stream, report_name, download_url,
                      report_time)
        singer.write_bookmark(STATE, state_key, 'request_id', None)
        singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat())
        singer.write_state(STATE)
        return True
    elif success and not download_url:
        LOGGER.info('No data for report: %s for account %s - from %s to %s',
                    report_name, account_id, start_date, end_date)
        singer.write_bookmark(STATE, state_key, 'request_id', None)
        singer.write_bookmark(STATE, state_key, 'date', end_date.isoformat())
        singer.write_state(STATE)
        return True
    else:
        LOGGER.info(
            'Unsuccessful request for report: %s for account %s - from %s to %s',
            report_name, account_id, start_date, end_date)
        singer.write_bookmark(STATE, state_key, 'request_id', None)
        singer.write_state(STATE)
        return False
Beispiel #25
0
def sync_query(cursor, catalog_entry, state, select_sql, columns,
               stream_version, params):
    """..."""
    replication_key = singer.get_bookmark(state, catalog_entry.tap_stream_id,
                                          'replication_key')

    time_extracted = utils.now()

    LOGGER.info('Running %s', select_sql)
    cursor.execute(select_sql, params)

    row = cursor.fetchone()
    rows_saved = 0

    database_name = get_database_name(catalog_entry)

    with metrics.record_counter(None) as counter:
        counter.tags['database'] = database_name
        counter.tags['table'] = catalog_entry.table

        while row:
            counter.increment()
            rows_saved += 1
            record_message = row_to_singer_record(catalog_entry,
                                                  stream_version, row, columns,
                                                  time_extracted)
            singer.write_message(record_message)

            md_map = metadata.to_map(catalog_entry.metadata)
            stream_metadata = md_map.get((), {})
            replication_method = stream_metadata.get('replication-method')

            if replication_method == 'FULL_TABLE':
                key_properties = get_key_properties(catalog_entry)

                max_pk_values = singer.get_bookmark(
                    state, catalog_entry.tap_stream_id, 'max_pk_values')

                if max_pk_values:
                    last_pk_fetched = {
                        k: v
                        for k, v in record_message.record.items()
                        if k in key_properties
                    }

                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'last_pk_fetched',
                                                  last_pk_fetched)

            elif replication_method == 'INCREMENTAL':
                if replication_key is not None:
                    state = singer.write_bookmark(state,
                                                  catalog_entry.tap_stream_id,
                                                  'replication_key',
                                                  replication_key)

                    state = singer.write_bookmark(
                        state, catalog_entry.tap_stream_id,
                        'replication_key_value',
                        record_message.record[replication_key])
            if rows_saved % 1000 == 0:
                singer.write_message(
                    singer.StateMessage(value=copy.deepcopy(state)))

            row = cursor.fetchone()

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Beispiel #26
0
def sync(config: dict, state: dict, catalog: singer.Catalog):
    """
    Synchronise data from source schemas using input context
    """

    session = None

    # Get bookmarks of state of each stream
    bookmarks = state.get('bookmarks', dict())

    # Parse timestamp and convert to date
    start_date = singer.utils.strptime_to_utc(config['start_date'])

    # selected_stream_ids = get_selected_streams(catalog)
    #
    # if not selected_stream_ids:
    #     singer.log_warning('No streams selected')

    # Iterate over streams in catalog
    for stream in catalog.streams:

        stream_id = stream.tap_stream_id

        # Skip if not selected for sync
        # if stream_id not in selected_stream_ids:
        #     continue

        LOGGER.info('Syncing stream: "%s"', stream_id)

        filter_schema(stream.schema, stream.metadata)

        # Emit schema
        singer.write_schema(stream_name=stream_id,
                            schema=stream.schema.to_dict(),
                            key_properties=stream.key_properties)

        # Initialise Gemini HTTP API session (only do this once)
        if session is None:
            session = tap_gemini.transport.GeminiSession(
                # Mandatory
                client_id=config['username'],
                client_secret=config['password'],
                refresh_token=config['refresh_token'],

                # Optional
                api_version=config.get('api_version'),
                user_agent=config.get('user_agent'),
                session_options=config.get('session', dict()),
                sandbox=config.get('sandbox'))

            # Get a list of all the account IDs
            advertiser_ids = config.get(
                'advertiser_ids', [adv['id'] for adv in session.advertisers])

        # Create data stream
        if stream_id in OBJECT_MAP.keys():

            # List API objects
            model = OBJECT_MAP[stream_id]
            write_records(stream=stream,
                          rows=model.list_data(session=session),
                          tags=dict(object=stream_id))

        else:
            # Run report

            # Use bookmark to continue where we left off
            bookmark = bookmarks.get(stream_id, dict())
            start_date = bookmark.get(tap_gemini.settings.BOOKMARK_KEY,
                                      start_date)

            # Define time range
            try:
                # Is there a maximum look back? (i.e. earliest start date for report)
                days = tap_gemini.settings.MAX_LOOK_BACK_DAYS[stream_id]

                # Get the current timestamp and "look back" the specified number of days
                look_back_start_date = singer.utils.now() - datetime.timedelta(
                    days=days)

                # Must we confine the time range to avoid errors?
                if look_back_start_date > start_date:
                    start_date = look_back_start_date
                    singer.log_warning(
                        "\"%s\" enforced maximum look back of %s days, start date set to %s",
                        stream_id, days, start_date)

            except KeyError:
                pass

            # Break into time window chunks, if necessary
            try:
                time_windows = generate_time_windows(
                    start=start_date,
                    size=tap_gemini.settings.MAX_WINDOW_DAYS[stream_id])
            except KeyError:
                # Default time window: just use specified start/end date
                time_windows = ((start_date,
                                 cast_date_to_datetime(
                                     date=datetime.date.today())), )

            # Each report is run within a single time window
            for start, end in time_windows:
                # Build report definition
                report_params = build_report_params(config=config,
                                                    stream=stream,
                                                    start_date=start,
                                                    end_date=end)

                report_params['advertiser_ids'] = advertiser_ids

                # Define the report
                rep = tap_gemini.report.GeminiReport(
                    session=session,
                    poll_interval=config.get('poll_interval'),
                    **report_params)

                # Emit records
                write_records(stream=stream, rows=rep.stream(), tags=rep.tags)

                # Bookmark the progress through the stream
                # Get the time when the data is complete (no further changes will occur)
                bookmark_timestamp = get_books_closed(rep=rep)

                # Preserve state for each stream
                singer.write_bookmark(
                    state=state,
                    tap_stream_id=stream_id,
                    key=tap_gemini.settings.BOOKMARK_KEY,
                    val=cast_date_to_datetime(bookmark_timestamp).isoformat())

                singer.write_state(state)
Beispiel #27
0
def write_stream_duration(state, tap_stream_id, start, end):
    duration = (end - start).total_seconds()
    return singer.write_bookmark(state, tap_stream_id, "last_sync_duration",
                                 duration)
def write_current_sync_start(state, tap_stream_id, start):
    value = start
    if start is not None:
        value = utils.strftime(start)
    return singer.write_bookmark(state, tap_stream_id, "current_sync_start",
                                 value)
Beispiel #29
0
    def setUp(self):
        self.conn = test_utils.get_test_connection()
        self.state = {}

        log_file, log_pos = binlog.fetch_current_log_file_and_pos(self.conn)

        with connect_with_backoff(self.conn) as open_conn:
            with open_conn.cursor() as cursor:
                cursor.execute(
                    'CREATE TABLE datetime_types (id int, datetime_col datetime, timestamp_col timestamp, time_col time, date_col date)'
                )
                cursor.execute(
                    'INSERT INTO datetime_types (id, datetime_col, timestamp_col, time_col, date_col) VALUES (1, \'0000-00-00\', \'0000-00-00 00:00:00\', \'00:00:00\', \'0000-00-00\' )'
                )
                cursor.execute(
                    'INSERT INTO datetime_types (id, datetime_col, timestamp_col, time_col, date_col) VALUES (2, NULL, NULL, NULL, NULL)'
                )
            open_conn.commit()

        self.catalog = test_utils.discover_catalog(self.conn, {})

        for stream in self.catalog.streams:
            stream.stream = stream.table

            stream.metadata = [{
                'breadcrumb': (),
                'metadata': {
                    'selected': True,
                    'database-name': 'tap_mysql_test',
                    'table-key-propertes': ['id']
                }
            }, {
                'breadcrumb': ('properties', 'id'),
                'metadata': {
                    'selected': True
                }
            }, {
                'breadcrumb': ('properties', 'datetime_col'),
                'metadata': {
                    'selected': True
                }
            }, {
                'breadcrumb': ('properties', 'timestamp_col'),
                'metadata': {
                    'selected': True
                }
            }, {
                'breadcrumb': ('properties', 'time_col'),
                'metadata': {
                    'selected': True
                }
            }, {
                'breadcrumb': ('properties', 'date_col'),
                'metadata': {
                    'selected': True
                }
            }]

            test_utils.set_replication_method_and_key(stream, 'LOG_BASED',
                                                      None)

            self.state = singer.write_bookmark(self.state,
                                               stream.tap_stream_id,
                                               'log_file', log_file)

            self.state = singer.write_bookmark(self.state,
                                               stream.tap_stream_id, 'log_pos',
                                               log_pos)

            self.state = singer.write_bookmark(self.state,
                                               stream.tap_stream_id, 'version',
                                               singer.utils.now())
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(
                    lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE, ctx,
                                                      record['companyId'])

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE