async def sync_stream(state, instance, counter, *args, start_date=None): stream = instance.stream mdata = stream.metadata stream_name = stream.tap_stream_id LOGGER.info("%s: Starting sync", stream_name) # If we have a bookmark, use it; otherwise use start_date & update bookmark with it if (instance.replication_method == 'INCREMENTAL' and not state.get('bookmarks', {}).get(stream.tap_stream_id, {}).get(instance.replication_key)): singer.write_bookmark( state, stream.tap_stream_id, instance.replication_key, start_date ) # We keep our own count since the counter passed in may be shared by # other async fns. my_count = 0 parent_stream = stream async for (stream, record) in instance.sync(state, *args): # NB: Only count parent records in the case of sub-streams if stream.tap_stream_id == parent_stream.tap_stream_id: counter.increment() my_count += 1 with singer.Transformer() as transformer: rec = transformer.transform(record, stream.schema.to_dict(), metadata=metadata.to_map(mdata)) singer.write_record(stream.tap_stream_id, rec) LOGGER.info("%s: Completed sync (%s rows)", stream_name, my_count)
def _transform_records(self, start: datetime, end: datetime, counter: singer.metrics.Counter, time_extracted: datetime): report_params = { "003cd1ea-5f11-4fe8-ae9c-d7af1e3a95d6": singer.utils.strftime(start, format_str=self.date_param_fmt), "b03cd1ea-5f11-4fe8-ae9c-d7af1e3a95d6": singer.utils.strftime(end, format_str=self.date_param_fmt), } rows_returned = 0 for _, row in self.client.get_report( xrefcode="pay_summary_report", **report_params).yield_report_rows(limit=(500, 3600)): if row: rows_returned += 1 with singer.Transformer() as transformer: transformed_record = transformer.transform( data=row, schema=self.get_schema(self.tap_stream_id, self.catalog)) singer.write_record(stream_name=self.tap_stream_id, record=transformed_record, time_extracted=time_extracted) counter.increment() if 18000 <= rows_returned < 20000: LOGGER.warning( "Approaching maximum row limit of 20,000. Consider making request window smaller." ) elif rows_returned >= 20000: LOGGER.error( "Hit maximum row limit of 20,000. Make request window smaller for Pay Summary Report." )
def sync(self, mdata): schema = self.load_schema() with singer.metrics.job_timer( job_type='list_conversation_history') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: for channel in self.channels(): channel_id = channel.get('id') for page in self.webclient.conversations_history( channel=channel_id): messages = page.get('messages') for message in messages: data = {} data['channel_id'] = channel_id data = {**data, **message} with singer.Transformer( integer_datetime_fmt= "unix-seconds-integer-datetime-parsing" ) as transformer: transformed_record = transformer.transform( data=data, schema=schema, metadata=metadata.to_map(mdata)) singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment() #TODO: handle rate limiting better than this. time.sleep(1)
def sync(self, mdata): schema = self.load_schema() # pylint: disable=unused-variable with singer.metrics.job_timer(job_type='list_user_groups') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: usergroups_list = self.client.get_user_groups( include_count="true", include_disabled="true", include_user="******") for page in usergroups_list: for usergroup in page.get('usergroups'): with singer.Transformer( integer_datetime_fmt="unix-seconds-integer-datetime-parsing") \ as transformer: transformed_record = transformer.transform( data=usergroup, schema=schema, metadata=metadata.to_map(mdata)) if self.write_to_singer: singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def sync(self, mdata): schema = self.load_schema() # pylint: disable=unused-variable with singer.metrics.job_timer( job_type='list_conversation_members') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: for channel in self.channels(): channel_id = channel.get('id') members_cursor = self.client.get_channel_members( channel_id) for page in members_cursor: members = page.get('members') for member in members: data = { 'channel_id': channel_id, 'user_id': member } with singer.Transformer() as transformer: transformed_record = transformer.transform( data=data, schema=schema, metadata=metadata.to_map(mdata)) if self.write_to_singer: singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def sync(self, mdata, channel_id, ts): schema = self.load_schema() start, end = self.get_absolute_date_range( self.config.get('start_date')) # pylint: disable=unused-variable with singer.metrics.job_timer(job_type='list_threads') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: replies = self.client.get_thread(channel=channel_id, ts=ts, inclusive="true", oldest=int(start.timestamp()), latest=int(end.timestamp())) for page in replies: transformed_threads = transform_json( stream=self.name, data=page.get('messages', []), date_fields=self.date_fields, channel_id=channel_id) for message in transformed_threads: with singer.Transformer( integer_datetime_fmt="unix-seconds-integer-datetime-parsing") \ as transformer: transformed_record = transformer.transform( data=message, schema=schema, metadata=metadata.to_map(mdata)) if self.write_to_singer: singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def get_all_issues(schema, config, state, mdata): ''' https://developer.github.com/v3/issues/#list-issues-for-a-repository ''' repo_path = config['repository'] if bookmarks.get_bookmark(state, "issues", 'since'): query_string = '&since={}'.format( bookmarks.get_bookmark(state, "issues", 'since')) else: query_string = '' last_issue_time = None with metrics.record_counter('issues') as counter: for response in authed_get_all_pages( 'issues', 'https://api.github.com/repos/{}/issues?sort=updated&direction=asc{}' .format(repo_path, query_string)): issues = response.json() extraction_time = singer.utils.now() for issue in issues: with singer.Transformer() as transformer: rec = transformer.transform( issue, schema, metadata=metadata.to_map(mdata)) singer.write_record('issues', rec, time_extracted=extraction_time) singer.write_bookmark(state, 'issues', 'since', singer.utils.strftime(extraction_time)) counter.increment() return state
def sync(self, mdata): schema = self.load_schema() # pylint: disable=unused-variable with singer.metrics.job_timer(job_type='list_conversations') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: channels = self.channels() for channel in channels: transformed_channel = transform_json( stream=self.name, data=[channel], date_fields=self.date_fields) with singer.Transformer( integer_datetime_fmt="unix-seconds-integer-datetime-parsing") \ as transformer: transformed_record = transformer.transform( data=transformed_channel[0], schema=schema, metadata=metadata.to_map(mdata)) if self.write_to_singer: singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def get_all_commits(schema, config, state, mdata): ''' https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository ''' repo_path = config['repository'] if bookmarks.get_bookmark(state, "commits", 'since'): query_string = '?since={}'.format( bookmarks.get_bookmark(state, "commits", 'since')) else: query_string = '' latest_commit_time = None with metrics.record_counter('commits') as counter: for response in authed_get_all_pages( 'commits', 'https://api.github.com/repos/{}/commits{}'.format( repo_path, query_string)): commits = response.json() extraction_time = singer.utils.now() for commit in commits: with singer.Transformer() as transformer: rec = transformer.transform( commit, schema, metadata=metadata.to_map(mdata)) singer.write_record('commits', rec, time_extracted=extraction_time) singer.write_bookmark(state, 'commits', 'since', singer.utils.strftime(extraction_time)) counter.increment() return state
def sync(self, mdata): schema = self.load_schema() bookmark = singer.get_bookmark(state=self.state, tap_stream_id=self.name, key=self.replication_key) if bookmark is None: bookmark = self.config.get('start_date') new_bookmark = bookmark # pylint: disable=unused-variable with singer.metrics.job_timer(job_type='list_users') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: users_list = self.client.get_users(limit=100) for page in users_list: users = page.get('members') transformed_users = transform_json(stream=self.name, data=users, date_fields=self.date_fields) for user in transformed_users: with singer.Transformer( integer_datetime_fmt="unix-seconds-integer-datetime-parsing") \ as transformer: transformed_record = transformer.transform(data=user, schema=schema, metadata=metadata.to_map( mdata)) new_bookmark = max(new_bookmark, transformed_record.get('updated')) if transformed_record.get('updated') > bookmark: if self.write_to_singer: singer.write_record(stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment() self.state = singer.write_bookmark(state=self.state, tap_stream_id=self.name, key=self.replication_key, val=new_bookmark)
def get_all_commit_comments(schemas, repo_path, state, mdata): # https://developer.github.com/v3/repos/comments/ # updated_at? incremental # 'https://api.github.com/repos/{}/comments?sort=created_at&direction=desc'.format(repo_path) bookmark_value = get_bookmark(state, repo_path, "commit_comments", "since") if bookmark_value: bookmark_time = singer.utils.strptime_to_utc(bookmark_value) else: bookmark_time = 0 with metrics.record_counter('commit_comments') as counter: for response in authed_get_all_pages( 'commit_comments', 'https://api.github.com/repos/{}/comments?sort=created_at&direction=desc'.format(repo_path) ): commit_comments = response.json() extraction_time = singer.utils.now() for r in commit_comments: r['_sdc_repository'] = repo_path # skip records that haven't been updated since the last run # the GitHub API doesn't currently allow a ?since param for pulls # once we find the first piece of old data we can return, thanks to # the sorting if bookmark_time and singer.utils.strptime_to_utc(r.get('updated_at')) < bookmark_time: return state # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('commit_comments', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'commit_comments', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def get_all_jobs_for_workflow( schemas: dict, pipeline_id: str, workflow_id: str, project: str, state: dict, metadata: dict, job_counter: Optional[metrics.Counter] = None) -> dict: """ https://circleci.com/docs/api/v2/#get-a-workflow-39-s-jobs """ if job_counter is None: job_counter = metrics.record_counter('jobs') job_url = f"https://circleci.com/api/v2/workflow/{workflow_id}/job" extraction_time = singer.utils.now() for job in get_all_items('jobs', job_url): # add in workflow_id and pipeline_id job.update({'_pipeline_id': pipeline_id, '_workflow_id': workflow_id}) # Transform and write with singer.Transformer() as transformer: record = transformer.transform(job, schemas['jobs'].to_dict(), metadata=metadata_lib.to_map( metadata['jobs'])) singer.write_record('jobs', record, time_extracted=extraction_time) job_counter.increment() return state
def do_paginate(self, stream): while stream.has_data(): with singer.metrics.http_request_timer(stream.schema) as timer: try: response = self.execute_stream_request(stream) except (ConnectionError, RequestException) as e: raise e timer.tags[ singer.metrics.Tag.http_status_code] = response.status_code self.validate_response(response) self.rate_throttling(response) stream.paginate(response) # records with metrics with singer.metrics.record_counter(stream.schema) as counter: with singer.Transformer( singer.NO_INTEGER_DATETIME_PARSING) as optimus_prime: for row in self.iterate_response(response): row = stream.process_row(row) if not row: # in case of a non-empty response with an empty element continue row = optimus_prime.transform(row, stream.get_schema()) if stream.write_record(row): counter.increment() stream.update_state(row)
def sync(self): schema = self.load_schema() with singer.metrics.job_timer( job_type='list_conversation_members') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: for page in self.webclient.conversations_list( limit=100, exclude_archived='false', types="public_channel,private_channel"): channels = page.get('channels') for channel in channels: channel_id = channel.get('id') for page in self.webclient.conversations_members( channel=channel_id): members = page.get('members') for member in members: data = {} data['channel_id'] = channel_id data['user_id'] = member with singer.Transformer() as transformer: transformed_record = transformer.transform( data=data, schema=schema) singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def get_all_stargazers(schema, repo_path, state, mdata): ''' https://developer.github.com/v3/activity/starring/#list-stargazers ''' bookmark = get_bookmark(state, repo_path, "stargazers", "since") if bookmark: query_string = '&since={}'.format(bookmark) else: query_string = '' stargazers_headers = {'Accept': 'application/vnd.github.v3.star+json'} with metrics.record_counter('stargazers') as counter: for response in authed_get_all_pages( 'stargazers', 'https://api.github.com/repos/{}/stargazers?sort=updated&direction=asc{}' .format(repo_path, query_string), stargazers_headers): stargazers = response.json() extraction_time = singer.utils.now() for stargazer in stargazers: stargazer['_sdc_repository'] = repo_path with singer.Transformer() as transformer: rec = transformer.transform( stargazer, schema, metadata=metadata.to_map(mdata)) rec['user_id'] = rec['user']['id'] singer.write_record('stargazers', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'stargazers', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def sync(self, mdata): schema = self.load_schema() with singer.metrics.job_timer( job_type='list_conversation_members') as timer: with singer.metrics.record_counter(endpoint=self.name) as counter: for channel in self.channels(): channel_id = channel.get('id') for page in self.webclient.conversations_members( channel=channel_id): members = page.get('members') for member in members: data = {} data['channel_id'] = channel_id data['user_id'] = member with singer.Transformer() as transformer: transformed_record = transformer.transform( data=data, schema=schema, metadata=metadata.to_map(mdata)) singer.write_record( stream_name=self.name, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def get_all_collaborators(schema, repo_path, state, mdata): ''' https://developer.github.com/v3/repos/collaborators/#list-collaborators ''' with metrics.record_counter('collaborators') as counter: for response in authed_get_all_pages( 'collaborators', 'https://api.github.com/repos/{}/collaborators'.format( repo_path)): collaborators = response.json() extraction_time = singer.utils.now() for collaborator in collaborators: collaborator['_sdc_repository'] = repo_path with singer.Transformer() as transformer: rec = transformer.transform( collaborator, schema, metadata=metadata.to_map(mdata)) singer.write_record('collaborators', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'collaborator', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def get_all_comments(schema, repo_path, state, mdata): ''' https://developer.github.com/v3/issues/comments/#list-comments-in-a-repository ''' bookmark = get_bookmark(state, repo_path, "comments", "since") if bookmark: query_string = '&since={}'.format(bookmark) else: query_string = '' with metrics.record_counter('comments') as counter: for response in authed_get_all_pages( 'comments', 'https://api.github.com/repos/{}/issues/comments?sort=updated&direction=asc{}' .format(repo_path, query_string)): comments = response.json() extraction_time = singer.utils.now() for comment in comments: comment['_sdc_repository'] = repo_path with singer.Transformer() as transformer: rec = transformer.transform( comment, schema, metadata=metadata.to_map(mdata)) singer.write_record('comments', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'comments', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def get_all_project_columns(project_id, schemas, repo_path, state, mdata): bookmark_value = get_bookmark(state, repo_path, "project_columns", "since") if bookmark_value: bookmark_time = singer.utils.strptime_to_utc(bookmark_value) else: bookmark_time = 0 with metrics.record_counter('project_columns') as counter: for response in authed_get_all_pages( 'project_columns', 'https://api.github.com/projects/{}/columns?sort=created_at&direction=desc' .format(project_id)): project_columns = response.json() for r in project_columns: r['_sdc_repository'] = repo_path # skip records that haven't been updated since the last run # the GitHub API doesn't currently allow a ?since param for pulls # once we find the first piece of old data we can return, thanks to # the sorting if bookmark_time and singer.utils.strptime_to_utc( r.get('updated_at')) < bookmark_time: return state # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform( r, schemas, metadata=metadata.to_map(mdata)) counter.increment() yield rec return state
def get_all_releases(schemas, repo_path, state, mdata): # Releases doesn't seem to have an `updated_at` property, yet can be edited. # For this reason and since the volume of release can safely be considered low, # bookmarks were ignored for releases. with metrics.record_counter('releases') as counter: for response in authed_get_all_pages( 'releases', 'https://api.github.com/repos/{}/releases?sort=created_at&direction=desc' .format(repo_path)): releases = response.json() extraction_time = singer.utils.now() for r in releases: r['_sdc_repository'] = repo_path # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform( r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('releases', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'releases', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def sync_paginated(self, url, params=None, async_session=None): table = self.TABLE _next = True page = 1 all_resources = [] transformer = singer.Transformer(singer.UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) while _next is not None: result = self.client.make_request(url, self.API_METHOD, params=params) _next = result.get('next') data = self.get_stream_data(result['data'], transformer) with singer.metrics.record_counter(endpoint=table) as counter: singer.write_records( table, data) counter.increment(len(data)) all_resources.extend(data) if _next: params['offset'] = _next LOGGER.info('Synced page {} for {}'.format(page, self.TABLE)) page += 1 transformer.log_warning() return all_resources
def get_all_issue_labels(schemas, repo_path, state, mdata): # https://developer.github.com/v3/issues/labels/ # not sure if incremental key # 'https://api.github.com/repos/{}/labels?sort=created_at&direction=desc'.format(repo_path) with metrics.record_counter('issue_labels') as counter: for response in authed_get_all_pages( 'issue_labels', 'https://api.github.com/repos/{}/labels'.format(repo_path)): issue_labels = response.json() extraction_time = singer.utils.now() for r in issue_labels: r['_sdc_repository'] = repo_path # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform( r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('issue_labels', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'issue_labels', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def sync_stream(state, start_date, instance, config): stream = instance.stream mdata = stream.metadata # If we have a bookmark, use it; otherwise use start_date & update bookmark with it if (instance.replication_method == 'INCREMENTAL' and not state.get('bookmarks', {}).get( stream.tap_stream_id, {}).get(instance.replication_key)): singer.write_bookmark(state, stream.tap_stream_id, instance.replication_key, start_date) parent_stream = stream with metrics.record_counter(stream.tap_stream_id) as counter: for (stream, record) in instance.sync(state): # NB: Only count parent records in the case of sub-streams if stream.tap_stream_id == parent_stream.tap_stream_id: counter.increment() with singer.Transformer() as transformer: rec = transformer.transform(record, stream.schema.to_dict(), metadata=metadata.to_map(mdata)) singer.write_record(stream.tap_stream_id, rec) # NB: We will only write state at the end of a stream's sync: # We may find out that there exists a sync that takes too long and can never emit a bookmark # but we don't know if we can guarentee the order of emitted records. if instance.replication_method == "INCREMENTAL": singer.write_state(state) return counter.value
def get_all_tags(schemas, repo_path, state, mdata): # The volume of tags can safely be considered low with metrics.record_counter('tags') as counter: for response in authed_get_all_pages( 'tags', 'https://api.github.com/repos/{}/tags?sort=node_id&direction=desc' .format(repo_path)): tags = response.json() extraction_time = singer.utils.now() for t in tags: t['_sdc_repository'] = repo_path # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform( t, schemas, metadata=metadata.to_map(mdata)) singer.write_record('tags', rec, time_extracted=extraction_time) singer.write_bookmark( state, repo_path, 'tags', {'since': singer.utils.strftime(extraction_time)}) counter.increment() return state
def sync(self): most_recent_date = self.params["last_seen_at"] record_metadata = singer.metadata.to_map(self.metadata) with singer.metrics.job_timer(job_type=f"list_{self.tap_stream_id}"), \ singer.metrics.record_counter(endpoint=self.tap_stream_id) as counter, \ singer.Transformer() as transformer: for page in self._list_resource(url_suffix="/customers/all", params=self.params): for record in page.get(self.stream): history = self._list_resource(url_suffix="/customers", params={ 'customer_email': record['email'], 'with_history': 'true' }).response transformed_record = transformer.transform( data=history, schema=self.schema, metadata=record_metadata) singer.write_record(stream_name=self.stream, time_extracted=singer.utils.now(), record=transformed_record) counter.increment() if record["last_seen_at"] > most_recent_date: most_recent_date = record["last_seen_at"] singer.bookmarks.write_bookmark(state=self.state, tap_stream_id=self.tap_stream_id, key="last_seen_at", val=most_recent_date)
def get_all_teams(schemas, repo_path, state, mdata): org = repo_path.split('/')[0] with metrics.record_counter('teams') as counter: for response in authed_get_all_pages( 'teams', 'https://api.github.com/orgs/{}/teams?sort=created_at&direction=desc'.format(org) ): teams = response.json() extraction_time = singer.utils.now() for r in teams: r['_sdc_repository'] = repo_path # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform(r, schemas, metadata=metadata.to_map(mdata)) singer.write_record('teams', rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'teams', {'since': singer.utils.strftime(extraction_time)}) counter.increment() if schemas.get('team_members'): team_slug = r['slug'] for team_members_rec in get_all_team_members(team_slug, schemas['team_members'], repo_path, state, mdata): singer.write_record('team_members', team_members_rec, time_extracted=extraction_time) singer.write_bookmark(state, repo_path, 'team_members', {'since': singer.utils.strftime(extraction_time)}) if schemas.get('team_memberships'): team_slug = r['slug'] for team_memberships_rec in get_all_team_memberships(team_slug, schemas['team_memberships'], repo_path, state, mdata): singer.write_record('team_memberships', team_memberships_rec, time_extracted=extraction_time) return state
def get_all_project_cards(column_id, schemas, organization, state, mdata): bookmark_value = get_bookmark(state, organization, "project_cards", "{}/since".format(column_id)) if bookmark_value: bookmark_time = singer.utils.strptime_to_utc(bookmark_value) else: bookmark_time = 0 with metrics.record_counter("project_cards") as counter: for response in authed_get_all_pages( "project_cards", "https://api.github.com/projects/columns/{}/cards?sort=created_at&direction=desc" .format(column_id), {"Accept": "application/vnd.github.inertia-preview+json"}, ): project_cards = response.json() for r in project_cards: r["_sdc_organization"] = organization # skip records that haven't been updated since the last run # the GitHub API doesn't currently allow a ?since param for pulls # once we find the first piece of old data we can return, thanks to # the sorting t = singer.utils.strptime_to_utc(r.get("updated_at")) if bookmark_time and t < bookmark_time: return state # transform and write release record with singer.Transformer() as transformer: rec = transformer.transform( r, schemas, metadata=metadata.to_map(mdata)) counter.increment() yield rec return state
def transform_record(self, record): with singer.Transformer() as tx: metadata = self.stream_metadata if self.catalog.metadata else {} record = validate_ingestible_data(record) return tx.transform(record, self.catalog.schema.to_dict(), metadata)
def _transform_records(self, start, end, counter): for _, record in self.client.get_employee_punches(filterTransactionStartTimeUTC=singer.utils.strftime(start), filterTransactionEndTimeUTC=singer.utils.strftime(end)).yield_records(): if record: record["SyncTimestampUtc"] = self.get_bookmark(self.config, self.tap_stream_id, self.state, self.bookmark_properties) with singer.Transformer() as transformer: transformed_record = transformer.transform(data=record, schema=self.get_schema(self.tap_stream_id, self.catalog)) singer.write_record(stream_name=self.tap_stream_id, time_extracted=singer.utils.now(), record=transformed_record) counter.increment()
def get_all_pipelines(schemas: dict, project: str, state: dict, metadata: dict) -> dict: """ https://circleci.com/docs/api/v2/#get-all-pipelines """ bookmark = get_bookmark(state, project, "pipelines", "since") if bookmark: bookmark_time = singer.utils.strptime_to_utc(bookmark) else: bookmark_time = None pipeline_url = f'https://circleci.com/api/v2/project/{project}/pipeline' pipeline_counter = metrics.record_counter('pipelines') workflow_counter = metrics.record_counter('workflows') if schemas.get( 'workflows') else None job_counter = metrics.record_counter('jobs') if schemas.get( 'jobs') else None extraction_time = singer.utils.now() extraction_time_minus_buffer = extraction_time - TIME_BUFFER_FOR_RUNNING_PIPELINES for pipeline in get_all_items('pipelines', pipeline_url): # We leave a buffer before extracting a pipeline as a hack to avoid extracting currently running pipelines if extraction_time_minus_buffer < singer.utils.strptime_to_utc( pipeline.get('updated_at')): continue # break if the updated time of the pipeline is less than our bookmark_time if bookmark_time is not None and singer.utils.strptime_to_utc( pipeline.get('updated_at')) < bookmark_time: singer.write_bookmark( state, project, 'pipelines', {'since': singer.utils.strftime(extraction_time_minus_buffer)}) return state # Transform and write record with singer.Transformer() as transformer: record = transformer.transform(pipeline, schemas['pipelines'].to_dict(), metadata=metadata_lib.to_map( metadata['pipelines'])) singer.write_record('pipelines', record, time_extracted=extraction_time) pipeline_counter.increment() # If workflows are selected, grab all workflows for this pipeline if schemas.get('workflows'): state = get_all_workflows_for_pipeline(schemas, pipeline.get("id"), project, state, metadata, workflow_counter, job_counter) # Update bookmarks after extraction singer.write_bookmark(state, project, 'pipelines', {'since': singer.utils.strftime(extraction_time)}) return state