def process_s3_bucket_contents(): import boto3 import copy import csv import hashlib import json import lzma import os import typing from botocore.exceptions import ClientError from collectMetrics import shortcuts from datetime import datetime, timedelta COMMIT_DATE_FORMAT: str = '%Y-%m-%dT%H:%M:%SZ' work_queue, done_queue, ologger = utils.comm_binders( process_s3_bucket_contents) outputs_dir: str = os.path.join('/tmp', 'outputs', 'process-contents') astroconda_contrib_repos = shortcuts.obtain_s3_datum( 'astroconda-contrib-repos') astroconda_dev_repos = shortcuts.obtain_s3_datum('astroconda-dev-repos') ascii_date: str = shortcuts.obtain_latest_ascii_date(outputs_dir) s3_client = boto3.client('s3') filepaths: typing.List[typing.Any] = [] latest_dataset: typing.Dict[str, typing.Any] = [] # Debugging logic # ascii_date: str = '2019-11-12' # contents = [] # for page in s3_client.get_paginator('list_objects_v2').paginate(Bucket=os.environ['DATASET_BUCKET'], Prefix=f'daily/{ascii_date}'): # for item in page['Contents']: # filename: str = os.path.basename(item['Key']) # filepath: str = os.path.join(outputs_dir, ascii_date, filename) # file_dir: str = os.path.dirname(filepath) # if not os.path.exists(file_dir): # os.makedirs(file_dir) # s3_key: str = f'daily/{ascii_date}/{filename}' # s3_client.download_file(os.environ['DATASET_BUCKET'], s3_key, filepath) # with lzma.open(filepath, 'r', format=lzma.FORMAT_XZ) as stream: # data = json.loads(stream.read()) # contents.append(data) # for data in contents: # end Debugging logic for details in work_queue: filename: str = os.path.basename(details['key']) filepath: str = os.path.join(outputs_dir, ascii_date, filename) file_dir: str = os.path.dirname(filepath) if not os.path.exists(file_dir): os.makedirs(file_dir) s3_key: str = f'daily/{ascii_date}/{filename}' try: s3_client.download_file(os.environ['DATASET_BUCKET'], s3_key, filepath) except ClientError: ologger.error( f'Unable to download key[{s3_key} to file[{filepath}]') import pdb pdb.set_trace() pass else: with lzma.open(filepath, 'r', format=lzma.FORMAT_XZ) as stream: data = json.loads(stream.read()) os.remove(filepath) try: data['base']['name'] except KeyError as err: continue try: rtcname: str = data['releases'][0]['name'] rtcname_url: str = data['releases'][0]['html_url'] except IndexError: try: rtcname: str = data['tags'][0]['name'] rtcname_url: str = f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/releases/tag/{data["tags"][0]["name"]}' except IndexError: rtcname: str = 'latest commit' rtcname_url: str = data['commits'][0]['html_url'] try: descrip: str = data['releases'][0]['body'].strip() except IndexError: try: descrip: str = [ comm for comm in data['commits'] if comm['sha'] == data['tags'][0]['commit']['sha'] ][0]['commit']['message'].strip() except IndexError: descrip: str = 'N\A' try: date: str = data['releases'][0]['created_at'] except IndexError: try: date: str = [ comm for comm in data['commits'] if comm['sha'] == data['tags'][0]['commit']['sha'] ][0]['commit']['author']['date'] except IndexError: try: date: str = data['commits'][0]['commit']['author'][ 'date'] except IndexError: date: str = 'N\A' try: author_name = data['releases'][0]['author']['login'] author_login = data['releases'][0]['author']['login'] author_url: str = f'https://github.com/{author_login}' except IndexError: try: author_commit = [ comm for comm in data['commits'] if comm['sha'] == data['tags'][0]['commit']['sha'] ][0] author_name: str = author_commit['author'].get( 'name', author_commit['author'].get('login', None)) author_login: str = author_commit['author'].get( 'login', '') author_url: str = f'https://github.com/{author_login}' except IndexError: author_commit: str = data['commits'][0]['commit'] author_name: str = author_commit['author'].get( 'name', author_commit['author'].get('login', None)) author_login: str = author_commit['author'].get( 'login', '') author_url: str = f'https://github.com/{author_login}' try: last_commit: str = data['commits'][0]['commit']['author'][ 'date'] except IndexError: last_commit: str = 'N\A' try: top_contributor: str = data['contributors'][0]['login'] top_contributor_contributations: int = data['contributors'][0][ 'contributions'] except (IndexError, KeyError): try: top_contributor: str = data['contributors'][0]['name'] top_contributor_contributations: int = data[ 'contributors'][0]['contributions'] except IndexError: last_contributor: str = 'N\A' top_contributor_contributations: int = 0 try: license: str = data['base']['license']['name'] except (TypeError, KeyError): license: str = 'None' ologger.info( f'Building Timeseries Data for Org[{data["base"]["owner"]["login"]}]/Repo[{data["base"]["name"]}]' ) dataset_template = { 'package_name': data['base']['name'], 'repo_url': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/', 'owner': data['base']['owner']['login'], 'archived': data['base']['archived'], 'astroconda_contrib_repo': data['base']['name'] in [repo['name'] for repo in astroconda_contrib_repos], 'astroconda_dev_repo': data['base']['name'] in [repo['name'] for repo in astroconda_dev_repos], 'rtcname': rtcname, 'rtcname_url': rtcname_url, 'pulse_monthly': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulse/monthly', 'pulse_weekly': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulse/monthly', 'descrip': descrip, 'date': date, 'author': author_name, 'author_login': author_login, 'author_url': author_url, 'last_commit': last_commit, 'top_contributor': top_contributor, 'top_contributor_contributations': top_contributor_contributations, 'total_contributors': len(data['contributors']), 'travis_badge': f'https://img.shields.io/travis/{data["base"]["owner"]["login"]}/{data["base"]["name"]}.svg', 'rtd_badge': f'https://readthedocs.org/projects/{data["base"]["name"]}/badge/?version=latest', 'license': license, 'forks': data['base']['forks'], 'watchers': data['base']['watchers'], 'issues_open': len([ issue for issue in data['issues'] if issue['state'] == 'open' ]), 'issues_open_url': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/issues?q=is%3Aissue+is%3Aopen', 'issues_closed': len([ issue for issue in data['issues'] if issue['state'] == 'closed' ]), 'issues_closed_url': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/issues?q=is%3Aissue+is%3Aclosed', 'pull_requests_open': len([ pr for pr in data['pull_requests'] if pr['state'] == 'open' ]), 'pull_requests_open_url': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulls?q=is%3Apr+is%3Aopen', 'pull_requests_closed': len([ pr for pr in data['pull_requests'] if pr['state'] == 'closed' ]), 'pull_requests_closed_url': f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulls?q=is%3Apr+is%3Aclosed', } dataset_template['key'] = hashlib.md5( json.dumps(dataset_template).encode('utf-8')).hexdigest() # Find latest commits, issues, and pulls now: datetime = datetime.utcnow() last_week: datetime = datetime.utcnow() + timedelta(days=7) last_month: datetime = datetime.utcnow() + timedelta(days=30) dataset_template['commits_last_week'] = 0 dataset_template['commits_last_month'] = 0 for commit in data['commits']: commit_date: datetime = datetime.strptime( commit['commit']['author']['date'], COMMIT_DATE_FORMAT) if commit_date > last_week and commit_date < now: dataset_template['commits_last_week'] += 1 elif commit_date > last_month and commit_date < now: dataset_template['commits_last_month'] += 1 dataset_template['pull_requests_opened_last_week'] = 0 dataset_template['pull_requests_opened_last_month'] = 0 dataset_template['pull_requests_closed_last_week'] = 0 dataset_template['pull_requests_closed_last_month'] = 0 for pull_request in data['pull_requests']: created_at: datetime = datetime.strptime( pull_request['created_at'], COMMIT_DATE_FORMAT) if created_at > last_week and created_at < now: dataset_template['pull_requests_opened_last_week'] += 1 if created_at > last_month and created_at < now: dataset_template['pull_requests_opened_last_month'] += 1 try: closed_at: datetime = datetime.strptime( pull_request['closed_at'], COMMIT_DATE_FORMAT) except TypeError: closed_at = None else: if closed_at > last_week and closed_at < now: dataset_template['pull_requests_closed_last_week'] += 1 if closed_at > last_month and closed_at < now: dataset_template[ 'pull_requests_closed_last_month'] += 1 dataset_template['issues_opened_last_week'] = 0 dataset_template['issues_opened_last_month'] = 0 dataset_template['issues_closed_last_week'] = 0 dataset_template['issues_closed_last_month'] = 0 for issue in data['issues']: created_at: datetime = datetime.strptime( issue['created_at'], COMMIT_DATE_FORMAT) if created_at > last_week and created_at < now: dataset_template['issues_opened_last_week'] += 1 if created_at > last_month and created_at < now: dataset_template['issues_opened_last_month'] += 1 try: closed_at: datetime = datetime.strptime( issue['closed_at'], COMMIT_DATE_FORMAT) except TypeError: closed_at = None else: if closed_at > last_week and closed_at < now: dataset_template['issues_closed_last_week'] += 1 if closed_at > last_month and closed_at < now: dataset_template['issues_closed_last_month'] += 1 # Building sliding window first_commit_date: datetime = datetime.strptime( data['commits'][-1]['commit']['author']['date'], COMMIT_DATE_FORMAT) last_commit_date: datetime = datetime.strptime( data['commits'][0]['commit']['author']['date'], COMMIT_DATE_FORMAT) commit_date_step: timedelta = timedelta(days=7) date_steps: typing.list[datetime] = [first_commit_date] while True: next_commit_boundry = date_steps[-1] + commit_date_step if next_commit_boundry > last_commit_date: date_steps.append(last_commit_date) break else: date_steps.append(next_commit_boundry) fitted_commits: typing.List[typing.Any] = [] fitted_pull_requests: typing.List[typing.Dict[str, int]] = [] fitted_pull_requests_avg: typing.List[int] = [] fitted_issues: typing.List[typing.Dict[str, int]] = [] fitted_issues_avg: typing.List[int] = [] for idx, next_commit_boundry in enumerate(date_steps[1:], 0): previous_commit_boundry = date_steps[idx] fitted_commit: typing.List[typing.Any] = [] for commit in data['commits']: commit_date: datetime = datetime.strptime( commit['commit']['author']['date'], COMMIT_DATE_FORMAT) if previous_commit_boundry <= commit_date and next_commit_boundry >= commit_date: fitted_commit.append(commit['sha']) else: fitted_commits.append(fitted_commit) fitted_pull_requests_opened: int = 0 fitted_pull_requests_closed: int = 0 fitted_pull_requests_avg_open: float = 0.0 for pull_request in data['pull_requests']: created_at: datetime = datetime.strptime( pull_request['created_at'], COMMIT_DATE_FORMAT) try: closed_at: datetime = datetime.strptime( pull_request['closed_at'], COMMIT_DATE_FORMAT) except TypeError: closed_at: datetime = None if created_at >= previous_commit_boundry and created_at <= next_commit_boundry: fitted_pull_requests_opened += 1 if not closed_at is None: if closed_at >= previous_commit_boundry and closed_at <= next_commit_boundry: fitted_pull_requests_closed += 1 fitted_pull_requests_avg_open += ( closed_at - created_at).total_seconds() else: fitted_pull_requests.append({ 'opened': fitted_pull_requests_opened, 'closed': fitted_pull_requests_closed, }) if len(data['pull_requests']) == 0: fitted_pull_requests_avg.append(0) else: pull_request_dom: float = 3600.0 * 24.0 * len( data['pull_requests']) fitted_pull_requests_avg.append( fitted_pull_requests_avg_open / pull_request_dom) fitted_issues_opened: int = 0 fitted_issues_closed: int = 0 fitted_issues_avg_open: float = 0.0 for issue in data['issues']: created_at: datetime = datetime.strptime( issue['created_at'], COMMIT_DATE_FORMAT) try: closed_at: datetime = datetime.strptime( issue['closed_at'], COMMIT_DATE_FORMAT) except TypeError: clased_at: datetime = None if created_at >= previous_commit_boundry and created_at <= next_commit_boundry: fitted_issues_opened += 1 if not closed_at is None: if closed_at >= previous_commit_boundry and closed_at <= next_commit_boundry: fitted_issues_closed += 1 fitted_issues_avg_open += (closed_at - created_at).total_seconds() else: fitted_issues.append({ 'opened': fitted_issues_opened, 'closed': fitted_issues_closed, }) if len(data['issues']) == 0: fitted_issues_avg.append(0) else: issue_dom = 3600.0 * 24.0 * len(data['issues']) fitted_issues_avg.append(fitted_issues_avg_open / issue_dom) dataset = copy.deepcopy(dataset_template) dataset[ 'pull_requests_opened_weekly'] = fitted_pull_requests_opened dataset[ 'pull_requests_closed_weekly'] = fitted_pull_requests_closed dataset['avg_issue_time_weekly'] = fitted_issues_avg_open dataset['avg_pr_time_weekly'] = fitted_pull_requests_avg_open dataset['issues_opened_weekly'] = fitted_issues_opened dataset['issues_closed_weekly'] = fitted_issues_closed dataset['date_weekly'] = previous_commit_boundry.strftime( COMMIT_DATE_FORMAT) dataset['commits_weekly'] = len(fitted_commit) done_queue.put({'dataset': dataset})
def first_aws_lambda_function(): work_queue, done_queue, ologger = utils.comm_binders( first_aws_lambda_function) ologger.info('Awesome') for idx in range(0, 10): work_queue.put({'idx': idx})
def run_jobs(options: argparse.Namespace, jobs: typing.Dict[str, types.FunctionType]): if bert_constants.DEBUG: for idx, (job_name, conf) in enumerate(jobs.items()): if options.stop_at_job and options.stop_at_job == job_name: logger.info(f'Stoping at Job[{job_name}]') break if options.jump_to_job and options.jump_to_job != job_name: logger.info(f'Skipping Job[{job_name}]') continue if options.jump_to_job and options.jump_to_job == job_name: previous_job_name = [k for k in jobs.keys()][idx - 1] previous_job_conf = jobs[previous_job_name] cache_backend = previous_job_conf['job'].cache_backend if cache_backend: cache_backend.clear_queue(conf['job'].work_key) cache_backend.fill_queue(conf['job'].work_key, options.jump_to_number) bert_encoders.clear_encoding() bert_encoders.load_identity_encoders(conf['encoding']['identity_encoders']) bert_encoders.load_queue_encoders(conf['encoding']['queue_encoders']) bert_encoders.load_queue_decoders(conf['encoding']['queue_decoders']) logger.info(f'Running Job[{job_name}] as [{conf["spaces"]["pipeline-type"]}]') execution_role_arn: str = conf['iam'].get('execution-role-arn', None) job_worker_queue, job_done_queue, job_logger = bert_utils.comm_binders(conf['job']) if options.cognito is True: job_worker_queue.put({ 'cognito-event': inject_cognito_event(conf) }) for invoke_arg in conf['aws-deploy']['invoke-args']: job_worker_queue.put(invoke_arg) if execution_role_arn is None: with bert_datasource.ENVVars(conf['runner']['environment']): conf['job']() else: with bert_aws.assume_role(execution_role_arn): with bert_datasource.ENVVars(conf['runner']['environment']): conf['job']() else: for job_name, conf in jobs.items(): # Todo: Add jump-to-job here if options.stop_at_job and options.stop_at_job == job_name: logger.info(f'Stoping at Job[{job_name}]') break logger.info(f'Running Job[{conf["job"].func_space}] as [{conf["job"].pipeline_type.value}] for [{conf["job"].__name__}]') logger.info(f'Job worker count[{conf["job"].workers}]') processes: typing.List[multiprocessing.Process] = [] bert_encoders.clear_encoding() bert_encoders.load_identity_encoders(conf['encoding']['identity_encoders']) bert_encoders.load_queue_encoders(conf['encoding']['queue_encoders']) bert_encoders.load_queue_decoders(conf['encoding']['queue_decoders']) job_worker_queue, job_done_queue, job_logger = bert_utils.comm_binders(conf['job']) if options.cognito is True: job_worker_queue.put({ 'cognito-event': inject_cognito_event(conf) }) for invoke_arg in conf['aws-deploy']['invoke-args']: job_worker_queue.put(invoke_arg) @functools.wraps(conf['job']) def _job_runner(*args, **kwargs) -> None: bert_encoders.clear_encoding() bert_encoders.load_identity_encoders(conf['encoding']['identity_encoders']) bert_encoders.load_queue_encoders(conf['encoding']['queue_encoders']) bert_encoders.load_queue_decoders(conf['encoding']['queue_decoders']) execution_role_arn: str = conf['iam'].get('execution-role-arn', None) job_restart_count: int = 0 job_work_queue, job_done_queue, ologger = bert_utils.comm_binders(conf['job']) while job_restart_count < conf['runner']['max-retries']: try: if execution_role_arn is None: with bert_datasource.ENVVars(conf['runner']['environment']): conf['job']() while job_work_queue.size() > 0: conf['job']() else: with bert_aws.assume_role(execution_role_arn): with bert_datasource.ENVVars(conf['runner']['environment']): conf['job']() while job_worker_queue.size() > 0: conf['job']() except Exception as err: if LOG_ERROR_ONLY: logger.exception(err) else: raise err else: break job_restart_count += 1 else: logger.exception(f'Job[{conf["job"].func_space}] failed {job_restart_count} times') for idx in range(0, conf['job'].workers): proc: multiprocessing.Process = multiprocessing.Process(target=_job_runner, args=()) proc.daemon = True proc.start() processes.append(proc) else: while not STOP_DAEMON and any([proc.is_alive() for proc in processes]): time.sleep(bert_constants.DELAY)
def generate_stats(): import hashlib import json import os import sync_utils from datetime import timedelta, datetime from collectGithubData import reducers download_details = reducers.obtain_channel_download_details() work_queue, done_queue, ologger = utils.comm_binders(generate_stats) repo_list = [] entry_keys = [] for details in work_queue: org = details['org_name'] name = details['repo_name'] repo_list.append([org, name]) ologger.info(f'Loading Details for Github Repo:{org}/{name}') with details['etl']['repo'] as etl_dataset_reader: try: repo_data = [entry for entry in etl_dataset_reader][0] except IndexError: ologger.error(f'Unable to load entry[{org}-{name}]') continue with details['etl']['commits'] as etl_dataset_reader: commits = [entry for entry in etl_dataset_reader] with details['etl']['issues'] as etl_dataset_reader: issues = [entry for entry in etl_dataset_reader] with details['etl']['pull-requests'] as etl_dataset_reader: pull_requests = [entry for entry in etl_dataset_reader] with details['etl']['releases'] as etl_dataset_reader: releases = [entry for entry in etl_dataset_reader] with details['etl']['collaborators'] as etl_dataset_reader: collaborators = [entry for entry in etl_dataset_reader] with details['etl']['tags'] as etl_dataset_reader: tags = [entry for entry in etl_dataset_reader] dataset_template = { 'url': f'https://github.com/{org}/{name}/', 'package_name': name, 'org': org, 'description': repo_data['description'], 'authors_commit': reducers.find_commit_authors(commits), 'authors_issue': reducers.find_issue_authors(issues), 'authors_pull_request': reducers.find_pull_request_authors(pull_requests), 'authors_release': reducers.find_release_authors(releases), 'authors_tag': reducers.find_tag_authors(tags), 'archived': repo_data['archived'], 'date_earliest': reducers.find_earliest_date(issues, pull_requests, commits, releases, tags), 'date_latest': reducers.find_latest_date(issues, pull_requests, commits, releases, tags), 'release_authors': reducers.find_release_authors(releases), 'release_latest_notes': reducers.find_latest_release_notes(commits, releases, tags), 'release_latest_author': reducers.find_latest_release_author(commits, releases, tags), 'license': repo_data['license']['name'] if repo_data['license'] else None, 'is_private': repo_data['private'], 'count_forks': repo_data['forks'], 'count_watchers': repo_data['watchers'], 'count_issues_open': len([issue for issue in issues if issue['state'] == 'open']), 'count_issues_closed': len([issue for issue in issues if issue['state'] == 'closed']), 'count_issues_total': len(issues), 'count_pull_requests_open': len([pr for pr in pull_requests if pr['state'] == 'open']), 'count_pull_requests_closed': len([pr for pr in pull_requests if pr['state'] == 'closed']), 'count_pull_requests_total': len(pull_requests), 'count_commits_total': len(commits), 'count_releases_total': len(releases), 'count_tags_total': len(tags), 'pull_requests_open_url': f'https://github.com/{org}/{name}/pulls?q=is%3Apr+is%3Aopen', 'pull_requests_closed_url': f'https://github.com/{org}/{name}/pulls?q=is%3Apr+is%3Aclosed', 'issues_open_url': f'https://github.com/{org}/{name}/issues?q=is%3Aissue+is%3Aopen', 'issues_closed_url': f'https://github.com/{org}/{name}/issues?q=is%3Aissue+is%3Aclosed', 'badges': [], 'download_channel': {}, } for badge_name, badge_links in reducers.find_badge_locations( org, name).items(): dataset_template['badges'].append({ 'name': badge_name, 'src': badge_links['src'], 'anchor': badge_links['anchor'], }) entry_key = f'{org}-{name}'.lower() entry_keys.append(entry_key) entry_download_details = download_details.get(entry_key, None) if entry_download_details is None: ologger.info( f'Unable pull Download Details for Repo[{org}-{name}]') else: for channel in entry_download_details.get('channels', []): dataset_downloads_channel_details = dataset_template[ 'download_channel'].get(channel, { 'count': 0, 'homes': [], }) if not entry_download_details[ 'home'] in dataset_downloads_channel_details['homes']: dataset_downloads_channel_details['homes'].append( entry_download_details['home']) dataset_downloads_channel_details[ 'count'] += entry_download_details['count'] else: import pdb pdb.set_trace() import sys sys.exit(1) dataset_template['download_channel'][ channel] = dataset_downloads_channel_details stats_s3_key = f'stsci-tools/stats/{org}/{name}.json' sync_utils.upload_dataset(dataset_template, stats_s3_key, os.environ['DATASET_BUCKET']) # Upload base-repo information COMMIT_DATE_FORMAT: str = '%Y-%m-%dT%H:%M:%SZ' date_interval = timedelta(days=1) date_earliest = datetime.strptime(dataset_template['date_earliest'], COMMIT_DATE_FORMAT) date_latest = datetime.strptime(dataset_template['date_latest'], COMMIT_DATE_FORMAT) date_range = date_latest - date_earliest window_stat_collection = [] for window_index in range(0, date_range.days): window_start = date_earliest + timedelta(days=window_index) window_stop = date_earliest + timedelta(days=window_index + 1) window_stats = { 'window_start': window_start.strftime(COMMIT_DATE_FORMAT), 'window_stop': window_stop.strftime(COMMIT_DATE_FORMAT), 'count_commits': 0, 'count_issues_open': 0, 'count_issues_closed': 0, 'count_pull_requests_open': 0, 'count_pull_requests_closed': 0, 'count_releases': 0, 'count_tags': 0, } for commit in commits: commit_date = datetime.strptime( commit['commit']['author']['date'], COMMIT_DATE_FORMAT) if commit_date >= window_start and commit_date < window_stop: window_stats[ 'count_commits'] = window_stats['count_commits'] + 1 for issue in issues: issue_created = datetime.strptime(issue['created_at'], COMMIT_DATE_FORMAT) issue_updated = datetime.strptime(issue['updated_at'], COMMIT_DATE_FORMAT) if (issue_created >= window_start and issue_created < window_stop) \ or (issue_updated >= window_start and issue_updated < window_stop): if issue['state'] == 'closed': window_stats['count_issues_closed'] = window_stats[ 'count_issues_closed'] + 1 elif issue['state'] == 'open': window_stats['count_issues_open'] = window_stats[ 'count_issues_open'] + 1 else: raise NotImplementedError( f'Issue State: {issue["state"]}') for pr in pull_requests: pr_created = datetime.strptime(pr['created_at'], COMMIT_DATE_FORMAT) pr_updated = datetime.strptime(pr['updated_at'], COMMIT_DATE_FORMAT) if (pr_created >= window_start and pr_created < window_stop) \ or (pr_updated >= window_start and pr_updated < window_stop): if pr['state'] == 'closed': window_stats[ 'count_pull_requests_closed'] = window_stats[ 'count_pull_requests_closed'] + 1 elif pr['state'] == 'open': window_stats[ 'count_pull_requests_open'] = window_stats[ 'count_pull_requests_open'] + 1 else: raise NotImplementedError(f'PR State: {pr["state"]}') for release in releases: release_created = datetime.strptime(release['created_at'], COMMIT_DATE_FORMAT) release_published = datetime.strptime(release['published_at'], COMMIT_DATE_FORMAT) if release_published >= window_start and release_published <= window_stop: window_stats[ 'count_releases'] = window_stats['count_releases'] + 1 for tag in tags: # Update Extractor to pull tag-dates pass window_stat_collection.append(window_stats) window_stat_collection_s3_key = f'stsci-tools/window-stats/{org}/{name}.json' sync_utils.upload_dataset(window_stat_collection, window_stat_collection_s3_key, os.environ['DATASET_BUCKET'])
def second_aws_lambda_function(): work_queue, done_queue, ologger = utils.comm_binders( second_aws_lambda_function) for details in work_queue: idx = details['idx'] ologger.info(f'IDX: {idx}')
def init_job_queue() -> None: work_queue, done_queue, ologger = utils.comm_binders(init_job_queue) # for idx in range(0, 500): for idx in range(0, 10): done_queue.put({'idx': idx})
def handle_job_queue() -> None: work_queue, done_queue, ologger = utils.comm_binders(handle_job_queue) for details in work_queue: done_queue.put(details)
def finalize_contents(): import boto3 import csv import json import os import tempfile import typing from collectMetrics import shortcuts s3_client = boto3.client('s3') work_queue, done_queue, ologger = utils.comm_binders(finalize_contents) outputs_dir: str = os.path.join('/tmp', 'outputs', 'finalize_contents') if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) latest_dataset: typing.List[typing.Dict[str, typing.Any]] = [] for details in work_queue: latest_dataset.append(details['dataset']) # import ipdb; ipdb.set_trace() # s3_keys: typing.List[str] = [item for item in { ascii_date: str = shortcuts.obtain_latest_ascii_date(outputs_dir) timeseries_filename: str = 'github-metrics.csv' timeseries_s3_key: str = f'timeseries/{timeseries_filename}' timeseries_filepath: str = os.path.join(outputs_dir, 'timeseries', timeseries_filename) timeseries_dir: str = os.path.dirname(timeseries_filepath) if not os.path.exists(timeseries_dir): os.makedirs(timeseries_dir) ologger.info(f'Writing dataset to file[{timeseries_filepath}]') with open(timeseries_filepath, 'w') as stream: writer = csv.DictWriter(stream, fieldnames=latest_dataset[0].keys()) writer.writeheader() for entry in latest_dataset: writer.writerow(entry) if constants.DEBUG is False: ologger.info(f'Uploading file[{timeseries_filepath}] to s3 bucket[{os.environ["DATASET_BUCKET"]}] key[{timeseries_s3_key}]') s3_client.upload_file(timeseries_filepath, os.environ['DATASET_BUCKET'], timeseries_s3_key, ExtraArgs={'ACL': 'public-read'}) # Last Week Stats for owner, package_name, stats in shortcuts.find_last_week_stats(latest_dataset): filepath = tempfile.NamedTemporaryFile().name s3_key = f'timeseries/last-week-stats/{owner}/{package_name}.json' with open(filepath, 'wb') as stream: stream.write(json.dumps(stats).encode(constants.ENCODING)) if constants.DEBUG is False: ologger.info(f'Uploading Stats for Owner[{owner}] Package[{package_name}]') s3_client.upload_file(filepath, os.environ['DATASET_BUCKET'], s3_key, ExtraArgs={'ACL': 'public-read'}) # Last Week Entries last_week_entries_filepath = tempfile.NamedTemporaryFile().name last_week_entries_s3_key = 'timeseries/last-week-entries.json' ologger.info(f'Writing Last Week Entries to file[{last_week_entries_filepath}]') last_week_entries = shortcuts.last_week_entries(latest_dataset) with open(last_week_entries_filepath, 'w') as stream: stream.write(json.dumps(last_week_entries)) if constants.DEBUG is False: ologger.info(f'Uploading Last Week Entries to S3Key[{last_week_entries_s3_key}]') s3_client.upload_file(last_week_entries_filepath, os.environ['DATASET_BUCKET'], last_week_entries_s3_key, ExtraArgs={'ACL': 'public-read'}) latest_index_filename: str = 'latest.json' latest_index_filepath: str = os.path.join(outputs_dir, latest_index_filename) latest_index_s3_key: str = f'timeseries/{latest_index_filename}' ologger.info(f'Building Latest Index for date[{ascii_date}]') unique_dataset: typing.List[typing.Dict[str, typing.Any]] = [] unique_dataset_index: typing.List[str] = [] for entry in latest_dataset: entry_key: str = f'{entry["owner"]}-{entry["package_name"]}' if not entry_key in unique_dataset_index: del entry['pull_requests_opened_weekly'] del entry['pull_requests_closed_weekly'] del entry['issues_opened_weekly'] del entry['issues_closed_weekly'] del entry['commits_weekly'] unique_dataset.append(entry) unique_dataset_index.append(entry_key) with open(latest_index_filepath, 'w') as stream: stream.write(json.dumps(unique_dataset)) if constants.DEBUG is False: ologger.info(f'Uploading file[{latest_index_filepath}] to s3 bucket[{os.environ["DATASET_BUCKET"]}] key[{latest_index_s3_key}]') s3_client.upload_file(latest_index_filepath, os.environ['DATASET_BUCKET'], latest_index_s3_key, ExtraArgs={'ACL':'public-read'})
def mine_github_repo(): work_queue, done_queue, ologger = utils.comm_binders(mine_github_repo) import boto3 import lzma import os import json import requests import time from bert import aws from datetime import datetime from collectMetrics import shortcuts from requests.auth import HTTPBasicAuth from urllib.parse import urlencode output_dir: str = os.path.join('/tmp', 'outputs') if not os.path.exists(output_dir): os.makedirs(output_dir) config = {} with aws.kms('bert-etl') as keymaster: config['username'] = keymaster.decrypt(os.environ['GITHUB_USERNAME']) config['password'] = keymaster.decrypt(os.environ['GITHUB_PASSWORD']) config['delay'] = 1 config['base-url'] = 'https://api.github.com' config['headers'] = { 'User-Agent': 'repostats-tool', 'Accept': 'application/vnd.github.v3+json' } config['logger'] = ologger for details in work_queue: url: str = f'{config["base-url"]}/repos/{details["org_name"]}/{details["repo_name"]}' response = requests.get(url, auth=HTTPBasicAuth(config['username'], config['password']), headers=config['headers']) time.sleep(config['delay']) views = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'traffic/views', {}, config) time.sleep(config['delay']) clones = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'traffic/clones', {}, config) time.sleep(config['delay']) issues = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'issues', {}, config) time.sleep(config['delay']) releases = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'releases', {}, config) time.sleep(config['delay']) pull_requests = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'pulls', {}, config) time.sleep(config['delay']) contributors = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'contributors', {'anon': 'true'}, config) time.sleep(config['delay']) commits = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'commits', {}, config) time.sleep(config['delay']) tags = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'tags', {}, config) time.sleep(config['delay']) contents = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'contents', {}, config) time.sleep(config['delay']) date: str = datetime.utcnow().strftime('%Y-%m-%d') filename: str = f'{details["org_name"]}-{details["repo_name"]}.json.xz' filepath: str = os.path.join(output_dir, date, filename) dir_path: str = os.path.dirname(filepath) if not os.path.exists(dir_path): os.makedirs(dir_path) with lzma.open(filepath, mode='w', format=lzma.FORMAT_XZ) as stream: data = json.dumps({ 'base': response.json(), 'views': views, 'clones': clones, 'issues': issues, 'releases': releases, 'pull_requests': pull_requests, 'contributors': contributors, 'commits': commits, 'tags': tags, 'contents': contents, }).encode('utf-8') stream.write(data) s3_key: str = f'daily/{date}/{filename}' ologger.info(f'Saving Timeseries Data for Repo[{details["repo_name"]}] to S3 Key[{s3_key}]') s3_client = boto3.client('s3') response = s3_client.upload_file(filepath, os.environ['DATASET_BUCKET'], s3_key) done_queue.put({ 'key': s3_key, 'filename': os.path.basename(filepath), }) os.remove(filepath) else: outputs_dir: str = os.path.join('/tmp', 'outputs') latest_s3_key: str = f'cache/latest-date.txt' latest_filepath: str = os.path.join(outputs_dir, latest_s3_key) outputs_dir_path: str = os.path.dirname(latest_filepath) if not os.path.exists(outputs_dir_path): os.makedirs(outputs_dir_path) with open(latest_filepath, 'w') as stream: stream.write(date) s3_client.upload_file(latest_filepath, os.environ['DATASET_BUCKET'], latest_s3_key)
def parse_log_file(): import boto3 import gzip import json import os import requests import sync_utils import tempfile import typing from datetime import datetime from urllib.parse import urlparse work_queue, done_queue, ologger = utils.comm_binders(parse_log_file) ENCODING = 'utf-8' ASTROCONDA_DOWNLOADS = {} ASTROCONDA_CHANNEL_DATA = _load_channel_data('astroconda') ASTROCONDA_ETC_DOWNLOADS = {} ASTROCONDA_ETC_CHANNEL_DATA = _load_channel_data('astroconda-etc') # ASTROCONDA_TOMB_DOWNLOADS = {} # ASTROCONDA_TOMB_CHANNEL_DATA = _load_channel_data('astroconda-tomb') CONDA_DEV_DOWNLOADS = {} # CONDA_DEV_CHANNEL_DATA = _load_channel_data('conda-dev') def _parse_line(line: str) -> typing.Dict[str, typing.Any]: clone = list(line) line_parts = [] char_arr = [] CONTAINERS = [('[', ']'), ('"', '"')] BREAK_CHAR = [' '] inside_container = False while True: try: char = clone.pop(0) except IndexError: break else: if inside_container is False and not char in BREAK_CHAR and not char in [ c[0] for c in CONTAINERS ]: char_arr.append(char) elif inside_container is False and char in [ c[0] for c in CONTAINERS ]: inside_container = True continue elif inside_container is True and char in [ c[1] for c in CONTAINERS ]: inside_container = False line_parts.append(''.join(char_arr)) char_arr = [] continue elif inside_container is False and char in BREAK_CHAR: line_parts.append(''.join(char_arr)) char_arr = [] else: char_arr.append(char) try: method, path, protocol = line_parts[5].split(' ') except ValueError: return None else: # https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods if not method.lower() in [ 'get', 'post', 'delete', 'head', 'put', 'connect', 'options', 'trace', 'patch' ]: return None path = urlparse(path).path try: return { 'ipaddress': line_parts[0], 'timestamp': datetime.strptime(line_parts[3], '%d/%b/%Y:%H:%M:%S %z'), 'method': method, 'path': path, 'protocol': protocol, 'status_code': int(line_parts[7]), 'byte_length': line_parts[8], 'user_agent': line_parts[11] } except IndexError as err: ologger.exception(f'Unable to parse Line[{line}]') return None except ValueError as err: ologger.exception(f'Unable to parse Line[{line}]') import pdb pdb.set_trace() return None def parse_logfile_stream(stream: 'io-file-stream') -> None: for ldx, line in enumerate(stream.readlines()): try: line = line.decode(ENCODING) except UnicodeDecodeError: import pdb pdb.set_trace() pass result = _parse_line(line) if result is None: continue package_name = result['path'].rsplit('/', 1)[-1] if package_name.startswith('index.'): continue if result['path'].endswith('.json'): continue if result['path'].endswith('json.bz2'): continue if result['path'].endswith('/'): continue if result['path'].startswith('/astroconda-staging/'): continue if result['path'].startswith('/astroconda-staging-d/'): continue if result['path'].startswith('/astroconda-tomb/'): continue if result['method'].lower() == 'get' and \ any([ 'linux-64' in result['path'], 'osx-64' in result['path'] ]) and \ result['status_code'] == 200: if result['path'].startswith('/astroconda-etc/'): home = ASTROCONDA_ETC_CHANNEL_DATA[package_name].get( 'repo_home', None) if isinstance(home, list): import pdb pdb.set_trace() pass if home is None: ologger.info( f'Home not found for Package[{package_name}]') continue entry = ASTROCONDA_ETC_DOWNLOADS.get( package_name, { 'count': 0, 'home': home }) entry['count'] += 1 ASTROCONDA_ETC_DOWNLOADS[package_name] = entry # elif result['path'].startswith('/astroconda-tomb/'): # home = ASTROCONDA_TOMB_CHANNEL_DATA[package_name].get('linux-64', {}).get('repo_home', None) or \ # ASTROCONDA_TOMB_CHANNEL_DATA[package_name].get('osx-64', {}).get('repo_home', None) or \ # ASTROCONDA_TOMB_CHANNEL_DATA[package_name].get('win-64', {}).get('repo_home', None) # entry = ASTROCONDA_TOMB_DOWNLOADS.get(package_name, { # 'count': 0, # 'home': home # }) # entry['count'] += 1 # ASTROCONDA_TOMB_DOWNLOADS[package_name] = entry elif result['path'].startswith('/astroconda/'): home = ASTROCONDA_CHANNEL_DATA[package_name].get( 'repo_home', None) if isinstance(home, list): import pdb pdb.set_trace() pass if home is None: ologger.info( f'Home not found for Package[{package_name}]') continue entry = ASTROCONDA_DOWNLOADS.get(package_name, { 'count': 0, 'home': home }) entry['count'] += 1 ASTROCONDA_DOWNLOADS[package_name] = entry elif result['path'].startswith('/conda-dev'): count = CONDA_DEV_DOWNLOADS.get(package_name, 0) + 1 CONDA_DEV_DOWNLOADS[package_name] = count else: raise NotImplementedError(result) for idx, details in enumerate(work_queue): ologger.info(f'Parsing Logfile[{details["filepath"]}]') if details['filepath'].endswith('.gz'): with gzip.open(details['filepath'], 'rb') as stream: parse_logfile_stream(stream) else: with open(details['filepath'], 'rb') as stream: parse_logfile_stream(stream) sync_utils.upload_downloads_dataset({ 'channels': { 'astroconda-etc': ASTROCONDA_ETC_DOWNLOADS, 'astroconda': ASTROCONDA_DOWNLOADS }, 'data': { 'astroconda-etc': ASTROCONDA_ETC_CHANNEL_DATA, 'astroconda': ASTROCONDA_CHANNEL_DATA, } })