コード例 #1
0
ファイル: jobs.py プロジェクト: stscicrawford/github-metrics
def process_s3_bucket_contents():
    import boto3
    import copy
    import csv
    import hashlib
    import json
    import lzma
    import os
    import typing

    from botocore.exceptions import ClientError
    from collectMetrics import shortcuts

    from datetime import datetime, timedelta

    COMMIT_DATE_FORMAT: str = '%Y-%m-%dT%H:%M:%SZ'

    work_queue, done_queue, ologger = utils.comm_binders(
        process_s3_bucket_contents)
    outputs_dir: str = os.path.join('/tmp', 'outputs', 'process-contents')
    astroconda_contrib_repos = shortcuts.obtain_s3_datum(
        'astroconda-contrib-repos')
    astroconda_dev_repos = shortcuts.obtain_s3_datum('astroconda-dev-repos')
    ascii_date: str = shortcuts.obtain_latest_ascii_date(outputs_dir)
    s3_client = boto3.client('s3')

    filepaths: typing.List[typing.Any] = []
    latest_dataset: typing.Dict[str, typing.Any] = []
    # Debugging logic
    # ascii_date: str = '2019-11-12'
    # contents = []
    # for page in s3_client.get_paginator('list_objects_v2').paginate(Bucket=os.environ['DATASET_BUCKET'], Prefix=f'daily/{ascii_date}'):
    #     for item in page['Contents']:
    #         filename: str = os.path.basename(item['Key'])
    #         filepath: str = os.path.join(outputs_dir, ascii_date, filename)
    #         file_dir: str = os.path.dirname(filepath)
    #         if not os.path.exists(file_dir):
    #             os.makedirs(file_dir)

    #         s3_key: str = f'daily/{ascii_date}/{filename}'
    #         s3_client.download_file(os.environ['DATASET_BUCKET'], s3_key, filepath)
    #         with lzma.open(filepath, 'r', format=lzma.FORMAT_XZ) as stream:
    #             data = json.loads(stream.read())
    #             contents.append(data)

    # for data in contents:
    # end Debugging logic

    for details in work_queue:
        filename: str = os.path.basename(details['key'])
        filepath: str = os.path.join(outputs_dir, ascii_date, filename)
        file_dir: str = os.path.dirname(filepath)
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)

        s3_key: str = f'daily/{ascii_date}/{filename}'
        try:
            s3_client.download_file(os.environ['DATASET_BUCKET'], s3_key,
                                    filepath)
        except ClientError:
            ologger.error(
                f'Unable to download key[{s3_key} to file[{filepath}]')
            import pdb
            pdb.set_trace()
            pass

        else:
            with lzma.open(filepath, 'r', format=lzma.FORMAT_XZ) as stream:
                data = json.loads(stream.read())

            os.remove(filepath)
            try:
                data['base']['name']
            except KeyError as err:
                continue

            try:
                rtcname: str = data['releases'][0]['name']
                rtcname_url: str = data['releases'][0]['html_url']
            except IndexError:
                try:
                    rtcname: str = data['tags'][0]['name']
                    rtcname_url: str = f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/releases/tag/{data["tags"][0]["name"]}'
                except IndexError:
                    rtcname: str = 'latest commit'
                    rtcname_url: str = data['commits'][0]['html_url']

            try:
                descrip: str = data['releases'][0]['body'].strip()
            except IndexError:
                try:
                    descrip: str = [
                        comm for comm in data['commits']
                        if comm['sha'] == data['tags'][0]['commit']['sha']
                    ][0]['commit']['message'].strip()
                except IndexError:
                    descrip: str = 'N\A'

            try:
                date: str = data['releases'][0]['created_at']
            except IndexError:
                try:
                    date: str = [
                        comm for comm in data['commits']
                        if comm['sha'] == data['tags'][0]['commit']['sha']
                    ][0]['commit']['author']['date']
                except IndexError:
                    try:
                        date: str = data['commits'][0]['commit']['author'][
                            'date']
                    except IndexError:
                        date: str = 'N\A'

            try:
                author_name = data['releases'][0]['author']['login']
                author_login = data['releases'][0]['author']['login']
                author_url: str = f'https://github.com/{author_login}'
            except IndexError:
                try:
                    author_commit = [
                        comm for comm in data['commits']
                        if comm['sha'] == data['tags'][0]['commit']['sha']
                    ][0]
                    author_name: str = author_commit['author'].get(
                        'name', author_commit['author'].get('login', None))
                    author_login: str = author_commit['author'].get(
                        'login', '')
                    author_url: str = f'https://github.com/{author_login}'
                except IndexError:
                    author_commit: str = data['commits'][0]['commit']
                    author_name: str = author_commit['author'].get(
                        'name', author_commit['author'].get('login', None))
                    author_login: str = author_commit['author'].get(
                        'login', '')
                    author_url: str = f'https://github.com/{author_login}'

            try:
                last_commit: str = data['commits'][0]['commit']['author'][
                    'date']
            except IndexError:
                last_commit: str = 'N\A'

            try:
                top_contributor: str = data['contributors'][0]['login']
                top_contributor_contributations: int = data['contributors'][0][
                    'contributions']
            except (IndexError, KeyError):
                try:
                    top_contributor: str = data['contributors'][0]['name']
                    top_contributor_contributations: int = data[
                        'contributors'][0]['contributions']
                except IndexError:
                    last_contributor: str = 'N\A'
                    top_contributor_contributations: int = 0

            try:
                license: str = data['base']['license']['name']
            except (TypeError, KeyError):
                license: str = 'None'

            ologger.info(
                f'Building Timeseries Data for Org[{data["base"]["owner"]["login"]}]/Repo[{data["base"]["name"]}]'
            )
            dataset_template = {
                'package_name':
                data['base']['name'],
                'repo_url':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/',
                'owner':
                data['base']['owner']['login'],
                'archived':
                data['base']['archived'],
                'astroconda_contrib_repo':
                data['base']['name']
                in [repo['name'] for repo in astroconda_contrib_repos],
                'astroconda_dev_repo':
                data['base']['name']
                in [repo['name'] for repo in astroconda_dev_repos],
                'rtcname':
                rtcname,
                'rtcname_url':
                rtcname_url,
                'pulse_monthly':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulse/monthly',
                'pulse_weekly':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulse/monthly',
                'descrip':
                descrip,
                'date':
                date,
                'author':
                author_name,
                'author_login':
                author_login,
                'author_url':
                author_url,
                'last_commit':
                last_commit,
                'top_contributor':
                top_contributor,
                'top_contributor_contributations':
                top_contributor_contributations,
                'total_contributors':
                len(data['contributors']),
                'travis_badge':
                f'https://img.shields.io/travis/{data["base"]["owner"]["login"]}/{data["base"]["name"]}.svg',
                'rtd_badge':
                f'https://readthedocs.org/projects/{data["base"]["name"]}/badge/?version=latest',
                'license':
                license,
                'forks':
                data['base']['forks'],
                'watchers':
                data['base']['watchers'],
                'issues_open':
                len([
                    issue for issue in data['issues']
                    if issue['state'] == 'open'
                ]),
                'issues_open_url':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/issues?q=is%3Aissue+is%3Aopen',
                'issues_closed':
                len([
                    issue for issue in data['issues']
                    if issue['state'] == 'closed'
                ]),
                'issues_closed_url':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/issues?q=is%3Aissue+is%3Aclosed',
                'pull_requests_open':
                len([
                    pr for pr in data['pull_requests'] if pr['state'] == 'open'
                ]),
                'pull_requests_open_url':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulls?q=is%3Apr+is%3Aopen',
                'pull_requests_closed':
                len([
                    pr for pr in data['pull_requests']
                    if pr['state'] == 'closed'
                ]),
                'pull_requests_closed_url':
                f'https://github.com/{data["base"]["owner"]["login"]}/{data["base"]["name"]}/pulls?q=is%3Apr+is%3Aclosed',
            }
            dataset_template['key'] = hashlib.md5(
                json.dumps(dataset_template).encode('utf-8')).hexdigest()
            # Find latest commits, issues, and pulls
            now: datetime = datetime.utcnow()
            last_week: datetime = datetime.utcnow() + timedelta(days=7)
            last_month: datetime = datetime.utcnow() + timedelta(days=30)

            dataset_template['commits_last_week'] = 0
            dataset_template['commits_last_month'] = 0
            for commit in data['commits']:
                commit_date: datetime = datetime.strptime(
                    commit['commit']['author']['date'], COMMIT_DATE_FORMAT)
                if commit_date > last_week and commit_date < now:
                    dataset_template['commits_last_week'] += 1

                elif commit_date > last_month and commit_date < now:
                    dataset_template['commits_last_month'] += 1

            dataset_template['pull_requests_opened_last_week'] = 0
            dataset_template['pull_requests_opened_last_month'] = 0
            dataset_template['pull_requests_closed_last_week'] = 0
            dataset_template['pull_requests_closed_last_month'] = 0
            for pull_request in data['pull_requests']:
                created_at: datetime = datetime.strptime(
                    pull_request['created_at'], COMMIT_DATE_FORMAT)
                if created_at > last_week and created_at < now:
                    dataset_template['pull_requests_opened_last_week'] += 1

                if created_at > last_month and created_at < now:
                    dataset_template['pull_requests_opened_last_month'] += 1

                try:
                    closed_at: datetime = datetime.strptime(
                        pull_request['closed_at'], COMMIT_DATE_FORMAT)
                except TypeError:
                    closed_at = None

                else:
                    if closed_at > last_week and closed_at < now:
                        dataset_template['pull_requests_closed_last_week'] += 1

                    if closed_at > last_month and closed_at < now:
                        dataset_template[
                            'pull_requests_closed_last_month'] += 1

            dataset_template['issues_opened_last_week'] = 0
            dataset_template['issues_opened_last_month'] = 0
            dataset_template['issues_closed_last_week'] = 0
            dataset_template['issues_closed_last_month'] = 0
            for issue in data['issues']:
                created_at: datetime = datetime.strptime(
                    issue['created_at'], COMMIT_DATE_FORMAT)
                if created_at > last_week and created_at < now:
                    dataset_template['issues_opened_last_week'] += 1

                if created_at > last_month and created_at < now:
                    dataset_template['issues_opened_last_month'] += 1

                try:
                    closed_at: datetime = datetime.strptime(
                        issue['closed_at'], COMMIT_DATE_FORMAT)
                except TypeError:
                    closed_at = None

                else:
                    if closed_at > last_week and closed_at < now:
                        dataset_template['issues_closed_last_week'] += 1

                    if closed_at > last_month and closed_at < now:
                        dataset_template['issues_closed_last_month'] += 1

            # Building sliding window
            first_commit_date: datetime = datetime.strptime(
                data['commits'][-1]['commit']['author']['date'],
                COMMIT_DATE_FORMAT)
            last_commit_date: datetime = datetime.strptime(
                data['commits'][0]['commit']['author']['date'],
                COMMIT_DATE_FORMAT)
            commit_date_step: timedelta = timedelta(days=7)
            date_steps: typing.list[datetime] = [first_commit_date]
            while True:
                next_commit_boundry = date_steps[-1] + commit_date_step
                if next_commit_boundry > last_commit_date:
                    date_steps.append(last_commit_date)
                    break

                else:
                    date_steps.append(next_commit_boundry)

            fitted_commits: typing.List[typing.Any] = []
            fitted_pull_requests: typing.List[typing.Dict[str, int]] = []
            fitted_pull_requests_avg: typing.List[int] = []
            fitted_issues: typing.List[typing.Dict[str, int]] = []
            fitted_issues_avg: typing.List[int] = []
            for idx, next_commit_boundry in enumerate(date_steps[1:], 0):
                previous_commit_boundry = date_steps[idx]
                fitted_commit: typing.List[typing.Any] = []
                for commit in data['commits']:
                    commit_date: datetime = datetime.strptime(
                        commit['commit']['author']['date'], COMMIT_DATE_FORMAT)
                    if previous_commit_boundry <= commit_date and next_commit_boundry >= commit_date:
                        fitted_commit.append(commit['sha'])

                else:
                    fitted_commits.append(fitted_commit)

                fitted_pull_requests_opened: int = 0
                fitted_pull_requests_closed: int = 0
                fitted_pull_requests_avg_open: float = 0.0
                for pull_request in data['pull_requests']:
                    created_at: datetime = datetime.strptime(
                        pull_request['created_at'], COMMIT_DATE_FORMAT)
                    try:
                        closed_at: datetime = datetime.strptime(
                            pull_request['closed_at'], COMMIT_DATE_FORMAT)
                    except TypeError:
                        closed_at: datetime = None

                    if created_at >= previous_commit_boundry and created_at <= next_commit_boundry:
                        fitted_pull_requests_opened += 1

                    if not closed_at is None:
                        if closed_at >= previous_commit_boundry and closed_at <= next_commit_boundry:
                            fitted_pull_requests_closed += 1

                        fitted_pull_requests_avg_open += (
                            closed_at - created_at).total_seconds()

                else:
                    fitted_pull_requests.append({
                        'opened':
                        fitted_pull_requests_opened,
                        'closed':
                        fitted_pull_requests_closed,
                    })
                    if len(data['pull_requests']) == 0:
                        fitted_pull_requests_avg.append(0)

                    else:
                        pull_request_dom: float = 3600.0 * 24.0 * len(
                            data['pull_requests'])
                        fitted_pull_requests_avg.append(
                            fitted_pull_requests_avg_open / pull_request_dom)

                fitted_issues_opened: int = 0
                fitted_issues_closed: int = 0
                fitted_issues_avg_open: float = 0.0
                for issue in data['issues']:
                    created_at: datetime = datetime.strptime(
                        issue['created_at'], COMMIT_DATE_FORMAT)
                    try:
                        closed_at: datetime = datetime.strptime(
                            issue['closed_at'], COMMIT_DATE_FORMAT)
                    except TypeError:
                        clased_at: datetime = None

                    if created_at >= previous_commit_boundry and created_at <= next_commit_boundry:
                        fitted_issues_opened += 1

                    if not closed_at is None:
                        if closed_at >= previous_commit_boundry and closed_at <= next_commit_boundry:
                            fitted_issues_closed += 1

                        fitted_issues_avg_open += (closed_at -
                                                   created_at).total_seconds()

                else:
                    fitted_issues.append({
                        'opened': fitted_issues_opened,
                        'closed': fitted_issues_closed,
                    })
                    if len(data['issues']) == 0:
                        fitted_issues_avg.append(0)

                    else:
                        issue_dom = 3600.0 * 24.0 * len(data['issues'])
                        fitted_issues_avg.append(fitted_issues_avg_open /
                                                 issue_dom)

                dataset = copy.deepcopy(dataset_template)
                dataset[
                    'pull_requests_opened_weekly'] = fitted_pull_requests_opened
                dataset[
                    'pull_requests_closed_weekly'] = fitted_pull_requests_closed
                dataset['avg_issue_time_weekly'] = fitted_issues_avg_open
                dataset['avg_pr_time_weekly'] = fitted_pull_requests_avg_open
                dataset['issues_opened_weekly'] = fitted_issues_opened
                dataset['issues_closed_weekly'] = fitted_issues_closed
                dataset['date_weekly'] = previous_commit_boundry.strftime(
                    COMMIT_DATE_FORMAT)
                dataset['commits_weekly'] = len(fitted_commit)
                done_queue.put({'dataset': dataset})
コード例 #2
0
ファイル: jobs.py プロジェクト: jbcurtin/bert-simple
def first_aws_lambda_function():
    work_queue, done_queue, ologger = utils.comm_binders(
        first_aws_lambda_function)
    ologger.info('Awesome')
    for idx in range(0, 10):
        work_queue.put({'idx': idx})
コード例 #3
0
ファイル: manager.py プロジェクト: jbcurtin/bert-etl
def run_jobs(options: argparse.Namespace, jobs: typing.Dict[str, types.FunctionType]):
    if bert_constants.DEBUG:
        for idx, (job_name, conf) in enumerate(jobs.items()):
            if options.stop_at_job and options.stop_at_job == job_name:
                logger.info(f'Stoping at Job[{job_name}]')
                break

            if options.jump_to_job and options.jump_to_job != job_name:
                logger.info(f'Skipping Job[{job_name}]')
                continue

            if options.jump_to_job and options.jump_to_job == job_name:
                previous_job_name = [k for k in jobs.keys()][idx - 1]
                previous_job_conf = jobs[previous_job_name]
                cache_backend = previous_job_conf['job'].cache_backend
                if cache_backend:
                    cache_backend.clear_queue(conf['job'].work_key)
                    cache_backend.fill_queue(conf['job'].work_key, options.jump_to_number)

            bert_encoders.clear_encoding()
            bert_encoders.load_identity_encoders(conf['encoding']['identity_encoders'])
            bert_encoders.load_queue_encoders(conf['encoding']['queue_encoders'])
            bert_encoders.load_queue_decoders(conf['encoding']['queue_decoders'])
            logger.info(f'Running Job[{job_name}] as [{conf["spaces"]["pipeline-type"]}]')
            execution_role_arn: str = conf['iam'].get('execution-role-arn', None)
            job_worker_queue, job_done_queue, job_logger = bert_utils.comm_binders(conf['job'])

            if options.cognito is True:
                job_worker_queue.put({
                    'cognito-event': inject_cognito_event(conf)
                })

            for invoke_arg in conf['aws-deploy']['invoke-args']:
                job_worker_queue.put(invoke_arg)

            if execution_role_arn is None:
                with bert_datasource.ENVVars(conf['runner']['environment']):
                    conf['job']()

            else:
                with bert_aws.assume_role(execution_role_arn):
                    with bert_datasource.ENVVars(conf['runner']['environment']):
                        conf['job']()

    else:
        for job_name, conf in jobs.items():
            # Todo: Add jump-to-job here
            if options.stop_at_job and options.stop_at_job == job_name:
                logger.info(f'Stoping at Job[{job_name}]')
                break

            logger.info(f'Running Job[{conf["job"].func_space}] as [{conf["job"].pipeline_type.value}] for [{conf["job"].__name__}]')
            logger.info(f'Job worker count[{conf["job"].workers}]')
            processes: typing.List[multiprocessing.Process] = []
            bert_encoders.clear_encoding()
            bert_encoders.load_identity_encoders(conf['encoding']['identity_encoders'])
            bert_encoders.load_queue_encoders(conf['encoding']['queue_encoders'])
            bert_encoders.load_queue_decoders(conf['encoding']['queue_decoders'])

            job_worker_queue, job_done_queue, job_logger = bert_utils.comm_binders(conf['job'])
            if options.cognito is True:
                job_worker_queue.put({
                    'cognito-event': inject_cognito_event(conf)
                })

            for invoke_arg in conf['aws-deploy']['invoke-args']:
                job_worker_queue.put(invoke_arg)

            @functools.wraps(conf['job'])
            def _job_runner(*args, **kwargs) -> None:
                bert_encoders.clear_encoding()
                bert_encoders.load_identity_encoders(conf['encoding']['identity_encoders'])
                bert_encoders.load_queue_encoders(conf['encoding']['queue_encoders'])
                bert_encoders.load_queue_decoders(conf['encoding']['queue_decoders'])
                execution_role_arn: str = conf['iam'].get('execution-role-arn', None)
                job_restart_count: int = 0
                job_work_queue, job_done_queue, ologger = bert_utils.comm_binders(conf['job'])
                while job_restart_count < conf['runner']['max-retries']:
                    try:
                        if execution_role_arn is None:
                            with bert_datasource.ENVVars(conf['runner']['environment']):
                                conf['job']()
                                while job_work_queue.size() > 0:
                                    conf['job']()
                        else:
                            with bert_aws.assume_role(execution_role_arn):
                                with bert_datasource.ENVVars(conf['runner']['environment']):
                                    conf['job']()
                                    while job_worker_queue.size() > 0:
                                        conf['job']()

                    except Exception as err:
                        if LOG_ERROR_ONLY:
                            logger.exception(err)

                        else:
                            raise err
                    else:
                        break

                    job_restart_count += 1

                else:
                    logger.exception(f'Job[{conf["job"].func_space}] failed {job_restart_count} times')

            for idx in range(0, conf['job'].workers):
                proc: multiprocessing.Process = multiprocessing.Process(target=_job_runner, args=())
                proc.daemon = True
                proc.start()
                processes.append(proc)

            else:
                while not STOP_DAEMON and any([proc.is_alive() for proc in processes]):
                    time.sleep(bert_constants.DELAY)
コード例 #4
0
ファイル: jobs.py プロジェクト: spacetelescope/github-metrics
def generate_stats():
    import hashlib
    import json
    import os
    import sync_utils

    from datetime import timedelta, datetime
    from collectGithubData import reducers

    download_details = reducers.obtain_channel_download_details()
    work_queue, done_queue, ologger = utils.comm_binders(generate_stats)
    repo_list = []
    entry_keys = []
    for details in work_queue:
        org = details['org_name']
        name = details['repo_name']
        repo_list.append([org, name])
        ologger.info(f'Loading Details for Github Repo:{org}/{name}')
        with details['etl']['repo'] as etl_dataset_reader:
            try:
                repo_data = [entry for entry in etl_dataset_reader][0]
            except IndexError:
                ologger.error(f'Unable to load entry[{org}-{name}]')
                continue

        with details['etl']['commits'] as etl_dataset_reader:
            commits = [entry for entry in etl_dataset_reader]
        with details['etl']['issues'] as etl_dataset_reader:
            issues = [entry for entry in etl_dataset_reader]
        with details['etl']['pull-requests'] as etl_dataset_reader:
            pull_requests = [entry for entry in etl_dataset_reader]
        with details['etl']['releases'] as etl_dataset_reader:
            releases = [entry for entry in etl_dataset_reader]
        with details['etl']['collaborators'] as etl_dataset_reader:
            collaborators = [entry for entry in etl_dataset_reader]
        with details['etl']['tags'] as etl_dataset_reader:
            tags = [entry for entry in etl_dataset_reader]

        dataset_template = {
            'url':
            f'https://github.com/{org}/{name}/',
            'package_name':
            name,
            'org':
            org,
            'description':
            repo_data['description'],
            'authors_commit':
            reducers.find_commit_authors(commits),
            'authors_issue':
            reducers.find_issue_authors(issues),
            'authors_pull_request':
            reducers.find_pull_request_authors(pull_requests),
            'authors_release':
            reducers.find_release_authors(releases),
            'authors_tag':
            reducers.find_tag_authors(tags),
            'archived':
            repo_data['archived'],
            'date_earliest':
            reducers.find_earliest_date(issues, pull_requests, commits,
                                        releases, tags),
            'date_latest':
            reducers.find_latest_date(issues, pull_requests, commits, releases,
                                      tags),
            'release_authors':
            reducers.find_release_authors(releases),
            'release_latest_notes':
            reducers.find_latest_release_notes(commits, releases, tags),
            'release_latest_author':
            reducers.find_latest_release_author(commits, releases, tags),
            'license':
            repo_data['license']['name'] if repo_data['license'] else None,
            'is_private':
            repo_data['private'],
            'count_forks':
            repo_data['forks'],
            'count_watchers':
            repo_data['watchers'],
            'count_issues_open':
            len([issue for issue in issues if issue['state'] == 'open']),
            'count_issues_closed':
            len([issue for issue in issues if issue['state'] == 'closed']),
            'count_issues_total':
            len(issues),
            'count_pull_requests_open':
            len([pr for pr in pull_requests if pr['state'] == 'open']),
            'count_pull_requests_closed':
            len([pr for pr in pull_requests if pr['state'] == 'closed']),
            'count_pull_requests_total':
            len(pull_requests),
            'count_commits_total':
            len(commits),
            'count_releases_total':
            len(releases),
            'count_tags_total':
            len(tags),
            'pull_requests_open_url':
            f'https://github.com/{org}/{name}/pulls?q=is%3Apr+is%3Aopen',
            'pull_requests_closed_url':
            f'https://github.com/{org}/{name}/pulls?q=is%3Apr+is%3Aclosed',
            'issues_open_url':
            f'https://github.com/{org}/{name}/issues?q=is%3Aissue+is%3Aopen',
            'issues_closed_url':
            f'https://github.com/{org}/{name}/issues?q=is%3Aissue+is%3Aclosed',
            'badges': [],
            'download_channel': {},
        }
        for badge_name, badge_links in reducers.find_badge_locations(
                org, name).items():
            dataset_template['badges'].append({
                'name': badge_name,
                'src': badge_links['src'],
                'anchor': badge_links['anchor'],
            })

        entry_key = f'{org}-{name}'.lower()
        entry_keys.append(entry_key)
        entry_download_details = download_details.get(entry_key, None)
        if entry_download_details is None:
            ologger.info(
                f'Unable pull Download Details for Repo[{org}-{name}]')

        else:
            for channel in entry_download_details.get('channels', []):
                dataset_downloads_channel_details = dataset_template[
                    'download_channel'].get(channel, {
                        'count': 0,
                        'homes': [],
                    })
                if not entry_download_details[
                        'home'] in dataset_downloads_channel_details['homes']:
                    dataset_downloads_channel_details['homes'].append(
                        entry_download_details['home'])
                    dataset_downloads_channel_details[
                        'count'] += entry_download_details['count']

                else:
                    import pdb
                    pdb.set_trace()
                    import sys
                    sys.exit(1)

                dataset_template['download_channel'][
                    channel] = dataset_downloads_channel_details

        stats_s3_key = f'stsci-tools/stats/{org}/{name}.json'
        sync_utils.upload_dataset(dataset_template, stats_s3_key,
                                  os.environ['DATASET_BUCKET'])
        # Upload base-repo information

        COMMIT_DATE_FORMAT: str = '%Y-%m-%dT%H:%M:%SZ'
        date_interval = timedelta(days=1)
        date_earliest = datetime.strptime(dataset_template['date_earliest'],
                                          COMMIT_DATE_FORMAT)
        date_latest = datetime.strptime(dataset_template['date_latest'],
                                        COMMIT_DATE_FORMAT)
        date_range = date_latest - date_earliest
        window_stat_collection = []
        for window_index in range(0, date_range.days):
            window_start = date_earliest + timedelta(days=window_index)
            window_stop = date_earliest + timedelta(days=window_index + 1)
            window_stats = {
                'window_start': window_start.strftime(COMMIT_DATE_FORMAT),
                'window_stop': window_stop.strftime(COMMIT_DATE_FORMAT),
                'count_commits': 0,
                'count_issues_open': 0,
                'count_issues_closed': 0,
                'count_pull_requests_open': 0,
                'count_pull_requests_closed': 0,
                'count_releases': 0,
                'count_tags': 0,
            }
            for commit in commits:
                commit_date = datetime.strptime(
                    commit['commit']['author']['date'], COMMIT_DATE_FORMAT)
                if commit_date >= window_start and commit_date < window_stop:
                    window_stats[
                        'count_commits'] = window_stats['count_commits'] + 1

            for issue in issues:
                issue_created = datetime.strptime(issue['created_at'],
                                                  COMMIT_DATE_FORMAT)
                issue_updated = datetime.strptime(issue['updated_at'],
                                                  COMMIT_DATE_FORMAT)
                if (issue_created >= window_start and issue_created < window_stop) \
                    or (issue_updated >= window_start and issue_updated < window_stop):
                    if issue['state'] == 'closed':
                        window_stats['count_issues_closed'] = window_stats[
                            'count_issues_closed'] + 1

                    elif issue['state'] == 'open':
                        window_stats['count_issues_open'] = window_stats[
                            'count_issues_open'] + 1

                    else:
                        raise NotImplementedError(
                            f'Issue State: {issue["state"]}')

            for pr in pull_requests:
                pr_created = datetime.strptime(pr['created_at'],
                                               COMMIT_DATE_FORMAT)
                pr_updated = datetime.strptime(pr['updated_at'],
                                               COMMIT_DATE_FORMAT)
                if (pr_created >= window_start and pr_created < window_stop) \
                    or (pr_updated >= window_start and pr_updated < window_stop):
                    if pr['state'] == 'closed':
                        window_stats[
                            'count_pull_requests_closed'] = window_stats[
                                'count_pull_requests_closed'] + 1

                    elif pr['state'] == 'open':
                        window_stats[
                            'count_pull_requests_open'] = window_stats[
                                'count_pull_requests_open'] + 1

                    else:
                        raise NotImplementedError(f'PR State: {pr["state"]}')

            for release in releases:
                release_created = datetime.strptime(release['created_at'],
                                                    COMMIT_DATE_FORMAT)
                release_published = datetime.strptime(release['published_at'],
                                                      COMMIT_DATE_FORMAT)
                if release_published >= window_start and release_published <= window_stop:
                    window_stats[
                        'count_releases'] = window_stats['count_releases'] + 1

            for tag in tags:
                # Update Extractor to pull tag-dates
                pass

            window_stat_collection.append(window_stats)

        window_stat_collection_s3_key = f'stsci-tools/window-stats/{org}/{name}.json'
        sync_utils.upload_dataset(window_stat_collection,
                                  window_stat_collection_s3_key,
                                  os.environ['DATASET_BUCKET'])
コード例 #5
0
ファイル: jobs.py プロジェクト: jbcurtin/bert-simple
def second_aws_lambda_function():
    work_queue, done_queue, ologger = utils.comm_binders(
        second_aws_lambda_function)
    for details in work_queue:
        idx = details['idx']
        ologger.info(f'IDX: {idx}')
コード例 #6
0
ファイル: jobs.py プロジェクト: jbcurtin/bert-etl-testing
def init_job_queue() -> None:
    work_queue, done_queue, ologger = utils.comm_binders(init_job_queue)
    # for idx in range(0, 500):
    for idx in range(0, 10):
        done_queue.put({'idx': idx})
コード例 #7
0
ファイル: jobs.py プロジェクト: jbcurtin/bert-etl-testing
def handle_job_queue() -> None:
    work_queue, done_queue, ologger = utils.comm_binders(handle_job_queue)
    for details in work_queue:
        done_queue.put(details)
コード例 #8
0
ファイル: jobs.py プロジェクト: spacetelescope/github-metrics
def finalize_contents():
    import boto3
    import csv
    import json
    import os
    import tempfile
    import typing

    from collectMetrics import shortcuts

    s3_client = boto3.client('s3')
    work_queue, done_queue, ologger = utils.comm_binders(finalize_contents)
    outputs_dir: str = os.path.join('/tmp', 'outputs', 'finalize_contents')
    if not os.path.exists(outputs_dir):
        os.makedirs(outputs_dir)

    latest_dataset: typing.List[typing.Dict[str, typing.Any]] = []
    for details in work_queue:
        latest_dataset.append(details['dataset'])

    # import ipdb; ipdb.set_trace()
    # s3_keys: typing.List[str] = [item for item in {
    ascii_date: str = shortcuts.obtain_latest_ascii_date(outputs_dir)
    timeseries_filename: str = 'github-metrics.csv'
    timeseries_s3_key: str = f'timeseries/{timeseries_filename}'
    timeseries_filepath: str = os.path.join(outputs_dir, 'timeseries', timeseries_filename)
    timeseries_dir: str = os.path.dirname(timeseries_filepath)
    if not os.path.exists(timeseries_dir):
        os.makedirs(timeseries_dir)

    ologger.info(f'Writing dataset to file[{timeseries_filepath}]')
    with open(timeseries_filepath, 'w') as stream:
        writer = csv.DictWriter(stream, fieldnames=latest_dataset[0].keys())
        writer.writeheader()
        for entry in latest_dataset:
            writer.writerow(entry)

    if constants.DEBUG is False:
        ologger.info(f'Uploading file[{timeseries_filepath}] to s3 bucket[{os.environ["DATASET_BUCKET"]}] key[{timeseries_s3_key}]')
        s3_client.upload_file(timeseries_filepath, os.environ['DATASET_BUCKET'], timeseries_s3_key, ExtraArgs={'ACL': 'public-read'})

    # Last Week Stats
    for owner, package_name, stats in shortcuts.find_last_week_stats(latest_dataset):
        filepath = tempfile.NamedTemporaryFile().name
        s3_key = f'timeseries/last-week-stats/{owner}/{package_name}.json'
        with open(filepath, 'wb') as stream:
            stream.write(json.dumps(stats).encode(constants.ENCODING))

        if constants.DEBUG is False:
            ologger.info(f'Uploading Stats for Owner[{owner}] Package[{package_name}]')
            s3_client.upload_file(filepath, os.environ['DATASET_BUCKET'], s3_key, ExtraArgs={'ACL': 'public-read'})

    # Last Week Entries
    last_week_entries_filepath = tempfile.NamedTemporaryFile().name
    last_week_entries_s3_key = 'timeseries/last-week-entries.json'
    ologger.info(f'Writing Last Week Entries to file[{last_week_entries_filepath}]')
    last_week_entries = shortcuts.last_week_entries(latest_dataset)
    with open(last_week_entries_filepath, 'w') as stream:
        stream.write(json.dumps(last_week_entries))

    if constants.DEBUG is False:
        ologger.info(f'Uploading Last Week Entries to S3Key[{last_week_entries_s3_key}]')
        s3_client.upload_file(last_week_entries_filepath, os.environ['DATASET_BUCKET'], last_week_entries_s3_key, ExtraArgs={'ACL': 'public-read'})


    latest_index_filename: str = 'latest.json'
    latest_index_filepath: str = os.path.join(outputs_dir, latest_index_filename)
    latest_index_s3_key: str = f'timeseries/{latest_index_filename}'
    ologger.info(f'Building Latest Index for date[{ascii_date}]')
    unique_dataset: typing.List[typing.Dict[str, typing.Any]] = []
    unique_dataset_index: typing.List[str] = []
    for entry in latest_dataset:
        entry_key: str = f'{entry["owner"]}-{entry["package_name"]}'
        if not entry_key in unique_dataset_index:
            del entry['pull_requests_opened_weekly']
            del entry['pull_requests_closed_weekly']
            del entry['issues_opened_weekly']
            del entry['issues_closed_weekly']
            del entry['commits_weekly']

            unique_dataset.append(entry)
            unique_dataset_index.append(entry_key)

    with open(latest_index_filepath, 'w') as stream:
        stream.write(json.dumps(unique_dataset))

    if constants.DEBUG is False:
        ologger.info(f'Uploading file[{latest_index_filepath}] to s3 bucket[{os.environ["DATASET_BUCKET"]}] key[{latest_index_s3_key}]')
        s3_client.upload_file(latest_index_filepath, os.environ['DATASET_BUCKET'], latest_index_s3_key, ExtraArgs={'ACL':'public-read'})
コード例 #9
0
ファイル: jobs.py プロジェクト: spacetelescope/github-metrics
def mine_github_repo():
    work_queue, done_queue, ologger = utils.comm_binders(mine_github_repo)

    import boto3
    import lzma
    import os
    import json
    import requests
    import time

    from bert import aws
    from datetime import datetime

    from collectMetrics import shortcuts

    from requests.auth import HTTPBasicAuth

    from urllib.parse import urlencode


    output_dir: str = os.path.join('/tmp', 'outputs')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    config = {}
    with aws.kms('bert-etl') as keymaster:
        config['username'] = keymaster.decrypt(os.environ['GITHUB_USERNAME'])
        config['password'] = keymaster.decrypt(os.environ['GITHUB_PASSWORD'])

    config['delay'] = 1
    config['base-url'] = 'https://api.github.com'
    config['headers'] = {
        'User-Agent': 'repostats-tool',
        'Accept': 'application/vnd.github.v3+json'
    }
    config['logger'] = ologger
    for details in work_queue:
        url: str = f'{config["base-url"]}/repos/{details["org_name"]}/{details["repo_name"]}'
        response = requests.get(url, auth=HTTPBasicAuth(config['username'], config['password']), headers=config['headers'])
        time.sleep(config['delay'])

        views = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'traffic/views', {}, config)
        time.sleep(config['delay'])

        clones = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'traffic/clones', {}, config)
        time.sleep(config['delay'])

        issues = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'issues', {}, config)
        time.sleep(config['delay'])
        
        releases = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'releases', {}, config)
        time.sleep(config['delay'])
        
        pull_requests = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'pulls', {}, config)
        time.sleep(config['delay'])
        
        contributors = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'contributors', {'anon': 'true'}, config)
        time.sleep(config['delay'])
        
        commits = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'commits', {}, config)
        time.sleep(config['delay'])
        
        tags = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'tags', {}, config)
        time.sleep(config['delay'])
        
        contents = shortcuts.mine_repo_attribute(details['org_name'], details['repo_name'], 'contents', {}, config)
        time.sleep(config['delay'])
        
        date: str = datetime.utcnow().strftime('%Y-%m-%d')
        filename: str = f'{details["org_name"]}-{details["repo_name"]}.json.xz'
        filepath: str = os.path.join(output_dir, date, filename)
        dir_path: str = os.path.dirname(filepath)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        with lzma.open(filepath, mode='w', format=lzma.FORMAT_XZ) as stream:
            data = json.dumps({
                'base': response.json(),
                'views': views,
                'clones': clones,
                'issues': issues,
                'releases': releases,
                'pull_requests': pull_requests,
                'contributors': contributors,
                'commits': commits,
                'tags': tags,
                'contents': contents,
            }).encode('utf-8')
            stream.write(data)

        s3_key: str = f'daily/{date}/{filename}'
        ologger.info(f'Saving Timeseries Data for Repo[{details["repo_name"]}] to S3 Key[{s3_key}]')
        s3_client = boto3.client('s3')
        response = s3_client.upload_file(filepath, os.environ['DATASET_BUCKET'], s3_key)
        done_queue.put({
            'key': s3_key,
            'filename': os.path.basename(filepath),
        })
        os.remove(filepath)

    else:
        outputs_dir: str = os.path.join('/tmp', 'outputs')
        latest_s3_key: str = f'cache/latest-date.txt'
        latest_filepath: str = os.path.join(outputs_dir, latest_s3_key)
        outputs_dir_path: str = os.path.dirname(latest_filepath)
        if not os.path.exists(outputs_dir_path):
            os.makedirs(outputs_dir_path)

        with open(latest_filepath, 'w') as stream:
            stream.write(date)

        s3_client.upload_file(latest_filepath, os.environ['DATASET_BUCKET'], latest_s3_key)
コード例 #10
0
def parse_log_file():
    import boto3
    import gzip
    import json
    import os
    import requests
    import sync_utils
    import tempfile
    import typing

    from datetime import datetime
    from urllib.parse import urlparse
    work_queue, done_queue, ologger = utils.comm_binders(parse_log_file)

    ENCODING = 'utf-8'
    ASTROCONDA_DOWNLOADS = {}
    ASTROCONDA_CHANNEL_DATA = _load_channel_data('astroconda')
    ASTROCONDA_ETC_DOWNLOADS = {}
    ASTROCONDA_ETC_CHANNEL_DATA = _load_channel_data('astroconda-etc')
    # ASTROCONDA_TOMB_DOWNLOADS = {}
    # ASTROCONDA_TOMB_CHANNEL_DATA = _load_channel_data('astroconda-tomb')
    CONDA_DEV_DOWNLOADS = {}

    # CONDA_DEV_CHANNEL_DATA = _load_channel_data('conda-dev')

    def _parse_line(line: str) -> typing.Dict[str, typing.Any]:
        clone = list(line)
        line_parts = []
        char_arr = []
        CONTAINERS = [('[', ']'), ('"', '"')]
        BREAK_CHAR = [' ']
        inside_container = False
        while True:
            try:
                char = clone.pop(0)
            except IndexError:
                break

            else:
                if inside_container is False and not char in BREAK_CHAR and not char in [
                        c[0] for c in CONTAINERS
                ]:
                    char_arr.append(char)

                elif inside_container is False and char in [
                        c[0] for c in CONTAINERS
                ]:
                    inside_container = True
                    continue

                elif inside_container is True and char in [
                        c[1] for c in CONTAINERS
                ]:
                    inside_container = False
                    line_parts.append(''.join(char_arr))
                    char_arr = []
                    continue

                elif inside_container is False and char in BREAK_CHAR:
                    line_parts.append(''.join(char_arr))
                    char_arr = []

                else:
                    char_arr.append(char)

        try:
            method, path, protocol = line_parts[5].split(' ')
        except ValueError:
            return None

        else:
            # https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods
            if not method.lower() in [
                    'get', 'post', 'delete', 'head', 'put', 'connect',
                    'options', 'trace', 'patch'
            ]:
                return None

            path = urlparse(path).path

        try:
            return {
                'ipaddress':
                line_parts[0],
                'timestamp':
                datetime.strptime(line_parts[3], '%d/%b/%Y:%H:%M:%S %z'),
                'method':
                method,
                'path':
                path,
                'protocol':
                protocol,
                'status_code':
                int(line_parts[7]),
                'byte_length':
                line_parts[8],
                'user_agent':
                line_parts[11]
            }
        except IndexError as err:
            ologger.exception(f'Unable to parse Line[{line}]')
            return None
        except ValueError as err:
            ologger.exception(f'Unable to parse Line[{line}]')
            import pdb
            pdb.set_trace()
            return None

    def parse_logfile_stream(stream: 'io-file-stream') -> None:
        for ldx, line in enumerate(stream.readlines()):
            try:
                line = line.decode(ENCODING)
            except UnicodeDecodeError:
                import pdb
                pdb.set_trace()
                pass

            result = _parse_line(line)
            if result is None:
                continue

            package_name = result['path'].rsplit('/', 1)[-1]
            if package_name.startswith('index.'):
                continue

            if result['path'].endswith('.json'):
                continue

            if result['path'].endswith('json.bz2'):
                continue

            if result['path'].endswith('/'):
                continue

            if result['path'].startswith('/astroconda-staging/'):
                continue

            if result['path'].startswith('/astroconda-staging-d/'):
                continue

            if result['path'].startswith('/astroconda-tomb/'):
                continue

            if result['method'].lower() == 'get' and \
                any([
                    'linux-64' in result['path'],
                    'osx-64' in result['path']
                ]) and \
                result['status_code'] == 200:

                if result['path'].startswith('/astroconda-etc/'):
                    home = ASTROCONDA_ETC_CHANNEL_DATA[package_name].get(
                        'repo_home', None)
                    if isinstance(home, list):
                        import pdb
                        pdb.set_trace()
                        pass

                    if home is None:
                        ologger.info(
                            f'Home not found for Package[{package_name}]')
                        continue

                    entry = ASTROCONDA_ETC_DOWNLOADS.get(
                        package_name, {
                            'count': 0,
                            'home': home
                        })
                    entry['count'] += 1
                    ASTROCONDA_ETC_DOWNLOADS[package_name] = entry

                # elif result['path'].startswith('/astroconda-tomb/'):
                #     home = ASTROCONDA_TOMB_CHANNEL_DATA[package_name].get('linux-64', {}).get('repo_home', None) or \
                #             ASTROCONDA_TOMB_CHANNEL_DATA[package_name].get('osx-64', {}).get('repo_home', None) or \
                #             ASTROCONDA_TOMB_CHANNEL_DATA[package_name].get('win-64', {}).get('repo_home', None)

                #     entry = ASTROCONDA_TOMB_DOWNLOADS.get(package_name, {
                #         'count': 0,
                #         'home': home
                #     })
                #     entry['count'] += 1
                #     ASTROCONDA_TOMB_DOWNLOADS[package_name] = entry

                elif result['path'].startswith('/astroconda/'):
                    home = ASTROCONDA_CHANNEL_DATA[package_name].get(
                        'repo_home', None)

                    if isinstance(home, list):
                        import pdb
                        pdb.set_trace()
                        pass

                    if home is None:
                        ologger.info(
                            f'Home not found for Package[{package_name}]')
                        continue

                    entry = ASTROCONDA_DOWNLOADS.get(package_name, {
                        'count': 0,
                        'home': home
                    })
                    entry['count'] += 1
                    ASTROCONDA_DOWNLOADS[package_name] = entry

                elif result['path'].startswith('/conda-dev'):
                    count = CONDA_DEV_DOWNLOADS.get(package_name, 0) + 1
                    CONDA_DEV_DOWNLOADS[package_name] = count

                else:
                    raise NotImplementedError(result)

    for idx, details in enumerate(work_queue):
        ologger.info(f'Parsing Logfile[{details["filepath"]}]')
        if details['filepath'].endswith('.gz'):
            with gzip.open(details['filepath'], 'rb') as stream:
                parse_logfile_stream(stream)

        else:
            with open(details['filepath'], 'rb') as stream:
                parse_logfile_stream(stream)

    sync_utils.upload_downloads_dataset({
        'channels': {
            'astroconda-etc': ASTROCONDA_ETC_DOWNLOADS,
            'astroconda': ASTROCONDA_DOWNLOADS
        },
        'data': {
            'astroconda-etc': ASTROCONDA_ETC_CHANNEL_DATA,
            'astroconda': ASTROCONDA_CHANNEL_DATA,
        }
    })