Esempio n. 1
0
def clean_db():
    """Clean stuff."""
    logger.info("Removing commits from fork repos.")
    session = new_session()
    try:
        all_repositories = session.query(Repository) \
            .filter(Repository.fork.is_(True)) \
            .filter(Repository.commits != None) \
            .options(joinedload(Repository.commits)) \
            .all()

        logger.info(f'Found {len(all_repositories)}')
        repositories_count = 0
        for repository in all_repositories:
            repository.commits = []
            session.add(repository)
            repositories_count += 1
            if repositories_count % 100 == 0:
                logger.info(f'Removed {repositories_count}')
                session.commit()

        logger.info("Remove unattached commits")
        session.query(Commit) \
            .filter(Commit.repositories == None) \
            .delete()
        session.commit()
    finally:
        session.close()
Esempio n. 2
0
def complete_repos():
    """Complete unfinished repsitories."""
    logger.info("Get unfinished or out of date repositories.")
    rescan_interval = int(config['aggregator']['repository_rescan_interval'])
    rescan_threshold = datetime.utcnow() - timedelta(seconds=rescan_interval)
    session = new_session()

    repos = session.query(Repository) \
        .filter(Repository.fork.is_(False)) \
        .filter(Repository.broken.is_(False)) \
        .filter(Repository.too_big.is_(False)) \
        .filter(or_(
            Repository.completely_scanned.is_(False),
            Repository.updated_at <= rescan_threshold,
        )) \
        .all()
    logger.info(f'Found {len(repos)}')

    full_names = [r.full_name for r in repos]
    repos_to_scan = set(full_names)

    manager = Manager('github_repository', repos_to_scan)
    manager.start()
    manager.run()

    session.close()
Esempio n. 3
0
def get_user_data(user_data: tuple):
    """Get all missing data from a user."""
    try:
        contributor = user_data[0]
        login = contributor.login

        session = new_session()
        contributor = Contributor.get_contributor(login, session, True)

        user = call_github_function(github.github, 'get_user', [login])
        if user.location:
            contributor.location = user.location

        session.add(contributor)
        session.commit()
        response = {'message': f'Scanned user {login}'}

    except GithubException as e:
        # Forbidden or not found (Just made private or deleted)
        if e.status == 404:
            response = {'message': f'User {login} not found.'}
        pass

    except BaseException as e:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response
Esempio n. 4
0
def get_organization_memberships():
    """Refresh all user organizations."""
    session = new_session()

    tz = pytz.timezone('Europe/Berlin')
    now = datetime.now(tz)
    contributors = session.query(Contributor).all()
    for contributor in contributors:
        if contributor.last_full_scan and contributor.last_full_scan > now - timedelta(
                days=2):
            continue
        logger.info(
            f'Checking {contributor.login}. {github.github.rate_limiting[0]} remaining.'
        )

        github_user = call_github_function(github.github, 'get_user',
                                           [contributor.login])

        github_orgs = call_github_function(github_user, 'get_orgs')
        for org in github_orgs:
            organization = Organization.get_organization(
                org.login, org.url, session)
            contributor.organizations.append(organization)
        contributor.last_full_scan = datetime.utcnow()
        session.add(contributor)
        session.commit()
Esempio n. 5
0
def user(login):
    """Plot all graphs for a specific github user."""
    try:
        session = new_session()
        plot_user(login, session)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)
Esempio n. 6
0
def user_for_repositories(login, repositories):
    """Get statistics of an user for specific repositories."""
    try:
        session = new_session()
        plot_employee(login, repositories, session)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)
Esempio n. 7
0
def comparison(users, repos):
    """Get statistics of several user for specific repositories."""
    try:
        if not users or not repos:
            logger.info("Users and Repos are required parameters.")
        session = new_session()
        plot_comparison(users, repos, session)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)
Esempio n. 8
0
def get_punchcard_data(contributors_commits):
    """Analyse the travel path of a few contributers."""
    try:
        session = new_session()
        for contributor, commit_hashes in contributors_commits:
            # Query result again with current session.
            contributor = session.query(Contributor).get(contributor.login)
            result = contributor.analysis_result

            if result is None:
                result = AnalysisResult()
                contributor.analysis_result = result

            if result.intermediate_results is None:
                result.intermediate_results = {}

            commits_changed = (len(commit_hashes) != result.commit_count)
            if 'punchcard' not in result.intermediate_results or commits_changed:
                # Deepcopy intermediate result, otherwise the jsonb won't refresh.
                new_intermediate = deepcopy(result.intermediate_results)
                commits = session.query(Commit) \
                    .filter(Commit.sha.in_(commit_hashes)) \
                    .all()

                # Compute the final punchcard evaluation
                plotter = CommitPunchcard(commits, '/', '')
                plotter.preprocess()

                # Standartize data
                df = plotter.data
                mean = df['count'].mean()
                df['count'] = df['count'] / mean

                # Add data to list
                vector = df['count'].values.tolist()

                # Save the standartized intermediate result into the database
                new_intermediate['punchcard'] = vector
                result.intermediate_results = new_intermediate
                result.last_change = datetime.now()
                result.commit_count = len(commits)

            session.add(result)
            session.add(contributor)
            session.commit()

    finally:
        session.close()

    return {'message': 'Success'}
Esempio n. 9
0
def update_contributors(update_all):
    """Complete contributors."""
    session = new_session()
    logger.info(f'Start Scan.')

    # Look at the last two years
    time_span = datetime.now() - timedelta(days=2 * 365)

    results = session.query(Contributor, func.array_agg(Commit.sha)) \
        .filter(Contributor.login == Contributor.login) \
        .join(Email, Contributor.login == Email.contributor_login) \
        .join(Commit, or_(
            Commit.author_email_address == Email.email,
            Commit.committer_email_address == Email.email,
        )) \
        .filter(Commit.commit_time >= time_span) \
        .filter(or_(
            Contributor.location == None,
        )) \
        .group_by(Contributor.login) \
        .all()

    logger.info(f'Scanning {len(results)} contributors.')

    if update_all:
        contributors_to_scan = results
        logger.info(f'Scanning {len(contributors_to_scan)} contributors')
    else:
        count = 0
        contributors_to_scan = []
        for contributor, commits in results:
            if len(commits) > 100 and len(commits) < 20000:
                contributors_to_scan.append((contributor, commits))

            count += 1
            if count % 5000 == 0:
                logger.info(
                    f'Found {count} contributors ({len(contributors_to_scan)} big)'
                )

    manager = ListManager('github_user', contributors_to_scan)
    manager.start()
    manager.run()
Esempio n. 10
0
def get_github_organization(name: str, members=False):
    """Get all collaborators of an organization."""
    session = new_session()
    orga = call_github_function(github.github, 'get_organization', [name])

    # Get orga repos
    orga_repos = call_github_function(orga, 'get_repos')
    while orga_repos._couldGrow():
        call_github_function(orga_repos, '_grow')

    # Check orga repos
    repos_to_scan = set()
    for github_repo in orga_repos:
        repository = Repository.get_or_create(
            session,
            github_repo.ssh_url,
            name=github_repo.name,
            full_name=github_repo.full_name,
        )
        if github_repo.fork:
            check_fork(github_repo, session, repository, repos_to_scan)
        session.add(repository)

        if not repository.should_scan():
            continue

        session.commit()
        repos_to_scan.add(github_repo.full_name)

    member_list = set()
    if members:
        # Get members
        members = call_github_function(orga, 'get_members')
        while members._couldGrow():
            call_github_function(members, '_grow')
        member_list = set([m.login for m in members])

    # Create and start manager with orga repos and memeber_list
    sub_manager = Manager('github_repository', repos_to_scan)
    manager = Manager('github_contributor', member_list, sub_manager)
    manager.start()
    manager.run()
Esempio n. 11
0
def get_user_with_followers(name: str):
    """Get all relevant Information about all friends of a specific user.."""
    user = call_github_function(github.github, 'get_user', [name])
    followers = call_github_function(user, 'get_followers')
    following = call_github_function(user, 'get_following')

    # Add all following and followed people into list
    # Deduplicate the list as we have to make as few API calls as possible.
    user_list = [user]
    for follower in followers:
        user_list.append(follower)
    for followed in following:
        exists = filter(lambda x: x.login == followed.login, user_list)
        if len(list(exists)) == 0:
            user_list.append(followed)

    user_logins = [u.login for u in user_list]
    #    for user in user_list:
    #        print(user)
    sub_manager = Manager('github_repository', [])
    manager = Manager('github_contributor', user_logins, sub_manager)
    manager.start()
    manager.run()

    try:
        session = new_session()
        for login in user_logins:
            contributor = session.query(Contributor) \
                .filter(Contributor.login.ilike(login)) \
                .one()
            if not contributor.too_big:
                contributor.last_full_scan = datetime.utcnow()
                session.add(contributor)
        session.commit()
    finally:
        session.close()
Esempio n. 12
0
def get_user_repos(user_login: str, skip=True):
    """Get all relevant Information for a single user."""
    try:
        session = new_session()
        contributor = Contributor.get_contributor(user_login, session, True)
        # Checks for already scanned users.
        if not contributor.should_scan():
            return user_up_to_date_message(user_login)
        if contributor.too_big:
            return user_too_big_message(user_login)

        user = call_github_function(github.github, 'get_user', [user_login])
        owned = user.get_repos()
        starred = user.get_starred()
        repos_to_scan = set()

        # Prefetch all owned repositories
        user_too_big = False
        owned_repos = 0
        while owned._couldGrow() and not user_too_big:
            owned_repos += 1
            call_github_function(owned, '_grow')

            # Debug messages to see that the repositories are still collected.
            if owned_repos % 100 == 0:
                logger.info(
                    f'{owned_repos} owned repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and owned_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # Prefetch all starred repositories
        starred_repos = 0
        while starred._couldGrow() and not user_too_big:
            starred_repos += 1
            call_github_function(starred, '_grow')
            # Debug messages to see that the repositories are still collected.
            if starred_repos % 100 == 0:
                logger.info(
                    f'{starred_repos} starred repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and starred_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # User has too many repositories. Flag him and return
        if user_too_big:
            contributor.too_big = True
            sentry.captureMessage(
                'User too big',
                extra={'url': contributor.login},
                level='info',
                tags={
                    'type': 'too_big',
                    'entity': 'user'
                },
            )
            session.add(contributor)
            session.commit()
            return user_too_big_message(user_login)

        # Check own repositories. We assume that we are collaborating in those
        for github_repo in owned:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )
            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            session.commit()
            repos_to_scan.add(github_repo.full_name)

        # Check stars and if the user collaborated to them.
        for github_repo in starred:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )

            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            repos_to_scan.add(github_repo.full_name)

        session.commit()

        rate = github.github.get_rate_limit().core
        message = f'Got repositories for {user.login}. '
        message += f'{user.login}. {rate.remaining} of 5000 remaining.'
        response = {
            'message': message,
            'tasks': list(repos_to_scan),
        }
    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {user_login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response
Esempio n. 13
0
def analyse_punch_card(existing, method, eps=150, min_samples=5):
    """Analyze the efficiency of the missing time comparison."""
    session = new_session()
    logger.info(f'Start Scan.')

    # If the only_existing parameter is given, we only work with
    # the existing intermediate AnalysisResults.
    if not existing:
        # Only look at commits of the last year
        time_span = datetime.now() - timedelta(days=365)
        results = session.query(Contributor, func.array_agg(Commit.sha)) \
            .filter(Contributor.login == Contributor.login) \
            .join(Email, Contributor.login == Email.contributor_login) \
            .join(Commit, or_(
                Commit.author_email_address == Email.email,
                Commit.committer_email_address == Email.email,
            )) \
            .filter(Commit.commit_time >= time_span) \
            .group_by(Contributor.login) \
            .all()

        logger.info(f'Scanning {len(results)} contributors.')

        count = 0
        big_contributors = []
        for contributor, commits in results:
            if len(commits) > 100 and len(commits) < 20000:
                big_contributors.append((contributor, commits))

            count += 1
            if count % 50000 == 0:
                logger.info(
                    f'Scanned {count} contributors ({len(big_contributors)} big)'
                )

        # Finished searching for contributors with enough commits.
        logger.info(f'Analysing {len(big_contributors)} contributors.')

        # Chunk the contributor list into chunks of 100
        chunks = create_chunks(big_contributors, 100)

        manager = ListManager('analyse_punchcard', chunks)
        manager.start()
        manager.run()

    # Only look at commits of the last year
    analysis_results = session.query(AnalysisResult) \
        .filter(AnalysisResult.intermediate_results != None) \
        .filter(AnalysisResult.commit_count > 100) \
        .filter(AnalysisResult.commit_count < 20000) \
        .options(joinedload('contributor')) \
        .all()

    if existing:
        logger.info(f'Analysing {len(analysis_results)} results.')

    logger.info(f'Using {method} clustering')
    vectorized_data = []
    contributors = []
    for result in analysis_results:
        if 'punchcard' in result.intermediate_results:
            vectorized_data.append(result.intermediate_results['punchcard'])
            contributors.append(result.contributor)

    # Cluster using DBSCAN algorithm
    if method == 'dbscan':
        metric = 'l1'
        cluster_result = DBSCAN(
            eps=eps,
            min_samples=min_samples,
            metric=metric,
            n_jobs=-1,
        ).fit(vectorized_data)
        core_samples_mask = np.zeros_like(cluster_result.labels_, dtype=bool)
        core_samples_mask[cluster_result.core_sample_indices_] = True

    # Cluster using Mean-Shift algorithm
    elif method == 'mean-shift':
        quantile = 0.1
        n_samples = -1
        logger.info(f'Computing bandwidth.')
        bandwidth = estimate_bandwidth(
            vectorized_data,
            quantile=quantile,
            n_samples=n_samples,
            n_jobs=-1,
        )
        logger.info(f'Bandwidth computed.')

        cluster_result = MeanShift(
            bandwidth=bandwidth,
            bin_seeding=True,
            n_jobs=-1,
        ).fit(vectorized_data)
    # Cluster using Affinity Propagation algorithm
    elif method == 'affinity':
        preference = None
        cluster_result = AffinityPropagation(preference=preference) \
            .fit(vectorized_data)

    # Number of entities per label
    labels = cluster_result.labels_
    unique, counts = np.unique(labels, return_counts=True)
    occurrences = dict(zip(unique, counts))

    contributor_by_label = {}
    for index, label in enumerate(labels):
        if contributor_by_label.get(label) is None:
            contributor_by_label[label] = []

        contributor_by_label[label].append(contributors[index].login)

    # Prepare the plot dir for prototype plotting
    plot_dir = config['plotting']['plot_dir']
    plot_dir = os.path.join(plot_dir, 'analysis', 'analyse_punch', method)

    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    logger.info(f'Found {len(occurrences)}')
    # Get the mean-center prototypes for each label and plot them
    prototypes = get_mean_center_prototypes(cluster_result, vectorized_data,
                                            min_samples)

    logger.info(f'Found {len(prototypes)} valid clusters')
    for label, prototype in prototypes.items():
        if method == 'dbscan':
            name = f'{metric}-{min_samples}-{eps}-{label}'
        else:
            name = f'{label}'

        path = os.path.join(plot_dir, name)
        title = f'Prototype for {name} with {occurrences[label]} elements'
        plotter = CommitPunchcard([], path, title)
        plotter.preprocess()
        plotter.data['count'] = np.array(prototype) * 5
        plotter.plot()

    if method == 'dbscan':
        logger.info(f'DBSCAN with EPS: {eps} and {min_samples} min samples.')
    logger.info('Amount of entities in clusters. -1 is an outlier:')
    logger.info(pformat(occurrences))
    logger.info(pformat(contributor_by_label))
    logger.info(f'{len(analysis_results)} contributers are relevant.')

    if method == 'dbscan':
        core_samples = cluster_result.core_sample_indices_
        logger.info(f'Core samples: {len(core_samples)}')

    return
Esempio n. 14
0
def get_github_repository(full_name: str):
    """Get all information from a single repository."""
    try:
        session = new_session()
        # Sleep for a random time to avoid hitting the abuse detection.
        sleeptime = randrange(1, 15)
        sleep(sleeptime)

        github_repo = call_github_function(github.github, 'get_repo',
                                           [full_name], {'lazy': False})

        repository = Repository.get_or_create(
            session,
            github_repo.ssh_url,
            name=github_repo.name,
            full_name=github_repo.full_name,
        )

        if repository.broken:
            return {'message': f'Skip broken repo {github_repo.ssh_url}'}
        elif github_repo.size > int(config['aggregator']['max_repository_size']):
            repository.too_big = True
            session.add(repository)
            session.commit()
            sentry.captureMessage(f'Repo filesize too big', level='info',
                                  extra={'repo': repository.clone_url})

            return {'message': f'Repo too big (filesize): {github_repo.ssh_url}'}

        current_time = datetime.now().strftime('%H:%M')

        owner = get_github_object(github_repo, 'owner')
        git_repo = get_git_repository(
            github_repo.ssh_url,
            owner.login,
            github_repo.name,
        )
        scanner = CommitScanner(git_repo, session, github_repo)
        commit_count = scanner.scan_repository()

        breadcrumbs.record(
            data={'action': 'Commits scanned. Set repo metadata and debug output'},
            category='info',
        )

        repository = session.query(Repository).get(github_repo.ssh_url)
        rate = github.github.get_rate_limit().core
        time = rate.reset.strftime("%H:%M")
        current_time = datetime.now().strftime('%H:%M')

        message = f'{current_time}: '
        message += f'Scanned {repository.clone_url} with {commit_count} commits.\n'
        message += f'{rate.remaining} of 5000 remaining. Reset at {time}\n'

        response = {'message': message}

        repository.updated_at = datetime.now()
        session.add(repository)
        session.commit()

    except GithubException as e:
        # 451: Access denied. Repository probably gone private.
        # 404: User or repository just got deleted
        if e.status == 451 or e.status == 404:
            repository = session.query(Repository) \
                .filter(Repository.full_name == full_name) \
                .one_or_none()

            if repository:
                repository.broken = True
                session.add(repository)
                session.commit()
            response = {'message': 'Repository access blocked.'}
        # Catch any other GithubException
        else:
            sentry.captureException()
            response = error_message('Error in get_repository:\n')

        pass

    except (GitError, UnicodeDecodeError):
        response = error_message('Error in get_repository:\n')
        pass

    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = error_message('Error in get_repository:\n')
        pass

    finally:
        if 'owner' in locals() and 'github_repo' in locals():
            delete_git_repository(owner.login, github_repo.name)
        session.close()

    return response
Esempio n. 15
0
def analyse_travel_path(existing):
    """Analyze the efficiency of the missing time comparison."""
    session = new_session()
    logger.info(f'Start Scan.')

    # Look at the last two years
    time_span = datetime.now() - timedelta(days=2 * 365)

    if not existing:
        results = session.query(Contributor, func.array_agg(Commit.sha)) \
            .filter(Contributor.login == Contributor.login) \
            .join(Email, Contributor.login == Email.contributor_login) \
            .join(Commit, or_(
                Commit.author_email_address == Email.email,
                Commit.committer_email_address == Email.email,
            )) \
            .filter(Commit.commit_time >= time_span) \
            .group_by(Contributor.login) \
            .all()

        logger.info(f'Scanning {len(results)} contributors.')

        count = 0
        big_contributors = []
        for contributor, commits in results:
            if len(commits) > 100 and len(commits) < 20000:
                big_contributors.append((contributor, commits))

            count += 1
            if count % 5000 == 0:
                logger.info(
                    f'Scanned {count} contributors ({len(big_contributors)} big)'
                )

        # Finished searching for contributors with enough commits.
        logger.info(f'Analysing {len(big_contributors)} contributors.')

        # Chunk the contributor list into chunks of 100
        chunks = create_chunks(big_contributors, 100)

        manager = ListManager('analyse_travel_path', chunks)
        manager.start()
        manager.run()

    # Only look at commits of the last year
    results = session.query(AnalysisResult) \
        .filter(AnalysisResult.timezone_switches != None) \
        .filter(and_(
            AnalysisResult.commit_count != None,
            AnalysisResult.commit_count > 100,
            AnalysisResult.commit_count < 20000,
        )) \
        .options(joinedload('contributor')) \
        .all()

    changed = 0
    unchanged = 0
    distribution = {}
    for result in results:
        amount = result.timezone_switches
        if amount > 1:
            changed += 1
        else:
            unchanged += 1

        if distribution.get(amount) is None:
            distribution[amount] = 1
        else:
            distribution[amount] += 1

    ignored_timezones = set([
        'GB',
        'WET',
        'MET',
        'CET',
        'EET',
        'NZ',
        'MST7MDT',
        'PST8PDT',
        'CST6CDT',
        'W-SU',
        'ROK',
        'EET',
        'NZ-CHAT',
        'GB-Eire',
        'ROC',
        'EST5EDT',
        'EET',
        'PRC',
    ])
    for i in range(0, 16):
        ignored_timezones.add(f'GMT-{i}')
        ignored_timezones.add(f'GMT+{i}')

    correct = 0
    considered_contributors = 0
    survey_results = {}
    detected_timezones = {}
    for result in results:
        contributor = result.contributor
        home = set(result.intermediate_results['home']['set'])
        if 'full_set' in result.intermediate_results['home']:
            full_set = set(result.intermediate_results['home']['full_set'])
        else:
            full_set = set()

        if result.different_timezones is not None:
            if result.different_timezones not in detected_timezones:
                detected_timezones[result.different_timezones] = 0
            detected_timezones[result.different_timezones] += 1

        if contributor.location is None:
            continue

        for item in timezone_evaluations:
            if element_in_string(contributor.location, item['search']):
                survey_string = ', '.join(item['search'])
                if survey_string not in survey_results:
                    survey_results[survey_string] = {}
                    survey_results[survey_string]['set'] = set(home)
                    survey_results[survey_string]['amount'] = 0
                    survey_results[survey_string]['correct'] = 0
                    survey_results[survey_string]['timezone_amount'] = 0
                    survey_results[survey_string]['match'] = item['timezone']
                    survey_results[survey_string]['full_set'] = full_set

                survey_results[survey_string][
                    'set'] = survey_results[survey_string]['set'] | home
                survey_results[survey_string]['amount'] += 1
                survey_results[survey_string]['timezone_amount'] += len(
                    home - ignored_timezones)
                survey_results[survey_string]['ratio'] = survey_results[
                    survey_string]['timezone_amount'] / survey_results[
                        survey_string]['amount']
                considered_contributors += 1

                if 'full_set' in item:
                    survey_results[survey_string]['full_set'] = survey_results[
                        survey_string]['full_set'] | full_set

                # Debug stuff
                if 'roflcopter' == survey_string:
                    print(home)

                if item['timezone'] in home:
                    correct += 1
                    survey_results[survey_string]['correct'] += 1
                break

    logger.info(f'Looked at {len(results)} contributors.')
    logger.info(f'{len(results)} are relevant.')
    logger.info(f'Detected a change in {changed} of those.')
    logger.info(f'Detected no change in {unchanged} of those.')
    logger.info(f'Distribution of users by amount of different timezones:')
    logger.info(pformat(distribution))
    logger.info(f'Distribution of users by amount of detected timezones:')
    logger.info(pformat(detected_timezones))
    logger.info(
        f'Verified contributors {correct} of {considered_contributors}: {correct/considered_contributors}'
    )

    print(
        f"Strings query;Considered contributors;Expected timezone;Home location in subset;Mean size of subset;Max size of subset"
    )
    for key, result in survey_results.items():
        print(
            f"{key};{result['amount']};{result['match']};{result['correct']};{result['ratio']:.2f};{len(result['full_set'])}"
        )

    return
Esempio n. 16
0
def analyse_contributer_travel_path(contributors_commits):
    """Analyse the travel path of a few contributers."""
    try:
        session = new_session()
        count = 0
        for contributor, commit_hashes in contributors_commits:
            # Query result again with current session.
            contributor = session.query(Contributor).get(contributor.login)
            result = contributor.analysis_result

            if result is None:
                result = AnalysisResult()
                contributor.analysis_result = result
                session.add(contributor)
                session.add(result)

            commits_changed = (len(commit_hashes) != result.commit_count)

            # Look at the jsonb intermediate_result to see if we already wrote the data into it
            json_results = result.intermediate_results
            if json_results is None:
                json_results = {}
                result.intermediate_results = json_results

            if result.different_timezones is None \
                    or result.timezone_switches is None \
                    or commits_changed \
                    or json_results.get('travel') is None \
                    or json_results.get('home') is None:
                commits = session.query(Commit) \
                    .filter(Commit.sha.in_(commit_hashes)) \
                    .all()

                plotter = TravelPath(commits, '/')
                plotter.preprocess()

                json_results = deepcopy(result.intermediate_results)

                for timezone_set in plotter.data:
                    del (timezone_set['start'])
                    del (timezone_set['end'])
                    timezone_set['set'] = list(timezone_set['set'])
                    timezone_set['full_set'] = list(timezone_set['full_set'])

                json_results['home'] = plotter.home_zone
                json_results['travel'] = plotter.data
                result.intermediate_results = json_results

                result.timezone_switches = len(plotter.data)
                result.different_timezones = plotter.different_timezones
                result.last_change = datetime.now()
                result.commit_count = len(commits)
                session.add(result)

            count += 1
            if count % 50 == 0:
                session.commit()

        session.commit()
    finally:
        session.close()

    return {'message': 'Success'}