Python infoの例、gitalizer.extensions.logger.info Pythonの例

コード例 #1

0

ファイルを表示

def plot_employee(login, repositories, session):
    """Plot all user related graphs."""
    plot_dir = config['plotting']['plot_dir']
    path = os.path.join(plot_dir, login.lower(), 'employee')
    os.makedirs(path, exist_ok=True)

    contributor = session.query(Contributor) \
        .filter(Contributor.login.ilike(login)) \
        .one_or_none()

    from gitalizer.models import Repository
    conditions = []
    for name in repositories:
        conditions.append(Repository.full_name.ilike(name))

    repositories = session.query(Repository) \
        .filter(or_(*conditions)) \
        .all()

    if contributor is None:
        logger.info(f'No contributor with name {login}')
        sys.exit(1)
    elif len(repositories) == 0:
        logger.info('No repositories found with these names.')
        sys.exit(1)

    plot_employee_punchcard(contributor, repositories, path, session)
    plot_employee_timeline_with_holiday(contributor, repositories, path, session)
    plot_employee_missing_time(contributor, repositories, path, session)

コード例 #2

0

ファイルを表示

ファイル: maintenance.py プロジェクト: Nukesor/gitalizer

def complete_repos():
    """Complete unfinished repsitories."""
    logger.info("Get unfinished or out of date repositories.")
    rescan_interval = int(config['aggregator']['repository_rescan_interval'])
    rescan_threshold = datetime.utcnow() - timedelta(seconds=rescan_interval)
    session = new_session()

    repos = session.query(Repository) \
        .filter(Repository.fork.is_(False)) \
        .filter(Repository.broken.is_(False)) \
        .filter(Repository.too_big.is_(False)) \
        .filter(or_(
            Repository.completely_scanned.is_(False),
            Repository.updated_at <= rescan_threshold,
        )) \
        .all()
    logger.info(f'Found {len(repos)}')

    full_names = [r.full_name for r in repos]
    repos_to_scan = set(full_names)

    manager = Manager('github_repository', repos_to_scan)
    manager.start()
    manager.run()

    session.close()

コード例 #3

0

ファイルを表示

ファイル: organization.py プロジェクト: Nukesor/gitalizer

def members(name):
    """Scan all members of a organization."""
    try:
        get_github_organization(name, True)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #4

0

ファイルを表示

ファイル: repository.py プロジェクト: Nukesor/gitalizer

def get_github_repository_by_owner_name(owner: str, name: str):
    """Get a repository by it's owner and name."""
    full_name = f'{owner}/{name}'
    response = get_github_repository(full_name)
    logger.info(response['message'])
    if 'error' in response:
        logger.error(response['error'])

コード例 #5

0

ファイルを表示

def travel(existing):
    """Analyse missing time stuff."""
    try:
        analyse_travel_path(existing)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #6

0

ファイルを表示

ファイル: organization.py プロジェクト: Nukesor/gitalizer

def get_organization_memberships():
    """Refresh all user organizations."""
    session = new_session()

    tz = pytz.timezone('Europe/Berlin')
    now = datetime.now(tz)
    contributors = session.query(Contributor).all()
    for contributor in contributors:
        if contributor.last_full_scan and contributor.last_full_scan > now - timedelta(
                days=2):
            continue
        logger.info(
            f'Checking {contributor.login}. {github.github.rate_limiting[0]} remaining.'
        )

        github_user = call_github_function(github.github, 'get_user',
                                           [contributor.login])

        github_orgs = call_github_function(github_user, 'get_orgs')
        for org in github_orgs:
            organization = Organization.get_organization(
                org.login, org.url, session)
            contributor.organizations.append(organization)
        contributor.last_full_scan = datetime.utcnow()
        session.add(contributor)
        session.commit()

コード例 #7

0

ファイルを表示

def from_github(owner, name):
    """Get a github repository by owner and name."""
    try:
        logger.info(f'\n\nGet {repository} from user {owner}')
        get_github_repository_by_owner_name(owner, repository)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #8

0

ファイルを表示

def user_for_repositories(login, repositories):
    """Get statistics of an user for specific repositories."""
    try:
        session = new_session()
        plot_employee(login, repositories, session)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #9

0

ファイルを表示

ファイル: organization.py プロジェクト: Nukesor/gitalizer

def by_name(name):
    """Get github organizations for all known contributors."""
    try:
        logger.info(f'\nGet organization {name}')
        get_github_organization(name)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #10

0

ファイルを表示

def from_github_for_users(full_name):
    """Scan all users of a github repository."""
    try:
        logger.info(f'\nGet all users from {full_name}')
        get_github_repository_users(full_name)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #11

0

ファイルを表示

def user(login):
    """Plot all graphs for a specific github user."""
    try:
        session = new_session()
        plot_user(login, session)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #12

0

ファイルを表示

def comparison(users, repos):
    """Get statistics of several user for specific repositories."""
    try:
        if not users or not repos:
            logger.info("Users and Repos are required parameters.")
        session = new_session()
        plot_comparison(users, repos, session)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #13

0

ファイルを表示

def complete():
    """Complete missing data from previous runs.

    This includes:
        - Complete all unfinished repositories
    """
    try:
        complete_data()
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #14

0

ファイルを表示

ファイル: organization.py プロジェクト: Nukesor/gitalizer

def user_membership():
    """Get all organizations for all known users (users in current database).

    This operation only scans the membership of users and doesn't scan any repositories.
    It is not yet multi processed.
    """
    try:
        get_organization_memberships()
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #15

0

ファイルを表示

ファイル: maintenance.py プロジェクト: Nukesor/gitalizer

def clean_db():
    """Clean stuff."""
    logger.info("Removing commits from fork repos.")
    session = new_session()
    try:
        all_repositories = session.query(Repository) \
            .filter(Repository.fork.is_(True)) \
            .filter(Repository.commits != None) \
            .options(joinedload(Repository.commits)) \
            .all()

        logger.info(f'Found {len(all_repositories)}')
        repositories_count = 0
        for repository in all_repositories:
            repository.commits = []
            session.add(repository)
            repositories_count += 1
            if repositories_count % 100 == 0:
                logger.info(f'Removed {repositories_count}')
                session.commit()

        logger.info("Remove unattached commits")
        session.query(Commit) \
            .filter(Commit.repositories == None) \
            .delete()
        session.commit()
    finally:
        session.close()

コード例 #16

0

ファイルを表示

def update(update_all):
    """Update data from previous runs.

    If the `--update-all` flag is provided, everything will be scanned again.
    For instance, without the flag, only contributors with more than 100 commits will be scanned.
    This prevents scans of users which might be uninteresting peers and, for instance, only have one commit in a uninteresting repository.

    This includes:
        - Update all contributors
    """
    try:
        update_data(update_all)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #17

0

ファイルを表示

def punchcard(existing):
    """Analyse missing time stuff."""
    try:
        #            for method in ['mean-shift', 'dbscan', 'affinity']:
        for method in ['affinity']:
            if method == 'dbscan':
                for min_samples in range(5, 10, 5):
                    for eps in range(140, 150, 2):
                        analyse_punch_card(
                            existing,
                            method,
                            eps=eps,
                            min_samples=min_samples,
                        )
            else:
                analyse_punch_card(existing, method)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #18

0

ファイルを表示

def plot_user(login, session):
    """Plot all user related graphs."""
    plot_dir = config['plotting']['plot_dir']
    if not os.path.exists(plot_dir):
        os.mkdir(plot_dir)

    user_dir = os.path.join(plot_dir, login.lower())
    if not os.path.exists(user_dir):
        os.mkdir(user_dir)

    contributor = session.query(Contributor) \
        .filter(Contributor.login.ilike(login)) \
        .one_or_none()

    if contributor is None:
        logger.info(f'No contributor with name {login}')
        sys.exit(1)

    plot_user_travel_path(contributor, user_dir, session)
    plot_user_punchcard(contributor, user_dir, session)

コード例 #19

0

ファイルを表示

ファイル: maintenance.py プロジェクト: Nukesor/gitalizer

def update_contributors(update_all):
    """Complete contributors."""
    session = new_session()
    logger.info(f'Start Scan.')

    # Look at the last two years
    time_span = datetime.now() - timedelta(days=2 * 365)

    results = session.query(Contributor, func.array_agg(Commit.sha)) \
        .filter(Contributor.login == Contributor.login) \
        .join(Email, Contributor.login == Email.contributor_login) \
        .join(Commit, or_(
            Commit.author_email_address == Email.email,
            Commit.committer_email_address == Email.email,
        )) \
        .filter(Commit.commit_time >= time_span) \
        .filter(or_(
            Contributor.location == None,
        )) \
        .group_by(Contributor.login) \
        .all()

    logger.info(f'Scanning {len(results)} contributors.')

    if update_all:
        contributors_to_scan = results
        logger.info(f'Scanning {len(contributors_to_scan)} contributors')
    else:
        count = 0
        contributors_to_scan = []
        for contributor, commits in results:
            if len(commits) > 100 and len(commits) < 20000:
                contributors_to_scan.append((contributor, commits))

            count += 1
            if count % 5000 == 0:
                logger.info(
                    f'Found {count} contributors ({len(contributors_to_scan)} big)'
                )

    manager = ListManager('github_user', contributors_to_scan)
    manager.start()
    manager.run()

コード例 #20

0

ファイルを表示

ファイル: user.py プロジェクト: Nukesor/gitalizer

def user(login, with_followers):
    """Get all repositories for a specific github user.

    If `--with-followers` is provided, the user and ALL of his followers/following will be scanned.
    This is recommended for a better coverage, but it will also take significantly longer and require significantly more disk space.
    """
    try:
        if with_followers:
            logger.info(f'\n\nGet friends of user {login}')
            get_user_with_followers(login)
        else:
            logger.info(f'\n\nGet user {login}')
            get_user(login)
    except KeyboardInterrupt:
        logger.info("CTRL-C Exiting Gracefully")
        sys.exit(1)

コード例 #21

0

ファイルを表示

    def run(self):
        """All tasks are added. Process worker responses and wait for worker to finish."""
        # Start the sub manager
        if self.sub_manager is not None:
            logger.info('Start sub manager.')

        # Poison pill for user scanner
        logger.info('Add poison pills.')
        for _ in range(self.consumer_count + 1):
            self.task_queue.put(None)

        logger.info(f'Processing {len(self.tasks)} tasks')
        finished_tasks = 0
        while finished_tasks < len(self.tasks):
            logger.info(f'Waiting: {finished_tasks} of {len(self.tasks)}')
            result = self.result_queue.get()
            self.results.append(result)

            logger.info(result['message'])
            if 'error' in result:
                logger.info('Encountered an error:')
                logger.info(result['error'])
            elif self.sub_manager is not None:
                self.sub_manager.add_tasks(result['tasks'])
            finished_tasks += 1

        # All sub tasks have been added.
        # Wait for them to finish.
        if self.sub_manager is not None:
            self.sub_manager.start()
            self.sub_manager.run()

コード例 #22

0

ファイルを表示

ファイル: user.py プロジェクト: Nukesor/gitalizer

def get_user_repos(user_login: str, skip=True):
    """Get all relevant Information for a single user."""
    try:
        session = new_session()
        contributor = Contributor.get_contributor(user_login, session, True)
        # Checks for already scanned users.
        if not contributor.should_scan():
            return user_up_to_date_message(user_login)
        if contributor.too_big:
            return user_too_big_message(user_login)

        user = call_github_function(github.github, 'get_user', [user_login])
        owned = user.get_repos()
        starred = user.get_starred()
        repos_to_scan = set()

        # Prefetch all owned repositories
        user_too_big = False
        owned_repos = 0
        while owned._couldGrow() and not user_too_big:
            owned_repos += 1
            call_github_function(owned, '_grow')

            # Debug messages to see that the repositories are still collected.
            if owned_repos % 100 == 0:
                logger.info(
                    f'{owned_repos} owned repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and owned_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # Prefetch all starred repositories
        starred_repos = 0
        while starred._couldGrow() and not user_too_big:
            starred_repos += 1
            call_github_function(starred, '_grow')
            # Debug messages to see that the repositories are still collected.
            if starred_repos % 100 == 0:
                logger.info(
                    f'{starred_repos} starred repos for user {user_login}.')

            # The user is too big. Just drop him.
            if skip and starred_repos > int(
                    config['aggregator']['max_repositories_for_user']):
                user_too_big = True

        # User has too many repositories. Flag him and return
        if user_too_big:
            contributor.too_big = True
            sentry.captureMessage(
                'User too big',
                extra={'url': contributor.login},
                level='info',
                tags={
                    'type': 'too_big',
                    'entity': 'user'
                },
            )
            session.add(contributor)
            session.commit()
            return user_too_big_message(user_login)

        # Check own repositories. We assume that we are collaborating in those
        for github_repo in owned:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )
            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            session.commit()
            repos_to_scan.add(github_repo.full_name)

        # Check stars and if the user collaborated to them.
        for github_repo in starred:
            repository = Repository.get_or_create(
                session,
                github_repo.ssh_url,
                name=github_repo.name,
                full_name=github_repo.full_name,
            )

            if github_repo.fork and not repository.is_invalid():
                check_fork(github_repo, session, repository, repos_to_scan,
                           user_login)
            session.add(repository)

            if not repository.should_scan():
                continue

            repos_to_scan.add(github_repo.full_name)

        session.commit()

        rate = github.github.get_rate_limit().core
        message = f'Got repositories for {user.login}. '
        message += f'{user.login}. {rate.remaining} of 5000 remaining.'
        response = {
            'message': message,
            'tasks': list(repos_to_scan),
        }
    except BaseException:
        # Catch any exception and print it, as we won't get any information due to threading otherwise.
        sentry.captureException()
        response = {
            'message': f'Error while getting repos for {user_login}:\n',
            'error': traceback.format_exc(),
        }
        pass
    finally:
        session.close()

    return response

コード例 #23

0

ファイルを表示

def call_github_function(github_object: object,
                         function_name: str,
                         args: list = None,
                         kwargs: dict = None):
    """Call a pygithub object member function.

    We need to handle those calls in case we get rate limited.
    """
    _try = 0
    tries = 5
    exception = None
    while _try <= tries:
        try:
            if not args:
                args = []
            if not kwargs:
                kwargs = {}
            retrieved_object = getattr(github_object, function_name)(*args,
                                                                     **kwargs)
            return retrieved_object
        except RateLimitExceededException as e:
            # Wait until the rate limiting is reset
            resettime = github.github.get_rate_limit().core.reset
            if resettime < datetime.now():
                resettime = datetime.now()
            delta = resettime - datetime.utcnow()
            delta += timedelta(minutes=2)
            total_minutes = int(delta.total_seconds() / 60)
            logger.info('Hit the rate limit.')
            logger.info(
                f'Reset at {resettime}. Waiting for {total_minutes} minutes.')
            time.sleep(delta.total_seconds())

            _try += 1
            exception = e
            pass
        except GithubException as e:
            # Forbidden or not found (Just made private or deleted)
            if e.status == 451 or e.status == 404:
                raise e

                # Otherwise abuse detection
            if e.status == 403:
                seconds = randrange(180, 480)
                logger.info('Github abuse detection.')
                logger.info(f'Waiting for {seconds} seconds')
                time.sleep(seconds)

            breadcrumbs.record(
                data={
                    'action': 'Github Exception.',
                    'exception': e
                },
                category='info',
            )

            _try += 1
            exception = e
            pass
        except timeout as e:
            logger.info('Hit socket timeout waiting 10 secs.')
            breadcrumbs.record(data={'action': 'Socket timeout hit'},
                               category='info')

            time.sleep(10)
            _try += 1
            exception = e
            pass

    raise exception

コード例 #24

0

ファイルを表示

def get_github_object(github_object: object, object_name: str):
    """Get a pygithub object.

    As pygithub sometimes implicitly queries the github api on a class member access,
    we need to handle those accesses in case we get rate limited.
    """
    _try = 0
    tries = 5
    exception = None
    while _try <= tries:
        try:
            retrieved_object = getattr(github_object, object_name)
            return retrieved_object
        except RateLimitExceededException as e:
            # Wait until the rate limiting is reset
            resettime = github.github.get_rate_limit().core.reset
            if resettime < datetime.now():
                resettime = datetime.now()
            delta = resettime - datetime.now()
            delta += timedelta(minutes=2)
            total_minutes = int(delta.total_seconds() / 60)
            logger.info('Hit the rate limit.')
            logger.info(
                f'Reset at {resettime}. Waiting for {total_minutes} minutes.')

            time.sleep(delta.total_seconds())

            _try += 1
            exception = e
            pass
        except GithubException as e:
            # Forbidden or not found (Just made private or deleted)
            if e.status == 451 or e.status == 404:
                raise e

                # Otherwise abuse detection
            if e.status == 403:
                seconds = randrange(180, 480)
                logger.info('Github abuse detection.')
                logger.info(f'Waiting for {seconds} seconds')
                time.sleep(seconds)

            breadcrumbs.record(
                data={
                    'action': 'Github Exception.',
                    'exception': e
                },
                category='info',
            )

            _try += 1
            exception = e
            pass
        except timeout as e:
            logger.info('Hit socket timeout waiting 10 secs.')
            breadcrumbs.record(data={'action': 'Socket timeout hit'},
                               category='info')

            time.sleep(10)
            _try += 1
            exception = e
            pass

    raise exception

コード例 #25

0

ファイルを表示

def analyse_punch_card(existing, method, eps=150, min_samples=5):
    """Analyze the efficiency of the missing time comparison."""
    session = new_session()
    logger.info(f'Start Scan.')

    # If the only_existing parameter is given, we only work with
    # the existing intermediate AnalysisResults.
    if not existing:
        # Only look at commits of the last year
        time_span = datetime.now() - timedelta(days=365)
        results = session.query(Contributor, func.array_agg(Commit.sha)) \
            .filter(Contributor.login == Contributor.login) \
            .join(Email, Contributor.login == Email.contributor_login) \
            .join(Commit, or_(
                Commit.author_email_address == Email.email,
                Commit.committer_email_address == Email.email,
            )) \
            .filter(Commit.commit_time >= time_span) \
            .group_by(Contributor.login) \
            .all()

        logger.info(f'Scanning {len(results)} contributors.')

        count = 0
        big_contributors = []
        for contributor, commits in results:
            if len(commits) > 100 and len(commits) < 20000:
                big_contributors.append((contributor, commits))

            count += 1
            if count % 50000 == 0:
                logger.info(
                    f'Scanned {count} contributors ({len(big_contributors)} big)'
                )

        # Finished searching for contributors with enough commits.
        logger.info(f'Analysing {len(big_contributors)} contributors.')

        # Chunk the contributor list into chunks of 100
        chunks = create_chunks(big_contributors, 100)

        manager = ListManager('analyse_punchcard', chunks)
        manager.start()
        manager.run()

    # Only look at commits of the last year
    analysis_results = session.query(AnalysisResult) \
        .filter(AnalysisResult.intermediate_results != None) \
        .filter(AnalysisResult.commit_count > 100) \
        .filter(AnalysisResult.commit_count < 20000) \
        .options(joinedload('contributor')) \
        .all()

    if existing:
        logger.info(f'Analysing {len(analysis_results)} results.')

    logger.info(f'Using {method} clustering')
    vectorized_data = []
    contributors = []
    for result in analysis_results:
        if 'punchcard' in result.intermediate_results:
            vectorized_data.append(result.intermediate_results['punchcard'])
            contributors.append(result.contributor)

    # Cluster using DBSCAN algorithm
    if method == 'dbscan':
        metric = 'l1'
        cluster_result = DBSCAN(
            eps=eps,
            min_samples=min_samples,
            metric=metric,
            n_jobs=-1,
        ).fit(vectorized_data)
        core_samples_mask = np.zeros_like(cluster_result.labels_, dtype=bool)
        core_samples_mask[cluster_result.core_sample_indices_] = True

    # Cluster using Mean-Shift algorithm
    elif method == 'mean-shift':
        quantile = 0.1
        n_samples = -1
        logger.info(f'Computing bandwidth.')
        bandwidth = estimate_bandwidth(
            vectorized_data,
            quantile=quantile,
            n_samples=n_samples,
            n_jobs=-1,
        )
        logger.info(f'Bandwidth computed.')

        cluster_result = MeanShift(
            bandwidth=bandwidth,
            bin_seeding=True,
            n_jobs=-1,
        ).fit(vectorized_data)
    # Cluster using Affinity Propagation algorithm
    elif method == 'affinity':
        preference = None
        cluster_result = AffinityPropagation(preference=preference) \
            .fit(vectorized_data)

    # Number of entities per label
    labels = cluster_result.labels_
    unique, counts = np.unique(labels, return_counts=True)
    occurrences = dict(zip(unique, counts))

    contributor_by_label = {}
    for index, label in enumerate(labels):
        if contributor_by_label.get(label) is None:
            contributor_by_label[label] = []

        contributor_by_label[label].append(contributors[index].login)

    # Prepare the plot dir for prototype plotting
    plot_dir = config['plotting']['plot_dir']
    plot_dir = os.path.join(plot_dir, 'analysis', 'analyse_punch', method)

    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    logger.info(f'Found {len(occurrences)}')
    # Get the mean-center prototypes for each label and plot them
    prototypes = get_mean_center_prototypes(cluster_result, vectorized_data,
                                            min_samples)

    logger.info(f'Found {len(prototypes)} valid clusters')
    for label, prototype in prototypes.items():
        if method == 'dbscan':
            name = f'{metric}-{min_samples}-{eps}-{label}'
        else:
            name = f'{label}'

        path = os.path.join(plot_dir, name)
        title = f'Prototype for {name} with {occurrences[label]} elements'
        plotter = CommitPunchcard([], path, title)
        plotter.preprocess()
        plotter.data['count'] = np.array(prototype) * 5
        plotter.plot()

    if method == 'dbscan':
        logger.info(f'DBSCAN with EPS: {eps} and {min_samples} min samples.')
    logger.info('Amount of entities in clusters. -1 is an outlier:')
    logger.info(pformat(occurrences))
    logger.info(pformat(contributor_by_label))
    logger.info(f'{len(analysis_results)} contributers are relevant.')

    if method == 'dbscan':
        core_samples = cluster_result.core_sample_indices_
        logger.info(f'Core samples: {len(core_samples)}')

    return

コード例 #26

0

ファイルを表示

ファイル: travel_path.py プロジェクト: Nukesor/gitalysis

def analyse_travel_path(existing):
    """Analyze the efficiency of the missing time comparison."""
    session = new_session()
    logger.info(f'Start Scan.')

    # Look at the last two years
    time_span = datetime.now() - timedelta(days=2 * 365)

    if not existing:
        results = session.query(Contributor, func.array_agg(Commit.sha)) \
            .filter(Contributor.login == Contributor.login) \
            .join(Email, Contributor.login == Email.contributor_login) \
            .join(Commit, or_(
                Commit.author_email_address == Email.email,
                Commit.committer_email_address == Email.email,
            )) \
            .filter(Commit.commit_time >= time_span) \
            .group_by(Contributor.login) \
            .all()

        logger.info(f'Scanning {len(results)} contributors.')

        count = 0
        big_contributors = []
        for contributor, commits in results:
            if len(commits) > 100 and len(commits) < 20000:
                big_contributors.append((contributor, commits))

            count += 1
            if count % 5000 == 0:
                logger.info(
                    f'Scanned {count} contributors ({len(big_contributors)} big)'
                )

        # Finished searching for contributors with enough commits.
        logger.info(f'Analysing {len(big_contributors)} contributors.')

        # Chunk the contributor list into chunks of 100
        chunks = create_chunks(big_contributors, 100)

        manager = ListManager('analyse_travel_path', chunks)
        manager.start()
        manager.run()

    # Only look at commits of the last year
    results = session.query(AnalysisResult) \
        .filter(AnalysisResult.timezone_switches != None) \
        .filter(and_(
            AnalysisResult.commit_count != None,
            AnalysisResult.commit_count > 100,
            AnalysisResult.commit_count < 20000,
        )) \
        .options(joinedload('contributor')) \
        .all()

    changed = 0
    unchanged = 0
    distribution = {}
    for result in results:
        amount = result.timezone_switches
        if amount > 1:
            changed += 1
        else:
            unchanged += 1

        if distribution.get(amount) is None:
            distribution[amount] = 1
        else:
            distribution[amount] += 1

    ignored_timezones = set([
        'GB',
        'WET',
        'MET',
        'CET',
        'EET',
        'NZ',
        'MST7MDT',
        'PST8PDT',
        'CST6CDT',
        'W-SU',
        'ROK',
        'EET',
        'NZ-CHAT',
        'GB-Eire',
        'ROC',
        'EST5EDT',
        'EET',
        'PRC',
    ])
    for i in range(0, 16):
        ignored_timezones.add(f'GMT-{i}')
        ignored_timezones.add(f'GMT+{i}')

    correct = 0
    considered_contributors = 0
    survey_results = {}
    detected_timezones = {}
    for result in results:
        contributor = result.contributor
        home = set(result.intermediate_results['home']['set'])
        if 'full_set' in result.intermediate_results['home']:
            full_set = set(result.intermediate_results['home']['full_set'])
        else:
            full_set = set()

        if result.different_timezones is not None:
            if result.different_timezones not in detected_timezones:
                detected_timezones[result.different_timezones] = 0
            detected_timezones[result.different_timezones] += 1

        if contributor.location is None:
            continue

        for item in timezone_evaluations:
            if element_in_string(contributor.location, item['search']):
                survey_string = ', '.join(item['search'])
                if survey_string not in survey_results:
                    survey_results[survey_string] = {}
                    survey_results[survey_string]['set'] = set(home)
                    survey_results[survey_string]['amount'] = 0
                    survey_results[survey_string]['correct'] = 0
                    survey_results[survey_string]['timezone_amount'] = 0
                    survey_results[survey_string]['match'] = item['timezone']
                    survey_results[survey_string]['full_set'] = full_set

                survey_results[survey_string][
                    'set'] = survey_results[survey_string]['set'] | home
                survey_results[survey_string]['amount'] += 1
                survey_results[survey_string]['timezone_amount'] += len(
                    home - ignored_timezones)
                survey_results[survey_string]['ratio'] = survey_results[
                    survey_string]['timezone_amount'] / survey_results[
                        survey_string]['amount']
                considered_contributors += 1

                if 'full_set' in item:
                    survey_results[survey_string]['full_set'] = survey_results[
                        survey_string]['full_set'] | full_set

                # Debug stuff
                if 'roflcopter' == survey_string:
                    print(home)

                if item['timezone'] in home:
                    correct += 1
                    survey_results[survey_string]['correct'] += 1
                break

    logger.info(f'Looked at {len(results)} contributors.')
    logger.info(f'{len(results)} are relevant.')
    logger.info(f'Detected a change in {changed} of those.')
    logger.info(f'Detected no change in {unchanged} of those.')
    logger.info(f'Distribution of users by amount of different timezones:')
    logger.info(pformat(distribution))
    logger.info(f'Distribution of users by amount of detected timezones:')
    logger.info(pformat(detected_timezones))
    logger.info(
        f'Verified contributors {correct} of {considered_contributors}: {correct/considered_contributors}'
    )

    print(
        f"Strings query;Considered contributors;Expected timezone;Home location in subset;Mean size of subset;Max size of subset"
    )
    for key, result in survey_results.items():
        print(
            f"{key};{result['amount']};{result['match']};{result['correct']};{result['ratio']:.2f};{len(result['full_set'])}"
        )

    return