def plot_employee(login, repositories, session): """Plot all user related graphs.""" plot_dir = config['plotting']['plot_dir'] path = os.path.join(plot_dir, login.lower(), 'employee') os.makedirs(path, exist_ok=True) contributor = session.query(Contributor) \ .filter(Contributor.login.ilike(login)) \ .one_or_none() from gitalizer.models import Repository conditions = [] for name in repositories: conditions.append(Repository.full_name.ilike(name)) repositories = session.query(Repository) \ .filter(or_(*conditions)) \ .all() if contributor is None: logger.info(f'No contributor with name {login}') sys.exit(1) elif len(repositories) == 0: logger.info('No repositories found with these names.') sys.exit(1) plot_employee_punchcard(contributor, repositories, path, session) plot_employee_timeline_with_holiday(contributor, repositories, path, session) plot_employee_missing_time(contributor, repositories, path, session)
def complete_repos(): """Complete unfinished repsitories.""" logger.info("Get unfinished or out of date repositories.") rescan_interval = int(config['aggregator']['repository_rescan_interval']) rescan_threshold = datetime.utcnow() - timedelta(seconds=rescan_interval) session = new_session() repos = session.query(Repository) \ .filter(Repository.fork.is_(False)) \ .filter(Repository.broken.is_(False)) \ .filter(Repository.too_big.is_(False)) \ .filter(or_( Repository.completely_scanned.is_(False), Repository.updated_at <= rescan_threshold, )) \ .all() logger.info(f'Found {len(repos)}') full_names = [r.full_name for r in repos] repos_to_scan = set(full_names) manager = Manager('github_repository', repos_to_scan) manager.start() manager.run() session.close()
def members(name): """Scan all members of a organization.""" try: get_github_organization(name, True) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def get_github_repository_by_owner_name(owner: str, name: str): """Get a repository by it's owner and name.""" full_name = f'{owner}/{name}' response = get_github_repository(full_name) logger.info(response['message']) if 'error' in response: logger.error(response['error'])
def travel(existing): """Analyse missing time stuff.""" try: analyse_travel_path(existing) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def get_organization_memberships(): """Refresh all user organizations.""" session = new_session() tz = pytz.timezone('Europe/Berlin') now = datetime.now(tz) contributors = session.query(Contributor).all() for contributor in contributors: if contributor.last_full_scan and contributor.last_full_scan > now - timedelta( days=2): continue logger.info( f'Checking {contributor.login}. {github.github.rate_limiting[0]} remaining.' ) github_user = call_github_function(github.github, 'get_user', [contributor.login]) github_orgs = call_github_function(github_user, 'get_orgs') for org in github_orgs: organization = Organization.get_organization( org.login, org.url, session) contributor.organizations.append(organization) contributor.last_full_scan = datetime.utcnow() session.add(contributor) session.commit()
def from_github(owner, name): """Get a github repository by owner and name.""" try: logger.info(f'\n\nGet {repository} from user {owner}') get_github_repository_by_owner_name(owner, repository) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def user_for_repositories(login, repositories): """Get statistics of an user for specific repositories.""" try: session = new_session() plot_employee(login, repositories, session) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def by_name(name): """Get github organizations for all known contributors.""" try: logger.info(f'\nGet organization {name}') get_github_organization(name) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def from_github_for_users(full_name): """Scan all users of a github repository.""" try: logger.info(f'\nGet all users from {full_name}') get_github_repository_users(full_name) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def user(login): """Plot all graphs for a specific github user.""" try: session = new_session() plot_user(login, session) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def comparison(users, repos): """Get statistics of several user for specific repositories.""" try: if not users or not repos: logger.info("Users and Repos are required parameters.") session = new_session() plot_comparison(users, repos, session) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def complete(): """Complete missing data from previous runs. This includes: - Complete all unfinished repositories """ try: complete_data() except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def user_membership(): """Get all organizations for all known users (users in current database). This operation only scans the membership of users and doesn't scan any repositories. It is not yet multi processed. """ try: get_organization_memberships() except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def clean_db(): """Clean stuff.""" logger.info("Removing commits from fork repos.") session = new_session() try: all_repositories = session.query(Repository) \ .filter(Repository.fork.is_(True)) \ .filter(Repository.commits != None) \ .options(joinedload(Repository.commits)) \ .all() logger.info(f'Found {len(all_repositories)}') repositories_count = 0 for repository in all_repositories: repository.commits = [] session.add(repository) repositories_count += 1 if repositories_count % 100 == 0: logger.info(f'Removed {repositories_count}') session.commit() logger.info("Remove unattached commits") session.query(Commit) \ .filter(Commit.repositories == None) \ .delete() session.commit() finally: session.close()
def update(update_all): """Update data from previous runs. If the `--update-all` flag is provided, everything will be scanned again. For instance, without the flag, only contributors with more than 100 commits will be scanned. This prevents scans of users which might be uninteresting peers and, for instance, only have one commit in a uninteresting repository. This includes: - Update all contributors """ try: update_data(update_all) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def punchcard(existing): """Analyse missing time stuff.""" try: # for method in ['mean-shift', 'dbscan', 'affinity']: for method in ['affinity']: if method == 'dbscan': for min_samples in range(5, 10, 5): for eps in range(140, 150, 2): analyse_punch_card( existing, method, eps=eps, min_samples=min_samples, ) else: analyse_punch_card(existing, method) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def plot_user(login, session): """Plot all user related graphs.""" plot_dir = config['plotting']['plot_dir'] if not os.path.exists(plot_dir): os.mkdir(plot_dir) user_dir = os.path.join(plot_dir, login.lower()) if not os.path.exists(user_dir): os.mkdir(user_dir) contributor = session.query(Contributor) \ .filter(Contributor.login.ilike(login)) \ .one_or_none() if contributor is None: logger.info(f'No contributor with name {login}') sys.exit(1) plot_user_travel_path(contributor, user_dir, session) plot_user_punchcard(contributor, user_dir, session)
def update_contributors(update_all): """Complete contributors.""" session = new_session() logger.info(f'Start Scan.') # Look at the last two years time_span = datetime.now() - timedelta(days=2 * 365) results = session.query(Contributor, func.array_agg(Commit.sha)) \ .filter(Contributor.login == Contributor.login) \ .join(Email, Contributor.login == Email.contributor_login) \ .join(Commit, or_( Commit.author_email_address == Email.email, Commit.committer_email_address == Email.email, )) \ .filter(Commit.commit_time >= time_span) \ .filter(or_( Contributor.location == None, )) \ .group_by(Contributor.login) \ .all() logger.info(f'Scanning {len(results)} contributors.') if update_all: contributors_to_scan = results logger.info(f'Scanning {len(contributors_to_scan)} contributors') else: count = 0 contributors_to_scan = [] for contributor, commits in results: if len(commits) > 100 and len(commits) < 20000: contributors_to_scan.append((contributor, commits)) count += 1 if count % 5000 == 0: logger.info( f'Found {count} contributors ({len(contributors_to_scan)} big)' ) manager = ListManager('github_user', contributors_to_scan) manager.start() manager.run()
def user(login, with_followers): """Get all repositories for a specific github user. If `--with-followers` is provided, the user and ALL of his followers/following will be scanned. This is recommended for a better coverage, but it will also take significantly longer and require significantly more disk space. """ try: if with_followers: logger.info(f'\n\nGet friends of user {login}') get_user_with_followers(login) else: logger.info(f'\n\nGet user {login}') get_user(login) except KeyboardInterrupt: logger.info("CTRL-C Exiting Gracefully") sys.exit(1)
def run(self): """All tasks are added. Process worker responses and wait for worker to finish.""" # Start the sub manager if self.sub_manager is not None: logger.info('Start sub manager.') # Poison pill for user scanner logger.info('Add poison pills.') for _ in range(self.consumer_count + 1): self.task_queue.put(None) logger.info(f'Processing {len(self.tasks)} tasks') finished_tasks = 0 while finished_tasks < len(self.tasks): logger.info(f'Waiting: {finished_tasks} of {len(self.tasks)}') result = self.result_queue.get() self.results.append(result) logger.info(result['message']) if 'error' in result: logger.info('Encountered an error:') logger.info(result['error']) elif self.sub_manager is not None: self.sub_manager.add_tasks(result['tasks']) finished_tasks += 1 # All sub tasks have been added. # Wait for them to finish. if self.sub_manager is not None: self.sub_manager.start() self.sub_manager.run()
def get_user_repos(user_login: str, skip=True): """Get all relevant Information for a single user.""" try: session = new_session() contributor = Contributor.get_contributor(user_login, session, True) # Checks for already scanned users. if not contributor.should_scan(): return user_up_to_date_message(user_login) if contributor.too_big: return user_too_big_message(user_login) user = call_github_function(github.github, 'get_user', [user_login]) owned = user.get_repos() starred = user.get_starred() repos_to_scan = set() # Prefetch all owned repositories user_too_big = False owned_repos = 0 while owned._couldGrow() and not user_too_big: owned_repos += 1 call_github_function(owned, '_grow') # Debug messages to see that the repositories are still collected. if owned_repos % 100 == 0: logger.info( f'{owned_repos} owned repos for user {user_login}.') # The user is too big. Just drop him. if skip and owned_repos > int( config['aggregator']['max_repositories_for_user']): user_too_big = True # Prefetch all starred repositories starred_repos = 0 while starred._couldGrow() and not user_too_big: starred_repos += 1 call_github_function(starred, '_grow') # Debug messages to see that the repositories are still collected. if starred_repos % 100 == 0: logger.info( f'{starred_repos} starred repos for user {user_login}.') # The user is too big. Just drop him. if skip and starred_repos > int( config['aggregator']['max_repositories_for_user']): user_too_big = True # User has too many repositories. Flag him and return if user_too_big: contributor.too_big = True sentry.captureMessage( 'User too big', extra={'url': contributor.login}, level='info', tags={ 'type': 'too_big', 'entity': 'user' }, ) session.add(contributor) session.commit() return user_too_big_message(user_login) # Check own repositories. We assume that we are collaborating in those for github_repo in owned: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork and not repository.is_invalid(): check_fork(github_repo, session, repository, repos_to_scan, user_login) session.add(repository) if not repository.should_scan(): continue session.commit() repos_to_scan.add(github_repo.full_name) # Check stars and if the user collaborated to them. for github_repo in starred: repository = Repository.get_or_create( session, github_repo.ssh_url, name=github_repo.name, full_name=github_repo.full_name, ) if github_repo.fork and not repository.is_invalid(): check_fork(github_repo, session, repository, repos_to_scan, user_login) session.add(repository) if not repository.should_scan(): continue repos_to_scan.add(github_repo.full_name) session.commit() rate = github.github.get_rate_limit().core message = f'Got repositories for {user.login}. ' message += f'{user.login}. {rate.remaining} of 5000 remaining.' response = { 'message': message, 'tasks': list(repos_to_scan), } except BaseException: # Catch any exception and print it, as we won't get any information due to threading otherwise. sentry.captureException() response = { 'message': f'Error while getting repos for {user_login}:\n', 'error': traceback.format_exc(), } pass finally: session.close() return response
def call_github_function(github_object: object, function_name: str, args: list = None, kwargs: dict = None): """Call a pygithub object member function. We need to handle those calls in case we get rate limited. """ _try = 0 tries = 5 exception = None while _try <= tries: try: if not args: args = [] if not kwargs: kwargs = {} retrieved_object = getattr(github_object, function_name)(*args, **kwargs) return retrieved_object except RateLimitExceededException as e: # Wait until the rate limiting is reset resettime = github.github.get_rate_limit().core.reset if resettime < datetime.now(): resettime = datetime.now() delta = resettime - datetime.utcnow() delta += timedelta(minutes=2) total_minutes = int(delta.total_seconds() / 60) logger.info('Hit the rate limit.') logger.info( f'Reset at {resettime}. Waiting for {total_minutes} minutes.') time.sleep(delta.total_seconds()) _try += 1 exception = e pass except GithubException as e: # Forbidden or not found (Just made private or deleted) if e.status == 451 or e.status == 404: raise e # Otherwise abuse detection if e.status == 403: seconds = randrange(180, 480) logger.info('Github abuse detection.') logger.info(f'Waiting for {seconds} seconds') time.sleep(seconds) breadcrumbs.record( data={ 'action': 'Github Exception.', 'exception': e }, category='info', ) _try += 1 exception = e pass except timeout as e: logger.info('Hit socket timeout waiting 10 secs.') breadcrumbs.record(data={'action': 'Socket timeout hit'}, category='info') time.sleep(10) _try += 1 exception = e pass raise exception
def get_github_object(github_object: object, object_name: str): """Get a pygithub object. As pygithub sometimes implicitly queries the github api on a class member access, we need to handle those accesses in case we get rate limited. """ _try = 0 tries = 5 exception = None while _try <= tries: try: retrieved_object = getattr(github_object, object_name) return retrieved_object except RateLimitExceededException as e: # Wait until the rate limiting is reset resettime = github.github.get_rate_limit().core.reset if resettime < datetime.now(): resettime = datetime.now() delta = resettime - datetime.now() delta += timedelta(minutes=2) total_minutes = int(delta.total_seconds() / 60) logger.info('Hit the rate limit.') logger.info( f'Reset at {resettime}. Waiting for {total_minutes} minutes.') time.sleep(delta.total_seconds()) _try += 1 exception = e pass except GithubException as e: # Forbidden or not found (Just made private or deleted) if e.status == 451 or e.status == 404: raise e # Otherwise abuse detection if e.status == 403: seconds = randrange(180, 480) logger.info('Github abuse detection.') logger.info(f'Waiting for {seconds} seconds') time.sleep(seconds) breadcrumbs.record( data={ 'action': 'Github Exception.', 'exception': e }, category='info', ) _try += 1 exception = e pass except timeout as e: logger.info('Hit socket timeout waiting 10 secs.') breadcrumbs.record(data={'action': 'Socket timeout hit'}, category='info') time.sleep(10) _try += 1 exception = e pass raise exception
def analyse_punch_card(existing, method, eps=150, min_samples=5): """Analyze the efficiency of the missing time comparison.""" session = new_session() logger.info(f'Start Scan.') # If the only_existing parameter is given, we only work with # the existing intermediate AnalysisResults. if not existing: # Only look at commits of the last year time_span = datetime.now() - timedelta(days=365) results = session.query(Contributor, func.array_agg(Commit.sha)) \ .filter(Contributor.login == Contributor.login) \ .join(Email, Contributor.login == Email.contributor_login) \ .join(Commit, or_( Commit.author_email_address == Email.email, Commit.committer_email_address == Email.email, )) \ .filter(Commit.commit_time >= time_span) \ .group_by(Contributor.login) \ .all() logger.info(f'Scanning {len(results)} contributors.') count = 0 big_contributors = [] for contributor, commits in results: if len(commits) > 100 and len(commits) < 20000: big_contributors.append((contributor, commits)) count += 1 if count % 50000 == 0: logger.info( f'Scanned {count} contributors ({len(big_contributors)} big)' ) # Finished searching for contributors with enough commits. logger.info(f'Analysing {len(big_contributors)} contributors.') # Chunk the contributor list into chunks of 100 chunks = create_chunks(big_contributors, 100) manager = ListManager('analyse_punchcard', chunks) manager.start() manager.run() # Only look at commits of the last year analysis_results = session.query(AnalysisResult) \ .filter(AnalysisResult.intermediate_results != None) \ .filter(AnalysisResult.commit_count > 100) \ .filter(AnalysisResult.commit_count < 20000) \ .options(joinedload('contributor')) \ .all() if existing: logger.info(f'Analysing {len(analysis_results)} results.') logger.info(f'Using {method} clustering') vectorized_data = [] contributors = [] for result in analysis_results: if 'punchcard' in result.intermediate_results: vectorized_data.append(result.intermediate_results['punchcard']) contributors.append(result.contributor) # Cluster using DBSCAN algorithm if method == 'dbscan': metric = 'l1' cluster_result = DBSCAN( eps=eps, min_samples=min_samples, metric=metric, n_jobs=-1, ).fit(vectorized_data) core_samples_mask = np.zeros_like(cluster_result.labels_, dtype=bool) core_samples_mask[cluster_result.core_sample_indices_] = True # Cluster using Mean-Shift algorithm elif method == 'mean-shift': quantile = 0.1 n_samples = -1 logger.info(f'Computing bandwidth.') bandwidth = estimate_bandwidth( vectorized_data, quantile=quantile, n_samples=n_samples, n_jobs=-1, ) logger.info(f'Bandwidth computed.') cluster_result = MeanShift( bandwidth=bandwidth, bin_seeding=True, n_jobs=-1, ).fit(vectorized_data) # Cluster using Affinity Propagation algorithm elif method == 'affinity': preference = None cluster_result = AffinityPropagation(preference=preference) \ .fit(vectorized_data) # Number of entities per label labels = cluster_result.labels_ unique, counts = np.unique(labels, return_counts=True) occurrences = dict(zip(unique, counts)) contributor_by_label = {} for index, label in enumerate(labels): if contributor_by_label.get(label) is None: contributor_by_label[label] = [] contributor_by_label[label].append(contributors[index].login) # Prepare the plot dir for prototype plotting plot_dir = config['plotting']['plot_dir'] plot_dir = os.path.join(plot_dir, 'analysis', 'analyse_punch', method) if not os.path.exists(plot_dir): os.makedirs(plot_dir) logger.info(f'Found {len(occurrences)}') # Get the mean-center prototypes for each label and plot them prototypes = get_mean_center_prototypes(cluster_result, vectorized_data, min_samples) logger.info(f'Found {len(prototypes)} valid clusters') for label, prototype in prototypes.items(): if method == 'dbscan': name = f'{metric}-{min_samples}-{eps}-{label}' else: name = f'{label}' path = os.path.join(plot_dir, name) title = f'Prototype for {name} with {occurrences[label]} elements' plotter = CommitPunchcard([], path, title) plotter.preprocess() plotter.data['count'] = np.array(prototype) * 5 plotter.plot() if method == 'dbscan': logger.info(f'DBSCAN with EPS: {eps} and {min_samples} min samples.') logger.info('Amount of entities in clusters. -1 is an outlier:') logger.info(pformat(occurrences)) logger.info(pformat(contributor_by_label)) logger.info(f'{len(analysis_results)} contributers are relevant.') if method == 'dbscan': core_samples = cluster_result.core_sample_indices_ logger.info(f'Core samples: {len(core_samples)}') return
def analyse_travel_path(existing): """Analyze the efficiency of the missing time comparison.""" session = new_session() logger.info(f'Start Scan.') # Look at the last two years time_span = datetime.now() - timedelta(days=2 * 365) if not existing: results = session.query(Contributor, func.array_agg(Commit.sha)) \ .filter(Contributor.login == Contributor.login) \ .join(Email, Contributor.login == Email.contributor_login) \ .join(Commit, or_( Commit.author_email_address == Email.email, Commit.committer_email_address == Email.email, )) \ .filter(Commit.commit_time >= time_span) \ .group_by(Contributor.login) \ .all() logger.info(f'Scanning {len(results)} contributors.') count = 0 big_contributors = [] for contributor, commits in results: if len(commits) > 100 and len(commits) < 20000: big_contributors.append((contributor, commits)) count += 1 if count % 5000 == 0: logger.info( f'Scanned {count} contributors ({len(big_contributors)} big)' ) # Finished searching for contributors with enough commits. logger.info(f'Analysing {len(big_contributors)} contributors.') # Chunk the contributor list into chunks of 100 chunks = create_chunks(big_contributors, 100) manager = ListManager('analyse_travel_path', chunks) manager.start() manager.run() # Only look at commits of the last year results = session.query(AnalysisResult) \ .filter(AnalysisResult.timezone_switches != None) \ .filter(and_( AnalysisResult.commit_count != None, AnalysisResult.commit_count > 100, AnalysisResult.commit_count < 20000, )) \ .options(joinedload('contributor')) \ .all() changed = 0 unchanged = 0 distribution = {} for result in results: amount = result.timezone_switches if amount > 1: changed += 1 else: unchanged += 1 if distribution.get(amount) is None: distribution[amount] = 1 else: distribution[amount] += 1 ignored_timezones = set([ 'GB', 'WET', 'MET', 'CET', 'EET', 'NZ', 'MST7MDT', 'PST8PDT', 'CST6CDT', 'W-SU', 'ROK', 'EET', 'NZ-CHAT', 'GB-Eire', 'ROC', 'EST5EDT', 'EET', 'PRC', ]) for i in range(0, 16): ignored_timezones.add(f'GMT-{i}') ignored_timezones.add(f'GMT+{i}') correct = 0 considered_contributors = 0 survey_results = {} detected_timezones = {} for result in results: contributor = result.contributor home = set(result.intermediate_results['home']['set']) if 'full_set' in result.intermediate_results['home']: full_set = set(result.intermediate_results['home']['full_set']) else: full_set = set() if result.different_timezones is not None: if result.different_timezones not in detected_timezones: detected_timezones[result.different_timezones] = 0 detected_timezones[result.different_timezones] += 1 if contributor.location is None: continue for item in timezone_evaluations: if element_in_string(contributor.location, item['search']): survey_string = ', '.join(item['search']) if survey_string not in survey_results: survey_results[survey_string] = {} survey_results[survey_string]['set'] = set(home) survey_results[survey_string]['amount'] = 0 survey_results[survey_string]['correct'] = 0 survey_results[survey_string]['timezone_amount'] = 0 survey_results[survey_string]['match'] = item['timezone'] survey_results[survey_string]['full_set'] = full_set survey_results[survey_string][ 'set'] = survey_results[survey_string]['set'] | home survey_results[survey_string]['amount'] += 1 survey_results[survey_string]['timezone_amount'] += len( home - ignored_timezones) survey_results[survey_string]['ratio'] = survey_results[ survey_string]['timezone_amount'] / survey_results[ survey_string]['amount'] considered_contributors += 1 if 'full_set' in item: survey_results[survey_string]['full_set'] = survey_results[ survey_string]['full_set'] | full_set # Debug stuff if 'roflcopter' == survey_string: print(home) if item['timezone'] in home: correct += 1 survey_results[survey_string]['correct'] += 1 break logger.info(f'Looked at {len(results)} contributors.') logger.info(f'{len(results)} are relevant.') logger.info(f'Detected a change in {changed} of those.') logger.info(f'Detected no change in {unchanged} of those.') logger.info(f'Distribution of users by amount of different timezones:') logger.info(pformat(distribution)) logger.info(f'Distribution of users by amount of detected timezones:') logger.info(pformat(detected_timezones)) logger.info( f'Verified contributors {correct} of {considered_contributors}: {correct/considered_contributors}' ) print( f"Strings query;Considered contributors;Expected timezone;Home location in subset;Mean size of subset;Max size of subset" ) for key, result in survey_results.items(): print( f"{key};{result['amount']};{result['match']};{result['correct']};{result['ratio']:.2f};{len(result['full_set'])}" ) return