def add_personal_token(request): message = '' if request.method == 'POST': form = PersonalTokenForm(request.POST) if form.is_valid(): try: g = GithubApi(token=form.cleaned_data['access_token']) g.get_connection().get_user().login except: error = 'Wrong access token!' return render(request, 'add_personal_token.html', { 'form': form, 'error': error }) token = form.save(commit=False) token.access_token = form.cleaned_data.get('access_token') token.save() token.user.add(request.user) message = 'Your token has been successfully added' else: form = PersonalTokenForm() return render(request, 'add_personal_token.html', { 'form': form, 'message': message })
def update_personal_token(request): message = '' current_user = request.user if request.method == 'POST': form = PersonalTokenForm(request.POST) if form.is_valid(): try: g = GithubApi(token=form.cleaned_data['access_token']) if_token_ok = g.get_connection().get_user().login except: error = 'Wrong access token!' return render(request, 'add_personal_token.html', { 'form': form, 'error': error }) GitAuthentication.objects.filter(user=current_user).update( access_token=form.cleaned_data['access_token']) message = 'Your token has been successfully updated' else: form = PersonalTokenForm() return render(request, 'update_token.html', { 'form': form, 'message': message })
def add_new_issue(request, pk): message = '' repo = get_object_or_404(Repo, pk=pk) if request.method == "POST": form = IssueForm(request.POST) if form.is_valid(): token = GitAuthentication.objects.filter( user=request.user).first().access_token g = GithubApi(token=token) try: g.create_new_issue(repo.name, form.cleaned_data.get('title'), form.cleaned_data.get('body'), form.cleaned_data.get('label'), form.cleaned_data.get('milestone')) except: return render(request, 'add_issue.html', { 'form': form, 'error': 'Problem with connection' }) issue = form.save(commit=False) issue.title = form.cleaned_data.get('title') issue.body = form.cleaned_data.get('body') issue.label = form.cleaned_data.get('label') issue.milestone = form.cleaned_data.get('milestone') issue.repo = repo issue.save() message = 'Your issue has been successfully added' else: form = IssueForm() return render(request, 'add_issue.html', { 'form': form, 'message': message })
def __init__(self, token, project_info): """ Input - token: used to access github - project_info: course project data """ super(GithubAnalyzer, self).__init__() self.client = GithubApi(token) self.projects = project_info self.commit_cache = False self.project_cache = False
def scrum_board(request, pk): repo = get_object_or_404(Repo, pk=pk) token = GitAuthentication.objects.filter( user=request.user).first().access_token g = GithubApi(token=token) issues_by_label = {} labels = g.get_all_labels_in_repo(repo.name) for label in labels: issues_by_label[label] = list( Issue.objects.filter(repo_id=pk, label=label).all()) return render(request, 'scrum_board.html', { 'issues': issues_by_label, 'labels': labels })
def refresh_data(request, pk): repo = get_object_or_404(Repo, pk=pk) token = GitAuthentication.objects.filter( user=request.user).first().access_token g = GithubApi(token=token) branches = g.get_names_of_branch(repo.name) issues = g.get_all_issues(repo.name) Branch.objects.filter(repo=repo).delete() Issue.objects.filter(repo=repo).delete() for branch_name in branches: add_branch(branch_name, pk) for i in issues: add_issue(i, repo) return redirect('home')
def __init__(self, project_info): """ Tokens are read from conf/tokens.json file. Input - project_info: dataset contains all course projects """ super(ProcessMiningAnalyzer, self).__init__() self.project_info = project_info with open('conf/tokens.json', 'r') as f_in: self.token = json.load(f_in) self.gt_analyzer = GithubAnalyzer(self.token['github']['token'], self.project_info) self.gt_client = GithubApi(self.token['github']['token']) self.out_header = 'process_mining' if not self.out_header in os.listdir('results'): os.mkdir('results/{}'.format(self.out_header)) self.out_header = 'results/{}'.format(self.out_header)
def add_new_repo(request): success = '' if request.method == "POST": form = RepoForm(request.POST) if form.is_valid(): current_user = request.user token = GitAuthentication.objects.filter( user=current_user).first().access_token g = GithubApi(token=token) try: g.get_repo_by_name(form.cleaned_data['name']) except: error = "This repo doesn't exist" return render(request, 'add_new_repo.html', { 'form': form, 'error': error }) if Repo.objects.filter(name=form.cleaned_data.get('name')): error = 'This repo has already been in your favourites' return render(request, 'add_new_repo.html', { 'form': form, 'error': error }) repo = form.save(commit=False) repo.name = form.cleaned_data.get('name') repo.save() repo.user.add(current_user) refresh_data(request, repo.id) success = 'Repo has been successfully added' else: form = RepoForm() return render(request, 'add_new_repo.html', { 'form': form, 'success': success })
class GithubAnalyzer(object): """ Provide functions to get github statistics and visualize results. """ def __init__(self, token, project_info): """ Input - token: used to access github - project_info: course project data """ super(GithubAnalyzer, self).__init__() self.client = GithubApi(token) self.projects = project_info self.commit_cache = False self.project_cache = False def commits(self, reload=False): """ Get all commits of all projects. If reload=True, use cached results. Otherwise get data from APIs and cache the result. Output - a list of lists, each list contains all commits of a repository(project). """ if reload: with open('cache/commits.json', 'r') as f_in: return json.load(f_in) lstCommits = [] for project in self.projects: owner, repo = project['repo']['owner'], project['repo']['repo'] commits = self.client.get_commits(owner, repo) lstCommits.append(commits) with open('cache/commits.json', 'w') as f_out: json.dump(lstCommits, f_out, sort_keys=True, indent=4, separators=(',', ': ')) return lstCommits def user_commits(self, reload=True): """ Get a dictionary with user ID as key and a list of commits as value. self.commits will be called with the input reload. WARNING: assume a user map cache file cache/new_user_mapping.json exists Output - a dictionary between user ID and list of commits of the user """ all_commits = self.commits(reload) with open('cache/new_user_mapping.json', 'r') as f_in: user_map = json.load(f_in) user_commits = defaultdict(lambda: []) for repo in all_commits: for commit in repo: user = commit['commit']['author']['name'] if user in user_map: user_commits[user_map[user]].append(commit) return user_commits def get_commit(self, sha): """ Return the information of a single commit. It assumes the cache file cache/sha2commit_new.json exists. Output - a dictionary of commit information, or False if the given commit can't be found """ if not self.commit_cache: with open('cache/sha2commit_new.json', 'r') as f_in: self.commit_cache = json.load(f_in) if sha in self.commit_cache: return self.commit_cache[sha] else: return False def cache_commits(self, fix=True): ''' Cache all commits. This will take about two hours for cs169 fall 2016. Fix is used for fixing some error messages. This function should be called only once. It will generate cache/sha2commit_new.json file. TODO: FUNCTION NEEDS REFACTOR. ''' import time if fix: print('Fix mode') with open('cache/sha2commit.json', 'r') as f_in: sha2commit = json.load(f_in) all_commits = self.commits(reload=True) for index, commits in enumerate(all_commits): owner, repo = self.projects[index]['repo'][ 'owner'], self.projects[index]['repo']['repo'] for commit in commits: info = sha2commit[commit['sha']] if 'message' in info: sha2commit[commit['sha']] = self.client.get_commit( owner, repo, commit['sha']) time.sleep(0.1) with open('cache/sha2commit_new.json', 'w') as f_out: json.dump(sha2commit, f_out) return print('Cache mode') dictSha2Commit = {} all_commits = self.commits(reload=True) for index, commits in enumerate(all_commits): owner, repo = self.projects[index]['repo']['owner'], self.projects[ index]['repo']['repo'] for commit in commits: dictSha2Commit[commit['sha']] = self.client.get_commit( owner, repo, commit['sha']) time.sleep(0.01) with open('cache/sha2commit.json', 'w') as f_out: json.dump(dictSha2Commit, f_out, sort_keys=True, indent=4, separators=(',', ': ')) def commits_plot(self): """ Plot - a histogram of number of commits per project """ lstCommits = self.commits(reload=True) plotdata = pd.DataFrame({ 'x': np.arange(len(lstCommits)), 'y': [len(item) for item in lstCommits] }) fig, ax = plt.subplots() sns.barplot('x', 'y', data=plotdata) plt.savefig('results/hist_num_commits.png') plt.close(fig) def commmits_per_student_plot(self): """ Plot - Histogram of number of commits per student. """ lstCommits = self.commits(reload=True) dictStu2Commits = defaultdict(lambda: 0) for proj_commit in lstCommits: for commit in proj_commit: dictStu2Commits[commit['commit']['author']['name']] += 1 fig, ax = plt.subplots() sns.distplot([v for _, v in dictStu2Commits.items()]) plt.savefig('results/num_commits_per_student.png') plt.close(fig) def generate_user_map(self): """ Generate a user mapping which maps a github username to the User ID. It will go through all usernames and compare it with existing usernames from pivotal tracker and student info. Candidates are listed based on edit distance. Input 'Y' will link the username to the User ID of the candidate. Input 'S' will skip all candidates. It will generate cache/new_user_mapping.json. It assumes cache/user_mapping.json exists, which is a mapp from tracker users and students to user ID. TODO: NEED REFACTOR """ with open('cache/user_mapping.json', 'r') as f_in: user_map = json.load(f_in) self.student_list = user_map.keys() lstCommits = self.commits(reload=True) setStudents = set() for proj_commit in lstCommits: for commit in proj_commit: setStudents.add(commit['commit']['author']['name']) counter = 0 for student in setStudents: if student in self.student_list: continue choices = self._nearest_neighbor(student) for choice in choices: inpt = input('{} and {}?'.format(choice, student)) if inpt == 'Y': counter += 1 user_map[student] = user_map[choice] break if inpt == 'S': break print('{}/{}'.format(counter, len(setStudents))) with open('cache/new_user_mapping.json', 'w') as f_out: json.dump(user_map, f_out) def _nearest_neighbor(self, wd): choices = list( sorted(self.student_list, key=lambda x: self._distance(x, wd)))[:3] return choices def _distance(self, wd_1, wd_2): wd_2 = ', '.join(reversed(wd_2.split(' '))) return nltk.edit_distance(wd_1.lower(), wd_2.lower()) def extract_proj(self, commit): info = commit['url'].split('/') ind = info.index('api.github.com') return '{}/{}'.format(info[ind + 2], info[ind + 3]) def iteration_commits(self): """ Group commits into interations. Assume conf/iterations.json exists. Assume there are four iterations. Output - a list of four lists containing all commits of that iteration. """ commits = self.commits(reload=True) iterations = [[], [], [], []] with open('conf/iterations.json', 'r') as f_in: timestamps = json.load(f_in) timestamps = [ datetime.datetime.strptime(s, '%Y-%m-%d') for s in timestamps ] for proj in commits: for commit in proj: t = datetime.datetime.strptime( commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ') i = np.searchsorted(timestamps, t) if i in [1, 2, 3, 4]: iterations[i - 1].append(commit) return iterations
def _load_connection(self): self.client = GithubApi(self.tokens['github']['token'])
class MetricGithub(BasicMetric): """All metrics concerning Github""" def __init__(self, proj, tokens, **args): super(MetricGithub, self).__init__(proj, tokens) self.out_header = 'metric_github' if not self.out_header in os.listdir(self.ROOT_PATH+'/results'): os.mkdir('{}/results/{}'.format(self.ROOT_PATH, self.out_header)) self.out_header = '{}/results/{}'.format(self.ROOT_PATH, self.out_header) self.projsha_commit, self.projsha_pr = {}, {} def _load_connection(self): self.client = GithubApi(self.tokens['github']['token']) def metrics(self, **args): commits = self._commits() pull_requests = self._pull_requests() with open('{}/conf/iterations.json'.format(self.ROOT_PATH), 'r') as f_in: iterations = json.load(f_in) iterations = [time.mktime(time.strptime(x, '%Y-%m-%d')) for x in iterations] iteration_data = defaultdict(lambda: defaultdict(lambda: [])) for cmit in commits: ctime = time.mktime(time.strptime(cmit['commit']['committer']['date'], '%Y-%m-%dT%H:%M:%SZ')) nite = bisect(iterations, ctime) if not nite in [1, 2, 3, 4]: continue sha = cmit['sha'] cmit_info = self._get_commit(sha) iteration_data[nite]['num_files'].append(len(cmit_info['files'])) iteration_data[nite]['comments'].append(cmit_info['commit']['message']) for pr in pull_requests: if not pr['merged_at']: continue ctime = time.mktime(time.strptime(pr['created_at'], '%Y-%m-%dT%H:%M:%SZ')) mtime = time.mktime(time.strptime(pr['merged_at'], '%Y-%m-%dT%H:%M:%SZ')) nite = bisect(iterations, mtime) if not nite in [1, 2, 3, 4]: continue pr_info = self._get_pull_request(pr['number']) num_comments = pr_info['comments'] iteration_data[nite]['review_time'].append(mtime-ctime) iteration_data[nite]['pr_comments'].append(num_comments) result = defaultdict(lambda: [None for _ in self.metric_name()]) for k, v in iteration_data.items(): result[k] = self._extract(v) return result def metric_name(self): return ['Files Edited', 'Message Length', 'PR Review', 'PR Comments'] def _extract(self, info): total_num_files = np.sum(info['num_files']) avg_msg_length = np.average([len(nltk.word_tokenize(x)) for x in info['comments']]) avg_review_time = np.average([np.log(x+1) for x in info['review_time']]) if len(info['review_time']) > 0 else None avg_num_comments = np.average(info['pr_comments']) if len(info['pr_comments']) > 0 else None return [total_num_files, avg_msg_length, avg_review_time, avg_num_comments] def _commits(self, reload=True): proj_commit = {} if 'proj2commits.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)): with open('{}/cache/proj2commits.json'.format(self.ROOT_PATH), 'r') as f_in: proj_commit = json.load(f_in) if reload and self.proj['ID'] in proj_commit: return proj_commit[self.proj['ID']] owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo'] commits = self.client.get_commits(owner, repo) proj_commit[self.proj['ID']] = commits with open('{}/cache/proj2commits.json'.format(self.ROOT_PATH), 'w') as f_out: json.dump(proj_commit, f_out, sort_keys=True, indent=4, separators=(',', ': ')) return commits def _get_commit(self, sha, reload=True): dict_key = '{}:{}'.format(self.proj['ID'], sha) if dict_key in self.projsha_commit: return self.projsha_commit[dict_key] if 'projsha2commit.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)): with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'r') as f_in: self.projsha_commit = json.load(f_in) if reload and dict_key in self.projsha_commit: return self.projsha_commit[dict_key] owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo'] commit = self.client.get_commit(owner, repo, sha) multiplier, sleep_time = 2, 0.1 while 'message' in commit: print(commit['message']) time.sleep(sleep_time) commit = self.client.get_commit(owner, repo, sha) sleep_time *= multiplier self.projsha_commit[dict_key] = commit with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'w') as f_out: json.dump(self.projsha_commit, f_out, sort_keys=True, indent=4, separators=(',', ': ')) return commit def _pull_requests(self, reload=True): proj_requests = {} if 'proj2prs.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)): with open('{}/cache/proj2prs.json'.format(self.ROOT_PATH), 'r') as f_in: proj_requests = json.load(f_in) if reload and self.proj['ID'] in proj_requests: return proj_requests[self.proj['ID']] owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo'] prs = self.client.get_pull_requests(owner, repo) proj_requests[self.proj['ID']] = prs with open('{}/cache/proj2prs.json'.format(self.ROOT_PATH), 'w') as f_out: json.dump(proj_requests, f_out, sort_keys=True, indent=4, separators=(',', ': ')) return prs def _get_pull_request(self, number, reload=True): dict_key = '{}:{}'.format(self.proj['ID'], number) if dict_key in self.projsha_pr: return self.projsha_pr[dict_key] if 'projsha2pr.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)): with open('{}/cache/projsha2pr.json'.format(self.ROOT_PATH), 'r') as f_in: self.projsha_pr = json.load(f_in) if reload and dict_key in self.projsha_pr: return self.projsha_pr[dict_key] owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo'] pr = self.client.get_pull_request(owner, repo, number) multiplier, sleep_time = 2, 1 while 'message' in pr: print(pr['message']) time.sleep(sleep_time) pr = self.client.get_pull_request(owner, repo, number) sleep_time *= multiplier self.projsha_pr[dict_key] = pr with open('{}/cache/projsha2pr.json'.format(self.ROOT_PATH), 'w') as f_out: json.dump(self.projsha_pr, f_out, sort_keys=True, indent=4, separators=(',', ': ')) return pr def dump(self): if 'projsha2commit.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)): with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'r') as f_in: tmp_projsha_commit = json.load(f_in) self.projsha_commit.update(tmp_projsha_commit) with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'w') as f_out: json.dump(self.projsha_commit, f_out, sort_keys=True, indent=4, separators=(',', ': '))
class ProcessMiningAnalyzer(object): """ Process mining based on git commits. """ def __init__(self, project_info): """ Tokens are read from conf/tokens.json file. Input - project_info: dataset contains all course projects """ super(ProcessMiningAnalyzer, self).__init__() self.project_info = project_info with open('conf/tokens.json', 'r') as f_in: self.token = json.load(f_in) self.gt_analyzer = GithubAnalyzer(self.token['github']['token'], self.project_info) self.gt_client = GithubApi(self.token['github']['token']) self.out_header = 'process_mining' if not self.out_header in os.listdir('results'): os.mkdir('results/{}'.format(self.out_header)) self.out_header = 'results/{}'.format(self.out_header) def ftype_count(self): self._build_graph() types, mixed_numbers = [], [] for _, v in self.commit_graph.sha2node.items(): valid_types = set() for f_type in v.type: if 'app' in f_type: valid_type = 'app' elif 'test' in f_type: valid_type = 'test' else: valid_type = f_type valid_types.add(valid_type) types.append(valid_type) if 'unknown' in valid_types: valid_types.remove('unknown') mixed_numbers.append(len(valid_types)) plotdata = pd.DataFrame({'types': types}) fig, ax = plt.subplots() sns.countplot(y='types', data=plotdata) plt.savefig('{}/type_count.png'.format(self.out_header)) plt.close(fig) plotdata = pd.DataFrame({'mixed_numbers': mixed_numbers}) fig, ax = plt.subplots() sns.countplot(y='mixed_numbers', data=plotdata) plt.savefig('{}/mixed_number_count.png'.format(self.out_header)) plt.close(fig) def frequent_pattern(self, step_size=3): import time def get_type(ftype): return ftype[0].upper() self._build_graph() patterns, counter = [], defaultdict(lambda: 0) time_atm, time_tam, time_all = [], [], [] for _, nd in tqdm(self.commit_graph.sha2node.items()): for ptn in nd.next(step_size, get_type): if len(ptn) != step_size + 1: continue str_id = '-'.join(ptn) patterns.append(str_id) counter[str_id] += 1 if 'A-T' in str_id: time_atm.append(time.mktime(nd.timestamp.timetuple())) if 'T-A' in str_id: time_tam.append(time.mktime(nd.timestamp.timetuple())) time_all.append(time.mktime(nd.timestamp.timetuple())) sorted_list = sorted([(k, v) for k, v in counter.items()], key=lambda x: -x[1]) print(sorted_list[:10]) plotdata = pd.DataFrame({'patterns': patterns}) fig, ax = plt.subplots() sns.countplot(y='patterns', data=plotdata) plt.savefig('{}/pattern_count.png'.format(self.out_header)) plt.close(fig) fig, ax = plt.subplots() sns.distplot([np.log(v) for _, v in counter.items()]) plt.savefig('{}/pattern_count_dist.png'.format(self.out_header)) plt.close(fig) fig, ax = plt.subplots() sns.distplot(time_atm, label='at') sns.distplot(time_tam, label='ta') plt.legend() plt.savefig('{}/hist_tam_atm.png'.format(self.out_header)) plt.close(fig) ite_at, ite_ta = [], [] ite_boundary = [1.42e9, 1.45e9, 1.47e9, 1.50e9] def boundary_func(x): return x > ite_boundary[i] and x < ite_boundary[i + 1] for i in range(3): num_ite = len(list(filter(boundary_func, time_all))) num_at = len(list(filter(boundary_func, time_atm))) num_ta = len(list(filter(boundary_func, time_tam))) ite_at.append(num_at / num_ite) ite_ta.append(num_ta / num_ite) print(ite_at) print(ite_ta) def neighbor_selection(self, step_size=3): import time import time def get_type(ftype): return ftype[0].upper() self._build_graph() patterns, edge_counter = defaultdict(lambda: 0), defaultdict(lambda: 0) total_num = len(self.commit_graph.sha2node) for _, nd in tqdm(self.commit_graph.sha2node.items()): tmp_signature = nd.next(step_size, get_type) for ptn in tmp_signature: str_id = '-'.join(ptn) patterns[str_id] += 1 if nd.parents: for pnd in nd.parents: for pptn in pnd.next(step_size, get_type): edge_counter['{}->{}'.format( '-'.join(pptn), str_id)] += 1 else: patterns['START'] += 1 edge_counter['START->{}'.format(str_id)] += 1 print(len(patterns)) with open('cache/pattern_count.json', 'w') as f_out: json.dump(patterns, f_out) with open('cache/edge_strength.json', 'w') as f_out: json.dump(edge_counter, f_out) filtered_patterns = {} for k, v in patterns.items(): if np.log(v) > -1: filtered_patterns[k] = v # print(sorted(filtered_patterns, key=lambda x: -filtered_patterns[x])) print(len(filtered_patterns)) fig, ax = plt.subplots() sns.distplot([np.log(v) for _, v in patterns.items()]) plt.savefig('{}/state_counter.png'.format(self.out_header)) plt.close(fig) fig, ax = plt.subplots() sns.distplot([v for _, v in filtered_patterns.items()]) plt.savefig('{}/freq_state_counter.png'.format(self.out_header)) plt.close(fig) ptn2index = {} for k in filtered_patterns: ptn2index[k] = len(ptn2index) edge_mat = np.zeros((len(ptn2index), len(ptn2index))) for k, v in edge_counter.items(): ptn_1, ptn_2 = k.split('->') if ptn_1 in ptn2index and ptn_2 in ptn2index: edge_mat[ptn2index[ptn_1], ptn2index[ptn_2]] = v from sklearn.manifold import TSNE model = TSNE(n_components=2, random_state=0) y = model.fit_transform(edge_mat) fig, ax = plt.subplots() plt.scatter([x[0] for x in y], [x[1] for x in y]) plt.savefig('{}/filtered_link_mat_tsne.png'.format(self.out_header)) plt.close(fig) def _build_graph(self, project=None): """ Build a commit graph for a given project. If project is None, build a single graph for all projects. Input - project: a project from project info """ self.commit_graph = CommitGraph() with open('cache/sha2commit_new.json', 'r') as f_in: sha2cmit = json.load(f_in) proj_dict = {} if project: commits = self.gt_client.get_commits(project['repo']['owner'], project['repo']['repo']) for cmit in commits: sha = cmit['sha'] if sha in sha2cmit: proj_dict[sha] = sha2cmit[sha] else: proj_dict = sha2cmit self.commit_graph.construct(proj_dict) if project: print('{}: {} root'.format(project['project'], len(self.commit_graph.root))) else: print('All: {} root'.format(len(self.commit_graph.root))) def _convert_commit(self, commit): """ Convert a commit into an object for analysis Input - commit: a dictionary got from GitHub get single commit API """ file_types = [self._file_type(item) for item in commit['files']]
class ProcessSegmentAnalyzer(object): """ Segment git commits and correlates them with user stories. """ def __init__(self, project_info): """ Tokens are read from conf/tokens.json file. Input - project_info: dataset contains all course projects """ super(ProcessSegmentAnalyzer, self).__init__() self.project_info = project_info with open('conf/tokens.json', 'r') as f_in: self.token = json.load(f_in) self.gt_analyzer = GithubAnalyzer(self.token['github']['token'], self.project_info) self.gt_client = GithubApi(self.token['github']['token']) self.pt_client = TrackerApi(self.token['pivotal_tracker']['token']) self.out_header = 'process_segment' if not self.out_header in os.listdir('results'): os.mkdir('results/{}'.format(self.out_header)) self.out_header = 'results/{}'.format(self.out_header) def correlation(self, proj): """ Generate segmentation for a single project. Input - proj: a data point in project_info """ pass def time_sequence(self, proj): """ Extract time information and files information from commits. Input - proj: the project Output - time_sequence: a list of datetime objects - file_sequence: a list of file indexes - file_dict: a dictionary from file index to file name """ commits = self.gt_client.get_commits(proj['repo']['owner'], proj['repo']['repo']) stories = self.pt_client.get_stories(proj['tracker']) file_indexer = {} time_sequence, file_sequence = [], [] for cmit in commits: # tmp_time = datetime.datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ') tmp_file_vec = [] commit = self.gt_analyzer.get_commit(cmit['sha']) if not commit: print('Commit not found: {}'.format(cmit['sha'])) continue if 'merge' in commit['commit']['message']: continue for f in commit['files']: if not f['filename'] in file_indexer: file_indexer[f['filename']] = len(file_indexer) tmp_file_vec.append(file_indexer[f['filename']]) file_sequence.append(tmp_file_vec) time_sequence.append( datetime.datetime.strptime(cmit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ')) return time_sequence, file_sequence def story_time(self, proj): """ Extract time informaiton and story information from pivotral tracker Input - proj: the project Output """ times, info = [], [] for story in self.pt_client.get_stories(proj['tracker']): s = datetime.datetime.strptime(story['created_at'], '%Y-%m-%dT%H:%M:%SZ') e = datetime.datetime.strptime(story['updated_at'], '%Y-%m-%dT%H:%M:%SZ') times.append((s, e)) info.append(story) return times, info def story_time_overlaps(self): """ Plot - a counting plot of 'active' user stories over time """ import time if not 'story_time' in os.listdir(self.out_header): os.mkdir('{}/story_time'.format(self.out_header)) for proj in self.project_info[1:2]: times, info = self.story_time(proj) time_to_val = {} for s_t, e_t in times: time_to_val[s_t] = 1 time_to_val[e_t] = -1 time_seq, count_seq = [], [] counter = 0 for t in sorted(time_to_val.keys()): time_seq.append(t) counter += time_to_val[t] count_seq.append(counter) fig, ax = plt.subplots() plt.plot([time.mktime(t.timetuple()) for t in time_seq], count_seq) plt.savefig('{}/story_time/{}_{}'.format( self.out_header, proj['ID'], proj['project'].replace(" ", ""))) plt.close(fig) def git_commit_overlaps(self): """ Plot - a scatter plot between time and files edited for a given project. """ import time if not 'commit_time' in os.listdir(self.out_header): os.mkdir('{}/commit_time'.format(self.out_header)) for proj in self.project_info[1:2]: times, files = self.time_sequence(proj) sorted_time = sorted(times) t_seq, f_seq = [], [] for i in range(len(times)): for f in files[i]: # t_seq.append(sorted_time.index(times[i])) t_seq.append(time.mktime(times[i].timetuple())) f_seq.append(f) plotdata = pd.DataFrame({'time': t_seq, 'file': f_seq}) fig, ax = plt.subplots() sns.jointplot(x='time', y='file', data=plotdata) plt.savefig('{}/commit_time/{}_{}.png'.format( self.out_header, proj['ID'], proj['project'].replace(" ", ""))) plt.close(fig)