Beispiel #1
0
def add_personal_token(request):
    message = ''
    if request.method == 'POST':
        form = PersonalTokenForm(request.POST)
        if form.is_valid():
            try:
                g = GithubApi(token=form.cleaned_data['access_token'])
                g.get_connection().get_user().login
            except:
                error = 'Wrong access token!'
                return render(request, 'add_personal_token.html', {
                    'form': form,
                    'error': error
                })
            token = form.save(commit=False)
            token.access_token = form.cleaned_data.get('access_token')
            token.save()
            token.user.add(request.user)
            message = 'Your token has been successfully added'

    else:
        form = PersonalTokenForm()
    return render(request, 'add_personal_token.html', {
        'form': form,
        'message': message
    })
Beispiel #2
0
def update_personal_token(request):
    message = ''
    current_user = request.user
    if request.method == 'POST':
        form = PersonalTokenForm(request.POST)
        if form.is_valid():
            try:
                g = GithubApi(token=form.cleaned_data['access_token'])
                if_token_ok = g.get_connection().get_user().login
            except:
                error = 'Wrong access token!'
                return render(request, 'add_personal_token.html', {
                    'form': form,
                    'error': error
                })

            GitAuthentication.objects.filter(user=current_user).update(
                access_token=form.cleaned_data['access_token'])
            message = 'Your token has been successfully updated'

    else:
        form = PersonalTokenForm()
    return render(request, 'update_token.html', {
        'form': form,
        'message': message
    })
Beispiel #3
0
def add_new_issue(request, pk):
    message = ''
    repo = get_object_or_404(Repo, pk=pk)
    if request.method == "POST":
        form = IssueForm(request.POST)
        if form.is_valid():
            token = GitAuthentication.objects.filter(
                user=request.user).first().access_token
            g = GithubApi(token=token)
            try:
                g.create_new_issue(repo.name, form.cleaned_data.get('title'),
                                   form.cleaned_data.get('body'),
                                   form.cleaned_data.get('label'),
                                   form.cleaned_data.get('milestone'))
            except:
                return render(request, 'add_issue.html', {
                    'form': form,
                    'error': 'Problem with connection'
                })

            issue = form.save(commit=False)
            issue.title = form.cleaned_data.get('title')
            issue.body = form.cleaned_data.get('body')
            issue.label = form.cleaned_data.get('label')
            issue.milestone = form.cleaned_data.get('milestone')
            issue.repo = repo
            issue.save()
            message = 'Your issue has been successfully added'

    else:
        form = IssueForm()
    return render(request, 'add_issue.html', {
        'form': form,
        'message': message
    })
Beispiel #4
0
 def __init__(self, token, project_info):
     """
   Input
     - token: used to access github
     - project_info: course project data
 """
     super(GithubAnalyzer, self).__init__()
     self.client = GithubApi(token)
     self.projects = project_info
     self.commit_cache = False
     self.project_cache = False
Beispiel #5
0
def scrum_board(request, pk):
    repo = get_object_or_404(Repo, pk=pk)
    token = GitAuthentication.objects.filter(
        user=request.user).first().access_token
    g = GithubApi(token=token)
    issues_by_label = {}
    labels = g.get_all_labels_in_repo(repo.name)

    for label in labels:
        issues_by_label[label] = list(
            Issue.objects.filter(repo_id=pk, label=label).all())

    return render(request, 'scrum_board.html', {
        'issues': issues_by_label,
        'labels': labels
    })
Beispiel #6
0
def refresh_data(request, pk):
    repo = get_object_or_404(Repo, pk=pk)
    token = GitAuthentication.objects.filter(
        user=request.user).first().access_token
    g = GithubApi(token=token)
    branches = g.get_names_of_branch(repo.name)
    issues = g.get_all_issues(repo.name)
    Branch.objects.filter(repo=repo).delete()
    Issue.objects.filter(repo=repo).delete()

    for branch_name in branches:
        add_branch(branch_name, pk)

    for i in issues:
        add_issue(i, repo)

    return redirect('home')
Beispiel #7
0
 def __init__(self, project_info):
     """
   Tokens are read from conf/tokens.json file.
   
   Input
     - project_info: dataset contains all course projects
 """
     super(ProcessMiningAnalyzer, self).__init__()
     self.project_info = project_info
     with open('conf/tokens.json', 'r') as f_in:
         self.token = json.load(f_in)
     self.gt_analyzer = GithubAnalyzer(self.token['github']['token'],
                                       self.project_info)
     self.gt_client = GithubApi(self.token['github']['token'])
     self.out_header = 'process_mining'
     if not self.out_header in os.listdir('results'):
         os.mkdir('results/{}'.format(self.out_header))
     self.out_header = 'results/{}'.format(self.out_header)
Beispiel #8
0
def add_new_repo(request):
    success = ''
    if request.method == "POST":
        form = RepoForm(request.POST)
        if form.is_valid():
            current_user = request.user
            token = GitAuthentication.objects.filter(
                user=current_user).first().access_token
            g = GithubApi(token=token)
            try:
                g.get_repo_by_name(form.cleaned_data['name'])
            except:
                error = "This repo doesn't exist"
                return render(request, 'add_new_repo.html', {
                    'form': form,
                    'error': error
                })
            if Repo.objects.filter(name=form.cleaned_data.get('name')):
                error = 'This repo has already been in your favourites'
                return render(request, 'add_new_repo.html', {
                    'form': form,
                    'error': error
                })

            repo = form.save(commit=False)
            repo.name = form.cleaned_data.get('name')
            repo.save()
            repo.user.add(current_user)
            refresh_data(request, repo.id)
            success = 'Repo has been successfully added'

    else:
        form = RepoForm()
    return render(request, 'add_new_repo.html', {
        'form': form,
        'success': success
    })
Beispiel #9
0
class GithubAnalyzer(object):
    """
    Provide functions to get github statistics and visualize results.
  """
    def __init__(self, token, project_info):
        """
      Input
        - token: used to access github
        - project_info: course project data
    """
        super(GithubAnalyzer, self).__init__()
        self.client = GithubApi(token)
        self.projects = project_info
        self.commit_cache = False
        self.project_cache = False

    def commits(self, reload=False):
        """
      Get all commits of all projects.
      If reload=True, use cached results. Otherwise get data from APIs and cache the result.

      Output
        - a list of lists, each list contains all commits of a repository(project).
    """
        if reload:
            with open('cache/commits.json', 'r') as f_in:
                return json.load(f_in)
        lstCommits = []
        for project in self.projects:
            owner, repo = project['repo']['owner'], project['repo']['repo']
            commits = self.client.get_commits(owner, repo)
            lstCommits.append(commits)
        with open('cache/commits.json', 'w') as f_out:
            json.dump(lstCommits,
                      f_out,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
        return lstCommits

    def user_commits(self, reload=True):
        """
      Get a dictionary with user ID as key and a list of commits as value.
      self.commits will be called with the input reload.

      WARNING: assume a user map cache file cache/new_user_mapping.json exists

      Output
        - a dictionary between user ID and list of commits of the user
    """
        all_commits = self.commits(reload)
        with open('cache/new_user_mapping.json', 'r') as f_in:
            user_map = json.load(f_in)
        user_commits = defaultdict(lambda: [])
        for repo in all_commits:
            for commit in repo:
                user = commit['commit']['author']['name']
                if user in user_map:
                    user_commits[user_map[user]].append(commit)
        return user_commits

    def get_commit(self, sha):
        """
      Return the information of a single commit. It assumes the cache file cache/sha2commit_new.json exists.

      Output
        - a dictionary of commit information, or False if the given commit can't be found
    """
        if not self.commit_cache:
            with open('cache/sha2commit_new.json', 'r') as f_in:
                self.commit_cache = json.load(f_in)
        if sha in self.commit_cache:
            return self.commit_cache[sha]
        else:
            return False

    def cache_commits(self, fix=True):
        '''
        Cache all commits. This will take about two hours for cs169 fall 2016.
        Fix is used for fixing some error messages.
        This function should be called only once. It will generate cache/sha2commit_new.json file.

        TODO: FUNCTION NEEDS REFACTOR.
    '''

        import time
        if fix:
            print('Fix mode')
            with open('cache/sha2commit.json', 'r') as f_in:
                sha2commit = json.load(f_in)
            all_commits = self.commits(reload=True)
            for index, commits in enumerate(all_commits):
                owner, repo = self.projects[index]['repo'][
                    'owner'], self.projects[index]['repo']['repo']
                for commit in commits:
                    info = sha2commit[commit['sha']]
                    if 'message' in info:
                        sha2commit[commit['sha']] = self.client.get_commit(
                            owner, repo, commit['sha'])
                        time.sleep(0.1)
            with open('cache/sha2commit_new.json', 'w') as f_out:
                json.dump(sha2commit, f_out)
            return
        print('Cache mode')
        dictSha2Commit = {}
        all_commits = self.commits(reload=True)
        for index, commits in enumerate(all_commits):
            owner, repo = self.projects[index]['repo']['owner'], self.projects[
                index]['repo']['repo']
            for commit in commits:
                dictSha2Commit[commit['sha']] = self.client.get_commit(
                    owner, repo, commit['sha'])
                time.sleep(0.01)
        with open('cache/sha2commit.json', 'w') as f_out:
            json.dump(dictSha2Commit,
                      f_out,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))

    def commits_plot(self):
        """
      Plot
        - a histogram of number of commits per project
    """
        lstCommits = self.commits(reload=True)

        plotdata = pd.DataFrame({
            'x': np.arange(len(lstCommits)),
            'y': [len(item) for item in lstCommits]
        })
        fig, ax = plt.subplots()
        sns.barplot('x', 'y', data=plotdata)
        plt.savefig('results/hist_num_commits.png')
        plt.close(fig)

    def commmits_per_student_plot(self):
        """
      Plot
        - Histogram of number of commits per student.
    """
        lstCommits = self.commits(reload=True)
        dictStu2Commits = defaultdict(lambda: 0)
        for proj_commit in lstCommits:
            for commit in proj_commit:
                dictStu2Commits[commit['commit']['author']['name']] += 1
        fig, ax = plt.subplots()
        sns.distplot([v for _, v in dictStu2Commits.items()])
        plt.savefig('results/num_commits_per_student.png')
        plt.close(fig)

    def generate_user_map(self):
        """
      Generate a user mapping which maps a github username to the User ID.
      It will go through all usernames and compare it with existing usernames from pivotal tracker and student info.
      Candidates are listed based on edit distance.
      Input 'Y' will link the username to the User ID of the candidate. Input 'S' will skip all candidates.
      It will generate cache/new_user_mapping.json. It assumes cache/user_mapping.json exists, which is a mapp from
      tracker users and students to user ID.

      TODO: NEED REFACTOR
    """
        with open('cache/user_mapping.json', 'r') as f_in:
            user_map = json.load(f_in)
        self.student_list = user_map.keys()
        lstCommits = self.commits(reload=True)
        setStudents = set()
        for proj_commit in lstCommits:
            for commit in proj_commit:
                setStudents.add(commit['commit']['author']['name'])

        counter = 0
        for student in setStudents:
            if student in self.student_list:
                continue
            choices = self._nearest_neighbor(student)
            for choice in choices:
                inpt = input('{} and {}?'.format(choice, student))
                if inpt == 'Y':
                    counter += 1
                    user_map[student] = user_map[choice]
                    break
                if inpt == 'S':
                    break
        print('{}/{}'.format(counter, len(setStudents)))
        with open('cache/new_user_mapping.json', 'w') as f_out:
            json.dump(user_map, f_out)

    def _nearest_neighbor(self, wd):
        choices = list(
            sorted(self.student_list, key=lambda x: self._distance(x, wd)))[:3]
        return choices

    def _distance(self, wd_1, wd_2):
        wd_2 = ', '.join(reversed(wd_2.split(' ')))
        return nltk.edit_distance(wd_1.lower(), wd_2.lower())

    def extract_proj(self, commit):
        info = commit['url'].split('/')
        ind = info.index('api.github.com')
        return '{}/{}'.format(info[ind + 2], info[ind + 3])

    def iteration_commits(self):
        """
      Group commits into interations.
      Assume conf/iterations.json exists. Assume there are four iterations.

      Output
        - a list of four lists containing all commits of that iteration.
    """
        commits = self.commits(reload=True)
        iterations = [[], [], [], []]
        with open('conf/iterations.json', 'r') as f_in:
            timestamps = json.load(f_in)
        timestamps = [
            datetime.datetime.strptime(s, '%Y-%m-%d') for s in timestamps
        ]
        for proj in commits:
            for commit in proj:
                t = datetime.datetime.strptime(
                    commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ')
                i = np.searchsorted(timestamps, t)
                if i in [1, 2, 3, 4]:
                    iterations[i - 1].append(commit)
        return iterations
Beispiel #10
0
 def _load_connection(self):
   self.client = GithubApi(self.tokens['github']['token'])
Beispiel #11
0
class MetricGithub(BasicMetric):
  """All metrics concerning Github"""
  def __init__(self, proj, tokens, **args):
    super(MetricGithub, self).__init__(proj, tokens)

    self.out_header = 'metric_github'
    if not self.out_header in os.listdir(self.ROOT_PATH+'/results'):
      os.mkdir('{}/results/{}'.format(self.ROOT_PATH, self.out_header))
    self.out_header = '{}/results/{}'.format(self.ROOT_PATH, self.out_header)
    self.projsha_commit, self.projsha_pr = {}, {}

  def _load_connection(self):
    self.client = GithubApi(self.tokens['github']['token'])

  def metrics(self, **args):
    commits = self._commits()
    pull_requests = self._pull_requests()
    with open('{}/conf/iterations.json'.format(self.ROOT_PATH), 'r') as f_in:
      iterations = json.load(f_in)
    iterations = [time.mktime(time.strptime(x, '%Y-%m-%d')) for x in iterations]
    iteration_data = defaultdict(lambda: defaultdict(lambda: []))
    for cmit in commits:
      ctime = time.mktime(time.strptime(cmit['commit']['committer']['date'], '%Y-%m-%dT%H:%M:%SZ'))
      nite = bisect(iterations, ctime)
      if not nite in [1, 2, 3, 4]:
        continue
      sha = cmit['sha']
      cmit_info = self._get_commit(sha)
      iteration_data[nite]['num_files'].append(len(cmit_info['files']))
      iteration_data[nite]['comments'].append(cmit_info['commit']['message'])
    for pr in pull_requests:
      if not pr['merged_at']:
        continue
      ctime = time.mktime(time.strptime(pr['created_at'], '%Y-%m-%dT%H:%M:%SZ'))
      mtime = time.mktime(time.strptime(pr['merged_at'], '%Y-%m-%dT%H:%M:%SZ'))
      nite = bisect(iterations, mtime)
      if not nite in [1, 2, 3, 4]:
        continue
      pr_info = self._get_pull_request(pr['number'])
      num_comments = pr_info['comments']

      iteration_data[nite]['review_time'].append(mtime-ctime)
      iteration_data[nite]['pr_comments'].append(num_comments)

    result = defaultdict(lambda: [None for _ in self.metric_name()])
    for k, v in iteration_data.items():
      result[k] = self._extract(v)
    return result

  def metric_name(self):
    return ['Files Edited', 'Message Length', 'PR Review', 'PR Comments']

  def _extract(self, info):
    total_num_files = np.sum(info['num_files'])
    avg_msg_length = np.average([len(nltk.word_tokenize(x)) for x in info['comments']])
    avg_review_time = np.average([np.log(x+1) for x in info['review_time']]) if len(info['review_time']) > 0 else None
    avg_num_comments = np.average(info['pr_comments']) if len(info['pr_comments']) > 0 else None
    return [total_num_files, avg_msg_length, avg_review_time, avg_num_comments]

  def _commits(self, reload=True):
    proj_commit = {}
    if 'proj2commits.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)):
      with open('{}/cache/proj2commits.json'.format(self.ROOT_PATH), 'r') as f_in:
        proj_commit = json.load(f_in)
      if reload and self.proj['ID'] in proj_commit:
        return proj_commit[self.proj['ID']]
    owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo']
    commits = self.client.get_commits(owner, repo)
    proj_commit[self.proj['ID']] = commits
    with open('{}/cache/proj2commits.json'.format(self.ROOT_PATH), 'w') as f_out:
      json.dump(proj_commit, f_out, sort_keys=True, indent=4, separators=(',', ': '))
    return commits

  def _get_commit(self, sha, reload=True):
    dict_key = '{}:{}'.format(self.proj['ID'], sha)
    if dict_key in self.projsha_commit:
      return self.projsha_commit[dict_key]
    if 'projsha2commit.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)):
      with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'r') as f_in:
        self.projsha_commit = json.load(f_in)
      if reload and dict_key in self.projsha_commit:
        return self.projsha_commit[dict_key]
    owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo']
    commit = self.client.get_commit(owner, repo, sha)
    multiplier, sleep_time = 2, 0.1
    while 'message' in commit:
      print(commit['message'])
      time.sleep(sleep_time)
      commit = self.client.get_commit(owner, repo, sha)
      sleep_time *= multiplier
    self.projsha_commit[dict_key] = commit
    with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'w') as f_out:
      json.dump(self.projsha_commit, f_out, sort_keys=True, indent=4, separators=(',', ': '))
    return commit

  def _pull_requests(self, reload=True):
    proj_requests = {}
    if 'proj2prs.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)):
      with open('{}/cache/proj2prs.json'.format(self.ROOT_PATH), 'r') as f_in:
        proj_requests = json.load(f_in)
      if reload and self.proj['ID'] in proj_requests:
        return proj_requests[self.proj['ID']]
    owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo']
    prs = self.client.get_pull_requests(owner, repo)
    proj_requests[self.proj['ID']] = prs
    with open('{}/cache/proj2prs.json'.format(self.ROOT_PATH), 'w') as f_out:
      json.dump(proj_requests, f_out, sort_keys=True, indent=4, separators=(',', ': '))
    return prs

  def _get_pull_request(self, number, reload=True):
    dict_key = '{}:{}'.format(self.proj['ID'], number)
    if dict_key in self.projsha_pr:
      return self.projsha_pr[dict_key]
    if 'projsha2pr.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)):
      with open('{}/cache/projsha2pr.json'.format(self.ROOT_PATH), 'r') as f_in:
        self.projsha_pr = json.load(f_in)
      if reload and dict_key in self.projsha_pr:
        return self.projsha_pr[dict_key]
    owner, repo = self.proj['repo']['owner'], self.proj['repo']['repo']
    pr = self.client.get_pull_request(owner, repo, number)
    multiplier, sleep_time = 2, 1
    while 'message' in pr:
      print(pr['message'])
      time.sleep(sleep_time)
      pr = self.client.get_pull_request(owner, repo, number)
      sleep_time *= multiplier
    self.projsha_pr[dict_key] = pr
    with open('{}/cache/projsha2pr.json'.format(self.ROOT_PATH), 'w') as f_out:
      json.dump(self.projsha_pr, f_out, sort_keys=True, indent=4, separators=(',', ': '))
    return pr

  def dump(self):
    if 'projsha2commit.json' in os.listdir('{}/cache/'.format(self.ROOT_PATH)):
      with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'r') as f_in:
        tmp_projsha_commit = json.load(f_in)
      self.projsha_commit.update(tmp_projsha_commit)
    with open('{}/cache/projsha2commit.json'.format(self.ROOT_PATH), 'w') as f_out:
      json.dump(self.projsha_commit, f_out, sort_keys=True, indent=4, separators=(',', ': '))
Beispiel #12
0
class ProcessMiningAnalyzer(object):
    """
    Process mining based on git commits.
  """
    def __init__(self, project_info):
        """
      Tokens are read from conf/tokens.json file.
      
      Input
        - project_info: dataset contains all course projects
    """
        super(ProcessMiningAnalyzer, self).__init__()
        self.project_info = project_info
        with open('conf/tokens.json', 'r') as f_in:
            self.token = json.load(f_in)
        self.gt_analyzer = GithubAnalyzer(self.token['github']['token'],
                                          self.project_info)
        self.gt_client = GithubApi(self.token['github']['token'])
        self.out_header = 'process_mining'
        if not self.out_header in os.listdir('results'):
            os.mkdir('results/{}'.format(self.out_header))
        self.out_header = 'results/{}'.format(self.out_header)

    def ftype_count(self):
        self._build_graph()
        types, mixed_numbers = [], []
        for _, v in self.commit_graph.sha2node.items():
            valid_types = set()
            for f_type in v.type:
                if 'app' in f_type:
                    valid_type = 'app'
                elif 'test' in f_type:
                    valid_type = 'test'
                else:
                    valid_type = f_type
                valid_types.add(valid_type)
                types.append(valid_type)
            if 'unknown' in valid_types:
                valid_types.remove('unknown')
            mixed_numbers.append(len(valid_types))

        plotdata = pd.DataFrame({'types': types})
        fig, ax = plt.subplots()
        sns.countplot(y='types', data=plotdata)
        plt.savefig('{}/type_count.png'.format(self.out_header))
        plt.close(fig)

        plotdata = pd.DataFrame({'mixed_numbers': mixed_numbers})
        fig, ax = plt.subplots()
        sns.countplot(y='mixed_numbers', data=plotdata)
        plt.savefig('{}/mixed_number_count.png'.format(self.out_header))
        plt.close(fig)

    def frequent_pattern(self, step_size=3):
        import time

        def get_type(ftype):
            return ftype[0].upper()

        self._build_graph()
        patterns, counter = [], defaultdict(lambda: 0)
        time_atm, time_tam, time_all = [], [], []
        for _, nd in tqdm(self.commit_graph.sha2node.items()):
            for ptn in nd.next(step_size, get_type):
                if len(ptn) != step_size + 1:
                    continue
                str_id = '-'.join(ptn)
                patterns.append(str_id)
                counter[str_id] += 1
                if 'A-T' in str_id:
                    time_atm.append(time.mktime(nd.timestamp.timetuple()))
                if 'T-A' in str_id:
                    time_tam.append(time.mktime(nd.timestamp.timetuple()))
                time_all.append(time.mktime(nd.timestamp.timetuple()))

        sorted_list = sorted([(k, v) for k, v in counter.items()],
                             key=lambda x: -x[1])
        print(sorted_list[:10])

        plotdata = pd.DataFrame({'patterns': patterns})
        fig, ax = plt.subplots()
        sns.countplot(y='patterns', data=plotdata)
        plt.savefig('{}/pattern_count.png'.format(self.out_header))
        plt.close(fig)

        fig, ax = plt.subplots()
        sns.distplot([np.log(v) for _, v in counter.items()])
        plt.savefig('{}/pattern_count_dist.png'.format(self.out_header))
        plt.close(fig)

        fig, ax = plt.subplots()
        sns.distplot(time_atm, label='at')
        sns.distplot(time_tam, label='ta')
        plt.legend()
        plt.savefig('{}/hist_tam_atm.png'.format(self.out_header))
        plt.close(fig)

        ite_at, ite_ta = [], []
        ite_boundary = [1.42e9, 1.45e9, 1.47e9, 1.50e9]

        def boundary_func(x):
            return x > ite_boundary[i] and x < ite_boundary[i + 1]

        for i in range(3):
            num_ite = len(list(filter(boundary_func, time_all)))
            num_at = len(list(filter(boundary_func, time_atm)))
            num_ta = len(list(filter(boundary_func, time_tam)))

            ite_at.append(num_at / num_ite)
            ite_ta.append(num_ta / num_ite)
        print(ite_at)
        print(ite_ta)

    def neighbor_selection(self, step_size=3):
        import time
        import time

        def get_type(ftype):
            return ftype[0].upper()

        self._build_graph()
        patterns, edge_counter = defaultdict(lambda: 0), defaultdict(lambda: 0)
        total_num = len(self.commit_graph.sha2node)
        for _, nd in tqdm(self.commit_graph.sha2node.items()):
            tmp_signature = nd.next(step_size, get_type)
            for ptn in tmp_signature:
                str_id = '-'.join(ptn)
                patterns[str_id] += 1
                if nd.parents:
                    for pnd in nd.parents:
                        for pptn in pnd.next(step_size, get_type):
                            edge_counter['{}->{}'.format(
                                '-'.join(pptn), str_id)] += 1
                else:
                    patterns['START'] += 1
                    edge_counter['START->{}'.format(str_id)] += 1

        print(len(patterns))
        with open('cache/pattern_count.json', 'w') as f_out:
            json.dump(patterns, f_out)
        with open('cache/edge_strength.json', 'w') as f_out:
            json.dump(edge_counter, f_out)

        filtered_patterns = {}
        for k, v in patterns.items():
            if np.log(v) > -1:
                filtered_patterns[k] = v
        # print(sorted(filtered_patterns, key=lambda x: -filtered_patterns[x]))
        print(len(filtered_patterns))

        fig, ax = plt.subplots()
        sns.distplot([np.log(v) for _, v in patterns.items()])
        plt.savefig('{}/state_counter.png'.format(self.out_header))
        plt.close(fig)

        fig, ax = plt.subplots()
        sns.distplot([v for _, v in filtered_patterns.items()])
        plt.savefig('{}/freq_state_counter.png'.format(self.out_header))
        plt.close(fig)

        ptn2index = {}
        for k in filtered_patterns:
            ptn2index[k] = len(ptn2index)
        edge_mat = np.zeros((len(ptn2index), len(ptn2index)))
        for k, v in edge_counter.items():
            ptn_1, ptn_2 = k.split('->')
            if ptn_1 in ptn2index and ptn_2 in ptn2index:
                edge_mat[ptn2index[ptn_1], ptn2index[ptn_2]] = v

        from sklearn.manifold import TSNE
        model = TSNE(n_components=2, random_state=0)
        y = model.fit_transform(edge_mat)

        fig, ax = plt.subplots()
        plt.scatter([x[0] for x in y], [x[1] for x in y])
        plt.savefig('{}/filtered_link_mat_tsne.png'.format(self.out_header))
        plt.close(fig)

    def _build_graph(self, project=None):
        """
      Build a commit graph for a given project. If project is None, build a single graph for all projects.

      Input
        - project: a project from project info 
    """
        self.commit_graph = CommitGraph()
        with open('cache/sha2commit_new.json', 'r') as f_in:
            sha2cmit = json.load(f_in)
        proj_dict = {}
        if project:
            commits = self.gt_client.get_commits(project['repo']['owner'],
                                                 project['repo']['repo'])
            for cmit in commits:
                sha = cmit['sha']
                if sha in sha2cmit:
                    proj_dict[sha] = sha2cmit[sha]
        else:
            proj_dict = sha2cmit
        self.commit_graph.construct(proj_dict)
        if project:
            print('{}: {} root'.format(project['project'],
                                       len(self.commit_graph.root)))
        else:
            print('All: {} root'.format(len(self.commit_graph.root)))

    def _convert_commit(self, commit):
        """
      Convert a commit into an object for analysis

      Input
        - commit: a dictionary got from GitHub get single commit API
    """
        file_types = [self._file_type(item) for item in commit['files']]
Beispiel #13
0
class ProcessSegmentAnalyzer(object):
    """
    Segment git commits and correlates them with user stories.
  """
    def __init__(self, project_info):
        """
      Tokens are read from conf/tokens.json file.
      
      Input
        - project_info: dataset contains all course projects
    """
        super(ProcessSegmentAnalyzer, self).__init__()
        self.project_info = project_info
        with open('conf/tokens.json', 'r') as f_in:
            self.token = json.load(f_in)
        self.gt_analyzer = GithubAnalyzer(self.token['github']['token'],
                                          self.project_info)
        self.gt_client = GithubApi(self.token['github']['token'])
        self.pt_client = TrackerApi(self.token['pivotal_tracker']['token'])
        self.out_header = 'process_segment'
        if not self.out_header in os.listdir('results'):
            os.mkdir('results/{}'.format(self.out_header))
        self.out_header = 'results/{}'.format(self.out_header)

    def correlation(self, proj):
        """
      Generate segmentation for a single project.

      Input
        - proj: a data point in project_info
    """
        pass

    def time_sequence(self, proj):
        """
      Extract time information and files information from commits.

      Input
        - proj: the project
      Output
        - time_sequence: a list of datetime objects
        - file_sequence: a list of file indexes
        - file_dict: a dictionary from file index to file name
    """
        commits = self.gt_client.get_commits(proj['repo']['owner'],
                                             proj['repo']['repo'])
        stories = self.pt_client.get_stories(proj['tracker'])

        file_indexer = {}
        time_sequence, file_sequence = [], []
        for cmit in commits:
            # tmp_time = datetime.datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ')

            tmp_file_vec = []
            commit = self.gt_analyzer.get_commit(cmit['sha'])
            if not commit:
                print('Commit not found: {}'.format(cmit['sha']))
                continue
            if 'merge' in commit['commit']['message']:
                continue
            for f in commit['files']:
                if not f['filename'] in file_indexer:
                    file_indexer[f['filename']] = len(file_indexer)
                tmp_file_vec.append(file_indexer[f['filename']])
            file_sequence.append(tmp_file_vec)
            time_sequence.append(
                datetime.datetime.strptime(cmit['commit']['author']['date'],
                                           '%Y-%m-%dT%H:%M:%SZ'))
        return time_sequence, file_sequence

    def story_time(self, proj):
        """
      Extract time informaiton and story information from pivotral tracker

      Input
        - proj: the project
      Output
    """
        times, info = [], []
        for story in self.pt_client.get_stories(proj['tracker']):
            s = datetime.datetime.strptime(story['created_at'],
                                           '%Y-%m-%dT%H:%M:%SZ')
            e = datetime.datetime.strptime(story['updated_at'],
                                           '%Y-%m-%dT%H:%M:%SZ')
            times.append((s, e))
            info.append(story)
        return times, info

    def story_time_overlaps(self):
        """
      Plot
        - a counting plot of 'active' user stories over time
    """
        import time
        if not 'story_time' in os.listdir(self.out_header):
            os.mkdir('{}/story_time'.format(self.out_header))
        for proj in self.project_info[1:2]:
            times, info = self.story_time(proj)
            time_to_val = {}
            for s_t, e_t in times:
                time_to_val[s_t] = 1
                time_to_val[e_t] = -1
            time_seq, count_seq = [], []
            counter = 0
            for t in sorted(time_to_val.keys()):
                time_seq.append(t)
                counter += time_to_val[t]
                count_seq.append(counter)

            fig, ax = plt.subplots()
            plt.plot([time.mktime(t.timetuple()) for t in time_seq], count_seq)
            plt.savefig('{}/story_time/{}_{}'.format(
                self.out_header, proj['ID'], proj['project'].replace(" ", "")))
            plt.close(fig)

    def git_commit_overlaps(self):
        """
      Plot
        - a scatter plot between time and files edited for a given project.
    """
        import time
        if not 'commit_time' in os.listdir(self.out_header):
            os.mkdir('{}/commit_time'.format(self.out_header))
        for proj in self.project_info[1:2]:
            times, files = self.time_sequence(proj)
            sorted_time = sorted(times)
            t_seq, f_seq = [], []
            for i in range(len(times)):
                for f in files[i]:
                    # t_seq.append(sorted_time.index(times[i]))
                    t_seq.append(time.mktime(times[i].timetuple()))
                    f_seq.append(f)
            plotdata = pd.DataFrame({'time': t_seq, 'file': f_seq})

            fig, ax = plt.subplots()
            sns.jointplot(x='time', y='file', data=plotdata)
            plt.savefig('{}/commit_time/{}_{}.png'.format(
                self.out_header, proj['ID'], proj['project'].replace(" ", "")))
            plt.close(fig)