Example #1
0
    def test_compute_loc_using_different_time_bins(self):
        service_mgr = ServiceMgr(self.db_path)
        service_names = service_mgr.list_all_service_names()
        until = datetime(year=2019, month=12, day=31)
        step_size = 182
        for service_name in service_names:
            service = service_mgr.get_service(service_name=service_name)
            loc_dict = dict()
            for i in [1, 2]:
                s = step_size * i
                time_bins = self.data_mgr.get_time_bins(
                    service_names=service_names,
                    step_size_aprox=s,
                    until=until)
                print(time_bins)
                locs = self.data_mgr.compute_loc(service=service,
                                                 time_bins=time_bins)
                for t, l in zip(time_bins[1:], locs):
                    t_str = t.isoformat()
                    if t_str in loc_dict.keys():
                        loc_dict[t_str].append(l)
                    else:
                        loc_dict[t_str] = [l]

            for k, v in loc_dict.items():
                if len(v) == 2:
                    l1 = v[0]
                    l2 = v[1]
                    self.assertEqual(l1, l2)
Example #2
0
    def test_extract_new_data_from_ghe(self):
        f = os.path.join(os.getcwd(), 'microservices_miner', 'example.json')
        api_token = os.getenv('MINING_GHE_PERSONAL_ACCESS_TOKEN')
        with open(f) as file:
            input_data = json.load(file)
            self.ghe_extractor.extract_new_data_from_ghe(service_list=input_data,
                                                         db_path=self.db_path, api_token=api_token)
        service_mgr = ServiceMgr(db_path=self.db_path)
        for s in input_data:
            name = s.get('name')
            service_names = service_mgr.list_all_service_names()
            self.assertIn(name, service_names)
            service = service_mgr.get_service(service_name=name)
            start_dt = datetime.strptime(s.get('start_date'), '%Y-%m-%d').date()
            self.assertEqual(service.start_date, start_dt)
            input_repos = s.get('repositories')
            for input_repo in input_repos:
                found = False
                for repo_data in service.list_repository_data():
                    repo = repo_data.get('repository')
                    if repo.name == input_repo.get('name'):
                        repo_start_date = input_repo.get('start_date')
                        start_date = repo_data.get('start_date')
                        self.assertEqual(repo_start_date, start_date)

                        repo_end_date = input_repo.get('end_date')
                        end_date = repo_data.get('end_date')
                        self.assertEqual(repo_end_date, end_date)
                        found = True
                self.assertTrue(found)
Example #3
0
 def test_compute_time_to_repair(self):
     service_mgr = ServiceMgr(self.db_path)
     service_names = service_mgr.list_all_service_names()
     df = self.data_mgr.compute_repair_time(service_names=service_names)
     self.assertIsInstance(df, pd.DataFrame)
     self.assertEqual(df.shape[1], 3)
     self.assertIn('closed_at', df.columns)
     self.assertIn('time_to_repair', df.columns)
     self.assertIn('repository', df.columns)
     self.assertGreaterEqual(df.shape[0], 1)
Example #4
0
    def compute_repair_time(self, service_names):
        """
        compute the time to repair a bug, i.e., the time between the bug report and the bug fix.
        This time is computed in days

        Parameters
        ----------
        service_names: list of str

        Returns
        -------
        pd.DataFrame
        """
        repository_list = list()
        service_mgr = ServiceMgr(db_path=self.db_path)
        issue_mgr = IssueMgr(path_to_db=self.db_path)
        for service_name in service_names:
            print('Getting data from {}'.format(service_name))
            service = service_mgr.get_service(service_name=service_name)

            for repo_data in service.list_repository_data():
                repo = repo_data.get('repository')
                if repo not in repository_list:
                    repository_list.append(repo)

        time_to_repair = list()
        closed_at_list = list()
        repositories = list()
        for repo in repository_list:
            issues_aux = issue_mgr.get_issues_by_label(
                repository_id=repo.repository_id)
            for issue in issues_aux:
                if issue.state == 'closed':
                    td = issue.closed_at - issue.created_at
                    ttr = td.days * 24 * 60 * 60 + td.seconds
                    ttrhr = ttr / (24 * 60 * 60)
                    if ttrhr >= 500:
                        print(issue)
                    time_to_repair.append(ttrhr)
                    closed_at_list.append(issue.closed_at)
                    repositories.append(repo.name)

        df = pd.DataFrame(
            data={
                'closed_at': closed_at_list,
                'time_to_repair': time_to_repair,
                'repository': repositories
            })
        filename = os.path.join(os.getenv('BASE_DIR'), 'data', 'metrics',
                                'repair_time.csv')
        df.to_csv(filename)
        return df
Example #5
0
class TestServiceMgr(unittest.TestCase):
    def setUp(self) -> None:
        db_path = os.getenv('DB_PATH')
        self.service_mgr = ServiceMgr(db_path=db_path)
        self.conn = sqlite3.connect(db_path)

    def test_get_service_by_name(self):

        cursor = self.conn.cursor()
        service_names = self.service_mgr.list_all_service_names()
        filesystem_mgr = FileSystemMgr(db_path=os.getenv('DB_PATH'))
        for service_name in service_names:
            service = self.service_mgr.get_service(service_name=service_name)

            count_repo_sql = 'select count(*) from {} where service_id = {};'.format(
                ServiceRepositoryConn.TABLE_NAME, service.service_id)
            cursor.execute(count_repo_sql)
            row = cursor.fetchone()
            total_number_of_repositories = row[0]

            self.assertIsInstance(service, Service)
            self.assertTrue(len(service.list_repository_data()) ==
                            total_number_of_repositories > 0,
                            msg='Error! Invalid number of services')
            for repo_data in service.list_repository_data():
                repo = repo_data.get('repository')
                count_commits_sql = 'select count(*) from {} where repository_id = {};' \
                    .format(RepositoryCommitConn.TABLE_NAME, repo.repository_id)
                cursor.execute(count_commits_sql)
                self.assertIsInstance(repo, Repository)
                for c in repo.commits:

                    filename_sql = 'select filename from {} where commit_id = {};' \
                        .format(FileModificationConn.TABLE_NAME, c.commit_id)
                    cursor.execute(filename_sql)
                    rows = cursor.fetchall()
                    counter = 0
                    for row in rows:
                        filename = row[0]
                        if filesystem_mgr.check_filename(
                                filename=filename,
                                service_id=service.service_id,
                                repository_id=repo.repository_id):

                            counter += 1
                    filemodication_counter = len(c.file_modifications)
                    self.assertEqual(
                        counter,
                        filemodication_counter,
                        msg='Different number of file modifications')
Example #6
0
    def test_compute_bug_per_loc_ratio(self):
        """

        Returns
        -------

        """
        until = datetime(year=2019, month=12, day=31)
        service_mgr = ServiceMgr(db_path=self.db_path)
        service_names = service_mgr.list_all_service_names()
        time_bins = self.data_mgr.get_time_bins(service_names=service_names,
                                                step_size_aprox=365,
                                                until=until)
        df = self.data_mgr.compute_bug_per_loc_ratio(
            service_names=service_names, time_bins=time_bins)
        self.assertGreater(df.shape[0], 0)
        self.assertEqual(df.shape[1], 4)
        self.assertTrue(
            all(c in df.columns
                for c in ['year', 'defect_density', 'bugs', 'loc']))
Example #7
0
    def test_check_filename(self):
        service_mgr = ServiceMgr(db_path=self.db_path)
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        for name in service_mgr.list_all_service_names():
            service = service_mgr.get_service(service_name=name)
            for repo_data in service.list_repository_data():
                repo = repo_data.get('repository')  # type: Repository
                sql = 'select * from {} where repository_id = {};'.format(ServiceRepositoryConn.TABLE_NAME,
                                                                          repo.repository_id)
                cursor.execute(sql)
                rows = cursor.fetchall()
                if len(rows) > 1:
                    sql = 'select pattern, type, repository_id from filename_pattern where service_id = {};'\
                        .format(service.service_id)
                    cursor.execute(sql)
                    patterns = cursor.fetchall()
                    for pat in patterns:
                        pattern = pat[0]
                        type = pat[1]
                        repo_id = pat[2]
                        if repo.repository_id == repo_id:
                            for c in repo.commits:
                                for fm in c.file_modifications:
                                    is_valid = self.fs.check_filename(filename=fm.filename,
                                                                      service_id=service.service_id,
                                                                      repository_id=repo.repository_id)
                                    self.assertIsInstance(is_valid, bool)
                                    endswith_py = fm.filename.endswith('.py')
                                    if pattern in fm.filename.lower() and type == FileSystemMgr.INCLUSION:

                                        self.assertTrue(is_valid,
                                                        msg='service = {} filename={} '.format(service.name, fm.filename))
                                        self.assertTrue(endswith_py,
                                                        msg='service = {} filename={} '.format(service.name, fm.filename))
                                    elif pattern in fm.filename and type == FileSystemMgr.EXCLUSION:
                                        self.assertFalse(is_valid,
                                                             msg='service = {} filename={} '.format(service.name, fm.filename))

                            break
Example #8
0
    def compute_bug_per_loc_ratio(self, service_names, time_bins):
        """

        Parameters
        ----------
        service_names: List[str]
        time_bins: Tuple[datetime]

        Returns
        -------
        pd.DataFrame
        """
        print(
            'Getting bugs and changes from services: {}'.format(service_names))
        # connecting to the database
        issue_mgr = IssueMgr(path_to_db=self.db_path)
        service_mgr = ServiceMgr(db_path=self.db_path)

        # initializing variables
        bugs = list()
        loc_list = list()
        date_list = list()
        service_list = list()

        # for each service
        for service_name in service_names:
            print('Analysing the defect density of {} service'.format(
                service_name))
            service = service_mgr.get_service(service_name=service_name)
            loc_aux = self.compute_loc(service, time_bins)
            loc_list.extend(loc_aux)
            for i in range(1, len(time_bins)):

                prev_t = time_bins[i - 1]
                cur_t = time_bins[i]
                bugs_counter = 0

                for repo_data in service.list_repository_data():
                    repo = repo_data.get('repository')  # type: Repository
                    # print('Getting issues from {}'.format(repo.name))
                    issues = issue_mgr.get_issues_by_label(
                        repository_id=repo.repository_id)

                    for issue in issues:
                        if issue.closed_at is not None and prev_t <= issue.closed_at < cur_t:
                            bugs_counter += 1

                    for c in repo.commits:
                        if prev_t <= c.date < cur_t:
                            if DataMgr._is_bug_fix(comment=c.comment) \
                                    and not DataMgr.closed_using_keywords(line=c.comment):
                                print('{}: {}'.format(service.name, c.comment))
                                bugs_counter += 1
                bugs.append(bugs_counter)
                service_list.append(service_name)
                date_list.append(cur_t)
            assert len(loc_list) == len(bugs) == len(date_list) == len(
                service_list)
        temp_df = pd.DataFrame(
            data={
                'service': service_list,
                'date': date_list,
                'bugs': bugs,
                'loc': loc_list
            })
        temp_filename = os.path.join(os.getenv('BASE_DIR'), 'data', 'metrics',
                                     'defect_density.csv')
        temp_df.to_csv(temp_filename, sep=',')
        temp_df = temp_df[temp_df['loc'] != 0]
        temp_df.loc[:, 'year'] = temp_df['date'].apply(lambda t: t.year)
        df_group = temp_df.groupby(by=['year'], as_index=False).sum()
        df_group['defect_density'] = df_group['bugs'] / (df_group['loc'] /
                                                         1000)

        df_group.to_csv(
            os.path.join(os.getenv('BASE_DIR'), 'data', 'metrics',
                         PlotMgr.DEFECT_DENSITY_CSV_FILE))
        return df_group
Example #9
0
    def test_compute_loc_per_commit(self):
        service_mgr = ServiceMgr(self.db_path)
        repository_mgr = RepositoryMgr(self.db_path)
        max_absolute_error = 1000
        max_relative_error = 0.1
        service_names = service_mgr.list_all_service_names()
        for service_name in sorted(service_names):

            print('**********\n{}'.format(service_name))
            service = service_mgr.get_service(
                service_name=service_name)  # type: Service
            for repo_data in service.list_repository_data():
                temp_repo = repo_data.get('repository')  # type: Repository

                repo_sha_set = set()
                oracle_sha_set = set()
                repository = repository_mgr.get_repository(
                    service_id=service.service_id,
                    repository_id=temp_repo.repository_id)
                test_oracle = os.path.join(
                    os.getenv('BASE_DIR'), 'data', 'oracle',
                    'service_id_{}_{}__oracle_cloc.csv'.format(
                        service.service_id, temp_repo.name))
                self.assertTrue(
                    os.path.isfile(test_oracle),
                    msg='Error! file does not exist: {}'.format(test_oracle))
                loc_list, sha_list, date_list = self.data_mgr.compute_loc_per_repository(
                    repository=repository)
                df = pd.read_csv(test_oracle, index_col=0, sep=';')
                self.assertGreater(df.shape[0], 0)
                for i in df.index:
                    oracle_sha_set.add(i)
                self.assertGreater(len(loc_list), 0)
                for loc, sha, dt in zip(loc_list, sha_list, date_list):

                    if sha in df.index:
                        s = df.loc[sha, ['CLOC']]
                        l = s.tolist()
                        assert len(l) == 1, "Error! {}".format(l)
                        cloc = l.pop()
                        if loc == 0:
                            if cloc == 0:
                                ratio = 1.0
                            else:
                                continue
                        else:
                            ratio = cloc / loc
                        rel_diff = np.abs(ratio - 1.0)
                        print('SHA={} LOC={} CLOC={}'.format(sha, loc, cloc))
                        self.assertIsInstance(
                            rel_diff,
                            float,
                            msg='Error! delta is {} type={}'.format(
                                rel_diff, type(rel_diff)))
                        abs_diff = np.abs(loc - cloc)
                        print(
                            '{} relative diff = {} absolute diff = {}'.format(
                                dt, rel_diff, abs_diff))
                        if rel_diff > max_relative_error:

                            self.assertGreater(max_absolute_error,
                                               abs_diff,
                                               msg='Error! sha={}'.format(sha))

                print('service={} len oracle = {} len GHE = {}'.format(
                    service_name, len(oracle_sha_set), len(repo_sha_set)))
                print('service={} oracle - GHE = {}'.format(
                    service_name, oracle_sha_set - repo_sha_set))
                print('service={} GHE - oracle = {}'.format(
                    service_name, repo_sha_set - oracle_sha_set))
                print('service={} intersection {}'.format(
                    service_name, repo_sha_set.intersection(oracle_sha_set)))
Example #10
0
 def setUp(self) -> None:
     db_path = os.getenv('DB_PATH')
     self.service_mgr = ServiceMgr(db_path=db_path)
     self.conn = sqlite3.connect(db_path)
Example #11
0
    def extract_new_data_from_ghe(service_list: List[Dict], db_path: str,
                                  api_token: str) -> None:
        """

        Parameters
        ----------
        service_list: List[Dict]
        db_path: str
        api_token: str

        Returns
        -------
        None
        """
        extractor = GHEExtractor(db_path=db_path)
        commit_mgr = CommitMgr(path_to_db=db_path)
        repo_mgr = RepositoryMgr(path_to_db=db_path)
        service_mgr = ServiceMgr(db_path=db_path)
        filesystem_mgr = FileSystemMgr(db_path)

        for s in service_list:
            service_name = s.get('name')
            end_date_str = s.get('end_date')
            start_date_str = s.get('start_date')
            programming_languages = s.get('programming_languages')
            logging.info('Extracting data from: {}'.format(service_name))
            service = service_mgr.get_service(service_name=service_name)
            if service is None:
                sid = service_mgr.insert_service(name=service_name,
                                                 start_date_str=start_date_str)
                service = service_mgr.get_service(service_id=sid)
                assert isinstance(sid, int)
                filesystem_mgr.insert_extensions(
                    service_id=sid,
                    programming_languages=programming_languages)
            else:
                sid = service.service_id

            # get list of extensions of this service
            extensions = filesystem_mgr.get_extensions(service_id=sid)
            for rep in s.get('repositories'):
                url = rep.get('url')
                base_url = GHEExtractor._get_base_url(repo_url=url)
                print('Getting data from: {}'.format(url))
                repo_name = rep.get('name')
                owner = rep.get('owner')
                repo = repo_mgr.get_repository(url=url, service_id=sid)
                if repo is None:
                    repo_url = '{}/{}/{}'.format(base_url, owner, repo_name)
                    repo = repo_mgr.create_repository(name=repo_name,
                                                      url=repo_url)
                    repo.repository_id = repo_mgr.insert_repository(
                        repository=repo)
                    repo.owner = owner

                start_date_repo = rep.get('start_date')
                end_date_repo = rep.get('end_date')
                initial_loc = rep.get('initial_loc')
                service_mgr.insert_service_repository(
                    service_name=service_name,
                    repository_id=repo.repository_id,
                    start_date=start_date_repo,
                    end_date=end_date_repo,
                    initial_loc=initial_loc)

                # get most recent commits
                last_commit = commit_mgr.get_commit_by_position(
                    repository_id=repo.repository_id, pos=-1)
                if last_commit is not None:
                    since = last_commit.date
                else:
                    since = None
                assert repo is not None, "Error! Invalid repo"
                assert isinstance(
                    repo,
                    Repository), "Error! repo is not a type of Repository"
                assert repo.repository_id is not None, "Error! repo id is none: {}".format(
                    repo)
                commits = extractor.extract_commits_from_ghe(
                    base_url=base_url,
                    owner=owner,
                    repo=repo,
                    since=since,
                    api_token=api_token)
                extractor._insert_commit_data_into_database(
                    commit_list=commits,
                    repo=repo,
                    owner=owner,
                    base_url=base_url,
                    api_token=api_token)

                extractor._find_and_repair_inconsistencies(
                    owner=owner,
                    repository=repo,
                    extensions=extensions,
                    base_url=base_url)
            try:
                end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
            except (ValueError, TypeError) as e:
                end_date = None

            try:
                start_date_dt = datetime.strptime(start_date_str, "%Y-%m-%d")
                start_date = start_date_dt.date()
            except (ValueError, TypeError) as e:
                start_date = None

            service_mgr.update_dates(service_id=service.service_id,
                                     end_date=end_date,
                                     start_date=start_date)