def setUp(self):
        # Setup logging
        self.input_path_python = os.path.dirname(
            os.path.realpath(__file__)) + '/data/python_project'
        self.input_path_java = os.path.dirname(
            os.path.realpath(__file__)) + '/data/java_project2'
        self.out_java = os.path.dirname(
            os.path.realpath(__file__)) + '/data/out_java'

        # Clear database first (we need a small hack here, as mongomocks drop_database does not work)
        Project.drop_collection()
        VCSSystem.drop_collection()
        Commit.drop_collection()
        File.drop_collection()

        self.project_id = Project(name="zookeeper").save().id
        self.vcs_id = VCSSystem(url="http://test.de",
                                project_id=self.project_id,
                                repository_type="test").save().id
        self.commit_id = Commit(revision_hash="2342",
                                vcs_system_id=self.vcs_id).save()
        self.file1 = File(path="contribs/CekiGulcu/AppenderTable.java",
                          vcs_system_id=self.vcs_id).save()
        self.file2 = File(path="contribs/LeosLiterak/TempFileAppender.java",
                          vcs_system_id=self.vcs_id).save()
        self.file3 = File(
            path="src/main/java/org/apache/log4j/AsyncAppender.java",
            vcs_system_id=self.vcs_id).save()

        shutil.rmtree(self.out_java, ignore_errors=True)
        shutil.rmtree(self.input_path_java, ignore_errors=True)
        os.makedirs(self.out_java)

        # Copying up fake files that are generated by SourceMeter
        self.class_csv = os.path.dirname(
            os.path.realpath(__file__)) + '/data/csv_data/zookeeper-Class.csv'
        self.package_csv = os.path.dirname(os.path.realpath(
            __file__)) + '/data/csv_data/zookeeper-Package.csv'
        self.component_csv = os.path.dirname(os.path.realpath(
            __file__)) + '/data/csv_data/zookeeper-Component.csv'
        shutil.copy(self.class_csv, self.out_java)
        shutil.copy(self.package_csv, self.out_java)
        shutil.copy(self.component_csv, self.out_java)

        # Create Files and directories
        os.makedirs(self.input_path_java + '/contribs/CekiGulcu')
        os.makedirs(self.input_path_java + '/contribs/LeosLiterak')
        os.makedirs(self.input_path_java + '/src/main/java/org/apache/log4j')

        Path(self.input_path_java +
             '/contribs/CekiGulcu/AppenderTable.java').touch()
        Path(self.input_path_java +
             '/contribs/LeosLiterak/TempFileAppender.java').touch()
        Path(self.input_path_java +
             '/src/main/java/org/apache/log4j/AsyncAppender.java').touch()
Esempio n. 2
0
def main(args):
    # timing
    start = timeit.default_timer()

    if args.log_level and hasattr(logging, args.log_level):
        log.setLevel(getattr(logging, args.log_level))

    uri = create_mongodb_uri_string(args.db_user, args.db_password, args.db_hostname, args.db_port,
                                    args.db_authentication, args.ssl)
    connect(args.db_database, host=uri)

    # Get the id of the project for which the code entities shall be merged
    try:
        project_id = Project.objects(name=args.project_name).get().id
    except DoesNotExist:
        log.error('Project %s not found!' % args.project_name)
        sys.exit(1)

    vcs = VCSSystem.objects(project_id=project_id).get()

    log.info("Starting commit labeling")

    # import every approach defined or all
    if args.approaches == 'all':
        # just list every module in the package and import it
        basepath = os.path.dirname(os.path.abspath(__file__))
        for app in os.listdir(os.path.join(basepath, 'approaches/')):
            if app.endswith('.py') and app != '__init__.py':
                __import__('approaches.{}'.format(app[:-3]))
    else:
        # if we have a list of approaches import only those
        for app in args.approaches.split(','):
            __import__('approaches.{}'.format(app))

    # add specific configs
    labelshark = LabelSHARK()
    commit_count = Commit.objects(vcs_system_id=vcs.id).count()

    for i,commit in enumerate(Commit.objects(vcs_system_id=vcs.id).only('id', 'revision_hash', 'vcs_system_id', 'message', 'linked_issue_ids', 'parents', 'fixed_issue_ids', 'szz_issue_ids').timeout(False)):
        if i%100 == 0:
            log.info("%i/%i  commits finished", i, commit_count)
        labelshark.set_commit(commit)
        labels = labelshark.get_labels()

        #log.info('commit: {}, labels: {}'.format(commit.revision_hash, labels))

        # save the labels
        if labels:
            tmp = {'set__labels__{}'.format(k): v for k, v in labels}
            Commit.objects(id=commit.id).upsert_one(**tmp)

    end = timeit.default_timer() - start
    log.info("Finished commit labeling in {:.5f}s".format(end))
Esempio n. 3
0
    def _classify_hunk_documentation(self, commit_issue_df):
        commit = Commit.objects(id=commit_issue_df._id[0])
        commit_df = map_mongo_to_pandas(commit)
        commit_df.rename(columns={'_id': 'commit_id'}, inplace=True)
        # read changed files
        file_action_files = self._get_actions_files(commit_df)
        file_action_files = file_action_files.loc[:, ("commit_id", "path",
                                                      "file_id",
                                                      "file_action_id")]
        file_action_files = file_action_files[
            file_action_files.path.str.contains(
                "(\.java$)|(\.c$)|(\.cpp$)|(\.py$)|(\.h$)|(\.cc$)") == True]
        commit_files = commit_df.merge(file_action_files, how="left")
        hunks = Hunk.objects(
            file_action_id__in=commit_files.file_action_id.values.tolist())
        hunks_df = map_mongo_to_pandas(hunks)

        commit_files_hunk = commit_files.merge(hunks_df,
                                               how="left",
                                               left_on="file_action_id",
                                               right_on="file_action_id")

        # check if hunks contain actual code rather than comments and blank lines for candidate test
        commit_files_hunk["c_documentation"] = 0
        commit_files_hunk = commit_files_hunk.apply(
            self._find_documentation_hunk, axis=1)
        commit_files_hunk = commit_files_hunk.loc[:, ("commit_id",
                                                      "c_documentation")]
        commit_files_hunk = commit_files_hunk.groupby(
            "commit_id").sum().reset_index()
        commit_files_hunk.loc[commit_files_hunk.c_documentation != 0,
                              ("c_documentation")] = 1
        commit_files_hunk = commit_files_hunk.loc[:, ("commit_id",
                                                      "c_documentation")]
        return commit_files_hunk
Esempio n. 4
0
    def initialize(self, config, repository_url, repository_type):
        """Initializes the mongostore by connecting to the mongodb, creating the project in the project collection \
        and setting up processes (see: :class:`pyvcsshark.datastores.mongostore.CommitStorageProcess`, which
        read commits out of the commitqueue, process them and store them into the mongodb.

        :param config: all configuration
        :param repository_url: url of the repository, which is to be analyzed
        :param repository_type: type of the repository, which is to be analyzed (e.g. "git")
        """

        logger.setLevel(config.debug_level)
        logger.info("Initializing MongoStore...")

        # Create queue for multiprocessing
        self.commit_queue = multiprocessing.JoinableQueue()
        # We define, that the user we authenticate with is in the admin database
        logger.info("Connecting to MongoDB...")

        uri = create_mongodb_uri_string(config.db_user, config.db_password,
                                        config.db_hostname, config.db_port,
                                        config.db_authentication,
                                        config.ssl_enabled)
        connect(config.db_database, host=uri, connect=False)

        # Get project_id
        try:
            project_id = Project.objects(name=config.project_name).get().id
        except DoesNotExist:
            logger.error('Project with name "%s" does not exist in database!' %
                         config.project_name)
            sys.exit(1)

        # Check if vcssystem already exist, and use upsert
        vcs_system_id = VCSSystem.objects(url=repository_url).upsert_one(
            url=repository_url,
            repository_type=repository_type,
            last_updated=datetime.datetime.today(),
            project_id=project_id).id

        # Get the last commit by date of the project (if there is any)
        last_commit = Commit.objects(vcs_system_id=vcs_system_id)\
            .only('committer_date').order_by('-committer_date').first()

        if last_commit is not None:
            last_commit_date = last_commit.committer_date
        else:
            last_commit_date = None

        # Start worker, they will wait till something comes into the queue and then process it
        for i in range(self.NUMBER_OF_PROCESSES):
            name = "StorageProcess-%d" % i
            process = CommitStorageProcess(self.commit_queue, vcs_system_id,
                                           last_commit_date, config, name)
            process.daemon = True
            process.start()

        logger.info("Starting storage Process...")
Esempio n. 5
0
    def run(self):
        """ Endless loop for the processes, which consists of several steps:

        1. Get a object of class :class:`pyvcsshark.dbmodels.models.CommitModel` from the queue
        2. Check if this commit was stored before and if it is so: update branches and tags (if they have changed)
        3. Store author and committer in mongodb
        4. Store Tags in mongodb
        5. Create a list of branches, where the commit belongs to
        6. Save the different file actions, which were done in this commit in the mongodb
        7. Save the commit itself


        .. NOTE:: The committer date is used to check if a commit was already stored before. Meaning: We get the \
        last commit out of the database and check if the committer date of the commits we process are > than the \
        committer date of the last commit.

        .. WARNING:: We only look for changed tags and branches here for already processed commits!
        """
        while True:
            commit = self.queue.get()
            logger.debug("Process %s is processing commit with hash %s." %
                         (self.proc_name, commit.id))

            # Try to get the commit
            try:
                mongo_commit = Commit.objects(vcs_system_id=self.vcs_system_id,
                                              revision_hash=commit.id).get()
            except DoesNotExist:
                mongo_commit = Commit(vcs_system_id=self.vcs_system_id,
                                      revision_hash=commit.id).save()

            self.set_whole_commit(mongo_commit, commit)

            # Save Revision object
            mongo_commit.save()
            logger.debug(
                "Process %s saved commit with hash %s. Queue size: %d" %
                (self.proc_name, commit.id, self.queue.qsize()))

            self.queue.task_done()
Esempio n. 6
0
    def get_commit_id(self, vcs_system_id):
        """
        Gets the commit id for the corresponding projectid and revision
        :param vcs_system_id: id of the vcs system. :class:`bson.objectid.ObjectId`

        :return: commit_id (:class:`bson.objectid.ObjectId`)
        """
        try:
            return Commit.objects(vcs_system_id=vcs_system_id,
                                  revision_hash=self.revision_hash).get().id
        except DoesNotExist:
            logger.error(
                "Commit with vcs_system_id %s and revision %s does not exist" %
                (vcs_system_id, self.revision_hash))
            sys.exit(1)
Esempio n. 7
0
    def _classify_metric_documentation(self, commit_issue_df):
        commit = Commit.objects(id=commit_issue_df._id[0])
        commit_df = map_mongo_to_pandas(commit)
        commit_hash_df = commit_df.apply(lambda x: pd.Series(x['parents']),
                                         axis=1).stack().reset_index(level=1,
                                                                     drop=True)
        commit_hash_df.name = 'parents'
        commit_hash_df = commit_df.drop("parents", axis=1).join(commit_hash_df)
        commit_hash_df = commit_hash_df.loc[:, ("_id", "revision_hash")]
        commit_hash_df.columns = ['commit_id', 'revision_hash']
        # read previous revisions
        parent_commits = Commit.objects(
            revision_hash__in=commit_hash_df.revision_hash.values.tolist())
        prev_df = map_mongo_to_pandas(parent_commits)

        prev_df = prev_df.loc[:, ("_id", "revision_hash")]
        prev_df.columns = ["parent_commit_id", "parent_revision_hash"]
        current_previous_commits_df = commit_hash_df.merge(
            prev_df, left_on="revision_hash", right_on="parent_revision_hash")
        current_previous_commits_df.drop("revision_hash", axis=1, inplace=True)

        # read changed files
        file_action_files = self._get_actions_files(
            current_previous_commits_df)
        file_action_files = file_action_files.loc[:, ("commit_id", "path",
                                                      "file_id")]
        file_action_files = file_action_files[
            file_action_files.path.str.contains("(\.java)") == True]
        commit_prev_files = current_previous_commits_df.merge(
            file_action_files)

        current_commits = [{
            'commit_id': commit_id
        } for commit_id in commit_prev_files.commit_id.values.tolist()]
        parent_commits = [{
            'commit_id': commit_id
        } for commit_id in commit_prev_files.parent_commit_id.values.tolist()]
        file_ids = [{
            'file_id': file_id
        } for file_id in commit_prev_files.file_id.values.tolist()]

        # supported types by sourcemeter for java
        code_entity_types = [{
            'ce_type': 'annotation'
        }, {
            'ce_type': 'class'
        }, {
            'ce_type': 'enum'
        }, {
            'ce_type': 'interface'
        }, {
            'ce_type': 'method'
        }]

        current_commits_files = []
        for i in range(len(current_commits)):
            current_commits_files_dict = dict(
                list(current_commits[i].items()) + list(file_ids[i].items()))
            current_commits_files_dict['$or'] = code_entity_types
            current_commits_files.append(current_commits_files_dict)

        parent_commits_files = []
        for i in range(len(parent_commits)):
            parent_commits_files_dict = dict(
                list(parent_commits[i].items()) + list(file_ids[i].items()))
            parent_commits_files_dict['$or'] = code_entity_types
            parent_commits_files.append(parent_commits_files_dict)

        # read metrics realted to commit
        current_code_entity_states = CodeEntityState.objects(
            __raw__={'$or': current_commits_files})
        current_metrics = map_mongo_to_pandas(current_code_entity_states)
        current_metrics = current_metrics.loc[:, ("commit_id", "long_name",
                                                  "metrics", "file_id")]
        current_metrics.rename(columns={'commit_id': 'commit_id_x'},
                               inplace=True)

        parent_code_entity_states = CodeEntityState.objects(
            __raw__={'$or': parent_commits_files})
        parent_metrics = map_mongo_to_pandas(parent_code_entity_states)
        parent_metrics = parent_metrics.loc[:, ("commit_id", "long_name",
                                                "metrics", "file_id")]
        parent_metrics.rename(columns={'commit_id': 'commit_id_y'},
                              inplace=True)

        commit_prev_files_cpy = commit_prev_files.copy()
        commit_metrices_current = pd.merge(commit_prev_files,
                                           current_metrics,
                                           how='left',
                                           left_on=['commit_id', 'file_id'],
                                           right_on=['commit_id_x', 'file_id'])
        commit_metrices_current = commit_metrices_current.loc[:, (
            "commit_id", "parent_commit_id", "file_id", "long_name",
            "metrics")]
        commit_metrices_parent = pd.merge(
            commit_prev_files_cpy,
            parent_metrics,
            how='left',
            left_on=['parent_commit_id', 'file_id'],
            right_on=['commit_id_y', 'file_id'])
        commit_metrices_parent = commit_metrices_parent.loc[:, (
            "commit_id", "parent_commit_id", "file_id", "long_name",
            "metrics")]
        commit_metrics = pd.merge(
            commit_metrices_current,
            commit_metrices_parent,
            how='outer',
            left_on=['commit_id', 'parent_commit_id', 'file_id', 'long_name'],
            right_on=['commit_id', 'parent_commit_id', 'file_id', 'long_name'])

        commit_metrics["delta_DLOC"] = 0
        commit_metrics = commit_metrics.apply(self._calculate_dloc_metric,
                                              axis=1)
        commit_metrics = commit_metrics.loc[:, ("commit_id", "delta_DLOC")]
        commit_metrics = commit_metrics.groupby(
            "commit_id").sum().reset_index()
        commit_metrics.loc[commit_metrics.delta_DLOC != 0, ("delta_DLOC")] = 1
        return commit_metrics
Esempio n. 8
0
    def _find_boundary_date(self, issues, version_dates, affected_versions):
        """Find suspect boundary date.

        latest issue information but earliest date in commit (between created_at and affected versions)

        - latest creation date of linked bugs
        - earliest affected version
        """
        tags = git_tag_filter(self._project_name,
                              discard_patch=False,
                              correct_broken_tags=True)
        issue_dates = []
        affected_version_dates = []
        for issue in issues:

            if not issue.created_at:
                self._log.warn(
                    'no reporting date for issue {} id({}), ignoring it'.
                    format(issue.external_id, issue.id))
                continue

            # direct link match, broken dates are already filtered in pycoshark so we do not need to do that here
            for av in issue.affects_versions:
                for tag in tags:
                    if av.lower() == tag['original'].lower():
                        rev = tag['revision']
                        if 'corrected_revision' in tag.keys():
                            rev = tag['corrected_revision']

                        c = Commit.objects(
                            vcs_system_id=self._vcs_id,
                            revision_hash=rev).only('committer_date').get()
                        affected_version_dates.append(c.committer_date)
                        self._log.debug(
                            'found direct link between tag: {} and affected version: {} using '
                            .format(tag['original'], av))

            for av in get_affected_versions(issue, self._project_name,
                                            self._jira_key):
                avt = tuple(av)
                if avt in version_dates.keys():

                    for version_date in version_dates[avt]:
                        if version_date not in affected_version_dates:
                            affected_version_dates.append(version_date)
                else:
                    self._log.warn(
                        'affected version {} not found in git tags, skipping'.
                        format(avt))

            issue_dates.append(issue.created_at)

        # find latest bug report
        suspect_boundary_date = max(issue_dates)

        # latest bug report
        self._log.debug('latest bug report date is {} of {}'.format(
            suspect_boundary_date, issue_dates))

        # return earliest affected version, only if we want
        if affected_versions and affected_version_dates:
            min_affected_date = min(affected_version_dates)
            self._log.debug(
                'affected versions earliest date is {} while max bug report date is {}'
                .format(min_affected_date, suspect_boundary_date))
            suspect_boundary_date = min(min_affected_date,
                                        suspect_boundary_date)

        self._log.debug(
            'suspect boundary dates is {} from issue dates: {} and affected_versions: {}, use affected versions? {}'
            .format(suspect_boundary_date, issue_dates, affected_version_dates,
                    affected_versions))
        return suspect_boundary_date
Esempio n. 9
0
    def initialize(self, config, repository_url, repository_type):
        """Initializes the mongostore by connecting to the mongodb, creating the project in the project collection \
        and setting up processes (see: :class:`pyvcsshark.datastores.mongostore.CommitStorageProcess`, which
        read commits out of the commitqueue, process them and store them into the mongodb.

        :param config: all configuration
        :param repository_url: url of the repository, which is to be analyzed
        :param repository_type: type of the repository, which is to be analyzed (e.g. "git")
        """

        logger.setLevel(config.debug_level)
        logger.info("Initializing MongoStore...")

        # Create queue for multiprocessing
        self.commit_queue = multiprocessing.JoinableQueue()

        # we need an extra queue for branches because all commits need to be finished before we can process branches
        self.branch_queue = multiprocessing.JoinableQueue()
        self.config = config
        self.cores_per_job = config.cores_per_job

        # We define, that the user we authenticate with is in the admin database
        logger.info("Connecting to MongoDB...")

        uri = create_mongodb_uri_string(config.db_user, config.db_password,
                                        config.db_hostname, config.db_port,
                                        config.db_authentication,
                                        config.ssl_enabled)
        connect(config.db_database, host=uri, connect=False)

        # Get project_id
        try:
            project_id = Project.objects(name=config.project_name).get().id
        except DoesNotExist:
            logger.error('Project with name "%s" does not exist in database!' %
                         config.project_name)
            sys.exit(1)

        # Check if vcssystem already exist, and use upsert
        vcs_system = VCSSystem.objects(url=repository_url).upsert_one(
            url=repository_url,
            repository_type=repository_type,
            last_updated=datetime.datetime.today(),
            project_id=project_id)
        self.vcs_system_id = vcs_system.id

        # Tar.gz name based on project name
        tar_gz_name = '{}.tar.gz'.format(config.project_name)

        # Tar.gz of repository folder
        with tarfile.open(tar_gz_name, "w:gz") as tar:
            tar.add(config.path, arcname=config.project_name)

        # Add repository to gridfs if not existent
        if vcs_system.repository_file.grid_id is None:
            logger.info('Copying project to gridfs...')

            # Store in gridfs
            with open(tar_gz_name, 'rb') as tar_file:
                vcs_system.repository_file.put(tar_file,
                                               content_type='application/gzip',
                                               filename=tar_gz_name)
                vcs_system.save()
        else:
            # replace file if not existent
            logger.info('Replacing project file in gridfs...')
            with open(tar_gz_name, 'rb') as tar_file:
                vcs_system.repository_file.replace(
                    tar_file,
                    content_type='application/gzip',
                    filename=tar_gz_name)
                vcs_system.save()

        # Delete tar.gz file
        os.remove(tar_gz_name)

        # Get the last commit by date of the project (if there is any)
        last_commit = Commit.objects(vcs_system_id=self.vcs_system_id)\
            .only('committer_date').order_by('-committer_date').first()

        if last_commit is not None:
            last_commit_date = last_commit.committer_date
        else:
            last_commit_date = None

        # Start worker, they will wait till something comes into the queue and then process it
        for i in range(self.cores_per_job):
            name = "StorageProcess-%d" % i
            process = CommitStorageProcess(self.commit_queue,
                                           self.vcs_system_id,
                                           last_commit_date, self.config, name)
            process.daemon = True
            process.start()

        logger.info("Starting storage Process...")
Esempio n. 10
0
        project_id = Project.objects(name=name).get().id
    except DoesNotExist:
        print('unknown project:', name)
        sys.exit(1)

    cur_vcs_system = VCSSystem.objects(project_id=project_id).get().id
    cur_issue_system = IssueSystem.objects(project_id=project_id).get().id

    # 1) fetch commits
    print('fetching commit ids')
    issue_ids = ['LLOC']
    last_commit = None
    commit_bug_map = {}
    for commit in Commit.objects(vcs_system_id=cur_vcs_system,
                                 committer_date__gte=date_start,
                                 committer_date__lt=date_end,
                                 branches=master_branch)\
                        .only('id', 'committer_date', 'revision_hash', 'linked_issue_ids', 'message', 'parents'):
        linked_bugs = []
        if commit.linked_issue_ids is not None and len(
                commit.linked_issue_ids) > 0:
            for issue in Issue.objects(id__in=commit.linked_issue_ids):
                if issue.external_id in excluded_issues:
                    continue
                resolved = False
                fixed = False
                if issue.issue_type and issue.issue_type.lower() == 'bug':
                    if issue.status in ['resolved', 'closed']:
                        resolved = True
                        fixed |= issue.resolution.lower() != 'duplicated'
Esempio n. 11
0
    def start(self, cfg):
        """
        Executes the linkSHARK.
        :param cfg: configuration object that is used
        """
        self._log.setLevel(cfg.get_debug_level())
        start_time = timeit.default_timer()

        uri = create_mongodb_uri_string(cfg.user, cfg.password, cfg.host,
                                        cfg.port, cfg.authentication_db,
                                        cfg.ssl_enabled)
        connect(cfg.database, host=uri)

        # Get the id of the project for which the code entities shall be merged
        try:
            project_id = Project.objects(name=cfg.project_name).get().id
        except DoesNotExist:
            self._log.error('Project %s not found!' % cfg.project_name)
            sys.exit(1)

        vcs_system = VCSSystem.objects(project_id=project_id).get()
        self._itss = []
        self._log.info('found the following issue tracking systems:')
        for its in IssueSystem.objects(project_id=project_id).order_by('url'):
            self._log.info(its.url)
            self._itss.append(its)

        if len(cfg.correct_key) > 0:
            correct_keys_per_its = cfg.correct_key.split(';')
            if len(correct_keys_per_its) != len(self._itss):
                self._log_critical(
                    '--correct-key must correct keys for all issue tracking systems if specified'
                )
                sys.exit(1)
            for i, correct_key in enumerate(correct_keys_per_its):
                self._correct_key[self._itss[i].url] = correct_key
        if len(cfg.broken_keys) > 0:
            broken_keys_per_its = cfg.broken_keys.split(';')

            if len(broken_keys_per_its) != len(self._itss):
                self._log_critical(
                    '--broken-keys must correct keys for all issue tracking systems if specified. If there are no keys to correct for one of the ITS just use the name of the correct key twice itself'
                )
                sys.exit(1)
            for i, broken_keys in enumerate(broken_keys_per_its):
                self._broken_keys[self._itss[i].url] = broken_keys.split(',')

        self._log.info("Starting issue linking")
        commit_count = Commit.objects(vcs_system_id=vcs_system.id).count()

        issue_map = {}
        for i, issue_system in enumerate(self._itss):
            project_id_string = correct_keys_per_its[i]

            for issue in Issue.objects(issue_system_id=issue_system.id):
                if issue.external_id.startswith(project_id_string):
                    try:
                        issue_number = [
                            int(s) for s in issue.external_id.split('-')
                            if s.isdigit()
                        ][0]
                    except IndexError:
                        self._log.error(
                            "index error because SZZ currently only support JIRA, may not link all issues correctly:",
                            issue.external_id)
                        continue
                    if issue_number not in issue_map:
                        issue_map[issue_number] = [issue]
                    else:
                        issue_map[issue_number].append(issue)

        for i, commit in enumerate(
                Commit.objects(vcs_system_id=vcs_system.id).only(
                    'id', 'revision_hash', 'vcs_system_id', 'message',
                    'author_id', 'committer_id')):
            if i % 100 == 0:
                self._log.info("%i/%i  commits finished", i, commit_count)
            issue_links = self._get_issue_links(commit)
            if len(issue_links) > 0:
                commit.linked_issue_ids = issue_links
                commit.save()
            szz_links = self._get_szz_issue_links(commit, issue_map)
            if len(szz_links) > 0:
                commit.szz_issue_ids = szz_links
                commit.save()

        elapsed = timeit.default_timer() - start_time
        self._log.info("Execution time: %0.5f s" % elapsed)
Esempio n. 12
0
def main(args):
    # timing
    start = timeit.default_timer()

    if args.log_level and hasattr(logging, args.log_level):
        log.setLevel(getattr(logging, args.log_level))

    uri = create_mongodb_uri_string(args.db_user, args.db_password,
                                    args.db_hostname, args.db_port,
                                    args.db_authentication, args.ssl)
    connect(args.db_database, host=uri)

    vcs = VCSSystem.objects.get(url=args.url)

    itss = []
    if args.issue_systems == 'all':
        for its in IssueSystem.objects.filter(project_id=vcs.project_id):
            itss.append(its)
    else:
        for url in args.issue_systems.split(','):
            its = IssueSystem.objects.get(url=url)
            itss.append(its)

    log.info("Starting commit labeling")

    # import every approach defined or all
    if args.approaches == 'all':
        # just list every module in the package and import it
        basepath = os.path.dirname(os.path.abspath(__file__))
        for app in os.listdir(os.path.join(basepath, 'approaches/')):
            if app.endswith('.py') and app != '__init__.py':
                __import__('approaches.{}'.format(app[:-3]))
    else:
        # if we have a list of approaches import only those
        for app in args.approaches.split(','):
            __import__('approaches.{}'.format(app))

    # add specific configs
    config = {'itss': itss, 'args': args}
    a = LabelSHARK()
    a.configure(config)

    if args.linking_approach:
        log.info('using approach {} for issue links'.format(
            args.linking_approach))

    for commit in Commit.objects.filter(vcs_system_id=vcs.id):
        a.set_commit(commit)
        labels = a.get_labels()
        issue_links = a.get_issue_links()

        # we get a dict of approach_name => [issue_link_ids]
        for k, v in issue_links.items():
            # log.info('commit: {}, links: {}, from approach: {}'.format(commit.revision_hash, v, k))
            if args.linking_approach and k == args.linking_approach:
                if v:
                    log.info('commit: {}, linked to: {}'.format(
                        commit.revision_hash, ','.join([str(l) for l in v])))
                commit.linked_issue_ids = v
                commit.save()

        log.info('commit: {}, labels: {}'.format(commit.revision_hash, labels))

        # save the labels
        if labels:
            tmp = {'set__labels__{}'.format(k): v for k, v in labels}
            Commit.objects(id=commit.id).upsert_one(**tmp)

    end = timeit.default_timer() - start
    log.info("Finished commit labeling in {:.5f}s".format(end))
Esempio n. 13
0
def start():
    """
    Compares the commits and code_entity_states of two MongoDBs, whereas the first MongoDB is
    is condensed with the memeSHARK and the second MongoDB is verbose.
    """
    setup_logging()
    logger = logging.getLogger("main")
    logger.info("Starting consistency checker...")

    parser = argparse.ArgumentParser(description='DB consistency checker.')

    parser.add_argument('-v',
                        '--version',
                        help='Shows the version',
                        action='version',
                        version='0.1.0')
    parser.add_argument('-U1',
                        '--db-user1',
                        help='Database user name',
                        default=None)
    parser.add_argument('-P1',
                        '--db-password1',
                        help='Database user password',
                        default=None)
    parser.add_argument('-DB1',
                        '--db-database1',
                        help='Database name',
                        default='smartshark')
    parser.add_argument(
        '-H1',
        '--db-hostname1',
        help='Name of the host, where the database server is running',
        default='localhost')
    parser.add_argument('-p1',
                        '--db-port1',
                        help='Port, where the database server is listening',
                        default=27017,
                        type=int)
    parser.add_argument('-a1',
                        '--db-authentication1',
                        help='Name of the authentication database',
                        default=None)
    parser.add_argument('--ssl1',
                        help='Enables SSL',
                        default=False,
                        action='store_true')

    parser.add_argument('-U2',
                        '--db-user2',
                        help='Database user name',
                        default=None)
    parser.add_argument('-P2',
                        '--db-password2',
                        help='Database user password',
                        default=None)
    parser.add_argument('-DB2',
                        '--db-database2',
                        help='Database name',
                        default='smartshark_backup')
    parser.add_argument(
        '-H2',
        '--db-hostname2',
        help='Name of the host, where the database server is running',
        default='localhost')
    parser.add_argument('-p2',
                        '--db-port2',
                        help='Port, where the database server is listening',
                        default=27017,
                        type=int)
    parser.add_argument('-a2',
                        '--db-authentication2',
                        help='Name of the authentication database',
                        default=None)
    parser.add_argument('--ssl2',
                        help='Enables SSL',
                        default=False,
                        action='store_true')

    parser.add_argument(
        '--debug',
        help='Sets the debug level.',
        default='DEBUG',
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'])
    parser.add_argument('--project-name1',
                        help='Name of the project.',
                        default=None)
    parser.add_argument('--project-name2',
                        help='Name of the project.',
                        default=None)

    args = parser.parse_args()

    logger.info(args)

    logger.info("connecting to database 1 (condensed)...")
    uri1 = create_mongodb_uri_string(args.db_user1, args.db_password1,
                                     args.db_hostname1, args.db_port1,
                                     args.db_authentication1, args.ssl1)
    logger.info(uri1)
    connect(args.db_database1, host=uri1, alias='default')

    logger.info("connecting to database 2 (verbose)...")
    uri2 = create_mongodb_uri_string(args.db_user2, args.db_password2,
                                     args.db_hostname2, args.db_port2,
                                     args.db_authentication2, args.ssl2)
    logger.info(uri2)
    connect(args.db_database2, host=uri2, alias='db-verbose')

    # fetch all verbose commmits
    commits_verbose = []
    with switch_db(Commit, 'db-verbose') as CommitVerbose:
        # fetch only commits for selected project
        try:
            project_id = Project.objects(name=args.project_name2).get().id
        except DoesNotExist:
            logger.error('Project %s not found!' % args.project_name2)
            sys.exit(1)
        vcs_systems = VCSSystem.objects(project_id=project_id).get().id
        logger.info("vcs_system_id: %s", vcs_systems)
        commit_objects = Commit.objects(vcs_system_id=vcs_systems)

        for cur_commit_verbose in commit_objects:
            commits_verbose.append(cur_commit_verbose)

    with switch_db(VCSSystem, 'default') as VCSSystemCondensed:
        # fetch only commits for selected project
        try:
            project_id = Project.objects(name=args.project_name1).get().id
        except DoesNotExist:
            logger.error('Project %s not found!' % args.project_name1)
            sys.exit(1)
        vcs_systems_condensed = VCSSystemCondensed.objects(
            project_id=project_id).get().id

    # fetch files verbose
    with switch_db(File, 'db-verbose') as FilesVerbose:
        files_verbose = {}
        for cur_file_verbose in FilesVerbose.objects(
                vcs_system_id=vcs_systems):
            files_verbose[cur_file_verbose.id] = cur_file_verbose.path

    with switch_db(File, 'default') as FilesCondensed:
        files_condensed = {}
        for cur_file_condensed in FilesCondensed.objects(
                vcs_system_id=vcs_systems_condensed):
            files_condensed[cur_file_condensed.id] = cur_file_condensed.path

    num_commits_verbose = len(commits_verbose)
    logger.info("num commits verbose: %i", num_commits_verbose)
    for commit_nr, commit_verbose in enumerate(commits_verbose):
        logger.info("processing commit %s (%i / %i)", commit_verbose.id,
                    commit_nr + 1, num_commits_verbose)
        # fetch verbose CES
        ces_verbose = {}
        ces_verbose_by_id = {}
        with switch_db(CodeEntityState,
                       'db-verbose') as CodeEntityStateVerbose:
            for cur_ces_verbose in CodeEntityStateVerbose.objects(
                    commit_id=commit_verbose.id):
                ces_verbose[
                    cur_ces_verbose.long_name +
                    files_verbose[cur_ces_verbose.file_id]] = cur_ces_verbose
                ces_verbose_by_id[cur_ces_verbose.id] = cur_ces_verbose

        # fetch same commit in condensed DB
        with switch_db(Commit, 'default') as CommitCondensed:
            try:
                commit_condensed = CommitCondensed.objects(
                    revision_hash=commit_verbose.revision_hash,
                    vcs_system_id=vcs_systems_condensed).get()
            except:
                logger.info("commit %s not found in condensed db",
                            commit_verbose.revision_hash)
                continue

        # fetch CES from condensed DB
        ces_condensed = {}
        ces_condensed_by_id = {}
        with switch_db(CodeEntityState, 'default') as CodeEntityStateCondensed:
            for ces_id in commit_condensed.code_entity_states:
                cur_ces_condensed = CodeEntityStateCondensed.objects(
                    id=ces_id).get()
                ces_condensed[cur_ces_condensed.long_name + files_condensed[
                    cur_ces_condensed.file_id]] = cur_ces_condensed
                ces_condensed_by_id[cur_ces_condensed.id] = cur_ces_condensed

        logger.info("num CES verbose  : %i", len(ces_verbose.keys()))
        logger.info("num CES condensed: %i", len(ces_condensed.keys()))

        ces_unequal = 0
        # compare CES
        for long_name_verbose, cur_ces_verbose in ces_verbose.items():
            if long_name_verbose not in ces_condensed:
                logger.error(
                    "CES with long_name %s not found in condensed DB!",
                    long_name_verbose)
                ces_unequal += 1
                continue

            cur_ces_condensed = ces_condensed[long_name_verbose]
            old, new = compare_dicts(cur_ces_verbose, cur_ces_condensed, {
                'id', 's_key', 'commit_id', 'ce_parent_id', 'cg_ids', 'file_id'
            })
            if len(new.keys()) > 0 or len(old.keys()) > 0:
                logger.error(
                    "CES with long_name %s (id verbose: %s /id condensed %s) not equal!",
                    long_name_verbose, cur_ces_verbose.id,
                    cur_ces_condensed.id)
                logger.error("verbose  : %s", old)
                logger.error("condensed: %s", new)
                ces_unequal += 1
                continue

            # check if CES parent is equal
            ces_parent_verbose = ces_verbose_by_id[cur_ces_verbose.id]
            ces_parent_condensed = ces_condensed_by_id[cur_ces_condensed.id]
            old, new = compare_dicts\
                (ces_parent_verbose, ces_parent_condensed,
                 {'id', 's_key', 'commit_id', 'ce_parent_id', 'cg_ids', 'file_id'})
            if len(new.keys()) > 0 or len(old.keys()) > 0:
                logger.error("ce_parent of CES with long_name %s not equal!",
                             long_name_verbose)
                logger.error("verbose  : %s", old)
                logger.error("condensed: %s", new)
                ces_unequal += 1
                continue

        logger.info("num CES from verbose not matched: %i", ces_unequal)