Esempio n. 1
0
    def update(self):
        """
        updates the Git data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Code2DbUpdate started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            self._update_info_code(repo_id, self._get_import_type(repo_id))

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Code2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("Code2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 2
0
    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Git2DbMain started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            self._dao.insert_repo(project_id, self._repo_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            #info contribution does not need a connection to the db
            self._get_info_contribution(repo_id)
            self._dao.restart_connection()
            self._dao.fix_commit_parent_table(repo_id)
            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 3
0
    def __call__(self):
        try:
            log_path = self._log_root_path + "-git2db-" + self._make_it_printable(
                self._ref_name)
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Git2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
    def __call__(self):
        self._logging_util = LoggingUtil()
        log_path = self._log_root_path + "-code2db-" + str(
            self._interval[0].get('commit_id')) + "_" + str(
                self._interval[0].get('file_id')) + "-" + str(
                    self._interval[-1].get('commit_id')) + "_" + str(
                        self._interval[-1].get('file_id'))
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._tmp_root_file = log_path + "-tmp."
            self._git_querier = GitQuerier(self._git_repo_path, self._logger)
            self._code_querier = CodeQuerier(self._logger,
                                             self._tmp_root_file + "txt")
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Code2DbTag failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 5
0
class Code2DbUpdate():
    """
    This class handles the update of code data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name, repo_name, git_repo_path,
                 extensions, references, num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type extensions: list str
        :param extensions: file extensions to analyse. Currently extensions supported: ['java', 'py', 'php', 'scala', 'js', 'rb', 'cs', 'cpp', 'c']

        :type references: list str
        :param references: list of references to analyse

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 5)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "update-code-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._extensions = extensions
        self._references = references

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Code2DbUpdate.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_new_commit_file_pairs(self, repo_id):
        pairs = []

        filter_references = "1 = 1"
        if self._references:
            filter_references = "r.name IN (" + ",".join(
                ["'" + e + "'" for e in self._references]) + ")"
        filter_extensions = "1 = 1"
        if self._extensions:
            filter_extensions = "f.ext IN (" + ",".join(
                ["'" + e + "'" for e in self._extensions]) + ")"

        cursor = self._dao.get_cursor()
        query = "SELECT existing_pairs.* " \
                "FROM ( " \
                "SELECT cac.commit_id, cac.file_id FROM code_at_commit cac GROUP BY cac.commit_id, cac.file_id) AS processed_pairs " \
                "RIGHT JOIN " \
                "(SELECT c.id as commit_id, c.sha, f.id AS file_id, f.name AS file_name, f.ext AS file_ext " \
                "FROM commit_in_reference cin JOIN reference r ON r.id = cin.ref_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "JOIN file_modification fm ON fm.commit_id = c.id " \
                "JOIN file f ON f.id = fm.file_id " \
                "WHERE " + filter_references + " AND " + filter_extensions + " AND cin.repo_id = %s " \
                "GROUP BY c.id, f.id) AS existing_pairs " \
                "ON processed_pairs.commit_id = existing_pairs.commit_id AND processed_pairs.file_id = existing_pairs.file_id " \
                "WHERE processed_pairs.commit_id IS NULL"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            pairs.append({
                "commit_id": row[0],
                "commit_sha": row[1],
                "file_id": row[2],
                "file_name": row[3],
                "file_ext": row[4]
            })
            row = self._dao.fetchone(cursor)
        self._dao.close_cursor(cursor)

        return pairs

    def _update_existing_references(self, repo_id, import_type):
        pairs = self._get_new_commit_file_pairs(repo_id)
        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                pairs, self._num_processes) if len(i) > 0
        ]

        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, intervals,
                                             results)

        for interval in intervals:
            issue_extractor = Code2DbCommitFile(self._db_name,
                                                self._git_repo_path, interval,
                                                import_type, self._config,
                                                self._log_path)
            queue_intervals.put(issue_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes,
                                              queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _update_info_code(self, repo_id, import_type):
        #updates code data
        self._update_existing_references(repo_id, import_type)

    def _get_import_type(self, repo_id):
        #gets import type
        import_type = 0
        import_type += self._dao.function_at_commit_is_empty(
            repo_id) + self._dao.code_at_commit_is_empty(repo_id)
        return import_type

    def update(self):
        """
        updates the Git data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Code2DbUpdate started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            self._update_info_code(repo_id, self._get_import_type(repo_id))

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Code2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("Code2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 6
0
class Git2DbMain():
    """
    This class handles the import of Git data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name, repo_name, git_repo_path,
                 before_date, import_type, references, num_processes, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type import_type: int
        :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail

        :type references: list str
        :param references: list of references to import

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 5)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-git-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._before_date = before_date
        self._import_type = import_type

        self._references = references

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Git2DbMain.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_existing_references(self, repo_id):
        #retrieves already imported references
        existing_refs = []

        cursor = self._dao.get_cursor()
        query = "SELECT ref.name " \
                "FROM reference ref JOIN repository r ON ref.repo_id = r.id " \
                "WHERE r.id = %s"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            existing_refs.append(row[0])
            row = self._dao.fetchone(cursor)
        self._dao.close_cursor(cursor)

        return existing_refs

    def _get_info_contribution(self, repo_id):
        #processes Git data
        existing_refs = self._get_existing_references(repo_id)

        queue_references = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes,
                                             queue_references, results)
        for reference in self._querier.get_references():
            if self._references:
                if reference[0] in self._references:
                    git_ref_extractor = Git2DbReference(
                        self._db_name, repo_id, self._git_repo_path,
                        self._before_date, self._import_type, reference[0], "",
                        self._config, self._log_path)

                    queue_references.put(git_ref_extractor)
            else:
                if reference[0] not in existing_refs:
                    git_ref_extractor = Git2DbReference(
                        self._db_name, repo_id, self._git_repo_path,
                        self._before_date, self._import_type, reference[0], "",
                        self._config, self._log_path)

                    queue_references.put(git_ref_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes,
                                              queue_references)

        # Wait for all of the tasks to finish
        queue_references.join()

    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Git2DbMain started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            self._dao.insert_repo(project_id, self._repo_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            #info contribution does not need a connection to the db
            self._get_info_contribution(repo_id)
            self._dao.restart_connection()
            self._dao.fix_commit_parent_table(repo_id)
            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 7
0
class Git2DbUpdate():
    """
    This class handles the update of Git data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name,
                 repo_name, git_repo_path, before_date,
                 num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 5)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-git-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._before_date = before_date
        self._existing_refs = []

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Git2DbUpdate.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _update_existing_references(self, repo_id, import_type):
        #updates existing references in the DB
        cursor = self._dao.get_cursor()
        query = "SELECT c.sha, lc.ref_id " \
                "FROM commit c " \
                "JOIN (SELECT ref_id, max(commit_id) as last_commit_id_in_ref FROM commit_in_reference WHERE repo_id = %s GROUP BY ref_id) as lc " \
                "ON c.id = lc.last_commit_id_in_ref"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        queue_references = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, queue_references, results)

        row = self._dao.fetchone(cursor)
        while row:
            sha = row[0]
            ref_id = row[1]
            row = self._dao.fetchone(cursor)

            ref_name = self._dao.select_reference_name(repo_id, ref_id)

            for reference in self._querier.get_references():
                reference_name = reference[0]
                if reference_name == ref_name:
                    self._existing_refs.append(ref_name)

                    git_ref_extractor = Git2DbReference(self._db_name, repo_id, self._git_repo_path,
                                                        self._before_date, import_type, reference[0], sha,
                                                        self._config, self._log_path)

                    queue_references.put(git_ref_extractor)
                    break

        self._dao.close_cursor(cursor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes, queue_references)

        # Wait for all of the tasks to finish
        queue_references.join()

    def _update_repo(self, repo_id, import_type):
        #updates Git data
        self._update_existing_references(repo_id, import_type)

    def _get_import_type(self, repo_id):
        #gets import type
        import_type = 1
        import_type += self._dao.line_detail_table_is_empty(repo_id) + self._dao.file_modification_patch_is_empty(repo_id)
        return import_type

    def update(self):
        """
        updates the Git data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info")

            self._logger.info("Git2DbUpdate started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            self._update_repo(repo_id, self._get_import_type(repo_id))
            self._dao.restart_connection()
            self._dao.fix_commit_parent_table(repo_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("Git2DbUpdate finished after " + str(minutes_and_seconds[0])
                         + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except:
            self._logger.error("Git2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 8
0
class Git2DbReference(object):
    """
    This class handles the import of Git references
    """

    #do not import patches
    LIGHT_IMPORT_TYPE = 1
    #import patches but not at line level
    MEDIUM_IMPORT_TYPE = 2
    #import patches also at line level
    FULL_IMPORT_TYPE = 3

    def __init__(self, db_name, repo_id, git_repo_path, before_date,
                 import_type, ref_name, ref_type, from_sha, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type git_repo_path: str
        :param git_repo_path: local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type import_type: int
        :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail

        :type ref_name: str
        :param ref_name: the name of the reference to import

        :type from_sha: str
        :param from_sha: the SHA of the commit from where to start the import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._git_repo_path = git_repo_path
        self._repo_id = repo_id
        self._db_name = db_name
        self._ref_name = ref_name
        self._ref_type = ref_type
        self._before_date = before_date
        self._import_type = import_type
        self._from_sha = from_sha
        self._config = config
        self._logging_util = LoggingUtil()
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-git2db-" + self._make_it_printable(
                self._ref_name)
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Git2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _make_it_printable(self, str):
        #converts string to UTF-8 and removes empty and non-alphanumeric characters
        u = str.decode('utf-8', 'ignore').lower()
        return re.sub(r'(\W|\s)+', '-', u)

    def _get_info_contribution_in_reference(self, reference_name,
                                            reference_type, repo_id, from_sha):
        if from_sha:
            if self._before_date:
                commits = self._querier.collect_all_commits_after_sha_before_date(
                    reference_name, from_sha, self._before_date)
            else:
                commits = self._querier.collect_all_commits_after_sha(
                    reference_name, from_sha)

            self._analyse_commits(commits, reference_name, repo_id)
        else:
            if self._before_date:
                commits = self._querier.collect_all_commits_before_date(
                    reference_name, self._before_date)
            else:
                commits = self._querier.collect_all_commits(reference_name)

            self._analyse_commits(commits, reference_name, repo_id)

    def _load_all_references(self, repo_id):
        # load all git branches and tags into database
        for reference in self._querier.get_references():
            ref_name = reference[0]
            ref_type = reference[1]
            #inserts reference to DB
            self._dao.insert_reference(repo_id, ref_name, ref_type)

    def _get_diffs_from_commit(self, commit, files_in_commit):
        #calculates diffs within files in a commit
        if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
            diffs = self._querier.get_diffs(commit, files_in_commit, True)
        else:
            diffs = self._querier.get_diffs(commit, files_in_commit, False)

        return diffs

    def _analyse_commit(self, commit, repo_id, ref_id):
        #analyses a commit
        try:
            message = self._querier.get_commit_property(commit, "message")
            author_name = self._querier.get_commit_property(
                commit, "author.name")
            author_email = self._querier.get_commit_property(
                commit, "author.email")
            committer_name = self._querier.get_commit_property(
                commit, "committer.name")
            committer_email = self._querier.get_commit_property(
                commit, "committer.email")
            size = self._querier.get_commit_property(commit, "size")
            sha = self._querier.get_commit_property(commit, "hexsha")
            authored_date = self._querier.get_commit_time(
                self._querier.get_commit_property(commit, "authored_date"))
            committed_date = self._querier.get_commit_time(
                self._querier.get_commit_property(commit, "committed_date"))

            if author_name is None and author_email is None:
                self._logger.warning(
                    "author name and email are null for commit: " + sha)

            if committer_name is None and committer_email is None:
                self._logger.warning(
                    "committer name and email are null for commit: " + sha)

            #insert author
            author_id = self._dao.get_user_id(author_name, author_email)
            committer_id = self._dao.get_user_id(committer_name,
                                                 committer_email)

            commit_found = self._dao.select_commit_id(sha, repo_id)

            if not commit_found:
                #insert commit
                self._dao.insert_commit(repo_id, sha, message, author_id,
                                        committer_id, authored_date,
                                        committed_date, size)
                commit_found = self._dao.select_commit_id(sha, repo_id)

                commit_stats_files = commit.stats.files
                try:
                    if self._querier.commit_has_no_parents(commit):
                        for diff in self._querier.get_diffs_no_parent_commit(
                                commit):
                            file_path = diff[0]
                            ext = self._querier.get_ext(file_path)

                            self._dao.insert_file(repo_id, file_path, ext)
                            file_id = self._dao.select_file_id(
                                repo_id, file_path)

                            if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
                                patch_content = re.sub(r'^(\w|\W)*\n@@', '@@',
                                                       diff[1])
                            else:
                                patch_content = None

                            stats = self._querier.get_stats_for_file(
                                commit_stats_files, file_path)
                            status = self._querier.get_status_with_diff(
                                stats, diff)

                            #insert file modification
                            self._dao.insert_file_modification(
                                commit_found, file_id, status, stats[0],
                                stats[1], stats[2], patch_content)

                            if self._import_type == Git2DbReference.FULL_IMPORT_TYPE:
                                file_modification_id = self._dao.select_file_modification_id(
                                    commit_found, file_id)
                                line_details = self._querier.get_line_details(
                                    patch_content, ext)
                                for line_detail in line_details:
                                    self._dao.insert_line_details(
                                        file_modification_id, line_detail)
                    else:
                        for diff in self._get_diffs_from_commit(
                                commit, commit_stats_files.keys()):
                            #self.dao.check_connection_alive()
                            if self._querier.is_renamed(diff):
                                file_previous = self._querier.get_rename_from(
                                    diff)
                                ext_previous = self._querier.get_ext(
                                    file_previous)

                                file_current = self._querier.get_file_current(
                                    diff)
                                ext_current = self._querier.get_ext(
                                    file_current)

                                #insert new file
                                self._dao.insert_file(repo_id, file_current,
                                                      ext_current)

                                #get id new file
                                current_file_id = self._dao.select_file_id(
                                    repo_id, file_current)

                                #retrieve the id of the previous file
                                previous_file_id = self._dao.select_file_id(
                                    repo_id, file_previous)

                                #insert file modification
                                self._dao.insert_file_modification(
                                    commit_found, current_file_id, "renamed",
                                    0, 0, 0, None)

                                if not previous_file_id:
                                    self._dao.insert_file(
                                        repo_id, file_previous, ext_previous)
                                    previous_file_id = self._dao.select_file_id(
                                        repo_id, file_previous)

                                if current_file_id == previous_file_id:
                                    self._logger.warning(
                                        "previous file id is equal to current file id ("
                                        + str(current_file_id) + ") " +
                                        str(sha))
                                else:
                                    file_modification_id = self._dao.select_file_modification_id(
                                        commit_found, current_file_id)
                                    self._dao.insert_file_renamed(
                                        repo_id, current_file_id,
                                        previous_file_id, file_modification_id)

                            else:
                                #insert file
                                #if the file does not have a path, it won't be inserted
                                try:
                                    file_path = self._querier.get_file_path(
                                        diff)

                                    ext = self._querier.get_ext(file_path)

                                    stats = self._querier.get_stats_for_file(
                                        commit_stats_files, file_path)
                                    status = self._querier.get_status_with_diff(
                                        stats, diff)

                                    #if the file is new, add it
                                    if self._querier.is_new_file(diff):
                                        self._dao.insert_file(
                                            repo_id, file_path, ext)
                                    file_id = self._dao.select_file_id(
                                        repo_id, file_path)

                                    if not file_id:
                                        self._dao.insert_file(
                                            repo_id, file_path, ext)
                                        file_id = self._dao.select_file_id(
                                            repo_id, file_path)

                                    if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
                                        #insert file modification (additions, deletions)
                                        patch_content = self._querier.get_patch_content(
                                            diff)
                                    else:
                                        patch_content = None

                                    self._dao.insert_file_modification(
                                        commit_found, file_id, status,
                                        stats[0], stats[1], stats[2],
                                        patch_content)

                                    if self._import_type == Git2DbReference.FULL_IMPORT_TYPE:
                                        file_modification_id = self._dao.select_file_modification_id(
                                            commit_found, file_id)
                                        line_details = self._querier.get_line_details(
                                            patch_content, ext)
                                        for line_detail in line_details:
                                            self._dao.insert_line_details(
                                                file_modification_id,
                                                line_detail)
                                except Exception:
                                    self._logger.error(
                                        "Something went wrong with commit " +
                                        str(sha),
                                        exc_info=True)
                except Exception:
                    self._logger.error("Git2Db failed on commit " + str(sha),
                                       exc_info=True)

            # insert parents of the commit
            self._dao.insert_commit_parents(commit.parents, commit_found, sha,
                                            repo_id)
            # insert commits in reference
            self._dao.insert_commit_in_reference(repo_id, commit_found, ref_id)

            #return commit_found
        except Exception:
            self._logger.error("Git2Db failed on commit " + str(sha),
                               exc_info=True)

    def _analyse_commits(self, commits, ref, repo_id):
        #analyses commits in references
        ref_id = self._dao.select_reference_id(repo_id, ref)
        commits_in_reference = []
        for c in commits:
            self._analyse_commit(c, repo_id, ref_id)
            # self.logger.info("analysing commit " + str(commits.index(c)+1) + "/" + str(len(commits)))
            # to_insert = self._analyse_commit(c, repo_id, ref_id)
            # if to_insert:
            #     commits_in_reference.append((repo_id, to_insert, ref_id))
            # self._analyse_commit(c, repo_id, ref_id)

        #self._dao.insert_commits_in_reference(commits_in_reference)

    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger.info("Git2DbReference started")
            start_time = datetime.now()
            self._load_all_references(self._repo_id)
            self._get_info_contribution_in_reference(self._ref_name,
                                                     self._ref_type,
                                                     self._repo_id,
                                                     self._from_sha)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbReference finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbReference failed", exc_info=True)
Esempio n. 9
0
class Code2DbMain():
    """
    This class handles the import of code information
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name,
                 repo_name, git_repo_path, import_type, extensions, references, num_processes,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type import_type: int
        :param import_type: 1 = import overall function statistics per file, 2 = import function-level information

        :type extensions: list str
        :param extensions: file extensions to analyse. Gitana calculates loc, comments and blank lines for most of the files.
        For the following languages ['java', 'py', 'php', 'scala', 'js', 'rb', 'cs', 'cpp', 'c'], Gitana also provides insights about ccn, functions and tokens.

        :type references: list str
        :param references: list of references to analyse

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 10)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-code-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._import_type = import_type
        self._extensions = extensions
        self._references = references

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Code2DbMain.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_commit_file_pairs(self, repo_id):
        pairs = []

        filter_references = "1 = 1"
        if self._references:
            filter_references = "r.name IN (" + ",".join(["'" + e + "'" for e in self._references]) + ")"
        filter_extensions = "1 = 1"
        if self._extensions:
            filter_extensions = "f.ext IN (" + ",".join(["'" + e + "'" for e in self._extensions]) + ")"

        cursor = self._dao.get_cursor()
        query = "SELECT c.id AS commit_id, c.sha, f.id AS file_id, f.name AS file_name, f.ext AS file_ext " \
                "FROM commit_in_reference cin JOIN reference r ON r.id = cin.ref_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "JOIN file_modification fm ON fm.commit_id = c.id " \
                "JOIN file f ON f.id = fm.file_id " \
                "WHERE " + filter_references + " AND " + filter_extensions + " AND cin.repo_id = %s " \
                "GROUP BY c.id, f.id"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            pairs.append({"commit_id": row[0], "commit_sha": row[1], "file_id": row[2], "file_name": row[3], "file_ext": row[4]})
            row = self._dao.fetchone(cursor)
        self._dao.close_cursor(cursor)

        return pairs

    def _get_info_code(self, repo_id):
        pairs = self._get_commit_file_pairs(repo_id)
        intervals = [i for i in multiprocessing_util.get_tasks_intervals(pairs, self._num_processes) if len(i) > 0]

        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results)

        for interval in intervals:
            issue_extractor = Code2DbCommitFile(self._db_name, self._git_repo_path, interval, self._import_type,
                                                self._config, self._log_path)
            queue_intervals.put(issue_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def extract(self):
        """
        extracts code function data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info")

            self._logger.info("Code2DbMain started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            repo_id = self._dao.select_repo_id(self._repo_name)
            self._get_info_code(repo_id)
            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("Code2DbMain finished after " + str(minutes_and_seconds[0])
                            + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Code2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
class Code2DbCommitFile():
    """
    This class handles the import of code function data for a set of commit file pairs
    """

    #import overall function statistics per file
    LIGHT_IMPORT_TYPE = 1
    #import import function-level information
    FULL_IMPORT_TYPE = 2

    def __init__(self, db_name, git_repo_path, interval, import_type, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type git_repo_path: str
        :param git_repo_path: local path of the Git repository

        :type interval: list dict
        :param interval: a list of commit file pair

        :type import_type: int
        :param import_type: 1 = import overall function statistics per file, 2 = import function-level information

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._git_repo_path = git_repo_path
        self._db_name = db_name
        self._interval = interval
        self._import_type = import_type
        self._config = config
        self._fileHandler = None
        self._logger = None
        self._git_querier = None
        self._code_querier = None
        self._dao = None
        self._tmp_root_file = None

    def __call__(self):
        self._logging_util = LoggingUtil()
        log_path = self._log_root_path + "-code2db-" + str(
            self._interval[0].get('commit_id')) + "_" + str(
                self._interval[0].get('file_id')) + "-" + str(
                    self._interval[-1].get('commit_id')) + "_" + str(
                        self._interval[-1].get('file_id'))
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._tmp_root_file = log_path + "-tmp."
            self._git_querier = GitQuerier(self._git_repo_path, self._logger)
            self._code_querier = CodeQuerier(self._logger,
                                             self._tmp_root_file + "txt")
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Code2DbTag failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _save_content(self, content, target):
        file = codecs.open(target, "w+", "utf-8")
        file.write(content)
        file.close()

    def _delete_tmp_files(self, targets):
        for target in targets:
            if os.path.exists(target):
                os.remove(target)

    def _process_commit_file(self):
        _tmp_files = set()
        for i in self._interval:
            try:
                commit_id = i.get("commit_id")
                commit_sha = i.get("commit_sha")
                file_id = i.get("file_id")
                file_name = i.get("file_name")
                file_ext = i.get("file_ext")

                found = self._dao.select_code_at_commit(commit_id, file_id)
                if file_ext not in CodeQuerier.FORBIDDEN_EXTENSIONS and not found:
                    check_extension = False
                    if file_ext in CodeQuerier.ALLOWED_EXTENSIONS:
                        check_extension = True

                    if not file_ext:
                        file_ext = "unknown"

                    _tmp_file = self._tmp_root_file + file_ext
                    _tmp_files.add(_tmp_file)

                    file_content_at_revision = self._git_querier.get_file_content(
                        commit_sha, file_name)
                    if file_content_at_revision:
                        self._save_content(file_content_at_revision, _tmp_file)

                        if check_extension:
                            file_info, fun_info = self._code_querier.get_complexity_info(
                                _tmp_file, self._import_type)
                            self._dao.insert_code_at_commit(
                                commit_id, file_id, file_info.get('ccn'),
                                file_info.get('loc'),
                                file_info.get('comments'),
                                file_info.get('blanks'), file_info.get('funs'),
                                file_info.get('tokens'),
                                file_info.get('avg_ccn'),
                                file_info.get('avg_loc'),
                                file_info.get('avg_tokens'))

                            if self._import_type == Code2DbCommitFile.FULL_IMPORT_TYPE:
                                for fi in fun_info:
                                    self._dao.insert_function(
                                        fi.get('name'), file_id,
                                        fi.get('args'), fi.get('loc'),
                                        fi.get('tokens'), fi.get('lines'),
                                        fi.get('ccn'), fi.get('start'),
                                        fi.get('end'))

                                    fun_id = self._dao.select_function_id(
                                        file_id, fi.get('start'),
                                        fi.get('end'))

                                    self._dao.insert_function_at_commit(
                                        fun_id, commit_id)

                        else:
                            file_info = self._code_querier.get_comment_info(
                                _tmp_file)
                            self._dao.insert_code_at_commit(
                                commit_id, file_id, None, file_info.get('loc'),
                                file_info.get('comments'),
                                file_info.get('blanks'), None, None, None,
                                None, None)

                    if len(_tmp_files) >= 20:
                        self._delete_tmp_files(_tmp_files)

            except Exception:
                self._logger.error("Code2DbCommitFile failed on pair " +
                                   str(commit_sha) + ", " + str(file_name),
                                   exc_info=True)

        if _tmp_files:
            self._delete_tmp_files(_tmp_files)

    def extract(self):
        """
        extracts code function data and stores it in the DB
        """
        try:
            self._logger.info("Code2DbCommitFile started")
            start_time = datetime.now()

            self._process_commit_file()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Code2DbCommitFile finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception, e:
            self._logger.error("Code2DbCommitFile failed", exc_info=True)