class StackOverflowTopic2Db(object):
    """
    This class handles the import of Stackoverflow topics
    """
    def __init__(self, db_name, forum_id, interval, token, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type forum_id: int
        :param forum_id: the id of an existing forum in the DB

        :type interval: list int
        :param interval: a list of topic ids to import

        :type token: str
        :param token: a Stackoverflow token

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """

        self._log_root_path = log_root_path
        self._interval = interval
        self._db_name = db_name
        self._forum_id = forum_id
        self._token = token
        self._config = config

        self._logging_util = LoggingUtil()

        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-topic2db-" + str(
                self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = StackOverflowQuerier(self._token, self._logger)
            self._dao = StackOverflowDao(self._config, self._logger)
            self.extract()
        except Exception, e:
            self._logger.error("StackOverflowTopic2Db failed", exc_info=True)
        finally:
Esempio n. 2
0
class GitHubPullRequest2Db(object):
    """
    This class handles the import of GitHub pull requests
    """
    def __init__(self, db_name, repo_id, issue_tracker_id, url, interval,
                 token, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type issue_tracker_id: int
        :param issue_tracker_id: the id of an existing issue tracker in the DB

        :type url: str
        :param url: full name of the GitHub repository

        :type interval: list int
        :param interval: a list of issue ids to import

        :type token: str
        :param token: a GitHub token

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._url = url
        self._db_name = db_name
        self._repo_id = repo_id
        self._issue_tracker_id = issue_tracker_id
        self._interval = interval
        self._token = token
        self._config = config
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None
        self._git_dao = None

    def __call__(self):
        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()
        log_path = self._log_root_path + "-pr2db-" + str(
            self._interval[0]) + "-" + str(self._interval[-1])
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._querier = GitHubQuerier(self._url, self._token, self._logger)
            self._dao = GitHubDao(self._config, self._logger)
            self._git_dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception, e:
            self._logger.error("GitHubPullRequest2Db failed", exc_info=True)
        finally:
Esempio n. 3
0
class Code2DbUpdate():
    """
    This class handles the update of code data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name, repo_name, git_repo_path,
                 extensions, references, num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type extensions: list str
        :param extensions: file extensions to analyse. Currently extensions supported: ['java', 'py', 'php', 'scala', 'js', 'rb', 'cs', 'cpp', 'c']

        :type references: list str
        :param references: list of references to analyse

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 5)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "update-code-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._extensions = extensions
        self._references = references

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Code2DbUpdate.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_new_commit_file_pairs(self, repo_id):
        pairs = []

        filter_references = "1 = 1"
        if self._references:
            filter_references = "r.name IN (" + ",".join(
                ["'" + e + "'" for e in self._references]) + ")"
        filter_extensions = "1 = 1"
        if self._extensions:
            filter_extensions = "f.ext IN (" + ",".join(
                ["'" + e + "'" for e in self._extensions]) + ")"

        cursor = self._dao.get_cursor()
        query = "SELECT existing_pairs.* " \
                "FROM ( " \
                "SELECT cac.commit_id, cac.file_id FROM code_at_commit cac GROUP BY cac.commit_id, cac.file_id) AS processed_pairs " \
                "RIGHT JOIN " \
                "(SELECT c.id as commit_id, c.sha, f.id AS file_id, f.name AS file_name, f.ext AS file_ext " \
                "FROM commit_in_reference cin JOIN reference r ON r.id = cin.ref_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "JOIN file_modification fm ON fm.commit_id = c.id " \
                "JOIN file f ON f.id = fm.file_id " \
                "WHERE " + filter_references + " AND " + filter_extensions + " AND cin.repo_id = %s " \
                "GROUP BY c.id, f.id) AS existing_pairs " \
                "ON processed_pairs.commit_id = existing_pairs.commit_id AND processed_pairs.file_id = existing_pairs.file_id " \
                "WHERE processed_pairs.commit_id IS NULL"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            pairs.append({
                "commit_id": row[0],
                "commit_sha": row[1],
                "file_id": row[2],
                "file_name": row[3],
                "file_ext": row[4]
            })
            row = self._dao.fetchone(cursor)
        self._dao.close_cursor(cursor)

        return pairs

    def _update_existing_references(self, repo_id, import_type):
        pairs = self._get_new_commit_file_pairs(repo_id)
        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                pairs, self._num_processes) if len(i) > 0
        ]

        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, intervals,
                                             results)

        for interval in intervals:
            issue_extractor = Code2DbCommitFile(self._db_name,
                                                self._git_repo_path, interval,
                                                import_type, self._config,
                                                self._log_path)
            queue_intervals.put(issue_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes,
                                              queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _update_info_code(self, repo_id, import_type):
        #updates code data
        self._update_existing_references(repo_id, import_type)

    def _get_import_type(self, repo_id):
        #gets import type
        import_type = 0
        import_type += self._dao.function_at_commit_is_empty(
            repo_id) + self._dao.code_at_commit_is_empty(repo_id)
        return import_type

    def update(self):
        """
        updates the Git data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Code2DbUpdate started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            self._update_info_code(repo_id, self._get_import_type(repo_id))

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Code2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("Code2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 4
0
class Slack2DbUpdate():
    """
    This class handles the update of Slack data
    """
    def __init__(self, db_name, project_name, instant_messaging_name, tokens,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type instant_messaging_name: str
        :param instant_messaging_name: the name of an existing instant messaging in the DB to update

        :type tokens: list of tokens
        :param tokens: a list of Slack tokens

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "update-slack-" + db_name + "-" + project_name + "-" + instant_messaging_name
        self._project_name = project_name
        self._db_name = db_name
        self._instant_messaging_name = instant_messaging_name
        self._tokens = tokens

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _update_channels(self, instant_messaging_id):
        #updates channels of a instant messaging
        channel_ids = self._dao.get_channel_ids(instant_messaging_id)

        if channel_ids:
            intervals = [
                i for i in multiprocessing_util.get_tasks_intervals(
                    channel_ids, len(self._tokens)) if len(i) > 0
            ]

            queue_extractors = multiprocessing.JoinableQueue()
            results = multiprocessing.Queue()

            # Start consumers
            multiprocessing_util.start_consumers(len(self._tokens),
                                                 queue_extractors, results)

            for i in range(len(intervals)):
                channel_extractor = SlackChannel2Db(
                    self._db_name, instant_messaging_id, intervals[i],
                    self._tokens[i], self._config, self._log_path)
                queue_extractors.put(channel_extractor)

            # Add end-of-queue markers
            multiprocessing_util.add_poison_pills(len(self._tokens),
                                                  queue_extractors)

            # Wait for all of the tasks to finish
            queue_extractors.join()

    def update(self):
        """
        updates the Slack data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("SlackUpdate started")
            start_time = datetime.now()

            self._querier = SlackQuerier(self._tokens[0], self._logger)
            self._dao = SlackDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            instant_messaging_id = self._dao.select_instant_messaging_id(
                self._instant_messaging_name, project_id)

            if instant_messaging_id:
                self._update_channels(instant_messaging_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("SlackDbUpdate extract finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("SlackDbUpdate extract failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 5
0
class Git2DbMain():
    """
    This class handles the import of Git data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name, repo_name, git_repo_path,
                 before_date, import_type, references, num_processes, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type import_type: int
        :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail

        :type references: list str
        :param references: list of references to import

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 5)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-git-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._before_date = before_date
        self._import_type = import_type

        self._references = references

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Git2DbMain.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_existing_references(self, repo_id):
        #retrieves already imported references
        existing_refs = []

        cursor = self._dao.get_cursor()
        query = "SELECT ref.name " \
                "FROM reference ref JOIN repository r ON ref.repo_id = r.id " \
                "WHERE r.id = %s"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            existing_refs.append(row[0])
            row = self._dao.fetchone(cursor)
        self._dao.close_cursor(cursor)

        return existing_refs

    def _get_info_contribution(self, repo_id):
        #processes Git data
        existing_refs = self._get_existing_references(repo_id)

        queue_references = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes,
                                             queue_references, results)
        for reference in self._querier.get_references():
            if self._references:
                if reference[0] in self._references:
                    git_ref_extractor = Git2DbReference(
                        self._db_name, repo_id, self._git_repo_path,
                        self._before_date, self._import_type, reference[0], "",
                        self._config, self._log_path)

                    queue_references.put(git_ref_extractor)
            else:
                if reference[0] not in existing_refs:
                    git_ref_extractor = Git2DbReference(
                        self._db_name, repo_id, self._git_repo_path,
                        self._before_date, self._import_type, reference[0], "",
                        self._config, self._log_path)

                    queue_references.put(git_ref_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes,
                                              queue_references)

        # Wait for all of the tasks to finish
        queue_references.join()

    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Git2DbMain started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            self._dao.insert_repo(project_id, self._repo_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            #info contribution does not need a connection to the db
            self._get_info_contribution(repo_id)
            self._dao.restart_connection()
            self._dao.fix_commit_parent_table(repo_id)
            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 6
0
class DbSchema():
    """
    This class initializes the DB schema
    """
    def __init__(self, db_name, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of the DB to initialize/connect to, it cannot be null and must follow the format
        allowed in MySQL (http://dev.mysql.com/doc/refman/5.7/en/identifiers.html).
        If a DB having a name equal already exists in Gitana, the existing DB will be dropped and a new one will be created


        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._db_name = db_name
        self._config = config
        self._log_root_path = log_root_path
        self._db_util = DbUtil()
        self._logging_util = LoggingUtil()

        log_path = self._log_root_path + "db-schema-" + db_name
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")
        self._cnx = self._db_util.get_connection(self._config)

    def __del__(self):
        if self._cnx:
            self._db_util.close_connection(self._cnx)
        if self._logger:
            #deletes the file handler of the logger
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)

    def add_git_tables(self):
        """
        initializes git tables if they do not exist
        """
        self.set_database(self._db_name)
        self._init_git_tables()

    def add_issue_tracker_tables(self):
        """
        initializes issue tracker tables if they do not exist
        """
        self.set_database(self._db_name)
        self._init_shared_tables_issue_tracker_communication_channels()
        self._init_issue_tracker_tables()

    def add_instant_messaging_tables(self):
        """
        initializes instant messaging tables if they do not exist
        """
        self.set_database(self._db_name)
        self._init_shared_tables_issue_tracker_communication_channels()
        self._init_instant_messaging_tables()

    def add_forum_tables(self):
        """
        initializes forum tables if they do not exist
        """
        self.set_database(self._db_name)
        self._init_shared_tables_issue_tracker_communication_channels()
        self._init_forum_tables()

    def init_database(self, init_git, init_issue_tracker, init_forum,
                      init_instant_messaging):
        """
        initializes the database tables and functions

        :type init_git: bool
        :param init_git: if True, it initializes the tables containing git data

        :type init_issue_tracker: bool
        :param init_issue_tracker: if True, it initializes the tables containing issue tracker data

        :type init_forum: bool
        :param init_forum: if True, it initializes the tables containing forum data

        :type init_instant_messaging: bool
        :param init_instant_messaging: if True, it initializes the tables containing instant messaging data
        """
        try:
            self._logger.info("init database started")
            start_time = datetime.now()
            self._create_database()
            self.set_database(self._db_name)
            self._set_settings()

            self._init_common_tables()

            if init_issue_tracker or init_forum or init_instant_messaging:
                self._init_shared_tables_issue_tracker_communication_channels()

            if init_git:
                self._init_git_tables()

            if init_issue_tracker:
                self._init_issue_tracker_tables()

            if init_forum:
                self._init_forum_tables()

            if init_instant_messaging:
                self._init_instant_messaging_tables()

            self._init_functions()
            self._logger.info("database " + self._db_name + " created")

            end_time = datetime.now()

            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Init database finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
        except Exception:
            self._logger.error("init database failed", exc_info=True)

    def create_project(self, project_name):
        """
        inserts a project in the DB

        :type project_name: str
        :param project_name: the name of the project to create
        """
        self._cnx = self._db_util.get_connection(self._config)
        self._db_util.insert_project(self._cnx, self._db_name, project_name)
        self._db_util.close_connection(self._cnx)

    def create_repository(self, project_name, repo_name):
        """
        inserts a repository in the DB

        :type project_name: str
        :param project_name: the name of an existing project

        :type repo_name: str
        :param repo_name: the name of the repository to insert
        """
        self._cnx = self._db_util.get_connection(self._config)
        self.set_database(self._db_name)
        project_id = self._db_util.select_project_id(self._cnx, project_name,
                                                     self._logger)
        try:
            self._db_util.insert_repo(self._cnx, project_id, repo_name,
                                      self._logger)
        except Exception:
            self._logger.error("repository " + repo_name + " not inserted",
                               exc_info=True)
        self._db_util.close_connection(self._cnx)

    def list_projects(self):
        """
        lists all projects contained in the DB
        """
        self._cnx = self._db_util.get_connection(self._config)
        project_names = []
        self.set_database(self._db_name)
        cursor = self._cnx.cursor()
        query = "SELECT name FROM project"
        cursor.execute(query)

        row = cursor.fetchone()

        while row:
            project_names.append(row[0])
            row = cursor.fetchone()

        cursor.close()
        return project_names

    def set_database(self, db_name):
        """
        sets the DB used by the tool

        :type db_name: str
        :param db_name: the name of the DB
        """
        try:
            self._logger.info("set database " + db_name + " started")
            self._db_util.set_database(self._cnx, db_name)
            self._logger.info("set database " + db_name + " finished")
        except Exception:
            self._logger.error("set database failed", exc_info=True)

    def _set_settings(self):
        #sets the settings (max connections, charset, file format, ...) used by the DB
        self._db_util.set_settings(self._cnx)

    def _create_database(self):
        #creates the database
        cursor = self._cnx.cursor()

        drop_database_if_exists = "DROP DATABASE IF EXISTS " + self._db_name
        cursor.execute(drop_database_if_exists)

        create_database = "CREATE DATABASE " + self._db_name
        cursor.execute(create_database)

        cursor.close()

    def _init_functions(self):
        #initializes functions
        cursor = self._cnx.cursor()

        levenshtein_distance = """
        CREATE DEFINER=`root`@`localhost` FUNCTION `levenshtein_distance`(s1 VARCHAR(255) CHARACTER SET utf8, s2 VARCHAR(255) CHARACTER SET utf8) RETURNS int(11)
            DETERMINISTIC
        BEGIN
            DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT;
            DECLARE s1_char CHAR CHARACTER SET utf8;
            -- max strlen=255 for this function
            DECLARE cv0, cv1 VARBINARY(256);

            SET s1_len = CHAR_LENGTH(s1),
                s2_len = CHAR_LENGTH(s2),
                cv1 = 0x00,
                j = 1,
                i = 1,
                c = 0;

            IF (s1 = s2) THEN
              RETURN (0);
            ELSEIF (s1_len = 0) THEN
              RETURN (s2_len);
            ELSEIF (s2_len = 0) THEN
              RETURN (s1_len);
            END IF;

            WHILE (j <= s2_len) DO
              SET cv1 = CONCAT(cv1, CHAR(j)),
                  j = j + 1;
            END WHILE;

            WHILE (i <= s1_len) DO
              SET s1_char = SUBSTRING(s1, i, 1),
                  c = i,
                  cv0 = CHAR(i),
                  j = 1;

              WHILE (j <= s2_len) DO
                SET c = c + 1,
                    cost = IF(s1_char = SUBSTRING(s2, j, 1), 0, 1);

                SET c_temp = ORD(SUBSTRING(cv1, j, 1)) + cost;
                IF (c > c_temp) THEN
                  SET c = c_temp;
                END IF;

                SET c_temp = ORD(SUBSTRING(cv1, j+1, 1)) + 1;
                IF (c > c_temp) THEN
                  SET c = c_temp;
                END IF;

                SET cv0 = CONCAT(cv0, CHAR(c)),
                    j = j + 1;
              END WHILE;

              SET cv1 = cv0,
                  i = i + 1;
            END WHILE;

            RETURN (c);
        END"""

        soundex_match = """
        CREATE DEFINER=`root`@`localhost` FUNCTION `soundex_match`(s1 VARCHAR(255) CHARACTER SET utf8, s2 VARCHAR(255) CHARACTER SET utf8) RETURNS int(1)
            DETERMINISTIC
        BEGIN
            DECLARE _result INT DEFAULT 0;
            IF SOUNDEX(s1) = SOUNDEX(s2) THEN
                SET _result = 1;
            END IF;
            RETURN _result;
        END"""

        cursor.execute(levenshtein_distance)
        cursor.execute(soundex_match)
        cursor.close()

    def _init_common_tables(self):
        #initializes common tables used by tables modeling git, issue tracker, forum and instant messaging data
        cursor = self._cnx.cursor()

        create_table_project = "CREATE TABLE IF NOT EXISTS project( " \
                               "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                               "name varchar(255), " \
                               "CONSTRAINT name UNIQUE (name)" \
                               ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_user = "******" \
                            "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                            "name varchar(256), " \
                            "email varchar(256), " \
                            "CONSTRAINT namem UNIQUE (name, email) " \
                            ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_user_alias = "CREATE TABLE IF NOT EXISTS user_alias ( " \
                                  "user_id int(20), " \
                                  "alias_id int(20), " \
                                  "CONSTRAINT a UNIQUE (user_id) " \
                                  ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        cursor.execute(create_table_project)
        cursor.execute(create_table_user)
        cursor.execute(create_table_user_alias)

    def _init_shared_tables_issue_tracker_communication_channels(self):
        #initializes shared tables used by tables modeling issue tracker, forum and instant messaging data
        cursor = self._cnx.cursor()

        create_table_label = "CREATE TABLE IF NOT EXISTS label ( " \
                             "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                             "name varchar(256), " \
                             "CONSTRAINT name UNIQUE (name) " \
                             ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_message = "CREATE TABLE IF NOT EXISTS message ( " \
                               "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                               "own_id varchar(20), " \
                               "pos int(10), " \
                               "type_id int(20), " \
                               "issue_id int(20), " \
                               "topic_id int(20), " \
                               "channel_id int(20), " \
                               "body longblob, " \
                               "votes int(20), " \
                               "author_id int(20), " \
                               "created_at timestamp NULL DEFAULT NULL," \
                               "CONSTRAINT ip UNIQUE (issue_id, topic_id, channel_id, own_id) " \
                               ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_message_dependency = "CREATE TABLE IF NOT EXISTS message_dependency ( " \
                                          "source_message_id int(20), " \
                                          "target_message_id int(20), " \
                                          "PRIMARY KEY st (source_message_id, target_message_id) " \
                                          ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_message_type = "CREATE TABLE IF NOT EXISTS message_type ( " \
                                    "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                    "name varchar(255), " \
                                    "CONSTRAINT name UNIQUE (name) " \
                                    ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        insert_message_types = "INSERT IGNORE INTO message_type VALUES (NULL, 'question'), " \
                                                               "(NULL, 'answer'), " \
                                                               "(NULL, 'comment'), " \
                                                               "(NULL, 'accepted_answer'), " \
                                                               "(NULL, 'reply'), " \
                                                               "(NULL, 'file_upload'), " \
                                                               "(NULL, 'info');"

        create_table_attachment = "CREATE TABLE IF NOT EXISTS attachment ( " \
                                  "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                  "own_id varchar(20), " \
                                  "message_id int(20), " \
                                  "name varchar(256), " \
                                  "extension varchar(10), " \
                                  "bytes int(20), " \
                                  "url varchar(512), " \
                                  "CONSTRAINT ip UNIQUE (message_id, own_id) " \
                                  ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        cursor.execute(create_table_label)
        cursor.execute(create_table_message)
        cursor.execute(create_table_message_dependency)
        cursor.execute(create_table_message_type)
        cursor.execute(insert_message_types)
        cursor.execute(create_table_attachment)

        cursor.close()

    def _init_git_tables(self):
        #initializes tables used to model git data
        cursor = self._cnx.cursor()

        create_table_repository = "CREATE TABLE IF NOT EXISTS repository( " \
                                  "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                  "project_id int(20), " \
                                  "name varchar(255), " \
                                  "CONSTRAINT name UNIQUE (name)" \
                                  ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_reference = "CREATE TABLE IF NOT EXISTS reference( " \
                                 "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                 "repo_id int(20), " \
                                 "name varchar(255), " \
                                 "type varchar(255), " \
                                 "CONSTRAINT name UNIQUE (repo_id, name, type) " \
                                 ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_commit = "CREATE TABLE IF NOT EXISTS commit(" \
                              "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                              "repo_id int(20), " \
                              "sha varchar(512), " \
                              "message varchar(512), " \
                              "author_id int(20), " \
                              "committer_id int(20), " \
                              "authored_date timestamp NULL DEFAULT NULL, " \
                              "committed_date timestamp NULL DEFAULT NULL, " \
                              "size int(20), " \
                              "INDEX sha (sha), " \
                              "INDEX auth (author_id), " \
                              "INDEX comm (committer_id), " \
                              "CONSTRAINT s UNIQUE (sha, repo_id) " \
                              ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_commit_parent = "CREATE TABLE IF NOT EXISTS commit_parent(" \
                                     "repo_id int(20), " \
                                     "commit_id int(20), " \
                                     "commit_sha varchar(512), " \
                                     "parent_id int(20), " \
                                     "parent_sha varchar(512), " \
                                     "PRIMARY KEY copa (repo_id, commit_id, parent_id), " \
                                     "CONSTRAINT cshapsha UNIQUE (repo_id, commit_id, parent_sha) " \
                                     ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_commits2reference = "CREATE TABLE IF NOT EXISTS commit_in_reference(" \
                                         "repo_id int(20), " \
                                         "commit_id int(20), " \
                                         "ref_id int(20), " \
                                         "PRIMARY KEY core (commit_id, ref_id) " \
                                         ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_file = "CREATE TABLE IF NOT EXISTS file( " \
                            "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                            "repo_id int(20), " \
                            "name varchar(512), " \
                            "ext varchar(255), " \
                            "CONSTRAINT rerena UNIQUE (repo_id, name) " \
                            ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_file_renamed = "CREATE TABLE IF NOT EXISTS file_renamed ( " \
                                    "repo_id int(20), " \
                                    "current_file_id int(20), " \
                                    "previous_file_id int(20), " \
                                    "file_modification_id int(20), " \
                                    "PRIMARY KEY cpc (current_file_id, previous_file_id, file_modification_id) " \
                                    ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_file_modification = "CREATE TABLE IF NOT EXISTS file_modification ( " \
                                         "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                         "commit_id int(20), " \
                                         "file_id int(20), " \
                                         "status varchar(10), " \
                                         "additions numeric(10), " \
                                         "deletions numeric(10), " \
                                         "changes numeric(10), " \
                                         "patch longblob, " \
                                         "CONSTRAINT cf UNIQUE (commit_id, file_id) " \
                                         ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_line_detail = "CREATE TABLE IF NOT EXISTS line_detail( " \
                                   "file_modification_id int(20)," \
                                   "type varchar(25), " \
                                   "line_number numeric(20), " \
                                   "is_commented numeric(1), " \
                                   "is_partially_commented numeric(1), " \
                                   "is_empty numeric(1), " \
                                   "content longblob, " \
                                   "PRIMARY KEY fityli (file_modification_id, type, line_number) " \
                                   ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        # adding it here because "file_dependency" depends on "file" table creation.
        # @todo: find a way to move the following table creation to separate section
        #   make "extract_dependency_relations" API interface completely independent.
        create_table_file_dependency = "CREATE TABLE file_dependency ( " \
                                       "repo_id int(20), " \
                                       "ref_id int(20), " \
                                       "source_file_id int(20), " \
                                       "target_file_id int(20), " \
                                       "CONSTRAINT dep UNIQUE (repo_id, ref_id, source_file_id, target_file_id) " \
                                       ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        cursor.execute(create_table_repository)
        cursor.execute(create_table_reference)
        cursor.execute(create_table_commit)
        cursor.execute(create_table_commit_parent)
        cursor.execute(create_table_commits2reference)
        cursor.execute(create_table_file)
        cursor.execute(create_table_file_renamed)
        cursor.execute(create_table_file_modification)
        cursor.execute(create_table_line_detail)
        cursor.execute(create_table_file_dependency)
        cursor.close()

    def _init_issue_tracker_tables(self):
        #initializes tables used to model issue tracker data
        cursor = self._cnx.cursor()

        create_table_issue_tracker = "CREATE TABLE IF NOT EXISTS issue_tracker ( " \
                                     "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                     "repo_id int(20), " \
                                     "name varchar(512), " \
                                     "type varchar(512), " \
                                     "CONSTRAINT name UNIQUE (name)" \
                                     ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue = "CREATE TABLE IF NOT EXISTS issue ( " \
                             "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                             "own_id varchar(20), " \
                             "issue_tracker_id int(20), " \
                             "summary varchar(512), " \
                             "component varchar(256), " \
                             "version varchar(256), " \
                             "hardware varchar(256), " \
                             "priority varchar(256), " \
                             "severity varchar(256), " \
                             "reference_id int(20), " \
                             "reporter_id int(20), " \
                             "created_at timestamp NULL DEFAULT NULL, " \
                             "last_change_at timestamp NULL DEFAULT NULL, " \
                             "CONSTRAINT ioi UNIQUE (issue_tracker_id, own_id), " \
                             "INDEX u (reporter_id), " \
                             "INDEX r (reference_id) " \
                             ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue_assignee = "CREATE TABLE IF NOT EXISTS issue_assignee ( " \
                                      "issue_id int(20), " \
                                      "assignee_id int(20), " \
                                      "PRIMARY KEY il (issue_id, assignee_id) " \
                                      ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue_subscriber = "CREATE TABLE IF NOT EXISTS issue_subscriber ( " \
                                        "issue_id int(20), " \
                                        "subscriber_id int(20), " \
                                        "PRIMARY KEY il (issue_id, subscriber_id) " \
                                        ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue_event = "CREATE TABLE IF NOT EXISTS issue_event ( " \
                                   "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                   "issue_id int(20), " \
                                   "event_type_id int(20), " \
                                   "detail varchar(256), " \
                                   "creator_id int(20), " \
                                   "created_at timestamp NULL DEFAULT NULL, " \
                                   "target_user_id int(20), " \
                                   "CONSTRAINT iecc UNIQUE (issue_id, event_type_id, creator_id, created_at, detail) " \
                                   ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue_event_type = "CREATE TABLE IF NOT EXISTS issue_event_type ( " \
                                        "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                        "name varchar(256), " \
                                        "CONSTRAINT name UNIQUE (name) " \
                                        ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue_labelled = "CREATE TABLE IF NOT EXISTS issue_labelled ( " \
                                      "issue_id int(20), " \
                                      "label_id int(20), " \
                                      "PRIMARY KEY il (issue_id, label_id) " \
                                      ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_issue_commit_dependency = "CREATE TABLE IF NOT EXISTS issue_commit_dependency ( " \
                                         "issue_id int(20), " \
                                         "commit_id int(20), " \
                                         "PRIMARY KEY ict (issue_id, commit_id) " \
                                         ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_issue_dependency = "CREATE TABLE IF NOT EXISTS issue_dependency ( " \
                                        "issue_source_id int(20), " \
                                        "issue_target_id int(20), " \
                                        "type_id int(20), " \
                                        "PRIMARY KEY st (issue_source_id, issue_target_id, type_id) " \
                                        ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_issue_dependency_type = "CREATE TABLE IF NOT EXISTS issue_dependency_type (" \
                                       "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                       "name varchar(256), " \
                                       "CONSTRAINT name UNIQUE (name) " \
                                       ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        insert_issue_dependency_type = "INSERT IGNORE INTO issue_dependency_type VALUES (NULL, 'block'), " \
                                                                                "(NULL, 'depends'), " \
                                                                                "(NULL, 'related'), " \
                                                                                "(NULL, 'duplicated');"

        cursor.execute(create_table_issue_tracker)
        cursor.execute(create_table_issue)
        cursor.execute(create_table_issue_assignee)
        cursor.execute(create_table_issue_subscriber)
        cursor.execute(create_table_issue_event)
        cursor.execute(create_table_issue_event_type)
        cursor.execute(create_table_issue_labelled)
        cursor.execute(create_issue_commit_dependency)
        cursor.execute(create_table_issue_dependency)
        cursor.execute(create_issue_dependency_type)
        cursor.execute(insert_issue_dependency_type)
        cursor.close()

    def _init_forum_tables(self):
        #initializes tables used to model forum data
        cursor = self._cnx.cursor()

        create_table_forum = "CREATE TABLE IF NOT EXISTS forum ( " \
                             "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                             "project_id int(20), " \
                             "name varchar(512), " \
                             "type varchar(512), " \
                             "CONSTRAINT name UNIQUE (name)" \
                             ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_topic = "CREATE TABLE IF NOT EXISTS topic ( " \
                             "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                             "own_id varchar(20), " \
                             "forum_id int(20), " \
                             "name varchar(256), " \
                             "votes int(10), " \
                             "views int(10), " \
                             "created_at timestamp NULL DEFAULT NULL, " \
                             "last_change_at timestamp NULL DEFAULT NULL, " \
                             "CONSTRAINT name UNIQUE (forum_id, own_id)" \
                             ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_topic_labelled = "CREATE TABLE IF NOT EXISTS topic_labelled ( " \
                                      "topic_id int(20), " \
                                      "label_id int(20), " \
                                      "PRIMARY KEY il (topic_id, label_id) " \
                                      ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        cursor.execute(create_table_forum)
        cursor.execute(create_table_topic)
        cursor.execute(create_table_topic_labelled)

        cursor.close()

    def _init_instant_messaging_tables(self):
        #initializes tables used to model instant messaging data
        cursor = self._cnx.cursor()

        create_table_instant_messaging = "CREATE TABLE IF NOT EXISTS instant_messaging ( " \
                                         "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                                         "project_id int(20), " \
                                         "name varchar(512), " \
                                         "type varchar(512), " \
                                         "CONSTRAINT name UNIQUE (name)" \
                                         ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        create_table_channel = "CREATE TABLE IF NOT EXISTS channel ( " \
                               "id int(20) AUTO_INCREMENT PRIMARY KEY, " \
                               "own_id varchar(20), " \
                               "instant_messaging_id int(20), " \
                               "name varchar(256), " \
                               "description varchar(512), " \
                               "created_at timestamp NULL DEFAULT NULL, " \
                               "last_change_at timestamp NULL DEFAULT NULL, " \
                               "CONSTRAINT name UNIQUE (instant_messaging_id, own_id)" \
                               ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;"

        cursor.execute(create_table_instant_messaging)
        cursor.execute(create_table_channel)
        cursor.close()
Esempio n. 7
0
class StackOverflowTopic2Db(object):
    """
    This class handles the import of Stackoverflow topics
    """
    def __init__(self, db_name, forum_id, interval, token, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type forum_id: int
        :param forum_id: the id of an existing forum in the DB

        :type interval: list int
        :param interval: a list of topic ids to import

        :type token: str
        :param token: a Stackoverflow token

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """

        self._log_root_path = log_root_path
        self._interval = interval
        self._db_name = db_name
        self._forum_id = forum_id
        self._token = token
        self._config = config

        self._logging_util = LoggingUtil()

        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-topic2db-" + str(
                self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = StackOverflowQuerier(self._token, self._logger)
            self._dao = StackOverflowDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("StackOverflowTopic2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _extract_answers(self, answers, topic_id, message_id):
        # extracts answers
        for a in answers:
            own_id = self._querier.get_container_own_id(a)
            body = self._querier.get_container_body(a)
            author_id = self._dao.get_user_id(
                self._querier.get_container_author(a))
            created_at = self._querier.get_container_created_at(a)
            votes = self._querier.get_container_votes(a)

            if self._querier.is_accepted_answer(a):
                message_type = "accepted_answer"
            else:
                message_type = "answer"

            answer_message_id = self._dao.select_message_id(own_id, topic_id)

            if answer_message_id:
                self._dao.update_message(own_id, topic_id, body, votes)
            else:
                self._dao.insert_message(
                    own_id, self.pos,
                    self._dao.get_message_type_id(message_type), topic_id,
                    self._querier.remove_html_tags(body), votes, author_id,
                    created_at)
                answer_message_id = self._dao.select_message_id(
                    own_id, topic_id)

            self._dao.insert_message_dependency(message_id, answer_message_id)
            self._extract_attachments(body, answer_message_id)
            self.pos += 1

            self._extract_comment_messages(self._querier.get_comments(a),
                                           topic_id, answer_message_id)

    def _extract_comment_messages(self, comments, topic_id, message_id):
        # extracts comments
        for c in comments:
            own_id = self._querier.get_container_own_id(c)
            body = self._querier.get_container_body(c)
            author_id = self._dao.get_user_id(
                self._querier.get_container_author(c))
            created_at = self._querier.get_container_created_at(c)
            votes = self._querier.get_container_votes(c)

            comment_message_id = self._dao.select_message_id(own_id, topic_id)
            if comment_message_id:
                self._dao.update_message(own_id, topic_id, body, votes)
            else:
                self._dao.insert_message(
                    own_id, self.pos,
                    self._dao.get_message_type_id("comment"), topic_id,
                    self._querier.remove_html_tags(body), votes, author_id,
                    created_at)
                comment_message_id = self._dao.select_message_id(
                    own_id, topic_id)

            self._dao.insert_message_dependency(message_id, comment_message_id)
            self._extract_attachments(body, comment_message_id)
            self.pos += 1

    def _extract_attachments(self, body, message_id):
        # extracts attachments
        attachments = self._querier.get_attachments(body)
        if attachments:
            self._insert_attachments(attachments, message_id)

    def _insert_labels(self, labels, topic_id):
        for l in labels:
            self._dao.insert_label(l)
            label_id = self._dao.select_label_id(l)
            self._dao.assign_label_to_topic(label_id, topic_id)

    def _insert_attachments(self, attachments, message_id):
        # inserts attachments
        pos = 0
        for attachment in attachments:
            attachment_name = self._querier.get_attachment_name(attachment)
            attachment_own_id = self._querier.generate_attachment_id(
                message_id, pos)
            attachment_url = self._querier.get_attachment_url(attachment)
            self._dao.insert_attachment(attachment_own_id, message_id,
                                        attachment_name, attachment_url)
            pos += 1

    def _extract_topic(self, topic):
        # extracts a topic
        last_change_at = self._querier.get_topic_last_change_at(topic)
        own_id = self._querier.get_container_own_id(topic)

        if self._dao.get_topic_last_change_at(
                own_id, self._forum_id) != last_change_at:
            name = self._querier.get_topic_name(topic)
            votes = self._querier.get_container_votes(topic)
            views = self._querier.get_topic_views(topic)
            created_at = self._querier.get_container_created_at(topic)

            topic_id = self._dao.insert_topic(own_id, self._forum_id, name,
                                              votes, views, created_at,
                                              last_change_at)
            author_id = self._dao.get_user_id(
                self._querier.get_container_author(topic))

            labels = self._querier.get_topic_labels(topic)
            self._insert_labels(labels, topic_id)

            self.pos = 0
            body = self._querier.get_container_body(topic)

            message_id = self._dao.select_message_id(own_id, topic_id)
            if message_id:
                self._dao.update_message(own_id, topic_id,
                                         self._querier.remove_html_tags(body),
                                         votes)
            else:
                self._dao.insert_message(
                    own_id, self.pos,
                    self._dao.get_message_type_id("question"), topic_id,
                    self._querier.remove_html_tags(body), votes, author_id,
                    created_at)
                message_id = self._dao.select_message_id(own_id, topic_id)
            self._extract_attachments(body, message_id)

            self.pos += 1

            self._extract_comment_messages(self._querier.get_comments(topic),
                                           topic_id, message_id)
            self._extract_answers(self._querier.get_answers(topic), topic_id,
                                  message_id)

    def extract(self):
        """
        extracts Stackoverflow topic data and stores it in the DB
        """
        try:
            self._logger.info("StackOverflowTopic2Db started")
            start_time = datetime.now()

            for topic_id in self._interval:
                topic = self._querier.get_topic(topic_id)
                if topic:
                    self._extract_topic(topic)

            end_time = datetime.now()

            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("StackOverflowTopic2Db finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("StackOverflowTopic2Db failed", exc_info=True)
Esempio n. 8
0
class BugzillaIssueDependency2Db(object):
    """
    This class inserts the dependencies between Bugzilla issues
    """
    def __init__(self, db_name, repo_id, issue_tracker_id, url, product,
                 interval, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type issue_tracker_id: int
        :param issue_tracker_id: the id of an existing issue tracker in the DB

        :type url: str
        :param url: the URL of the bugzilla issue tracker

        :type product: str
        :param product: the name of the product in the bugzilla issue tracker

        :type interval: list int
        :param interval: a list of issue ids to import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._url = url
        self._product = product
        self._db_name = db_name
        self._repo_id = repo_id
        self._issue_tracker_id = issue_tracker_id
        self._interval = interval
        self._logging_util = LoggingUtil()
        self._config = config
        self._filehandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-issue2db-dependency" + \
                       str(self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._filehandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = BugzillaQuerier(self._url, self._product,
                                            self._logger)
            self._dao = BugzillaDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Issue2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _extract_single_issue_dependency(self, issue_id, data, type):
        # inserts issue dependency
        extracted = None
        if isinstance(data, int):
            extracted = data
        else:
            if "show_bug" in data:
                extracted = data.split("?id=")[1]

        if extracted:
            dependent_issue = self._dao.select_issue_id(
                extracted, self._issue_tracker_id, self._repo_id)
            if dependent_issue:
                self._dao.insert_issue_dependency(issue_id, dependent_issue,
                                                  type)

    def _extract_issue_dependency(self, issue_id, obj, type):
        # processes issue dependencies
        if isinstance(obj, list):
            for issue in obj:
                self._extract_single_issue_dependency(issue_id, issue, type)
        else:
            self._extract_single_issue_dependency(issue_id, obj, type)

    def _is_duplicated(self, issue):
        flag = True
        try:
            issue.dupe_of
        except:
            flag = False

        return flag

    def _set_dependencies(self):
        cursor = self._dao.get_cursor()
        query = "SELECT i.id FROM issue i " \
                "JOIN issue_tracker it ON i.issue_tracker_id = it.id " \
                "WHERE i.id >= %s AND i.id <= %s AND issue_tracker_id = %s AND repo_id = %s"
        arguments = [
            self._interval[0], self._interval[-1], self._issue_tracker_id,
            self._repo_id
        ]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            try:
                issue_id = row[0]
                issue_own_id = self._dao.select_issue_own_id(
                    issue_id, self._issue_tracker_id, self._repo_id)
                issue = self._querier.get_issue(issue_own_id)

                if issue.blocks:
                    self._extract_issue_dependency(
                        issue_id, self._querier.get_issue_blocks(issue),
                        self._dao.get_issue_dependency_type_id("block"))

                if issue.depends_on:
                    self._extract_issue_dependency(
                        issue_id, self._querier.get_issue_depends_on(issue),
                        self._dao.get_issue_dependency_type_id("depends"))

                if issue.see_also:
                    self._extract_issue_dependency(
                        issue_id, self._querier.get_issue_see_also(issue),
                        self._dao.get_issue_dependency_type_id("related"))

                if self._is_duplicated(issue):
                    if issue.dupe_of:
                        self._extract_issue_dependency(
                            issue_id, self._querier.get_issue_dupe_of(issue),
                            self._dao.get_issue_dependency_type_id(
                                "duplicated"))

            except Exception:
                self._logger.error(
                    "something went wrong with the following issue id: " +
                    str(issue_id) + " - tracker id " +
                    str(self._issue_tracker_id),
                    exc_info=True)

            row = self._dao.fetchone(cursor)

        self._dao.close_cursor(cursor)

    def extract(self):
        """
        extracts Bugzilla issue dependency data and stores it in the DB
        """
        try:
            self._logger.info("BugzillaIssueDependency2Db started")
            start_time = datetime.now()
            self._set_dependencies()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("BugzillaIssueDependency2Db finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._filehandler)
        except Exception:
            self._logger.error("BugzillaIssueDependency2Db failed",
                               exc_info=True)
Esempio n. 9
0
class EclipseForum2DbMain():
    """
    This class handles the import of Eclipse forum data
    """

    NUM_PROCESSES = 2

    def __init__(self, db_name, project_name, type, forum_name, url,
                 before_date, num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type type: str
        :param type: type of the forum (Stackoverflow, Eclipse forum)

        :type forum_name: str
        :param forum_name: the name of the forum to import

        :type url: str
        :param url: the URL of the forum

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 2)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name
        self._type = type
        self._url = url
        self._forum_name = forum_name
        self._project_name = project_name
        self._db_name = db_name
        self._before_date = before_date

        config.update({'database': db_name})
        self._config = config

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = EclipseForum2DbMain.NUM_PROCESSES

        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_topic_info(self, forum_id, topic):
        # get topic information
        own_id = self._querier.get_topic_own_id(topic)
        title = self._querier.get_topic_title(topic)
        views = self._querier.get_topic_views(topic)
        last_change_at = self._date_util.get_timestamp(
            self._querier.get_last_change_at(topic), "%a, %d %B %Y %H:%M")

        topic_id = self._dao.select_topic_id(forum_id, own_id)
        if not topic_id:
            if self._before_date:
                topic_created_at = self._querier.get_topic_created_at(topic)
                if self._date_util.get_timestamp(topic_created_at, "%a, %d %B %Y") <= \
                        self._date_util.get_timestamp(self._before_date, "%Y-%m-%d"):
                    self._dao.insert_topic(own_id, forum_id, title, views,
                                           last_change_at)
            else:
                self._dao.insert_topic(own_id, forum_id, title, views,
                                       last_change_at)
            topic_id = self._dao.select_topic_id(forum_id, own_id)

        return topic_id

    def _get_topic_ids(self, forum_id):
        # get list of topic ids of a forum
        topic_ids = []

        next_page = True
        while next_page:
            topics_on_page = self._querier.get_topics()

            for t in topics_on_page:
                topic_id = self._get_topic_info(forum_id, t)
                topic_ids.append(topic_id)

            next_page = self._querier.go_next_page()

        return [ti for ti in topic_ids if ti is not None]

    def _get_topics(self, forum_id):
        # insert topics to DB
        self._querier.start_browser()
        topic_ids = self._get_topic_ids(forum_id)
        self._querier.close_browser()

        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                topic_ids, self._num_processes) if len(i) > 0
        ]

        queue_extractors = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes,
                                             queue_extractors, results)

        for interval in intervals:
            topic_extractor = EclipseTopic2Db(self._db_name, forum_id,
                                              interval, self._config,
                                              self._log_path)
            queue_extractors.put(topic_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes,
                                              queue_extractors)

        # Wait for all of the tasks to finish
        queue_extractors.join()

    def extract(self):
        """
        extracts Eclipse forum data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("EclipseForum2DbMain started")
            start_time = datetime.now()

            self._querier = EclipseForumQuerier(self._url, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            forum_id = self._dao.insert_forum(project_id, self._forum_name,
                                              self._type)
            self._get_topics(forum_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("EclipseForum2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("EclipseForum2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
class GitHubIssue2DbMain():
    """
    This class handles the import of GitHub issue data
    """
    def __init__(self, db_name, project_name, repo_name, type,
                 issue_tracker_name, url, before_date, tokens, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type type: str
        :param type: type of the issue tracker (Bugzilla, GitHub)

        :type issue_tracker_name: str
        :param issue_tracker_name: the name of the issue tracker to import

        :type url: str
        :param url: full name of the GitHub repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type tokens: list str
        :param token: list of GitHub tokens

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-github-" + db_name + "-" + project_name + "-" + issue_tracker_name
        self._type = type
        self._url = url
        self._project_name = project_name
        self._db_name = db_name
        self._issue_tracker_name = issue_tracker_name
        self._repo_name = repo_name
        self._before_date = before_date
        self._tokens = tokens

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _pass_list_as_argument(self, elements):
        return '-'.join([str(e) for e in elements])

    def _insert_issue_data(self, repo_id, issue_tracker_id):
        #processes issue data
        imported = self._dao.get_already_imported_issue_ids(
            issue_tracker_id, repo_id)
        issues = list(
            set(self._querier.get_issue_ids(self._before_date)) -
            set(imported))

        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                issues, len(self._tokens)) if len(i) > 0
        ]

        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(len(self._tokens),
                                             queue_intervals, results)

        pos = 0
        for interval in intervals:
            issue_extractor = GitHubIssue2Db(self._db_name, repo_id,
                                             issue_tracker_id, self._url,
                                             interval, self._tokens[pos],
                                             self._config, self._log_path)
            queue_intervals.put(issue_extractor)
            pos += 1

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(len(self._tokens),
                                              queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _insert_issue_dependencies(self, repo_id, issue_tracker_id):
        #processes issue dependency data
        issues = self._dao.get_already_imported_issue_ids(
            issue_tracker_id, repo_id)
        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                issues, len(self._tokens)) if len(i) > 0
        ]

        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(len(self._tokens),
                                             queue_intervals, results)

        pos = 0
        for interval in intervals:
            issue_dependency_extractor = GitHubIssueDependency2Db(
                self._db_name, repo_id, issue_tracker_id, self._url, interval,
                self._tokens[pos], self._config, self._log_path)
            queue_intervals.put(issue_dependency_extractor)
            pos += 1

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(len(self._tokens),
                                              queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _split_issue_extraction(self):
        #splits the issues found according to the number of processes
        project_id = self._dao.select_project_id(self._project_name)
        repo_id = self._dao.select_repo_id(project_id, self._repo_name)
        issue_tracker_id = self._dao.insert_issue_tracker(
            repo_id, self._issue_tracker_name, self._type)
        self._insert_issue_data(repo_id, issue_tracker_id)

        self._dao.restart_connection()
        #self._insert_issue_dependencies(repo_id, issue_tracker_id)

    def extract(self):
        """
        extracts GitHub issue data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("GitHubIssue2DbMain started")
            start_time = datetime.now()

            self._querier = GitHubQuerier(self._url, self._tokens[0],
                                          self._logger)
            self._dao = GitHubDao(self._config, self._logger)

            self._split_issue_extraction()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("GitHubIssue2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("GitHubIssue2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
class BugzillaIssue2Db(object):
    """
    This class handles the import of Bugzilla issues
    """
    def __init__(self, db_name, repo_id, issue_tracker_id, url, product,
                 interval, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type issue_tracker_id: int
        :param issue_tracker_id: the id of an existing issue tracker in the DB

        :type url: str
        :param url: the URL of the bugzilla issue tracker

        :type product: str
        :param product: the name of the product in the bugzilla issue tracker

        :type interval: list int
        :param interval: a list of issue ids to import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._url = url
        self._product = product
        self._db_name = db_name
        self._repo_id = repo_id
        self._issue_tracker_id = issue_tracker_id
        self._interval = interval
        self._config = config

        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()

        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-issue2db-" + str(
                self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = BugzillaQuerier(self._url, self._product,
                                            self._logger)
            self._dao = BugzillaDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("BugzillaIssue2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _is_email(self, str):
        # checks that a string is an email
        return parseaddr(str)[1] != '' and '@' in str

    def _extract_attachment(self, issue_comment_id, attachment_id):
        # inserts an attachment
        attachment_info = self._querier.get_attachment(attachment_id)
        if '.' in attachment_info.name:
            name = ('.').join(attachment_info.name.split('.')[:-1]).strip()
            extension = attachment_info.name.split('.')[-1].lower()
        else:
            name = attachment_info.name
            extension = "patch"

        size = sys.getsizeof(attachment_info)
        self._dao.insert_attachment(attachment_id, issue_comment_id, name,
                                    extension, size, None)

    def _extract_issue_event(self, action, action_content, creator_id,
                             created_at, issue_id, field_name):
        # inserts an issue event
        event_type = action + '-' + field_name
        self._dao.insert_event_type(event_type)
        event_type_id = self._dao.select_event_type(event_type)
        target_user_id = None

        if ',' in action_content and field_name in [
                "keywords", "depends_on", "cc", "flagtypes.name", "blocks",
                "whiteboard", "see_also"
        ]:
            contents = action_content.split(',')
            for content in contents:
                content = content.strip()
                if self._is_email(content):
                    target_user_id = self._dao.get_user_id(
                        self._querier.get_user_name(content), content)

                self._dao.insert_issue_event(issue_id, event_type_id, content,
                                             creator_id, created_at,
                                             target_user_id)
        else:
            if self._is_email(action_content):
                target_user_id = self._dao.get_user_id(
                    self._querier.get_user_name(action_content),
                    action_content)

            self._dao.insert_issue_event(issue_id, event_type_id,
                                         action_content, creator_id,
                                         created_at, target_user_id)

    def _extract_history(self, issue_id, history):
        # inserts the history of an issue
        for event in history:
            try:
                created_at = self._date_util.get_timestamp(
                    self._querier.get_event_property(event, 'when'),
                    '%Y%m%dT%H:%M:%S')
                creator_email = self._querier.get_event_property(event, 'who')
                creator_id = self._dao.get_user_id(
                    self._querier.get_user_name(creator_email), creator_email)

                for change in self._querier.get_event_property(
                        event, 'changes'):
                    removed = self._querier.get_change_property(
                        change, 'removed')
                    field_name = self._querier.get_change_property(
                        change, 'field_name').lower()
                    added = self._querier.get_change_property(change, 'added')

                    if removed != '':
                        action = "removed"
                        self._extract_issue_event(action, removed, creator_id,
                                                  created_at, issue_id,
                                                  field_name)

                    if added != '':
                        action = "added"
                        self._extract_issue_event(action, added, creator_id,
                                                  created_at, issue_id,
                                                  field_name)
            except Exception:
                self._logger.warning("event at (" + str(created_at) +
                                     ") not extracted for issue id: " +
                                     str(issue_id) + " - tracker id " +
                                     str(self._issue_tracker_id),
                                     exc_info=True)

    def _extract_subscribers(self, issue_id, subscribers):
        # inserts subscribers of an issue
        for subscriber in subscribers:
            try:
                subscriber_id = self._dao.get_user_id(
                    self._querier.get_user_name(subscriber), subscriber)
                self._dao.insert_subscriber(issue_id, subscriber_id)
            except Exception:
                self._logger.warning("subscriber (" + subscriber +
                                     ") not inserted for issue id: " +
                                     str(issue_id) + " - tracker id " +
                                     str(self._issue_tracker_id),
                                     exc_info=True)

    def _extract_assignee(self, issue_id, assignee):
        # inserts the assignee of an issue
        try:
            assignee_id = self._dao.get_user_id(
                self._querier.get_user_name(assignee), assignee)
            self._dao.insert_assignee(issue_id, assignee_id)
        except Exception:
            self._logger.warning(
                "assignee (" + assignee + ") not inserted for issue id: " +
                str(issue_id) + " - tracker id " + str(self._issue_tracker_id),
                exc_info=True)

    def _extract_comments(self, issue_id, comments):
        # inserts the comments of an issue
        for comment in comments:
            try:
                own_id = self._querier.get_comment_property(comment, 'id')
                body = self._querier.get_comment_property(comment, 'text')
                position = self._querier.get_comment_property(comment, 'count')
                author_email = self._querier.get_comment_property(
                    comment, 'author')
                author_id = self._dao.get_user_id(
                    self._querier.get_user_name(author_email), author_email)
                created_at = self._date_util.get_timestamp(
                    self._querier.get_comment_property(comment,
                                                       'creation_time'),
                    '%Y%m%dT%H:%M:%S')
                self._dao.insert_issue_comment(
                    own_id, position, self._dao.get_message_type_id("comment"),
                    issue_id, body, None, author_id, created_at)

                attachment_id = self._querier.get_comment_property(
                    comment, 'attachment_id')
                if attachment_id:
                    issue_comment_id = self._dao.select_issue_comment_id(
                        own_id, issue_id, created_at)
                    self._extract_attachment(issue_comment_id, attachment_id)
            except Exception:
                self._logger.warning("comment(" + str(position) +
                                     ") not extracted for issue id: " +
                                     str(issue_id) + " - tracker id " +
                                     str(self._issue_tracker_id),
                                     exc_info=True)
                continue

    def _extract_labels(self, issue_id, labels):
        # inserts the labels of an issue
        for label in labels:
            try:
                digested_label = re.sub("^\W+", "",
                                        re.sub("\W+$", "", label.lower()))
                self._dao.insert_label(digested_label.strip())
                label_id = self._dao.select_label_id(digested_label)
                self._dao.assign_label_to_issue(issue_id, label_id)
            except Exception:
                self._logger.warning("label (" + label +
                                     ") not extracted for issue id: " +
                                     str(issue_id) + " - tracker id " +
                                     str(self._issue_tracker_id),
                                     exc_info=True)

    def _extract_issue_commit_dependency(self, issue_id, commits):
        # inserts the dependencies between an issue and commits
        flattened_list = [y for x in commits for y in x]
        for id in flattened_list:
            if "commit" in id:
                extracted = id.split("?id=")[1].strip()
                commit_id = self._dao.select_commit(extracted, self._repo_id)
                self._dao.insert_issue_commit_dependency(issue_id, commit_id)

    def _is_duplicated(self, issue):
        flag = True
        try:
            issue.dupe_of
        except:
            flag = False

        return flag

    def _get_issue_info(self, issue_own_id):
        # processes each single issue
        flag_insert_issue_data = False

        issue = self._querier.get_issue(issue_own_id)
        summary = self._querier.get_issue_summary(issue)
        component = self._querier.get_issue_component(issue)
        version = self._querier.get_issue_version(issue)
        hardware = self._querier.get_issue_operating_system(issue)
        priority = self._querier.get_issue_priority(issue)
        severity = self._querier.get_issue_severity(issue)
        created_at = self._querier.get_issue_creation_time(issue)
        last_change_at = self._querier.get_issue_last_change_time(issue)

        reference_id = self._dao.find_reference_id(version, issue_own_id,
                                                   self._repo_id)

        issue_creator_email = self._querier.get_issue_creator(issue)
        user_id = self._dao.get_user_id(
            self._querier.get_user_name(issue_creator_email),
            issue_creator_email)

        stored_issue_last_change = self._dao.select_last_change_issue(
            issue_own_id, self._issue_tracker_id, self._repo_id)
        if stored_issue_last_change:
            if last_change_at != stored_issue_last_change:
                flag_insert_issue_data = True
                self._dao.update_issue(issue_own_id, self._issue_tracker_id,
                                       summary, component, version, hardware,
                                       priority, severity, reference_id,
                                       last_change_at)
        else:
            flag_insert_issue_data = True
            self._dao.insert_issue(issue_own_id, self._issue_tracker_id,
                                   summary, component, version, hardware,
                                   priority, severity, reference_id, user_id,
                                   created_at, last_change_at)

        if flag_insert_issue_data:
            issue_id = self._dao.select_issue_id(issue_own_id,
                                                 self._issue_tracker_id,
                                                 self._repo_id)

            try:
                self._extract_labels(issue_id,
                                     self._querier.get_issue_keywords(issue))
            except Exception:
                self._logger.error(
                    "BugzillaError when extracting keywords for issue id: " +
                    str(issue_id) + " - tracker id " +
                    str(self._issue_tracker_id),
                    exc_info=True)

            try:
                self._extract_comments(issue_id,
                                       self._querier.get_issue_comments(issue))
            except Exception:
                self._logger.error(
                    "BugzillaError when extracting comments for issue id: " +
                    str(issue_id) + " - tracker id " +
                    str(self._issue_tracker_id),
                    exc_info=True)

            try:
                self._extract_history(issue_id,
                                      self._querier.get_issue_history(issue))
            except Exception:
                self._logger.error(
                    "BugzillaError when extracting history for issue id: " +
                    str(issue_id) + " - tracker id " +
                    str(self._issue_tracker_id),
                    exc_info=True)

            if issue.cc:
                self._extract_subscribers(issue_id,
                                          self._querier.get_issue_cc(issue))

            if issue.assigned_to:
                self._extract_assignee(issue_id,
                                       self._querier.get_issue_assignee(issue))

            if issue.see_also:
                self._extract_issue_commit_dependency(
                    issue_id, [self._querier.get_issue_see_also(issue)])

    def _get_issues(self):
        # processes issues
        for issue_id in self._interval:
            try:
                self._get_issue_info(issue_id)
            except Exception:
                self._logger.error("something went wrong for issue id: " +
                                   str(issue_id) + " - tracker id " +
                                   str(self._issue_tracker_id),
                                   exc_info=True)

    def extract(self):
        """
        extracts Bugzilla issue data and stores it in the DB
        """
        try:
            self._logger.info("BugzillaIssue2Db started")
            start_time = datetime.now()
            self._get_issues()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("BugzillaIssue2Db finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("BugzillaIssue2Db failed", exc_info=True)
Esempio n. 12
0
class BugzillaIssue2DbUpdate():
    """
    This class handles the update of Bugzilla issue tracker data
    """

    NUM_PROCESSES = 3

    def __init__(self, db_name, project_name,
                 repo_name, issue_tracker_name, url, product, num_processes,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type issue_tracker_name: str
        :param issue_tracker_name: the name of the issue tracker to import

        :type url: str
        :param url: the URL of the issue tracker

        :type product: str
        :param product: the name of the product to import from the issue tracker

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 3)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "update-bugzilla-" + db_name + "-" + project_name + "-" + issue_tracker_name
        self._issue_tracker_name = issue_tracker_name
        self._url = url
        self._product = product
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = BugzillaIssue2DbUpdate.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._dao = None

    def _update_issue_content(self, repo_id, issue_tracker_id, intervals, url):
        #updates issues already stored in the DB
        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results)

        for interval in intervals:
            issue_extractor = BugzillaIssue2Db(self._db_name, repo_id, issue_tracker_id, url, self._product, interval,
                                       self._config, self._log_path)
            queue_intervals.put(issue_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _update_issue_dependency(self, repo_id, issue_tracker_id, intervals, url):
        #updates issue dependencies already stored in the DB
        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results)

        for interval in intervals:
            issue_dependency_extractor = BugzillaIssueDependency2Db(self._db_name, repo_id, issue_tracker_id, url, self._product, interval,
                                                 self._config, self._log_path)
            queue_intervals.put(issue_dependency_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _update_issues(self):
        #updates issues
        project_id = self._dao.select_project_id(self._project_name)
        repo_id = self._dao.select_repo_id(project_id, self._repo_name)
        issue_tracker_id = self._dao.select_issue_tracker_id(repo_id, self._issue_tracker_name)
        issue_tracker_url = self._url

        if issue_tracker_id:
            cursor = self._dao.get_cursor()
            query = "SELECT i.own_id FROM issue i " \
                    "JOIN issue_tracker it ON i.issue_tracker_id = it.id " \
                    "WHERE issue_tracker_id = %s AND repo_id = %s " \
                    "ORDER BY i.own_id ASC;"
            arguments = [issue_tracker_id, repo_id]
            self._dao.execute(cursor, query, arguments)

            issues = []
            row = self._dao.fetchone(cursor)

            while row:
                issues.append(row[0])
                row = self._dao.fetchone(cursor)
            self._dao.close_cursor(cursor)

            if issues:
                intervals = [i for i in multiprocessing_util.get_tasks_intervals(issues, self._num_processes) if len(i) > 0]

                self._update_issue_content(repo_id, issue_tracker_id, intervals, issue_tracker_url)
                self._update_issue_dependency(repo_id, issue_tracker_id, intervals, issue_tracker_url)

    def update(self):
        """
        updates the Bugzilla issue tracker data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info")

            self._logger.info("BugzillaIssue2DbUpdate started")
            start_time = datetime.now()

            self._dao = BugzillaDao(self._config, self._logger)

            self._update_issues()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("BugzillaIssue2DbUpdate finished after " + str(minutes_and_seconds[0])
                         + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except:
            self._logger.error("BugzillaIssue2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 13
0
class FileJsonExporter:
    """
    This class handles the export of file information via JSON. It allows to use a former version of
    the bus factor tool (https://github.com/SOM-Research/busfactor)
    """

    LOG_FOLDER_PATH = "logs"

    def __init__(self, config, db_name, log_root_path):
        """
        :type config: dict
        :param config: the DB configuration file

        :type db_name: str
        :param config: name of an existing DB

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._date_util = DateUtil()
        self._db_util = DbUtil()

        self._logging_util = LoggingUtil()
        self._log_path = log_root_path + "export-file-json-" + db_name
        self._logger = self._logging_util.get_logger(self._log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, self._log_path, "info")

        self._db_name = db_name
        config.update({'database': db_name})
        self._config = config

        self._cnx = self._db_util.get_connection(self._config)
        self._db_util.set_database(self._cnx, self._db_name)
        self._db_util.set_settings(self._cnx)
        self._file_util = FileUtil(self._config, self._logger)

    def get_diff_info(self, patch_content):
        if patch_content:
            first_line = patch_content.split('\n')[0]
            if re.match(r"^@@(\s|\+|\-|\d|,)+@@", first_line, re.M):
                diff_info = first_line.split("@@")[1]
            else:
                diff_info = "Binary file"
        else:
            diff_info = 'Renamed file'

        return diff_info

    def get_diff_content(self, patch_content):
        if patch_content:
            lines = patch_content.split('\n')
            if re.match(r"^@@(\s|\+|\-|\d|,)+@@", lines[0], re.M):
                first_line_content = lines[0].split("@@")[2]
                diff_content = lines[1:]
                diff_content.insert(0, first_line_content)
                diff_content = '\n'.join(diff_content)
            else:
                diff_content = "No content"
        else:
            diff_content = "No content"

        return diff_content

    def get_patch_info(self, content):
        diff_info = self.get_diff_info(content)
        diff_content = self.get_diff_content(content)
        return {
            'info': diff_info,
            'content': diff_content.decode('utf-8', 'ignore')
        }

    def get_changes_for_file(self, file_ids):
        file_modifications = []
        cursor = self._cnx.cursor()
        query = "SELECT c.author_id, c.committer_id, c.authored_date, c.committed_date, c.sha, fm.additions, fm.deletions, fm.patch " \
                "FROM file_modification fm JOIN file f ON fm.file_id = f.id  " \
                "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \
                "JOIN reference r ON r.id = cin.ref_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "WHERE f.id IN (" + ",".join([str(id) for id in file_ids]) + ") " \
                "ORDER BY c.authored_date DESC"
        cursor.execute(query)
        row = cursor.fetchone()

        while row:
            author_id = row[0]
            committer_id = row[1]
            authored_date = row[2].strftime('%Y-%m-%d %H:%M:%S')
            committed_date = row[3].strftime('%Y-%m-%d %H:%M:%S')
            additions = str(row[4])
            deletions = str(row[5])
            sha = str(row[6])
            patch = str(row[7])

            patch_info = self.get_patch_info(patch)

            author_name, author_email = self.get_user_identity(author_id)

            if author_id != committer_id:
                committer_name, committer_email = self.get_user_identity(
                    committer_id)
            else:
                committer_name = author_name
                committer_email = author_email

            author = {'name': author_name, 'email': author_email}
            committer = {'name': committer_name, 'email': committer_email}

            file_modifications.append({
                'author': author,
                'authored_date': authored_date,
                'committer': committer,
                'committed_date': committed_date,
                'additions': additions,
                'deletions': deletions,
                'sha': sha,
                'patch': patch_info
            })

            row = cursor.fetchone()
        cursor.close()

        return file_modifications

    def array2string(self, array):
        return ','.join(str(x) for x in array)

    def get_user_identity(self, user_id):
        found = None

        cursor = self._cnx.cursor()
        query = "SELECT u.name, u.email " \
                "FROM user u " \
                "JOIN (SELECT IFNULL(ua.alias_id, u.id) as user_id FROM user u LEFT JOIN user_alias ua ON u.id = ua.user_id WHERE u.id = %s) as aliased " \
                "ON aliased.user_id = u.id"
        arguments = [user_id]
        cursor.execute(query, arguments)
        row = cursor.fetchone()

        if row:
            name = row[0]
            email = row[1]
            found = (name, email)

        return found

    def get_commits_info(self, file_ids):
        commits = []
        cursor = self._cnx.cursor()
        query = "SELECT c.sha, c.message, r.name, c.author_id, c.committer_id, c.authored_date, c.committed_date " \
                "FROM file_modification fm JOIN file f ON fm.file_id = f.id " \
                "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \
                "JOIN reference r ON r.id = cin.ref_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "WHERE f.id IN (" + ",".join([str(id) for id in file_ids]) + ")"
        cursor.execute(query)
        row = cursor.fetchone()
        while row:
            sha = str(row[0])
            message = str(row[1].encode('utf8'))
            ref = str(row[2])
            author_id = row[3]
            committer_id = row[4]
            authored_date = row[5].strftime('%Y-%m-%d %H:%M:%S')
            committed_date = row[6].strftime('%Y-%m-%d %H:%M:%S')

            author_name, author_email = self.get_user_identity(author_id)

            if author_id != committer_id:
                committer_name, committer_email = self.get_user_identity(
                    committer_id)
            else:
                committer_name = author_name
                committer_email = author_email

            author = {'name': author_name, 'email': author_email}
            committer = {'name': committer_name, 'email': committer_email}
            commits.append({
                'sha': sha,
                'author': author,
                'committer': committer,
                'message': message,
                'ref': ref,
                'authored_date': authored_date,
                'committed_date': committed_date
            })
            row = cursor.fetchone()
        cursor.close()

        return commits

    def get_status_file(self, file_id):
        cursor = self._cnx.cursor()
        query = "SELECT fm.status, MAX(c.committed_date) AS last_modification " \
                "FROM file_modification fm JOIN file f ON fm.file_id = f.id " \
                "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "WHERE f.id = %s"
        arguments = [file_id]
        cursor.execute(query, arguments)

        row = cursor.fetchone()
        cursor.close()

        status = ""
        last_modification = ""
        if row:
            status = row[0]
            last_modification = row[1].strftime('%Y-%m-%d %H:%M:%S')

        return {'status': status, 'last_modification': last_modification}

    def add_file_info_to_json(self, references, repo_json):
        cursor = self._cnx.cursor()
        query = "SELECT f.id, f.name, f.ext, r.name, r.id " \
                "FROM repository repo JOIN commit_in_reference cin ON repo.id = cin.repo_id " \
                "JOIN file_modification fm ON fm.commit_id = cin.commit_id " \
                "JOIN file f ON f.id = fm.file_id " \
                "JOIN reference r ON r.id = cin.ref_id " \
                "WHERE repo.id = %s AND r.name IN (" + ",".join(["'" + ref + "'" for ref in references]) + ") AND " \
                "f.id NOT IN " \
                            "(SELECT deletions.file_id FROM " \
                                        "(SELECT fm.file_id, c.committed_date " \
                                        "FROM commit_in_reference cin " \
                                        "JOIN file_modification fm ON fm.commit_id = cin.commit_id  " \
                                        "JOIN reference r ON r.id = cin.ref_id " \
                                        "JOIN commit c ON c.id = fm.commit_id " \
                                        "WHERE fm.status = 'deleted' AND cin.repo_id = %s ) as deletions " \
                                        "JOIN " \
                                        "(SELECT fm.file_id, max(c.committed_date) as committed_date " \
                                        "FROM commit_in_reference cin " \
                                        "JOIN file_modification fm ON fm.commit_id = cin.commit_id " \
                                        "JOIN reference r ON r.id = cin.ref_id " \
                                        "JOIN commit c ON c.id = fm.commit_id " \
                                        "WHERE fm.status <> 'deleted' AND cin.repo_id = %s " \
                                        "GROUP BY fm.file_id) AS last_action " \
                                        "ON deletions.file_id = last_action.file_id " \
                                        "WHERE deletions.committed_date > last_action.committed_date " \
                                        "UNION " \
                                        "SELECT fr.previous_file_id " \
                                        "FROM file_renamed fr JOIN file f ON fr.previous_file_id = f.id " \
                                        "JOIN file_modification fm ON fm.file_id = f.id " \
                                        "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \
                                        "JOIN reference r ON r.id = cin.ref_id " \
                                        "WHERE cin.repo_id = %s) " \
                "GROUP BY f.id, r.id"
        arguments = [
            self._repo_id, self._repo_id, self._repo_id, self._repo_id
        ]
        cursor.execute(query, arguments)
        row = cursor.fetchone()
        while row:
            file_id = row[0]
            file_name = row[1]
            file_ext = row[2]
            ref_name = row[3]
            ref_id = row[4]

            status = self.get_status_file(file_id)
            file_history = self._file_util.get_file_history_by_id(
                file_id, ref_id)
            file_ids = list(set([h.get("file_id") for h in file_history]))

            commits = self.get_commits_info(file_ids)
            directories = self._file_util.get_directories(file_name)
            changes_info = self.get_changes_for_file(file_ids)

            file_info = {
                'repo': self._repo_name,
                'info': status,
                'commits': commits,
                'ref': ref_name,
                'id': str(file_id),
                'name': file_name.split('/')[-1],
                'ext': file_ext,
                'dirs': directories,
                'file_changes': changes_info
            }
            repo_json.write(json.dumps(file_info) + "\n")
            row = cursor.fetchone()
        cursor.close()

    def export(self, repo_name, references, file_path):
        """
        exports the file data to JSON format

        :type repo_name: str
        :param repo_name: name of the repository to analyse

        :type references: list str
        :param references: list of references to analyse

        :type file_path: str
        :param file_path: the path where to export the file information
        """
        try:
            self._logger.info("FileJSONExporter started")
            start_time = datetime.now()

            repo_json = codecs.open(file_path, 'w', "utf-8")
            self._repo_name = repo_name
            self._repo_id = self._db_util.select_repo_id(
                self._cnx, repo_name, self._logger)
            self.add_file_info_to_json(references, repo_json)
            repo_json.close()

            self._db_util.close_connection(self._cnx)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("FileJSONExporter: process finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("FileJSONExporter failed", exc_info=True)
class DependencyExtractor(object):
    """
    Extract dependency information for all repo files
    and load into the mysql datatables
    """
    def __init__(self, config, db_name, project_name, repo_name, log_path):
        """
        initializer
        :param config: mysql database config dict
        :type config: dict

        :param db_name: mysql database name
        :type db_name: str

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :param repo_name: git repo name
        :type repo_name: str

        :type log_path: str
        :param log_path: the log path
        """
        self._config = config
        self._project_name = project_name
        self._repo_name = repo_name

        # set database key-val pair in config dict
        self._config['database'] = db_name

        self._logging_util = LoggingUtil()
        log_path = log_path + "extract-relations-" + db_name + ".log"
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

    def load_dependencies(self, repo_path, references, extra_paths):
        """
        Extract and load dependency info

        :param repo_path: directory path to git repo
        :type repo_path: str

        :param references: list of git references from where source dependency info loaded. By default all.
        :type references: list

        :param extra_paths: additional directory paths inside git repo to look for dependency target files
        :type extra_paths: list
        """
        if not os.path.exists(repo_path):
            self._logger.error('Invalid repository path: %s', repo_path)
            return

        dep_utils = DependencyUtils(self._config, self._project_name,
                                    self._repo_name, repo_path, self._logger)
        dep_utils.insert_repository()

        source_parser = Parser(repo_path, references, extra_paths,
                               self._logger)
        source_to_targets = source_parser.get_all_dependencies()
        dep_utils.insert_dependencies(source_to_targets)
Esempio n. 15
0
class Code2DbMain():
    """
    This class handles the import of code information
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name,
                 repo_name, git_repo_path, import_type, extensions, references, num_processes,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type import_type: int
        :param import_type: 1 = import overall function statistics per file, 2 = import function-level information

        :type extensions: list str
        :param extensions: file extensions to analyse. Gitana calculates loc, comments and blank lines for most of the files.
        For the following languages ['java', 'py', 'php', 'scala', 'js', 'rb', 'cs', 'cpp', 'c'], Gitana also provides insights about ccn, functions and tokens.

        :type references: list str
        :param references: list of references to analyse

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 10)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-code-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._import_type = import_type
        self._extensions = extensions
        self._references = references

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Code2DbMain.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_commit_file_pairs(self, repo_id):
        pairs = []

        filter_references = "1 = 1"
        if self._references:
            filter_references = "r.name IN (" + ",".join(["'" + e + "'" for e in self._references]) + ")"
        filter_extensions = "1 = 1"
        if self._extensions:
            filter_extensions = "f.ext IN (" + ",".join(["'" + e + "'" for e in self._extensions]) + ")"

        cursor = self._dao.get_cursor()
        query = "SELECT c.id AS commit_id, c.sha, f.id AS file_id, f.name AS file_name, f.ext AS file_ext " \
                "FROM commit_in_reference cin JOIN reference r ON r.id = cin.ref_id " \
                "JOIN commit c ON c.id = cin.commit_id " \
                "JOIN file_modification fm ON fm.commit_id = c.id " \
                "JOIN file f ON f.id = fm.file_id " \
                "WHERE " + filter_references + " AND " + filter_extensions + " AND cin.repo_id = %s " \
                "GROUP BY c.id, f.id"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            pairs.append({"commit_id": row[0], "commit_sha": row[1], "file_id": row[2], "file_name": row[3], "file_ext": row[4]})
            row = self._dao.fetchone(cursor)
        self._dao.close_cursor(cursor)

        return pairs

    def _get_info_code(self, repo_id):
        pairs = self._get_commit_file_pairs(repo_id)
        intervals = [i for i in multiprocessing_util.get_tasks_intervals(pairs, self._num_processes) if len(i) > 0]

        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results)

        for interval in intervals:
            issue_extractor = Code2DbCommitFile(self._db_name, self._git_repo_path, interval, self._import_type,
                                                self._config, self._log_path)
            queue_intervals.put(issue_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def extract(self):
        """
        extracts code function data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info")

            self._logger.info("Code2DbMain started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            repo_id = self._dao.select_repo_id(self._repo_name)
            self._get_info_code(repo_id)
            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("Code2DbMain finished after " + str(minutes_and_seconds[0])
                            + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Code2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 16
0
class GitHubIssue2DbUpdate():
    """
    This class handles the update of GitHub issue tracker data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name, repo_name, issue_tracker_name,
                 url, tokens, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type issue_tracker_name: str
        :param issue_tracker_name: the name of the issue tracker to import

        :type url: str
        :param url: full name of the GitHub repository

        :type tokens: list str
        :param token: list of GitHub tokens

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-github-" + db_name + "-" + project_name + "-" + issue_tracker_name
        self._issue_tracker_name = issue_tracker_name
        self._url = url
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._tokens = tokens

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._dao = None

    def _update_issue_content(self, repo_id, issue_tracker_id, intervals, url):
        # updates issues already stored in the DB
        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(len(self._tokens),
                                             queue_intervals, results)

        pos = 0
        for interval in intervals:
            issue_extractor = GitHubIssue2Db(self._db_name, repo_id,
                                             issue_tracker_id, url, interval,
                                             self._tokens[pos], self._config,
                                             self._log_path)
            queue_intervals.put(issue_extractor)
            pos += 1

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(len(self._tokens),
                                              queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _update_issue_dependency(self, repo_id, issue_tracker_id, intervals,
                                 url):
        # updates issue dependencies already stored in the DB
        queue_intervals = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(len(self._tokens),
                                             queue_intervals, results)

        pos = 0
        for interval in intervals:
            issue_dependency_extractor = GitHubIssueDependency2Db(
                self._db_name, repo_id, issue_tracker_id, url, interval,
                self._tokens[pos], self._config, self._log_path)
            queue_intervals.put(issue_dependency_extractor)
            pos += 1

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(len(self._tokens),
                                              queue_intervals)

        # Wait for all of the tasks to finish
        queue_intervals.join()

    def _update_issues(self):
        # updates issues
        project_id = self._dao.select_project_id(self._project_name)
        repo_id = self._dao.select_repo_id(project_id, self._repo_name)
        issue_tracker_id = self._dao.select_issue_tracker_id(
            repo_id, self._issue_tracker_name)
        issue_tracker_url = self._url

        if issue_tracker_id:
            imported = self._dao.get_already_imported_issue_ids(
                issue_tracker_id, repo_id)

            if imported:
                intervals = [
                    i for i in multiprocessing_util.get_tasks_intervals(
                        imported, len(self._tokens)) if len(i) > 0
                ]

                self._update_issue_content(repo_id, issue_tracker_id,
                                           intervals, issue_tracker_url)
                self._update_issue_dependency(repo_id, issue_tracker_id,
                                              intervals, issue_tracker_url)

    def update(self):
        """
        updates the GitHub issue tracker data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("GitHubIssue2DbUpdate started")
            start_time = datetime.now()

            self._dao = GitHubDao(self._config, self._logger)

            self._update_issues()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("GitHubIssue2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("GitHubIssue2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 17
0
class GraphExporter():
    """
    This class exports the Gitana data to a graph representation
    """

    LOG_FOLDER_PATH = "logs"
    INPUT_PATH = os.path.join(os.path.dirname(resources.__file__),
                              'queries.json')

    def __init__(self, config, db_name, log_root_path):
        """
        :type config: dict
        :param config: the DB configuration file

        :type db_name: str
        :param config: name of an existing DB

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._db_util = DbUtil()
        self._dsl_util = DslUtil()
        self._logging_util = LoggingUtil()
        self._log_path = log_root_path + "export-graph-" + db_name + ".log"
        self._logger = self._logging_util.get_logger(self._log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, self._log_path, "info")

        self._db_name = db_name

        self._config = config
        self._cnx = self._db_util.get_connection(self._config)

        self._db_util.set_database(self._cnx, self._db_name)
        self._db_util.set_settings(self._cnx)

    def _create_log_folder(self, name):
        # creates the log folder
        if not os.path.exists(name):
            os.makedirs(name)

    def _create_output_file(self, filename):
        # creates the output folder
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

    def _load_graph_exporter_json(self, json_path):
        # load the JSON that drives the graph exporter process
        with open(json_path) as json_data:
            data = json.load(json_data)

        return data.get('graph')

    def _get_parameter(self, key, parameters):
        # get JSON parameters
        found = None
        if key in ["EDGECOLOR", "NODECOLOR"]:
            found = parameters.get(key.lower())
        else:
            if key.endswith("ID"):
                name = parameters.get(key[:-2].lower())
                found = self._dsl_util.find_entity_id(self._cnx,
                                                      key[:-2].lower(), name,
                                                      self._logger)

        if not found:
            self._logger.error("GraphExporter: parameter " + str(key) +
                               " not found!")

        return found

    def _load_query_json(self, metric_name, parameters):
        # loads the query stored in the JSON file
        with open(GraphExporter.INPUT_PATH) as json_data:
            data = json.load(json_data)

        metrics = data.get('queries')

        try:
            found = [m for m in metrics if m.get('name') == metric_name][0]
            nodes_query = found.get('nodes')
            edges_query = found.get('edges')

            for k in found.keys():
                if k not in ['name', 'edges', 'nodes']:

                    k_value = str(self._get_parameter(k, parameters))

                    nodes_query = nodes_query.replace(k, k_value)
                    edges_query = edges_query.replace(k, k_value)

            return (nodes_query, edges_query)
        except:
            self._logger.error("GraphExporter: metric " + str(metric_name) +
                               " not found!")

    def export(self, file_path, json_path):
        """
        exports the Gitana data to a graph

        :type file_path: str
        :param file_path: the path where to export the graph

        :type json_path: str
        :param json_path: the path of the JSON that drives the export process
        """

        # gtype -> graph type = "undirected", "directed", if null "undirected"
        # gmode -> graph mode = "dynamic", "static", if null "dynamic"
        try:
            self._logger.info("GraphExporter started")
            start_time = datetime.now()

            exporter_data = self._load_graph_exporter_json(json_path)

            metric_name = exporter_data.get("name")
            parameters = exporter_data.get("params")

            (nodes_query,
             edges_query) = self._load_query_json(metric_name, parameters)

            gexf = GexfGenerator(self._cnx, self._logger)
            gexf.create(nodes_query, edges_query, parameters.get("type"),
                        file_path)
            self._db_util.close_connection(self._cnx)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("GraphExporter: process finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("GraphExporter failed", exc_info=True)
class StackOverflow2DbMain():
    """
    This class handles the import of Stackoverflow data
    """
    def __init__(self, db_name, project_name, type, forum_name, search_query,
                 before_date, tokens, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type type: str
        :param type: type of the forum (Stackoverflow, Eclipse forum)

        :type forum_name: str
        :param forum_name: the name of the forum to import

        :type search_query: str
        :param search_query: a label used to mark questions in Stackoverflow

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type tokens: list str
        :param tokens: list of Stackoverflow tokens

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """

        self._log_path = log_root_path + "import-stackoverflow-" + db_name + "-" + project_name + "-" + forum_name
        self._type = type
        self._forum_name = forum_name
        self._search_query = search_query.strip()
        self._project_name = project_name
        self._db_name = db_name
        self._before_date = before_date
        self._tokens = tokens

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_topics(self, forum_id):
        # processes Stackoverflow questions
        topic_imported = self._dao.get_topic_own_ids(forum_id)
        topic_ids = list(
            set(
                self._querier.get_topic_ids(self._search_query,
                                            self._before_date)) -
            set(topic_imported))
        topic_ids.sort()

        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                topic_ids, len(self._tokens)) if len(i) > 0
        ]

        queue_extractors = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(len(self._tokens),
                                             queue_extractors, results)

        pos = 0
        for interval in intervals:
            topic_extractor = StackOverflowTopic2Db(self._db_name, forum_id,
                                                    interval,
                                                    self._tokens[pos],
                                                    self._config,
                                                    self._log_path)
            queue_extractors.put(topic_extractor)
            pos += 1

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(len(self._tokens),
                                              queue_extractors)

        # Wait for all of the tasks to finish
        queue_extractors.join()

    def extract(self):
        """
        extracts Stackoverflow data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("StackOverflow2DbMain started")
            start_time = datetime.now()

            self._querier = StackOverflowQuerier(self._tokens[0], self._logger)
            self._dao = StackOverflowDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            forum_id = self._dao.insert_forum(project_id, self._forum_name,
                                              self._type)
            self._get_topics(forum_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("StackOverflow2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")

            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("StackOverflow2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
class SlackChannel2Db(object):
    """
    This class handles the import of Slack channels
    """

    def __init__(self, db_name, instant_messaging_id, interval, token,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type instant_messaging_id: int
        :param instant_messaging_id: the id of an existing instant messaging in the DB

        :type interval: list int
        :param interval: a list of channel ids to import

        :type token: str
        :param token: a Slack token

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """

        self._log_root_path = log_root_path
        self._interval = interval
        self._db_name = db_name
        self._instant_messaging_id = instant_messaging_id
        self._token = token
        self._config = config

        self._logging_util = LoggingUtil()

        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-channel2db-" + str(self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info")

            self._querier = SlackQuerier(self._token, self._logger)
            self._dao = SlackDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Channel2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _insert_not_recognized_url_attachments(self, message_id, urls):
        #insert not recognized url attachments
        pos = 0
        for url in urls:
            attachment_own_id = self._querier.generate_url_attachment_id(message_id, pos)
            attachment_name = self._querier.get_url_attachment_name(url)
            attachment_extension = self._querier.get_url_attachment_extension(url)
            self._dao.insert_url_attachment(attachment_own_id, message_id, attachment_name, attachment_extension, url)

        pos += 1

    def _extract_file_attachment_info(self, message, message_id):
        #insert file attachments
        file = self._querier.get_file_attachment(message)
        own_id = self._querier.get_file_attachment_property(file, "id")
        name = self._querier.get_file_attachment_property(file, "name")
        extension = self._querier.get_file_attachment_property(file, "filetype")
        url = self._querier.get_file_attachment_property(file, "permalink")
        bytes = self._querier.get_file_attachment_property(file, "size")

        self._dao.insert_attachment(own_id, message_id, name, extension, bytes, url)

    def _extract_url_attachments(self, message, message_id):
        #insert URL attachments
        urls = self._querier.get_url_attachments(self._querier.get_message_body(message))
        attachments = self._querier.get_message_attachments(message)

        for a in attachments:
            url = self._querier.get_attachment_url(a)
            name = self._querier.get_attachament_name(a)
            own_id = self._querier.get_attachment_id(a)
            extension = self._querier.get_attachment_extension(a)
            bytes = self._querier.get_attachment_size(a)
            self._dao.insert_attachment(own_id, message_id, name, extension, bytes, url)

            if url in urls:
                urls.remove(a.get('from_url'))

        self._insert_not_recognized_url_attachments(message_id, urls)

    def _extract_file_comment(self, channel_id, comment, pos):
        #insert file comment
        own_id = self._querier.get_comment_id(comment)
        body = self._querier.get_comment_body(comment)
        created_at = self._querier.get_comment_created_at(comment)
        author_name = self._querier.get_message_author_name(comment)
        author_email = self._querier.get_message_author_email(comment)
        author_id = self._dao.get_user_id(author_name, author_email)

        self._dao.insert_message(own_id, pos, self._dao.get_message_type_id("comment"), channel_id, body, author_id, created_at)
        comment_id = self._dao.select_message_id(own_id, channel_id)
        return comment_id

    def _extract_comment(self, message, channel_id):
        #insert comment
        pos = 0
        message_id = None

        initial_comment = self._querier.file_attachment_get_comment(message)
        if initial_comment:
            own_id = self._querier.get_comment_id(initial_comment)
            message_id = self._dao.select_message_id(own_id, channel_id)
            pos = self._dao.get_comments(message_id)

        comment = self._querier.get_comment_message(message)
        comment_id = self._extract_file_comment(channel_id, comment, pos)

        if message_id:
            self._dao.insert_message_dependency(comment_id, message_id)

    def _extract_message(self, message, channel_id, type, pos):
        #insert message
        author_name = self._querier.get_message_author_name(message)
        author_email = self._querier.get_message_author_email(message)
        author_id = self._dao.get_user_id(author_name, author_email)
        body = self._querier.get_message_body(message)
        own_id = self._querier.get_message_own_id(message)
        created_at = self._querier.get_message_created_at(message)

        if type == "message":
            message_type = "reply"
        else:
            message_type = "info"

        self._dao.insert_message(own_id, pos, self._dao.get_message_type_id(message_type), channel_id, body, author_id, created_at)
        message_id = self._dao.select_message_id(own_id, channel_id)
        self._extract_url_attachments(message, message_id)

    def _extract_file_upload(self, message, channel_id, pos):
        #insert file upload
        own_id = self._querier.get_message_own_id(message)
        author_name = self._querier.get_message_author_name(message)
        author_email = self._querier.get_message_author_email(message)
        author_id = self._dao.get_user_id(author_name, author_email)
        created_at = self._querier.get_message_created_at(message)
        body = self._querier.get_message_body(message).split(':')[0]

        self._dao.insert_message(own_id, pos, self._dao.get_message_type_id("file_upload"), channel_id, body, author_id, created_at)
        message_id = self._dao.select_message_id(own_id, channel_id)
        self._extract_file_attachment_info(message, message_id)

        comment = self._querier.file_attachment_get_comment(message)
        if comment:
            comment_id = self._extract_file_comment(channel_id, comment, 0)
            self._dao.insert_message_dependency(comment_id, message_id)

    def _extract_messages(self, channel_id, channel_own_id):
        #insert messages
        pos = 0
        for message in self._querier.get_channel_messages(channel_own_id):
            type = self._querier.get_message_type(message)

            if type == "file_comment":
                self._extract_comment(message, channel_id)
            elif type == "file_share":
                self._extract_file_upload(message, channel_id, pos)
                pos += 1
            else:
                if not self._querier.is_bot_message(message):
                    self._extract_message(message, channel_id, type, pos)
                #TODO deal with bot messages
                pos += 1

    def extract(self):
        """
        extracts Slack channel data and stores it in the DB
        """
        try:
            self._logger.info("SlackChannel2Db started")
            start_time = datetime.now()
            for channel_id in self._interval:
                channel_own_id = self._dao.select_channel_own_id(channel_id, self._instant_messaging_id)
                self._extract_messages(channel_id, channel_own_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("SlackChannel2Db finished after " + str(minutes_and_seconds[0])
                           + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except Exception:
            self._logger.error("SlackChannel2Db failed", exc_info=True)
Esempio n. 20
0
class Git2DbReference(object):
    """
    This class handles the import of Git references
    """

    #do not import patches
    LIGHT_IMPORT_TYPE = 1
    #import patches but not at line level
    MEDIUM_IMPORT_TYPE = 2
    #import patches also at line level
    FULL_IMPORT_TYPE = 3

    def __init__(self, db_name, repo_id, git_repo_path, before_date,
                 import_type, ref_name, ref_type, from_sha, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type git_repo_path: str
        :param git_repo_path: local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type import_type: int
        :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail

        :type ref_name: str
        :param ref_name: the name of the reference to import

        :type from_sha: str
        :param from_sha: the SHA of the commit from where to start the import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._git_repo_path = git_repo_path
        self._repo_id = repo_id
        self._db_name = db_name
        self._ref_name = ref_name
        self._ref_type = ref_type
        self._before_date = before_date
        self._import_type = import_type
        self._from_sha = from_sha
        self._config = config
        self._logging_util = LoggingUtil()
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-git2db-" + self._make_it_printable(
                self._ref_name)
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Git2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _make_it_printable(self, str):
        #converts string to UTF-8 and removes empty and non-alphanumeric characters
        u = str.decode('utf-8', 'ignore').lower()
        return re.sub(r'(\W|\s)+', '-', u)

    def _get_info_contribution_in_reference(self, reference_name,
                                            reference_type, repo_id, from_sha):
        if from_sha:
            if self._before_date:
                commits = self._querier.collect_all_commits_after_sha_before_date(
                    reference_name, from_sha, self._before_date)
            else:
                commits = self._querier.collect_all_commits_after_sha(
                    reference_name, from_sha)

            self._analyse_commits(commits, reference_name, repo_id)
        else:
            if self._before_date:
                commits = self._querier.collect_all_commits_before_date(
                    reference_name, self._before_date)
            else:
                commits = self._querier.collect_all_commits(reference_name)

            self._analyse_commits(commits, reference_name, repo_id)

    def _load_all_references(self, repo_id):
        # load all git branches and tags into database
        for reference in self._querier.get_references():
            ref_name = reference[0]
            ref_type = reference[1]
            #inserts reference to DB
            self._dao.insert_reference(repo_id, ref_name, ref_type)

    def _get_diffs_from_commit(self, commit, files_in_commit):
        #calculates diffs within files in a commit
        if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
            diffs = self._querier.get_diffs(commit, files_in_commit, True)
        else:
            diffs = self._querier.get_diffs(commit, files_in_commit, False)

        return diffs

    def _analyse_commit(self, commit, repo_id, ref_id):
        #analyses a commit
        try:
            message = self._querier.get_commit_property(commit, "message")
            author_name = self._querier.get_commit_property(
                commit, "author.name")
            author_email = self._querier.get_commit_property(
                commit, "author.email")
            committer_name = self._querier.get_commit_property(
                commit, "committer.name")
            committer_email = self._querier.get_commit_property(
                commit, "committer.email")
            size = self._querier.get_commit_property(commit, "size")
            sha = self._querier.get_commit_property(commit, "hexsha")
            authored_date = self._querier.get_commit_time(
                self._querier.get_commit_property(commit, "authored_date"))
            committed_date = self._querier.get_commit_time(
                self._querier.get_commit_property(commit, "committed_date"))

            if author_name is None and author_email is None:
                self._logger.warning(
                    "author name and email are null for commit: " + sha)

            if committer_name is None and committer_email is None:
                self._logger.warning(
                    "committer name and email are null for commit: " + sha)

            #insert author
            author_id = self._dao.get_user_id(author_name, author_email)
            committer_id = self._dao.get_user_id(committer_name,
                                                 committer_email)

            commit_found = self._dao.select_commit_id(sha, repo_id)

            if not commit_found:
                #insert commit
                self._dao.insert_commit(repo_id, sha, message, author_id,
                                        committer_id, authored_date,
                                        committed_date, size)
                commit_found = self._dao.select_commit_id(sha, repo_id)

                commit_stats_files = commit.stats.files
                try:
                    if self._querier.commit_has_no_parents(commit):
                        for diff in self._querier.get_diffs_no_parent_commit(
                                commit):
                            file_path = diff[0]
                            ext = self._querier.get_ext(file_path)

                            self._dao.insert_file(repo_id, file_path, ext)
                            file_id = self._dao.select_file_id(
                                repo_id, file_path)

                            if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
                                patch_content = re.sub(r'^(\w|\W)*\n@@', '@@',
                                                       diff[1])
                            else:
                                patch_content = None

                            stats = self._querier.get_stats_for_file(
                                commit_stats_files, file_path)
                            status = self._querier.get_status_with_diff(
                                stats, diff)

                            #insert file modification
                            self._dao.insert_file_modification(
                                commit_found, file_id, status, stats[0],
                                stats[1], stats[2], patch_content)

                            if self._import_type == Git2DbReference.FULL_IMPORT_TYPE:
                                file_modification_id = self._dao.select_file_modification_id(
                                    commit_found, file_id)
                                line_details = self._querier.get_line_details(
                                    patch_content, ext)
                                for line_detail in line_details:
                                    self._dao.insert_line_details(
                                        file_modification_id, line_detail)
                    else:
                        for diff in self._get_diffs_from_commit(
                                commit, commit_stats_files.keys()):
                            #self.dao.check_connection_alive()
                            if self._querier.is_renamed(diff):
                                file_previous = self._querier.get_rename_from(
                                    diff)
                                ext_previous = self._querier.get_ext(
                                    file_previous)

                                file_current = self._querier.get_file_current(
                                    diff)
                                ext_current = self._querier.get_ext(
                                    file_current)

                                #insert new file
                                self._dao.insert_file(repo_id, file_current,
                                                      ext_current)

                                #get id new file
                                current_file_id = self._dao.select_file_id(
                                    repo_id, file_current)

                                #retrieve the id of the previous file
                                previous_file_id = self._dao.select_file_id(
                                    repo_id, file_previous)

                                #insert file modification
                                self._dao.insert_file_modification(
                                    commit_found, current_file_id, "renamed",
                                    0, 0, 0, None)

                                if not previous_file_id:
                                    self._dao.insert_file(
                                        repo_id, file_previous, ext_previous)
                                    previous_file_id = self._dao.select_file_id(
                                        repo_id, file_previous)

                                if current_file_id == previous_file_id:
                                    self._logger.warning(
                                        "previous file id is equal to current file id ("
                                        + str(current_file_id) + ") " +
                                        str(sha))
                                else:
                                    file_modification_id = self._dao.select_file_modification_id(
                                        commit_found, current_file_id)
                                    self._dao.insert_file_renamed(
                                        repo_id, current_file_id,
                                        previous_file_id, file_modification_id)

                            else:
                                #insert file
                                #if the file does not have a path, it won't be inserted
                                try:
                                    file_path = self._querier.get_file_path(
                                        diff)

                                    ext = self._querier.get_ext(file_path)

                                    stats = self._querier.get_stats_for_file(
                                        commit_stats_files, file_path)
                                    status = self._querier.get_status_with_diff(
                                        stats, diff)

                                    #if the file is new, add it
                                    if self._querier.is_new_file(diff):
                                        self._dao.insert_file(
                                            repo_id, file_path, ext)
                                    file_id = self._dao.select_file_id(
                                        repo_id, file_path)

                                    if not file_id:
                                        self._dao.insert_file(
                                            repo_id, file_path, ext)
                                        file_id = self._dao.select_file_id(
                                            repo_id, file_path)

                                    if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
                                        #insert file modification (additions, deletions)
                                        patch_content = self._querier.get_patch_content(
                                            diff)
                                    else:
                                        patch_content = None

                                    self._dao.insert_file_modification(
                                        commit_found, file_id, status,
                                        stats[0], stats[1], stats[2],
                                        patch_content)

                                    if self._import_type == Git2DbReference.FULL_IMPORT_TYPE:
                                        file_modification_id = self._dao.select_file_modification_id(
                                            commit_found, file_id)
                                        line_details = self._querier.get_line_details(
                                            patch_content, ext)
                                        for line_detail in line_details:
                                            self._dao.insert_line_details(
                                                file_modification_id,
                                                line_detail)
                                except Exception:
                                    self._logger.error(
                                        "Something went wrong with commit " +
                                        str(sha),
                                        exc_info=True)
                except Exception:
                    self._logger.error("Git2Db failed on commit " + str(sha),
                                       exc_info=True)

            # insert parents of the commit
            self._dao.insert_commit_parents(commit.parents, commit_found, sha,
                                            repo_id)
            # insert commits in reference
            self._dao.insert_commit_in_reference(repo_id, commit_found, ref_id)

            #return commit_found
        except Exception:
            self._logger.error("Git2Db failed on commit " + str(sha),
                               exc_info=True)

    def _analyse_commits(self, commits, ref, repo_id):
        #analyses commits in references
        ref_id = self._dao.select_reference_id(repo_id, ref)
        commits_in_reference = []
        for c in commits:
            self._analyse_commit(c, repo_id, ref_id)
            # self.logger.info("analysing commit " + str(commits.index(c)+1) + "/" + str(len(commits)))
            # to_insert = self._analyse_commit(c, repo_id, ref_id)
            # if to_insert:
            #     commits_in_reference.append((repo_id, to_insert, ref_id))
            # self._analyse_commit(c, repo_id, ref_id)

        #self._dao.insert_commits_in_reference(commits_in_reference)

    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger.info("Git2DbReference started")
            start_time = datetime.now()
            self._load_all_references(self._repo_id)
            self._get_info_contribution_in_reference(self._ref_name,
                                                     self._ref_type,
                                                     self._repo_id,
                                                     self._from_sha)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbReference finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbReference failed", exc_info=True)
Esempio n. 21
0
class GitHubIssueDependency2Db(object):
    """
    This class inserts the dependencies between GitHub issues
    """

    def __init__(self, db_name,
                 repo_id, issue_tracker_id, url, interval, token,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type issue_tracker_id: int
        :param issue_tracker_id: the id of an existing issue tracker in the DB

        :type url: str
        :param url: full name of the GitHub repository

        :type interval: list int
        :param interval: a list of issue ids to import

        :type token: str
        :param token: a GitHub token

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._url = url
        self._db_name = db_name
        self._repo_id = repo_id
        self._issue_tracker_id = issue_tracker_id
        self._interval = interval
        self._token = token
        self._config = config

        self._logging_util = LoggingUtil()
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-issue2db-dependency" + str(self._interval[0]) + \
                       "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info")

            self._querier = GitHubQuerier(self._url, self._token, self._logger)
            self._dao = GitHubDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("GitHubIssueDependency2Db failed", exc_info=True)

    def _extract_issue_dependencies(self):
        # inserts issue dependency
        cursor = self._dao.get_cursor()
        query = "SELECT i.id FROM issue i " \
                "JOIN issue_tracker it ON i.issue_tracker_id = it.id " \
                "WHERE i.id >= %s AND i.id <= %s AND issue_tracker_id = %s AND repo_id = %s"
        arguments = [self._interval[0], self._interval[-1], self._issue_tracker_id, self._repo_id]
        self._dao.execute(cursor, query, arguments)

        row = self._dao.fetchone(cursor)

        while row:
            try:
                issue_id = row[0]
                issue_own_id = self._dao.select_issue_own_id(issue_id, self._issue_tracker_id, self._repo_id)
                issue = self._querier.get_issue(issue_own_id)

                comments = [self._querier.get_issue_body(issue)] + [self._querier.get_issue_comment_body(comment)
                                                                    for comment in
                                                                    self._querier.get_issue_comments(issue)]

                for c in comments:
                    if c:
                        referenced_issues = self._querier.get_referenced_issues(c)
                        for ri in referenced_issues:
                            referenced_issue_id = self._dao.select_issue_id(ri, self._issue_tracker_id, self._repo_id)
                            self._dao.insert_issue_dependency(referenced_issue_id, issue_own_id,
                                                              self._dao.get_issue_dependency_type_id("related"))

            except Exception:
                self._logger.error("something went wrong with the following issue id: " + str(issue_id) +
                                   " - tracker id " + str(self._issue_tracker_id), exc_info=True)

            row = self._dao.fetchone(cursor)

        self._dao.close_cursor(cursor)

    def extract(self):
        """
        extracts GitHub issue dependency data and stores it in the DB
        """
        try:
            self._logger.info("GitHubIssueDependency2Db started")
            start_time = datetime.now()

            self._extract_issue_dependencies()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("GitHubIssueDependency2Db finished after " + str(minutes_and_seconds[0]) +
                              " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except Exception:
            self._logger.error("GitHubIssueDependency2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 22
0
class EclipseForum2DbUpdate():
    """
    This class handles the update of Eclipse forum data
    """

    NUM_PROCESSES = 2

    def __init__(self, db_name, project_name, forum_name, eclipse_forum_url,
                 num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type forum_name: str
        :param forum_name: the name of an existing forum in the DB to update

        :type eclipse_forum_url: str
        :param eclipse_forum_url: the URL of the forum

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 2)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "update-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name
        self._project_name = project_name
        self._url = eclipse_forum_url
        self._db_name = db_name
        self._forum_name = forum_name

        config.update({'database': db_name})
        self._config = config

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = EclipseForum2DbUpdate.NUM_PROCESSES

        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _update_topics_info(self, forum_id):
        # update topics of a given forum
        next_page = True
        while next_page:
            topics_on_page = self._querier.get_topics()

            for topic in topics_on_page:

                topic_own_id = self._querier.get_topic_own_id(topic)
                topic_in_db = self._dao.get_topic_id(topic_own_id, forum_id)

                if topic_in_db:
                    views = self._querier.get_topic_views(topic)
                    last_change_at = self._date_util.get_timestamp(
                        self._querier.get_last_change_at(topic),
                        "%a, %d %B %Y %H:%M")
                    self._dao.update_topic_info(topic_in_db, forum_id, views,
                                                last_change_at)

            next_page = self._querier.go_next_page()

    def _get_topics(self, forum_id):
        #update topics of a forum
        topic_ids = self._dao.get_topic_ids(forum_id)

        if topic_ids:
            self._update_topics_info(forum_id)

            intervals = [
                i for i in multiprocessing_util.get_tasks_intervals(
                    topic_ids, self._num_processes) if len(i) > 0
            ]

            queue_extractors = multiprocessing.JoinableQueue()
            results = multiprocessing.Queue()

            # Start consumers
            multiprocessing_util.start_consumers(self._num_processes,
                                                 queue_extractors, results)

            for interval in intervals:
                topic_extractor = EclipseTopic2Db(self._db_name, forum_id,
                                                  interval, self._config,
                                                  self._log_path)
                queue_extractors.put(topic_extractor)

            # Add end-of-queue markers
            multiprocessing_util.add_poison_pills(self._num_processes,
                                                  queue_extractors)

            # Wait for all of the tasks to finish
            queue_extractors.join()

    def update(self):
        """
        updates the Eclipse forum data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("EclipseForum2DbUpdate started")
            start_time = datetime.now()

            self._querier = EclipseForumQuerier(self._url, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)

            self._querier.start_browser()

            project_id = self._dao.select_project_id(self._project_name)
            forum_id = self._dao.select_forum_id(self._forum_name, project_id)

            if forum_id:
                self._get_topics(forum_id)

            self._querier.close_browser()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("EclipseForum2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")

            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("EclipseForum2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 23
0
class GitHubIssue2Db(object):
    """
    This class handles the import of GitHub issues
    """

    def __init__(self, db_name,
                 repo_id, issue_tracker_id, url, interval, token,
                 config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type issue_tracker_id: int
        :param issue_tracker_id: the id of an existing issue tracker in the DB

        :type url: str
        :param url: full name of the GitHub repository

        :type interval: list int
        :param interval: a list of issue ids to import

        :type token: str
        :param token: a GitHub token

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._url = url
        self._db_name = db_name
        self._repo_id = repo_id
        self._issue_tracker_id = issue_tracker_id
        self._interval = interval
        self._token = token
        self._config = config

        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()
        log_path = self._log_root_path + "-issue2db-" + str(self._interval[0]) + "-" + str(self._interval[-1])
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info")

        try:
            self._querier = GitHubQuerier(self._url, self._token, self._logger)
            self._dao = GitHubDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("GitHubIssue2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _insert_attachments(self, attachments, message_id):
        #inserts attachments
        pos = 0
        for attachment in attachments:
            attachment_name = self._querier.get_attachment_name(attachment)
            attachment_own_id = self._querier.generate_attachment_id(message_id, pos)
            attachment_url = self._querier.get_attachment_url(attachment)
            self._dao.insert_attachment(attachment_own_id, message_id, attachment_name, attachment_url)
            pos += 1

    def _find_mentioner_user(self, issue_own_id, actor, created_at):
        #finds the mentioner user
        mentioner = None
        issue = self._querier.get_issue(issue_own_id)

        candidates = []

        if actor:
            if "@" + actor in self._querier.get_issue_body(issue):
                issue_creation = self._querier.get_issue_creation_time(issue)
                #if issue_creation <= created_at:
                candidates.append((self._querier.get_issue_creator(issue), issue_creation))

            for c in self._querier.get_issue_comments(issue):
                if "@" + actor in self._querier.get_issue_comment_body(c):
                    #if c.created_at <= created_at:
                    candidates.append((c.user, c.created_at))

            if candidates:
                found = min(candidates, key=lambda candidate: abs(candidate[1] - created_at))
                mentioner = found[0]
            else:
                self._logger.warning("mentioner not found for issue " + str(issue_own_id))
        #it may happen that the actor is not part of GitHub anymore, so in order to detect the mentioner, the datetime of the
        #mentioned event is compared with the creation times of the issues and comments
        else:
            if self._querier.get_issue_creation_time(issue) == created_at:
                mentioner = self._querier.get_issue_creator(issue)
            else:
                found = [c for c in self._querier.get_issue_comments(issue) if c.created_at == created_at]

                if found:
                    if len(found) == 1:
                        mentioner = found[0].user
                    else:
                        self._logger.warning("multiple mentioners for issue " + str(issue_own_id))

        if not mentioner:
            self._logger.warning("mentioner not found for issue " + str(issue_own_id))

        return mentioner

    def _extract_history(self, issue_id, issue_own_id, history):
        #inserts the history of an issue

        for event in history:
            try:
                created_at = self._querier.get_event_creation_time(event)
                actor = self._querier.get_event_actor(event)
                actor_id = self._dao.get_user_id(self._querier.get_user_name(actor), self._querier.get_user_email(actor))
                action = event.event

                if action in ["opened", "edited", "closed", "reopened"]:
                    self._dao.insert_event_type(action)
                    event_type_id = self._dao.select_event_type(action)
                    self._dao.insert_issue_event(issue_id, event_type_id, action, actor_id, created_at, None)
                elif action in ["labeled", "unlabeled"]:
                    self._dao.insert_event_type(action)
                    event_type_id = self._dao.select_event_type(action)
                    self._dao.insert_issue_event(issue_id, event_type_id, event._rawData.get('label').get('name').lower(), actor_id, created_at, None)
                elif action in ["mentioned"]:
                    self._dao.insert_event_type(action)
                    event_type_id = self._dao.select_event_type(action)
                    user_mentioner = self._find_mentioner_user(issue_own_id, self._querier.get_user_name(actor), created_at)
                    user_id = self._dao.get_user_id(self._querier.get_user_name(user_mentioner), self._querier.get_user_email(user_mentioner))
                    self._dao.insert_issue_event(issue_id, event_type_id, self._querier.get_user_name(user_mentioner), user_id, created_at, actor_id)
                elif action in ["subscribed"]:
                    self._dao.insert_event_type(action)
                    event_type_id = self._dao.select_event_type(action)
                    self._dao.insert_issue_event(issue_id, event_type_id, action, actor_id, created_at, None)
                elif action in ["assigned", "unassigned"]:
                    self._dao.insert_event_type(action)
                    event_type_id = self._dao.select_event_type(action)

                    assignee_login = event._rawData.get('assignee').get('login')
                    assignee = self._querier.find_user(assignee_login)
                    if assignee:
                        assignee_id = self._dao.get_user_id(self._querier.get_user_name(assignee), self._querier.get_user_email(assignee))
                    else:
                        assignee_id = self._dao.get_user_id(assignee_login, None)

                    assigner_login = event._rawData.get('assigner').get('login')
                    assigner = self._querier.find_user(assigner_login)
                    if assigner:
                        assigner_id = self._dao.get_user_id(self._querier.get_user_name(assigner), self._querier.get_user_email(assigner))
                    else:
                        assigner_id = self._dao.get_user_id(assigner_login, None)

                    self._dao.insert_issue_event(issue_id, event_type_id, action, assigner_id, created_at, assignee_id)


            except Exception:
                self._logger.warning("event at (" + str(created_at) + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

    def _extract_subscribers(self, issue_id, subscribers):
        #inserts subscribers of an issue
        for subscriber in subscribers:
            try:
                subscriber_id = self._dao.get_user_id(self._querier.get_user_name(subscriber), self._querier.get_user_email(subscriber))
                self._dao.insert_subscriber(issue_id, subscriber_id)
            except Exception:
                self._logger.warning("subscriber (" + subscriber.login + ") not inserted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

    def _extract_assignees(self, issue_id, assignees):
        #inserts the assignee of an issue
        for assignee in assignees:
            try:
                assignee_login = assignee.get('login')
                assignee = self._querier.find_user(assignee_login)

                if assignee:
                    assignee_id = self._dao.get_user_id(self._querier.get_user_name(assignee), self._querier.get_user_email(assignee))
                else:
                    assignee_id = self._dao.get_user_id(assignee_login, None)

                self._dao.insert_assignee(issue_id, assignee_id)
            except Exception:
                self._logger.warning("assignee (" + assignee.login + ") not inserted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

    def _extract_first_comment(self, issue_id, issue):
        #inserts first issue comment
        created_at = self._querier.get_issue_creation_time(issue)
        author = self._querier.get_issue_creator(issue)
        author_id = self._dao.get_user_id(self._querier.get_user_name(author), self._querier.get_user_email(author))
        body = self._querier.get_issue_body(issue)
        self._dao.insert_issue_comment(0, 0, self._dao.get_message_type_id("comment"), issue_id, body, None, author_id, created_at)

    def _extract_comments(self, issue_id, issue, comments):
        #inserts the comments of an issue
        self._extract_first_comment(issue_id, issue)
        pos = 1
        for comment in comments:
            try:
                own_id = self._querier.get_issue_comment_id(comment)
                body = self._querier.get_issue_comment_body(comment)
                author = self._querier.get_issue_comment_author(comment)
                author_id = self._dao.get_user_id(self._querier.get_user_name(author), self._querier.get_user_email(author))
                created_at = self._querier.get_issue_comment_creation_time(comment)
                self._dao.insert_issue_comment(own_id, pos, self._dao.get_message_type_id("comment"), issue_id, body, None, author_id, created_at)

                attachments = self._querier.get_attachments(body)
                if attachments:
                    issue_comment_id = self._dao.select_issue_comment_id(own_id, issue_id, created_at)
                    self._insert_attachments(attachments, issue_comment_id)
            except Exception:
                self._logger.warning("comment(" + str(pos) + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)
                continue

            pos += 1

    def _extract_labels(self, issue_id, labels):
        #inserts the labels of an issue
        for label in labels:
            try:
                digested_label = re.sub("^\W+", "", re.sub("\W+$", "", label.lower()))
                self._dao.insert_label(digested_label.strip())
                label_id = self._dao.select_label_id(digested_label)
                self._dao.assign_label_to_issue(issue_id, label_id)
            except Exception:
                self._logger.warning("label (" + label + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

    def _extract_issue_commit_dependency(self, issue_id, commits):
        #inserts the dependencies between an issue and commits
        for id in commits:
            commit_id = self._dao.select_commit(id, self._repo_id)
            if commit_id:
                self._dao.insert_issue_commit_dependency(issue_id, commit_id)

    def _get_issue_info(self, issue_own_id):
        #processes each single issue
        flag_insert_issue_data = False

        issue = self._querier.get_issue(issue_own_id)
        summary = self._querier.get_issue_summary(issue)
        component = None
        version = self._querier.get_issue_version(issue)
        hardware = None
        priority = None
        severity = None
        created_at = self._querier.get_issue_creation_time(issue)
        last_change_at = self._querier.get_issue_last_change_time(issue)

        reference_id = self._dao.find_reference_id(version, issue_own_id, self._repo_id)
        user = self._querier.get_issue_creator(issue)
        user_id = self._dao.get_user_id(self._querier.get_user_name(user), self._querier.get_user_email(user))

        stored_issue_last_change = self._dao.select_last_change_issue(issue_own_id, self._issue_tracker_id, self._repo_id)
        if stored_issue_last_change:
            if last_change_at != stored_issue_last_change:
                flag_insert_issue_data = True
                self._dao.update_issue(issue_own_id, self._issue_tracker_id, summary, component, version, hardware, priority, severity, reference_id, last_change_at)
        else:
            flag_insert_issue_data = True
            self._dao.insert_issue(issue_own_id, self._issue_tracker_id, summary, component, version, hardware, priority, severity, reference_id, user_id, created_at, last_change_at)

        if flag_insert_issue_data:
            issue_id = self._dao.select_issue_id(issue_own_id, self._issue_tracker_id, self._repo_id)

            try:
                self._extract_labels(issue_id, self._querier.get_issue_tags(issue))
            except Exception:
                self._logger.error("GitHubError when extracting tags for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

            try:
                self._extract_comments(issue_id, issue, self._querier.get_issue_comments(issue))
            except Exception:
                self._logger.error("GitHubError when extracting comments for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

            try:
                issue_history = self._querier.get_issue_history(issue)
                self._extract_history(issue_id, issue_own_id, issue_history)
                self._extract_subscribers(issue_id, self._querier.get_issue_subscribers(issue_history))
                self._extract_assignees(issue_id, self._querier.get_issue_assignees(issue_history))
                self._extract_issue_commit_dependency(issue_id, self._querier.get_commit_dependencies(issue_history))
            except Exception:
                self._logger.error("GitHubError when extracting history for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

    def _get_issues(self):
        #processes issues
        for issue_id in self._interval:
            try:
                self._get_issue_info(issue_id)
            except Exception:
                self._logger.error("something went wrong for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True)

    def extract(self):
        """
        extracts GitHub issue data and stores it in the DB
        """
        try:
            self._logger.info("GitHubIssue2Db started")
            start_time = datetime.now()
            self._get_issues()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("GitHubIssue2Db finished after " + str(minutes_and_seconds[0])
                           + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except Exception:
            self._logger.error("GitHubIssue2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 24
0
class Git2DbUpdate():
    """
    This class handles the update of Git data
    """

    NUM_PROCESSES = 5

    def __init__(self, db_name, project_name,
                 repo_name, git_repo_path, before_date,
                 num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of the Git repository to import

        :type git_repo_path: str
        :param git_repo_path: the local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 5)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-git-" + db_name + "-" + project_name + "-" + repo_name
        self._git_repo_path = git_repo_path
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._before_date = before_date
        self._existing_refs = []

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = Git2DbUpdate.NUM_PROCESSES

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _update_existing_references(self, repo_id, import_type):
        #updates existing references in the DB
        cursor = self._dao.get_cursor()
        query = "SELECT c.sha, lc.ref_id " \
                "FROM commit c " \
                "JOIN (SELECT ref_id, max(commit_id) as last_commit_id_in_ref FROM commit_in_reference WHERE repo_id = %s GROUP BY ref_id) as lc " \
                "ON c.id = lc.last_commit_id_in_ref"
        arguments = [repo_id]
        self._dao.execute(cursor, query, arguments)

        queue_references = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes, queue_references, results)

        row = self._dao.fetchone(cursor)
        while row:
            sha = row[0]
            ref_id = row[1]
            row = self._dao.fetchone(cursor)

            ref_name = self._dao.select_reference_name(repo_id, ref_id)

            for reference in self._querier.get_references():
                reference_name = reference[0]
                if reference_name == ref_name:
                    self._existing_refs.append(ref_name)

                    git_ref_extractor = Git2DbReference(self._db_name, repo_id, self._git_repo_path,
                                                        self._before_date, import_type, reference[0], sha,
                                                        self._config, self._log_path)

                    queue_references.put(git_ref_extractor)
                    break

        self._dao.close_cursor(cursor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes, queue_references)

        # Wait for all of the tasks to finish
        queue_references.join()

    def _update_repo(self, repo_id, import_type):
        #updates Git data
        self._update_existing_references(repo_id, import_type)

    def _get_import_type(self, repo_id):
        #gets import type
        import_type = 1
        import_type += self._dao.line_detail_table_is_empty(repo_id) + self._dao.file_modification_patch_is_empty(repo_id)
        return import_type

    def update(self):
        """
        updates the Git data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info")

            self._logger.info("Git2DbUpdate started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            self._update_repo(repo_id, self._get_import_type(repo_id))
            self._dao.restart_connection()
            self._dao.fix_commit_parent_table(repo_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("Git2DbUpdate finished after " + str(minutes_and_seconds[0])
                         + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except:
            self._logger.error("Git2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 25
0
class GitHubUtil():
    """
    This class helps mapping the identities of the users in the vcs and GitHub
    """
    def __init__(self, db_name, project_name, repo_name, github_repo_full_name,
                 tokens, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type url: str
        :param url: full name of the GitHub repository

        :type tokens: list str
        :param token: list of GitHub tokens

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "map-vcs-github-users-" + db_name + "-" + project_name + "-" + repo_name
        self._project_name = project_name
        self._db_name = db_name
        self._repo_name = repo_name
        self._tokens = tokens
        self._active_token = 0
        self._url = github_repo_full_name

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()
        self._logger = self._logging_util.get_logger(self._log_path)
        self._db_util = DbUtil()
        self._cnx = self._db_util.get_connection(self._config)
        self._git_dao = GitDao(self._config, self._logger)
        self._github_querier = GitHubQuerier(self._url,
                                             self._tokens[self._active_token],
                                             self._logger)

    def _change_token(self):
        if len(self._tokens) > 1:
            if not self._github_querier._token_util._is_usuable(
                    self._tokens[self._active_token]):
                self._active_token = (self._active_token + 1) % len(
                    self._tokens)
                self._github_querier = GitHubQuerier(
                    self._url, self._tokens[self._active_token], self._logger)

    def _analyse_user(self, user, unmatched_user, sha):
        if user:
            user_name = self._github_querier.get_user_name(user)
            user_ids = self._db_util.select_all_user_ids_by_name(
                self._cnx, user_name, self._logger)

            for user_id in user_ids:
                try:
                    user_id, alias_id = self._db_util._identify_user_and_alias(
                        self._cnx, unmatched_user, user_id, self._logger)
                    if user_id != alias_id:
                        self._db_util.insert_user_alias(
                            self._cnx, user_id, alias_id, self._logger)
                        self._logger.info("user ids " + str(user_id) +
                                          " and " + str(alias_id) +
                                          " successfully matched")
                except Exception:
                    self._logger.error("user ids " + str(user_id) + " and " +
                                       str(alias_id) + " not matched",
                                       exc_info=True)
                    continue
        else:
            self._logger.warning("GitHub user not found for commit " + sha)

    def match(self):
        """
        matches GitHub and Git identities
        """
        try:

            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("GitHubUtil started")
            start_time = datetime.now()
            repo_id = self._git_dao.select_repo_id(self._repo_name)
            user_ids = self._git_dao.select_all_developer_ids(repo_id)
            alias_ids = self._db_util.select_all_aliased_user_ids(
                self._cnx, self._logger)
            unmatched_users = list(set(user_ids) - set(alias_ids))

            for unmatched_user in unmatched_users:
                matched = False
                sha = self._git_dao.select_sha_commit_by_user(
                    unmatched_user, repo_id, match_on="author")
                if sha:
                    author = self._github_querier.get_author_by_commit(sha)
                    self._analyse_user(author, unmatched_user, sha)
                    matched = True
                else:
                    sha = self._git_dao.select_sha_commit_by_user(
                        unmatched_user, repo_id, match_on="committer")
                    if sha:
                        committer = self._github_querier.get_committer_by_commit(
                            sha)
                        self._analyse_user(committer, unmatched_user, sha)
                        matched = True

                if not matched:
                    self._logger.warning("No commits found for user " +
                                         str(unmatched_user))

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("GitHubUtil finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)

        except:
            self._logger.error("GitHubUtil failed", exc_info=True)
        finally:
            if self._git_dao:
                self._git_dao.close_connection()

            if self._cnx:
                self._db_util.close_connection(self._cnx)
Esempio n. 26
0
class FileUtilWrapper():
    """
    This class wraps the operations provided by the FileUtil class
    """
    def __init__(self, db_name, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "file-util-wrapper" + db_name
        self._db_name = db_name

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()
        self._logger = self._logging_util.get_logger(self._log_path)

    def get_file_history(self,
                         repo_name,
                         file_name,
                         reference_name,
                         reversed=False,
                         before_date=None):
        """
        get file history for a given file name within a reference. Optionally, the history can be retrieved before a given date

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type file_name: dict
        :param file_name: the name of the target file

        :type reference_name: str
        :param reference_name: the name of the reference

        :type reversed: bool
        :param reversed: if True, it returns the changes from the most recent to the earliest

        :type before_date: str (YYYY-mm-dd)
        :param reversed: if not null, it returns the last version of the file before the given date
        """
        history = []
        try:
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")
            self._logger.info("FileUtilWrapper started")
            start_time = datetime.now()

            file_util = FileUtil(self._config, self._logger)
            history = file_util.get_file_history_by_name(
                repo_name, file_name, reference_name, reversed, before_date)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("FileUtilWrapper finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("FileUtilWrapper failed", exc_info=True)
        finally:
            return history

    def get_file_version(self,
                         repo_name,
                         file_name,
                         reference_name,
                         before_date=None):
        """
        get file version for a given file name within a reference. Optionally, the version can be retrieved before a given date

        :type repo_name: str
        :param repo_name: the name of an existing repository in the DB

        :type file_name: dict
        :param file_name: the name of the target file

        :type reference_name: str
        :param reference_name: the name of the reference

        :type before_date: str (YYYY-mm-dd)
        :param reversed: if not null, it returns the last version of the file before the given date
        """
        content = ""
        try:
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")
            self._logger.info("FileUtilWrapper started")
            start_time = datetime.now()

            file_util = FileUtil(self._config, self._logger)
            content = file_util.get_file_version_by_name(
                repo_name, file_name, reference_name, before_date)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("FileUtilWrapper finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("FileUtil failed", exc_info=True)
        finally:
            return content
Esempio n. 27
0
class Slack2DbMain():
    """
    This class handles the import of Slack data
    """
    def __init__(self, db_name, project_name, type, instant_messaging_name,
                 before_date, channels, tokens, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type type: str
        :param type: type of the instant messaging (Slack, IRC)

        :type instant_messaging_name: str
        :param instant_messaging_name: the name of the instant messaging to import

        :type channels: list str
        :param channels: list of channels to import

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type tokens: list str
        :param tokens: list of Slack tokens

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-slack-" + db_name + "-" + project_name + "-" + instant_messaging_name
        self._type = type
        self._instant_messaging_name = instant_messaging_name
        self._project_name = project_name
        self._db_name = db_name
        self._channels = channels
        self._before_date = before_date
        self._tokens = tokens

        config.update({'database': db_name})
        self._config = config

        self._logging_util = LoggingUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_channel_ids(self, instant_messaging_id):
        #get data source channel ids
        channel_ids = []
        channel_own_ids = self._querier.get_channel_ids(
            self._before_date, self._channels)
        for own_id in channel_own_ids:
            channel = self._querier.get_channel(own_id)
            last_change_at = self._querier.get_channel_last_change_at(channel)

            if self._dao.get_channel_last_change_at(
                    own_id, instant_messaging_id) != last_change_at:
                name = self._querier._get_channel_name(channel)
                description = self._querier.get_channel_description(channel)
                created_at = self._querier._get_channel_created_at(channel)

                channel_id = self._dao.insert_channel(own_id,
                                                      instant_messaging_id,
                                                      name, description,
                                                      created_at,
                                                      last_change_at)
                channel_ids.append(channel_id)

        return channel_ids

    def _get_channels(self, instant_messaging_id):
        #processes Slack channels
        channel_ids = self._get_channel_ids(instant_messaging_id)

        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                channel_ids, len(self._tokens)) if len(i) > 0
        ]

        queue_extractors = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(len(self._tokens),
                                             queue_extractors, results)

        pos = 0
        for interval in intervals:
            topic_extractor = SlackChannel2Db(self._db_name,
                                              instant_messaging_id, interval,
                                              self._tokens[pos], self._config,
                                              self._log_path)
            queue_extractors.put(topic_extractor)
            pos += 1

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(len(self._tokens),
                                              queue_extractors)

        # Wait for all of the tasks to finish
        queue_extractors.join()

    def extract(self):
        """
        extracts Slack data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("SlackDbMain started")
            start_time = datetime.now()

            self._querier = SlackQuerier(self._tokens[0], self._logger)
            self._dao = SlackDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            instant_messaging_id = self._dao.insert_instant_messaging(
                project_id, self._instant_messaging_name, self._type)
            self._get_channels(instant_messaging_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("SlackDbMain extract finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")

            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("SlackDbMain extract failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Esempio n. 28
0
class BugzillaIssue2Db(object):
    """
    This class handles the import of Bugzilla issues
    """
    def __init__(self, db_name, repo_id, issue_tracker_id, url, product,
                 interval, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type issue_tracker_id: int
        :param issue_tracker_id: the id of an existing issue tracker in the DB

        :type url: str
        :param url: the URL of the bugzilla issue tracker

        :type product: str
        :param product: the name of the product in the bugzilla issue tracker

        :type interval: list int
        :param interval: a list of issue ids to import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._url = url
        self._product = product
        self._db_name = db_name
        self._repo_id = repo_id
        self._issue_tracker_id = issue_tracker_id
        self._interval = interval
        self._config = config

        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()

        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-issue2db-" + str(
                self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = BugzillaQuerier(self._url, self._product,
                                            self._logger)
            self._dao = BugzillaDao(self._config, self._logger)
            self.extract()
        except Exception, e:
            self._logger.error("BugzillaIssue2Db failed", exc_info=True)
        finally:
Esempio n. 29
0
class EclipseTopic2Db(object):
    """
    This class handles the import of Eclipse forum topics
    """

    TOPIC_URL = 'https://www.eclipse.org/forums/index.php/t/'

    def __init__(self, db_name, forum_id, interval, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type forum_id: int
        :param forum_id: the id of an existing forum in the DB

        :type interval: list int
        :param interval: a list of topic ids to import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """

        self._log_root_path = log_root_path
        self._interval = interval
        self._db_name = db_name
        self._forum_id = forum_id
        self._config = config
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()
        log_path = self._log_root_path + "-topic2db-" + str(
            self._interval[0]) + "-" + str(self._interval[-1])
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._querier = EclipseForumQuerier(None, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("EclipseTopic2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _get_message_attachments_info(self, message_id, message):
        #get attachment informatio of messages
        attachments = self._querier.message_get_attachments(message)

        for a in attachments:
            url = self._querier.get_attachment_url(a)
            own_id = self._querier.get_attachment_own_id(a)
            name = self._querier.get_attachment_name(a)
            extension = name.split('.')[-1].strip('').lower()
            size = self._querier.get_attachment_size(a)

            self._dao.insert_message_attachment(url, own_id, name, extension,
                                                size, message_id)

    def _get_message_info(self, topic_id, message, pos):
        #get information of topic messages
        own_id = self._querier.get_message_own_id(message)
        created_at = self._date_util.get_timestamp(
            self._querier.get_created_at(message), "%a, %d %B %Y %H:%M")
        body = self._querier.get_message_body(message)
        author_name = self._querier.get_message_author_name(message)
        message_id = self._dao.insert_message(
            own_id, pos, self._dao.get_message_type_id("reply"), topic_id,
            body, None, self._dao.get_user_id(author_name), created_at)

        if self._querier.message_has_attachments(message):
            self._get_message_attachments_info(message_id, message)

        if pos == 1:
            self._dao.update_topic_created_at(topic_id, created_at,
                                              self._forum_id)

    def extract(self):
        """
        extracts Eclipse forum topic data and stores it in the DB
        """
        self._logger.info("EclipseTopic2Db started")
        start_time = datetime.now()

        for topic_id in self._interval:
            topic_own_id = self._dao.get_topic_own_id(self._forum_id, topic_id)

            self._querier.set_url(EclipseTopic2Db.TOPIC_URL +
                                  str(topic_own_id) + "/")
            self._querier.start_browser()
            time.sleep(3)

            if 'index.php/e/' in self._querier._url:
                self._logger.warning("No URL exists for the topic id " +
                                     str(topic_id) + " - " +
                                     str(self._forum_id))

            next_page = True
            pos = 1

            while next_page:
                messages_on_page = self._querier.get_messages()

                for message in messages_on_page:
                    self._get_message_info(topic_id, message, pos)
                    pos += 1

                next_page = self._querier.go_next_page()

        self._querier.close_browser()
        end_time = datetime.now()
        minutes_and_seconds = self._logging_util.calculate_execution_time(
            end_time, start_time)
        self._logger.info("EclipseTopic2Db finished after " +
                          str(minutes_and_seconds[0]) + " minutes and " +
                          str(round(minutes_and_seconds[1], 1)) + " secs")
        self._logging_util.remove_file_handler_logger(self._logger,
                                                      self._fileHandler)
Esempio n. 30
0
class ActivityReportExporter():
    """
    This class handles the generation of reports
    """

    LOG_FOLDER_PATH = "logs"
    INPUT_PATH = os.path.join(os.path.dirname(resources.__file__), 'queries.json')

    def __init__(self, config, db_name, log_root_path):
        """
        :type config: dict
        :param config: the DB configuration file

        :type db_name: str
        :param config: name of an existing DB

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._dsl_util = DslUtil()
        self._date_util = DateUtil()
        self._db_util = DbUtil()

        self._logging_util = LoggingUtil()
        self._log_path = log_root_path + "export-report-" + db_name + ".log"
        self._logger = self._logging_util.get_logger(self._log_path)
        self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info")

        self._db_name = db_name
        self._config = config
        self._cnx = self._db_util.get_connection(self._config)
        self._db_util.set_database(self._cnx, self._db_name)
        self._db_util.set_settings(self._cnx)

        self._chart_generator = ChartGenerator(self._cnx, self._logger)
        self._html_generator = HtmlGenerator(self._logger)

    def _create_log_folder(self, name):
        #creates the log folder
        if not os.path.exists(name):
            os.makedirs(name)

    def _create_output_file(self, filename):
        #creates the output folder
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

    def _load_report_exporter_json(self, json_path):
        #loads the JSON that drives the report export process
        with open(json_path) as json_data:
            data = json.load(json_data)

        return data.get('report')

    def _find_entity_id(self, type, name):
        #finds the id of the tools stored in the DB
        found = None

        if type == "project":
            found = self._db_util.select_project_id(self._cnx, name, self._logger)
        elif type == "repo":
            found = self._db_util.select_repo_id(self._cnx, name, self._logger)
        elif type == "issuetracker":
            found = self._db_util.select_issue_tracker_id(self._cnx, name, self._logger)
        elif type == "forum":
            found = self._db_util.select_forum_id(self._cnx, name, self._logger)
        elif type == "instantmessaging":
            found = self._db_util.select_instant_messaging_id(self._cnx, name, self._logger)

        if not found:
            self._logger.error("ReporExporter: entity " + str(type) + " with name " + str(name) + " not found!")

        return found

    def _get_parameter(self, key, parameters):
        #gets parameters of the JSON
        found = None
        if key in ["AFTERDATE", "INTERVAL"]:
            found = parameters.get(key.lower())
        else:
            if key.endswith("ID"):
                found = parameters.get(key[:-2].lower())
        if not found:
            self._logger.error("ReportExporter: parameter " + str(key) + " not found!")

        return found

    def _load_query_json(self, metric_name, parameters):
        #loads the queries in the JSON configuration file
        with open(ActivityReportExporter.INPUT_PATH) as json_data:
            data = json.load(json_data)

        metrics = data.get('queries')

        try:
            found = [m for m in metrics if m.get('name') == metric_name][0]
            query = found.get('query')

            for k in found.keys():
                if k not in ['name', 'query']:

                    k_value = str(self._get_parameter(k, parameters))

                    query = query.replace(k, k_value)

            return query
        except:
            self._logger.error("ReportExporter: metric " + str(metric_name) + " not found!")

    def _get_activity_name(self, activity):
        #gets the name of the activity
        return activity.replace("_", " ")

    def _get_activity_type(self, activity):
        #gets the type of the activity
        return activity.replace("_activity", "").replace("_", "")

    def _generate_charts(self, activity, activity_data, project_id, time_span):
        #generates charts
        entity2charts = {}
        after_date, interval = self._calculate_time_information(time_span)
        activity_type = self._get_activity_type(activity)
        names = activity_data.get('names')
        measures = activity_data.get('measures')

        for entity_name in names:
            entity_id = self._dsl_util.find_entity_id(self._cnx, activity_type, entity_name, self._logger)
            charts = []
            for measure in measures:
                query = self._load_query_json(measure, {activity_type: entity_id, 'project': project_id, 'afterdate': after_date, 'interval': interval})
                charts.append(self._chart_generator.create(query, interval.lower(), measure, time_span))

            entity2charts.update({entity_name: charts})

        return entity2charts

    def _calculate_time_information(self, time_span):
        #calculates the time span information
        start = None
        interval = None
        current_time = datetime.now() #test datetime.strptime("2015-10-10", "%Y-%m-%d")
        if time_span == "this_week":
            start = self._date_util.get_start_time_span(current_time, "week", "%Y-%m-%d")
            interval = "DAY"
        elif time_span == "this_month":
            start = self._date_util.get_start_time_span(current_time, "month", "%Y-%m-%d")
            interval = "DAY"
        elif time_span == "this_year":
            start = self._date_util.get_start_time_span(current_time, "year", "%Y-%m-%d")
            interval = "MONTH"
        else:
            self._logger.error("ReportExporter: time span " + str(time_span) + " not recognized! Options are: this_week, this_month, this_year")

        return start, interval

    def export(self, file_path, json_path):
        """
        exports the Gitana data to a report

        :type file_path: str
        :param file_path: the path where to export the report

        :type json_path: str
        :param json_path: the path of the JSON that drives the export process
        """
        try:
            self._logger.info("ReportExporter started")
            start_time = datetime.now()

            exporter_data = self._load_report_exporter_json(json_path)

            project_name = exporter_data.get('project')
            project_id = self._dsl_util.find_entity_id(self._cnx, "project", project_name, self._logger)

            time_span = exporter_data.get('time_span')

            activity2charts = {}
            for activity in [attr for attr in exporter_data.keys() if attr.endswith('activity')]:
                activity_name = self._get_activity_name(activity)
                charts = self._generate_charts(activity, exporter_data.get(activity), project_id, time_span)
                activity2charts.update({activity_name: charts})

            html_page = self._html_generator.create(project_name, activity2charts)

            with codecs.open(file_path, 'w', encoding='utf8') as f:
                f.write(html_page)

            self._db_util.close_connection(self._cnx)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time)
            self._logger.info("ReportExporter: process finished after " + str(minutes_and_seconds[0])
                             + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
        except:
            self._logger.error("ReportExporter failed", exc_info=True)