def extract(self):
        """
        extracts Eclipse forum data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("EclipseForum2DbMain started")
            start_time = datetime.now()

            self._querier = EclipseForumQuerier(self._url, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            forum_id = self._dao.insert_forum(project_id, self._forum_name,
                                              self._type)
            self._get_topics(forum_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("EclipseForum2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("EclipseForum2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Example #2
0
    def __call__(self):
        try:
            log_path = self._log_root_path + "-topic2db-" + str(
                self._interval[0]) + "-" + str(self._interval[-1])
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = EclipseForumQuerier(None, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("EclipseTopic2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
Example #3
0
class EclipseTopic2Db(object):
    """
    This class handles the import of Eclipse forum topics
    """

    TOPIC_URL = 'https://www.eclipse.org/forums/index.php/t/'

    def __init__(self, db_name, forum_id, interval, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type forum_id: int
        :param forum_id: the id of an existing forum in the DB

        :type interval: list int
        :param interval: a list of topic ids to import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """

        self._log_root_path = log_root_path
        self._interval = interval
        self._db_name = db_name
        self._forum_id = forum_id
        self._config = config
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()
        log_path = self._log_root_path + "-topic2db-" + str(
            self._interval[0]) + "-" + str(self._interval[-1])
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._querier = EclipseForumQuerier(None, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("EclipseTopic2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _get_message_attachments_info(self, message_id, message):
        #get attachment informatio of messages
        attachments = self._querier.message_get_attachments(message)

        for a in attachments:
            url = self._querier.get_attachment_url(a)
            own_id = self._querier.get_attachment_own_id(a)
            name = self._querier.get_attachment_name(a)
            extension = name.split('.')[-1].strip('').lower()
            size = self._querier.get_attachment_size(a)

            self._dao.insert_message_attachment(url, own_id, name, extension,
                                                size, message_id)

    def _get_message_info(self, topic_id, message, pos):
        #get information of topic messages
        own_id = self._querier.get_message_own_id(message)
        created_at = self._date_util.get_timestamp(
            self._querier.get_created_at(message), "%a, %d %B %Y %H:%M")
        body = self._querier.get_message_body(message)
        author_name = self._querier.get_message_author_name(message)
        message_id = self._dao.insert_message(
            own_id, pos, self._dao.get_message_type_id("reply"), topic_id,
            body, None, self._dao.get_user_id(author_name), created_at)

        if self._querier.message_has_attachments(message):
            self._get_message_attachments_info(message_id, message)

        if pos == 1:
            self._dao.update_topic_created_at(topic_id, created_at,
                                              self._forum_id)

    def extract(self):
        """
        extracts Eclipse forum topic data and stores it in the DB
        """
        self._logger.info("EclipseTopic2Db started")
        start_time = datetime.now()

        for topic_id in self._interval:
            topic_own_id = self._dao.get_topic_own_id(self._forum_id, topic_id)

            self._querier.set_url(EclipseTopic2Db.TOPIC_URL +
                                  str(topic_own_id) + "/")
            self._querier.start_browser()
            time.sleep(3)

            if 'index.php/e/' in self._querier._url:
                self._logger.warning("No URL exists for the topic id " +
                                     str(topic_id) + " - " +
                                     str(self._forum_id))

            next_page = True
            pos = 1

            while next_page:
                messages_on_page = self._querier.get_messages()

                for message in messages_on_page:
                    self._get_message_info(topic_id, message, pos)
                    pos += 1

                next_page = self._querier.go_next_page()

        self._querier.close_browser()
        end_time = datetime.now()
        minutes_and_seconds = self._logging_util.calculate_execution_time(
            end_time, start_time)
        self._logger.info("EclipseTopic2Db finished after " +
                          str(minutes_and_seconds[0]) + " minutes and " +
                          str(round(minutes_and_seconds[1], 1)) + " secs")
        self._logging_util.remove_file_handler_logger(self._logger,
                                                      self._fileHandler)
class EclipseForum2DbUpdate():
    """
    This class handles the update of Eclipse forum data
    """

    NUM_PROCESSES = 2

    def __init__(self, db_name, project_name, forum_name, eclipse_forum_url,
                 num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type forum_name: str
        :param forum_name: the name of an existing forum in the DB to update

        :type eclipse_forum_url: str
        :param eclipse_forum_url: the URL of the forum

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 2)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "update-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name
        self._project_name = project_name
        self._url = eclipse_forum_url
        self._db_name = db_name
        self._forum_name = forum_name

        config.update({'database': db_name})
        self._config = config

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = EclipseForum2DbUpdate.NUM_PROCESSES

        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _update_topics_info(self, forum_id):
        # update topics of a given forum
        next_page = True
        while next_page:
            topics_on_page = self._querier.get_topics()

            for topic in topics_on_page:

                topic_own_id = self._querier.get_topic_own_id(topic)
                topic_in_db = self._dao.get_topic_id(topic_own_id, forum_id)

                if topic_in_db:
                    views = self._querier.get_topic_views(topic)
                    last_change_at = self._date_util.get_timestamp(
                        self._querier.get_last_change_at(topic),
                        "%a, %d %B %Y %H:%M")
                    self._dao.update_topic_info(topic_in_db, forum_id, views,
                                                last_change_at)

            next_page = self._querier.go_next_page()

    def _get_topics(self, forum_id):
        #update topics of a forum
        topic_ids = self._dao.get_topic_ids(forum_id)

        if topic_ids:
            self._update_topics_info(forum_id)

            intervals = [
                i for i in multiprocessing_util.get_tasks_intervals(
                    topic_ids, self._num_processes) if len(i) > 0
            ]

            queue_extractors = multiprocessing.JoinableQueue()
            results = multiprocessing.Queue()

            # Start consumers
            multiprocessing_util.start_consumers(self._num_processes,
                                                 queue_extractors, results)

            for interval in intervals:
                topic_extractor = EclipseTopic2Db(self._db_name, forum_id,
                                                  interval, self._config,
                                                  self._log_path)
                queue_extractors.put(topic_extractor)

            # Add end-of-queue markers
            multiprocessing_util.add_poison_pills(self._num_processes,
                                                  queue_extractors)

            # Wait for all of the tasks to finish
            queue_extractors.join()

    def update(self):
        """
        updates the Eclipse forum data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("EclipseForum2DbUpdate started")
            start_time = datetime.now()

            self._querier = EclipseForumQuerier(self._url, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)

            self._querier.start_browser()

            project_id = self._dao.select_project_id(self._project_name)
            forum_id = self._dao.select_forum_id(self._forum_name, project_id)

            if forum_id:
                self._get_topics(forum_id)

            self._querier.close_browser()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("EclipseForum2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")

            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("EclipseForum2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
class EclipseForum2DbMain():
    """
    This class handles the import of Eclipse forum data
    """

    NUM_PROCESSES = 2

    def __init__(self, db_name, project_name, type, forum_name, url,
                 before_date, num_processes, config, log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type project_name: str
        :param project_name: the name of an existing project in the DB

        :type type: str
        :param type: type of the forum (Stackoverflow, Eclipse forum)

        :type forum_name: str
        :param forum_name: the name of the forum to import

        :type url: str
        :param url: the URL of the forum

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type num_processes: int
        :param num_processes: number of processes to import the data (default 2)

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_path = log_root_path + "import-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name
        self._type = type
        self._url = url
        self._forum_name = forum_name
        self._project_name = project_name
        self._db_name = db_name
        self._before_date = before_date

        config.update({'database': db_name})
        self._config = config

        if num_processes:
            self._num_processes = num_processes
        else:
            self._num_processes = EclipseForum2DbMain.NUM_PROCESSES

        self._logging_util = LoggingUtil()
        self._date_util = DateUtil()

        self._logger = None
        self._fileHandler = None
        self._querier = None
        self._dao = None

    def _get_topic_info(self, forum_id, topic):
        #get topic information
        own_id = self._querier.get_topic_own_id(topic)
        title = self._querier.get_topic_title(topic)
        views = self._querier.get_topic_views(topic)
        last_change_at = self._date_util.get_timestamp(
            self._querier.get_last_change_at(topic), "%a, %d %B %Y %H:%M")

        topic_id = self._dao.select_topic_id(forum_id, own_id)
        #if topic_id:
        #    self._dao.update_topic_info(topic_id, forum_id, views, last_change_at)
        if not topic_id:
            if self._before_date:
                topic_created_at = self._querier.get_topic_created_at(topic)
                if self._date_util.get_timestamp(
                        topic_created_at,
                        "%a, %d %B %Y") <= self._date_util.get_timestamp(
                            self._before_date, "%Y-%m-%d"):
                    self._dao.insert_topic(own_id, forum_id, title, views,
                                           last_change_at)
            else:
                self._dao.insert_topic(own_id, forum_id, title, views,
                                       last_change_at)
            topic_id = self._dao.select_topic_id(forum_id, own_id)

        return topic_id

    def _get_topic_ids(self, forum_id):
        #get list of topic ids of a forum
        topic_ids = []

        next_page = True
        while next_page:
            topics_on_page = self._querier.get_topics()

            for t in topics_on_page:
                topic_id = self._get_topic_info(forum_id, t)
                topic_ids.append(topic_id)

            next_page = self._querier.go_next_page()

        return [ti for ti in topic_ids if ti is not None]

    def _get_topics(self, forum_id):
        #insert topics to DB
        self._querier.start_browser()
        topic_ids = self._get_topic_ids(forum_id)
        self._querier.close_browser()

        intervals = [
            i for i in multiprocessing_util.get_tasks_intervals(
                topic_ids, self._num_processes) if len(i) > 0
        ]

        queue_extractors = multiprocessing.JoinableQueue()
        results = multiprocessing.Queue()

        # Start consumers
        multiprocessing_util.start_consumers(self._num_processes,
                                             queue_extractors, results)

        for interval in intervals:
            topic_extractor = EclipseTopic2Db(self._db_name, forum_id,
                                              interval, self._config,
                                              self._log_path)
            queue_extractors.put(topic_extractor)

        # Add end-of-queue markers
        multiprocessing_util.add_poison_pills(self._num_processes,
                                              queue_extractors)

        # Wait for all of the tasks to finish
        queue_extractors.join()

    def extract(self):
        """
        extracts Eclipse forum data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("EclipseForum2DbMain started")
            start_time = datetime.now()

            self._querier = EclipseForumQuerier(self._url, self._logger)
            self._dao = EclipseForumDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            forum_id = self._dao.insert_forum(project_id, self._forum_name,
                                              self._type)
            self._get_topics(forum_id)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("EclipseForum2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("EclipseForum2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()