def extract(self): """ extracts Eclipse forum data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("EclipseForum2DbMain started") start_time = datetime.now() self._querier = EclipseForumQuerier(self._url, self._logger) self._dao = EclipseForumDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) forum_id = self._dao.insert_forum(project_id, self._forum_name, self._type) self._get_topics(forum_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseForum2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("EclipseForum2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
def __call__(self): try: log_path = self._log_root_path + "-topic2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") self._querier = EclipseForumQuerier(None, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self.extract() except Exception: self._logger.error("EclipseTopic2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class EclipseTopic2Db(object): """ This class handles the import of Eclipse forum topics """ TOPIC_URL = 'https://www.eclipse.org/forums/index.php/t/' def __init__(self, db_name, forum_id, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type forum_id: int :param forum_id: the id of an existing forum in the DB :type interval: list int :param interval: a list of topic ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._interval = interval self._db_name = db_name self._forum_id = forum_id self._config = config self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-topic2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._querier = EclipseForumQuerier(None, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self.extract() except Exception: self._logger.error("EclipseTopic2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _get_message_attachments_info(self, message_id, message): #get attachment informatio of messages attachments = self._querier.message_get_attachments(message) for a in attachments: url = self._querier.get_attachment_url(a) own_id = self._querier.get_attachment_own_id(a) name = self._querier.get_attachment_name(a) extension = name.split('.')[-1].strip('').lower() size = self._querier.get_attachment_size(a) self._dao.insert_message_attachment(url, own_id, name, extension, size, message_id) def _get_message_info(self, topic_id, message, pos): #get information of topic messages own_id = self._querier.get_message_own_id(message) created_at = self._date_util.get_timestamp( self._querier.get_created_at(message), "%a, %d %B %Y %H:%M") body = self._querier.get_message_body(message) author_name = self._querier.get_message_author_name(message) message_id = self._dao.insert_message( own_id, pos, self._dao.get_message_type_id("reply"), topic_id, body, None, self._dao.get_user_id(author_name), created_at) if self._querier.message_has_attachments(message): self._get_message_attachments_info(message_id, message) if pos == 1: self._dao.update_topic_created_at(topic_id, created_at, self._forum_id) def extract(self): """ extracts Eclipse forum topic data and stores it in the DB """ self._logger.info("EclipseTopic2Db started") start_time = datetime.now() for topic_id in self._interval: topic_own_id = self._dao.get_topic_own_id(self._forum_id, topic_id) self._querier.set_url(EclipseTopic2Db.TOPIC_URL + str(topic_own_id) + "/") self._querier.start_browser() time.sleep(3) if 'index.php/e/' in self._querier._url: self._logger.warning("No URL exists for the topic id " + str(topic_id) + " - " + str(self._forum_id)) next_page = True pos = 1 while next_page: messages_on_page = self._querier.get_messages() for message in messages_on_page: self._get_message_info(topic_id, message, pos) pos += 1 next_page = self._querier.go_next_page() self._querier.close_browser() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseTopic2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
class EclipseForum2DbUpdate(): """ This class handles the update of Eclipse forum data """ NUM_PROCESSES = 2 def __init__(self, db_name, project_name, forum_name, eclipse_forum_url, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type forum_name: str :param forum_name: the name of an existing forum in the DB to update :type eclipse_forum_url: str :param eclipse_forum_url: the URL of the forum :type num_processes: int :param num_processes: number of processes to import the data (default 2) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "update-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name self._project_name = project_name self._url = eclipse_forum_url self._db_name = db_name self._forum_name = forum_name config.update({'database': db_name}) self._config = config if num_processes: self._num_processes = num_processes else: self._num_processes = EclipseForum2DbUpdate.NUM_PROCESSES self._logging_util = LoggingUtil() self._date_util = DateUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _update_topics_info(self, forum_id): # update topics of a given forum next_page = True while next_page: topics_on_page = self._querier.get_topics() for topic in topics_on_page: topic_own_id = self._querier.get_topic_own_id(topic) topic_in_db = self._dao.get_topic_id(topic_own_id, forum_id) if topic_in_db: views = self._querier.get_topic_views(topic) last_change_at = self._date_util.get_timestamp( self._querier.get_last_change_at(topic), "%a, %d %B %Y %H:%M") self._dao.update_topic_info(topic_in_db, forum_id, views, last_change_at) next_page = self._querier.go_next_page() def _get_topics(self, forum_id): #update topics of a forum topic_ids = self._dao.get_topic_ids(forum_id) if topic_ids: self._update_topics_info(forum_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, self._num_processes) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_extractors, results) for interval in intervals: topic_extractor = EclipseTopic2Db(self._db_name, forum_id, interval, self._config, self._log_path) queue_extractors.put(topic_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def update(self): """ updates the Eclipse forum data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("EclipseForum2DbUpdate started") start_time = datetime.now() self._querier = EclipseForumQuerier(self._url, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self._querier.start_browser() project_id = self._dao.select_project_id(self._project_name) forum_id = self._dao.select_forum_id(self._forum_name, project_id) if forum_id: self._get_topics(forum_id) self._querier.close_browser() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseForum2DbUpdate finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("EclipseForum2DbUpdate failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class EclipseForum2DbMain(): """ This class handles the import of Eclipse forum data """ NUM_PROCESSES = 2 def __init__(self, db_name, project_name, type, forum_name, url, before_date, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type type: str :param type: type of the forum (Stackoverflow, Eclipse forum) :type forum_name: str :param forum_name: the name of the forum to import :type url: str :param url: the URL of the forum :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type num_processes: int :param num_processes: number of processes to import the data (default 2) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name self._type = type self._url = url self._forum_name = forum_name self._project_name = project_name self._db_name = db_name self._before_date = before_date config.update({'database': db_name}) self._config = config if num_processes: self._num_processes = num_processes else: self._num_processes = EclipseForum2DbMain.NUM_PROCESSES self._logging_util = LoggingUtil() self._date_util = DateUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_topic_info(self, forum_id, topic): #get topic information own_id = self._querier.get_topic_own_id(topic) title = self._querier.get_topic_title(topic) views = self._querier.get_topic_views(topic) last_change_at = self._date_util.get_timestamp( self._querier.get_last_change_at(topic), "%a, %d %B %Y %H:%M") topic_id = self._dao.select_topic_id(forum_id, own_id) #if topic_id: # self._dao.update_topic_info(topic_id, forum_id, views, last_change_at) if not topic_id: if self._before_date: topic_created_at = self._querier.get_topic_created_at(topic) if self._date_util.get_timestamp( topic_created_at, "%a, %d %B %Y") <= self._date_util.get_timestamp( self._before_date, "%Y-%m-%d"): self._dao.insert_topic(own_id, forum_id, title, views, last_change_at) else: self._dao.insert_topic(own_id, forum_id, title, views, last_change_at) topic_id = self._dao.select_topic_id(forum_id, own_id) return topic_id def _get_topic_ids(self, forum_id): #get list of topic ids of a forum topic_ids = [] next_page = True while next_page: topics_on_page = self._querier.get_topics() for t in topics_on_page: topic_id = self._get_topic_info(forum_id, t) topic_ids.append(topic_id) next_page = self._querier.go_next_page() return [ti for ti in topic_ids if ti is not None] def _get_topics(self, forum_id): #insert topics to DB self._querier.start_browser() topic_ids = self._get_topic_ids(forum_id) self._querier.close_browser() intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, self._num_processes) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_extractors, results) for interval in intervals: topic_extractor = EclipseTopic2Db(self._db_name, forum_id, interval, self._config, self._log_path) queue_extractors.put(topic_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def extract(self): """ extracts Eclipse forum data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("EclipseForum2DbMain started") start_time = datetime.now() self._querier = EclipseForumQuerier(self._url, self._logger) self._dao = EclipseForumDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) forum_id = self._dao.insert_forum(project_id, self._forum_name, self._type) self._get_topics(forum_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseForum2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("EclipseForum2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()