def _export_(self): self.logger.debug("Récupération des messages du sujet %d (page %d)", self.topic.topic_id, self.page) response = self.session.get("/t{}p{}-a".format(self.topic.topic_id, self.page)) document = PyQuery(response.text) pattern = re.compile(r"/u(\d+)") for element in document.find("tr.post"): e = PyQuery(element) post_id = int(e("td span.name a").attr("name")) self.logger.info("Récupération du message %d (sujet %d)", post_id, self.topic.topic_id) match = pattern.fullmatch(clean_url(e("td span.name strong a").eq(0).attr("href") or "")) if match: poster = self.users[int(match.group(1))] else: poster = AnonymousUser() post = e("td div.postbody div").eq(0).html() if not post: self.logger.warning("Le message %d (sujet %d) semble être vide", post_id, self.topic.topic_id) post = "" # Get title title = e("table td span.postdetails").contents()[1] # Remove "Sujet :" before the title and spaces at the end title = title[7:].rstrip() # Get the date and time of the post timestamp = parse_date(e("table td span.postdetails").contents()[3]) self.add_child(Post(post_id, post, title, timestamp, poster))
def _export_(self): self.logger.info('Récupération des statistiques') response = self.session.get("/statistics") document = PyQuery(response.text) # Go through the table of statistics and save the relevant # ones stats = {} for element in document.find("table.forumline tr"): e = PyQuery(element) stats[e("td span").eq(0).text()] = e("td span").eq(1).text() stats[e("td span").eq(2).text()] = e("td span").eq(3).text() self.total_posts = int(stats["Messages"]) self.total_topics = int(stats["Nombre de sujets ouvert dans le forum"]) self.total_users = int(stats["Nombre d'utilisateurs"]) self.startdate = parse_date(stats["Ouverture du forum"]) self.record_online_date = parse_date( stats["Date du record de connexions"]) self.record_online_users = int( stats["Nombre record d'utilisateurs connectés en même temps"]) self.site_name = document("div.maintitle").eq(0).text().strip(" \n") self.site_desc = document("div.maintitle").siblings("span.gen").eq( 0).text().strip(" \n") self.logger.debug('Messages : %d', self.total_posts) self.logger.debug('Sujets : %d', self.total_topics) self.logger.debug('Membres : %d', self.total_users) # Add the children nodes, which respectively handle the # exportation of the smilies, the users and the message self.add_child(Smilies()) if self.config["use_ocr"]: # Use Optical Character Recognition to get the users' # emails self.add_child(OcrUsers()) else: self.add_child(Users()) self.add_child(Groups()) self.add_child(Forums())
def _export_(self): self.logger.info('Récupération des statistiques') response = self.session.get("/statistics") document = PyQuery(response.text) # Go through the table of statistics and save the relevant # ones stats = {} for element in document.find("table.forumline tr"): e = PyQuery(element) stats[e("td span").eq(0).text()] = e("td span").eq(1).text() stats[e("td span").eq(2).text()] = e("td span").eq(3).text() self.total_posts = int(stats["Messages"]) self.total_topics = int(stats["Nombre de sujets ouvert dans le forum"]) self.total_users = int(stats["Nombre d'utilisateurs"]) self.startdate = parse_date(stats["Ouverture du forum"]) self.record_online_date = parse_date(stats["Date du record de connexions"]) self.record_online_users = int( stats["Nombre record d'utilisateurs connectés en même temps"]) self.site_name = document("div.maintitle").eq(0).text().strip(" \n") self.site_desc = document("div.maintitle").siblings("span.gen").eq(0).text().strip(" \n") self.logger.debug('Messages : %d', self.total_posts) self.logger.debug('Sujets : %d', self.total_topics) self.logger.debug('Membres : %d', self.total_users) # Add the children nodes, which respectively handle the # exportation of the smilies, the users and the message self.add_child(Smilies()) if self.config["use_ocr"]: # Use Optical Character Recognition to get the users' # emails self.add_child(OcrUsers()) else: self.add_child(Users()) self.add_child(Groups()) self.add_child(Forums())
def _export_(self): self.logger.debug('Récupération des messages du sujet %d (page %d)', self.topic.topic_id, self.page) response = self.session.get("/t{}p{}-{}".format( self.topic.topic_id, self.page, self.topic.topic_slug)) self.logger.debug("url : t{}p{}-{}".format(self.topic.topic_id, self.page, self.topic.topic_slug)) document = PyQuery(response.text) pattern = re.compile(r"/u(\d+)") for element in document.find('tr.post'): e = PyQuery(element) name = e("td span.name a").attr("name") if name is None: self.logger.warning( 'Message %s (sujet %d) irrécupérable, informations manquantes. !', e.attr('class'), self.topic.topic_id) continue post_id = int(name) self.logger.info('Récupération du message %d (sujet %d)', post_id, self.topic.topic_id) match = pattern.fullmatch( clean_url(e("td span.name strong a").eq(0).attr("href") or "")) if match: poster = self.users[int(match.group(1))] else: poster = AnonymousUser() post = e("td div.postbody div").eq(0).html() if not post: self.logger.warning( 'Le message %d (sujet %d) semble être vide', post_id, self.topic.topic_id) post = "" # Get title title = e("table td span.postdetails").contents()[1] # Remove "Sujet :" before the title and spaces at the end title = title[7:].rstrip() # Get the date and time of the post timestamp = parse_date( e("table td span.postdetails").contents()[3]) self.add_child(Post(post_id, post, title, timestamp, poster))