def _export_(self): self.logger.debug('Récupération du forum %s (page %d)', self.forum.oldid, self.page) # Download the page response = self.session.get("/{}p{}-a".format(self.forum.oldid, self.page)) document = PyQuery(response.text) # Get the topics for element in document.find('div.topictitle'): e = PyQuery(element) topic_id = int( re.search(r"/t(\d+)-.*", clean_url(e("a").attr("href"))).group(1)) if topic_id not in self.announcements: topic_slug = re.search(r"/t(\d+)-(.*)", clean_url(e("a").attr("href"))).group(2) f = e.parents().eq(-2) locked = 1 if ("verrouillé" in f("td img").eq(0).attr("alt")) else 0 views = int(f("td").eq(5).text()) topic_type = TOPIC_TYPES.get(e("strong").text(), 0) title = e("a").text() self.add_child( Topic(topic_id, topic_type, title, locked, views, topic_slug)) if topic_type >= 2: # The topic is an announcement, save its id to avoid exporting it again self.announcements.append(topic_id)
def _export_(self): self.logger.debug('Récupération des membres (page %d)', self.page) # Get the page of list of users from the administration panel params = { "part" : "users_groups", "sub" : "users", "start" : self.page } response = self.session.get_admin("/admin/index.forum", params=params) # Check if the page was blocked query = urllib.parse.urlparse(response.url).query query = urllib.parse.parse_qs(query) if "start" not in query: raise MemberPageBlocked() document = PyQuery(response.text) for element in document('tbody tr'): e = PyQuery(element) oldid = int(re.search(r"&u=(\d+)&", clean_url(e("td a").eq(0).attr("href"))).group(1)) self.logger.info('Récupération du membre %d', oldid) name = e("td a").eq(0).text() mail = e("td a").eq(1).text() posts = int(e("td").eq(2).text()) date = parse_admin_date(e("td").eq(3).text()) lastvisit = parse_admin_date(e("td").eq(4).text()) self.add_child(User(oldid, name, mail, posts, date, lastvisit))
def get_subforums_infos(self, html): """ Get informations (description, number of topics and posts, ...) about the forums listed on a page """ document = PyQuery(html) idpattern = re.compile(r"/([fc]\d+)-.*") for element in document("a.forumlink"): e = PyQuery(element) match = idpattern.fullmatch(clean_url(e.attr("href"))) if not match: continue oldid = match.group(1) row = e.closest("tr") # Get forum status alt = row("td:nth-of-type(1) img").eq(0).attr("alt") self.forums[oldid].status = 1 if "verrouillé" in alt else 0 # Get subforum description self.forums[oldid].description = row("td:nth-of-type(2) span").eq( 1).html() or "" # TODO : Get subforum icon # Get subforum numbers of topics and posts self.forums[oldid].num_topics = int(row("td").eq(2).text()) self.forums[oldid].num_posts = int(row("td").eq(3).text())
def _export_(self): self.logger.debug("Récupération des messages du sujet %d (page %d)", self.topic.topic_id, self.page) response = self.session.get("/t{}p{}-a".format(self.topic.topic_id, self.page)) document = PyQuery(response.text) pattern = re.compile(r"/u(\d+)") for element in document.find("tr.post"): e = PyQuery(element) post_id = int(e("td span.name a").attr("name")) self.logger.info("Récupération du message %d (sujet %d)", post_id, self.topic.topic_id) match = pattern.fullmatch(clean_url(e("td span.name strong a").eq(0).attr("href") or "")) if match: poster = self.users[int(match.group(1))] else: poster = AnonymousUser() post = e("td div.postbody div").eq(0).html() if not post: self.logger.warning("Le message %d (sujet %d) semble être vide", post_id, self.topic.topic_id) post = "" # Get title title = e("table td span.postdetails").contents()[1] # Remove "Sujet :" before the title and spaces at the end title = title[7:].rstrip() # Get the date and time of the post timestamp = parse_date(e("table td span.postdetails").contents()[3]) self.add_child(Post(post_id, post, title, timestamp, poster))
def _export_(self): self.logger.debug('Récupération des membres (page %d)', self.page) # Get the page of list of users from the administration panel params = {"part": "users_groups", "sub": "users", "start": self.page} response = self.session.get_admin("/admin/index.forum", params=params) # Check if the page was blocked query = urllib.parse.urlparse(response.url).query query = urllib.parse.parse_qs(query) if "start" not in query: raise MemberPageBlocked() document = PyQuery(response.text) for element in document('tbody tr'): e = PyQuery(element) oldid = int( re.search(r"&u=(\d+)&", clean_url(e("td a").eq(0).attr("href"))).group(1)) self.logger.info('Récupération du membre %d', oldid) name = e("td a").eq(0).text() mail = e("td a").eq(1).text() posts = int(e("td").eq(2).text()) date = parse_admin_date(e("td").eq(3).text()) lastvisit = parse_admin_date(e("td").eq(4).text()) self.add_child(User(oldid, name, mail, posts, date, lastvisit))
def get_subforums_infos(self, html): """ Get informations (description, number of topics and posts, ...) about the forums listed on a page """ document = PyQuery(html) idpattern = re.compile(r"/([fc]\d+)-.*") for element in document("a.forumlink"): e = PyQuery(element) match = idpattern.fullmatch(clean_url(e.attr("href"))) if not match: continue oldid = match.group(1) row = e.closest("tr") # Get forum status alt = row("td:nth-of-type(1) img").eq(0).attr("alt") self.forums[oldid].status = 1 if "verrouillé" in alt else 0 # Get subforum description self.forums[oldid].description = row("td:nth-of-type(2) span").eq(1).html() or "" # TODO : Get subforum icon # Get subforum numbers of topics and posts self.forums[oldid].num_topics = int(row("td").eq(2).text()) self.forums[oldid].num_posts = int(row("td").eq(3).text())
def _export_(self): self.logger.debug('Récupération des messages du sujet %d (page %d)', self.topic.topic_id, self.page) response = self.session.get("/t{}p{}-{}".format( self.topic.topic_id, self.page, self.topic.topic_slug)) self.logger.debug("url : t{}p{}-{}".format(self.topic.topic_id, self.page, self.topic.topic_slug)) document = PyQuery(response.text) pattern = re.compile(r"/u(\d+)") for element in document.find('tr.post'): e = PyQuery(element) name = e("td span.name a").attr("name") if name is None: self.logger.warning( 'Message %s (sujet %d) irrécupérable, informations manquantes. !', e.attr('class'), self.topic.topic_id) continue post_id = int(name) self.logger.info('Récupération du message %d (sujet %d)', post_id, self.topic.topic_id) match = pattern.fullmatch( clean_url(e("td span.name strong a").eq(0).attr("href") or "")) if match: poster = self.users[int(match.group(1))] else: poster = AnonymousUser() post = e("td div.postbody div").eq(0).html() if not post: self.logger.warning( 'Le message %d (sujet %d) semble être vide', post_id, self.topic.topic_id) post = "" # Get title title = e("table td span.postdetails").contents()[1] # Remove "Sujet :" before the title and spaces at the end title = title[7:].rstrip() # Get the date and time of the post timestamp = parse_date( e("table td span.postdetails").contents()[3]) self.add_child(Post(post_id, post, title, timestamp, poster))
def _export_(self): self.logger.debug('Récupération des membres (page %d)', self.page) params = { "mode": "joined", "order": "", "start": self.page, "username": "" } response = self.session.get("/memberlist", params=params) document = PyQuery(response.text) table = PyQuery( document("form[action=\"/memberlist\"]").next_all( "table.forumline").eq(0)) urlpattern = re.compile(r"/u(\d+)") stylepattern = re.compile(r"color:#(.{6})") first = True for element in table.find("tr"): # Skip first row if first: first = False continue e = PyQuery(element) oldid = int( urlpattern.fullmatch(clean_url( e("td a").eq(0).attr("href"))).group(1)) name = e("td a").eq(1).text() posts = int(e("td").eq(6).text()) date = e("td").eq(4).text().split("/") date = int( time.mktime( time.struct_time((int(date[2]), int(date[1]), int(date[0]), 0, 0, 0, 0, 0, 0)))) match = stylepattern.fullmatch( e("td a").eq(1).children("span").attr("style") or "") if match: colour = match.group(1) else: colour = "" self.add_child(OcrUser(oldid, name, posts, date, colour))
def _export_(self): self.logger.debug('Récupération du groupe %d (page %d)', self.group.oldid, self.page) params = { "start": self.page } response = self.session.get(r"/g{}-a".format(self.group.oldid), params=params) document = PyQuery(response.text) pattern = re.compile(r"/u(\d+)") for element in document.find("a"): match = pattern.fullmatch(clean_url(element.get("href", ""))) if match: user = self.users[int(match.group(1))] if self.group not in user.groups: user.groups.append(self.group)
def _export_(self): self.logger.debug('Récupération des membres (page %d)', self.page) params = { "mode" : "joined", "order" : "", "start" : self.page, "username" : "" } response = self.session.get("/memberlist", params=params) document = PyQuery(response.text) table = PyQuery(document("form[action=\"/memberlist\"]").next_all("table.forumline").eq(0)) urlpattern = re.compile(r"/u(\d+)") stylepattern = re.compile(r"color:#(.{6})") first = True for element in table.find("tr"): # Skip first row if first: first = False continue e = PyQuery(element) oldid = int(urlpattern.fullmatch(clean_url(e("td a").eq(0).attr("href"))).group(1)) name = e("td a").eq(1).text() posts = int(e("td").eq(6).text()) date = e("td").eq(4).text().split("/") date = int(time.mktime(time.struct_time( (int(date[2]), int(date[1]), int(date[0]), 0, 0, 0, 0, 0, 0)))) match = stylepattern.fullmatch(e("td a").eq(1).children("span").attr("style") or "") if match: colour = match.group(1) else: colour = "" self.add_child(OcrUser(oldid, name, posts, date, colour))
def _export_(self): self.logger.debug("Récupération du forum %s (page %d)", self.forum.oldid, self.page) # Download the page response = self.session.get("/{}p{}-a".format(self.forum.oldid, self.page)) document = PyQuery(response.text) # Get the topics for element in document.find("div.topictitle"): e = PyQuery(element) topic_id = int(re.search(r"/t(\d+)-.*", clean_url(e("a").attr("href"))).group(1)) if topic_id not in self.announcements: f = e.parents().eq(-2) locked = 1 if ("verrouillé" in f("td img").eq(0).attr("alt")) else 0 views = int(f("td").eq(5).text()) topic_type = TOPIC_TYPES.get(e("strong").text(), 0) title = e("a").text() self.add_child(Topic(topic_id, topic_type, title, locked, views)) if topic_type >= 2: # The topic is an announcement, save its id to avoid exporting it again self.announcements.append(topic_id)