Ejemplo n.º 1
0
    def _export_(self):
        self.logger.debug('Récupération du forum %s (page %d)',
                          self.forum.oldid, self.page)

        # Download the page
        response = self.session.get("/{}p{}-a".format(self.forum.oldid,
                                                      self.page))
        document = PyQuery(response.text)

        # Get the topics
        for element in document.find('div.topictitle'):
            e = PyQuery(element)

            topic_id = int(
                re.search(r"/t(\d+)-.*",
                          clean_url(e("a").attr("href"))).group(1))
            if topic_id not in self.announcements:
                topic_slug = re.search(r"/t(\d+)-(.*)",
                                       clean_url(e("a").attr("href"))).group(2)
                f = e.parents().eq(-2)
                locked = 1 if ("verrouillé"
                               in f("td img").eq(0).attr("alt")) else 0
                views = int(f("td").eq(5).text())
                topic_type = TOPIC_TYPES.get(e("strong").text(), 0)
                title = e("a").text()

                self.add_child(
                    Topic(topic_id, topic_type, title, locked, views,
                          topic_slug))
                if topic_type >= 2:
                    # The topic is an announcement, save its id to avoid exporting it again
                    self.announcements.append(topic_id)
Ejemplo n.º 2
0
    def _export_(self):
        self.logger.debug('Récupération des membres (page %d)', self.page)

        # Get the page of list of users from the administration panel
        params = {
            "part" : "users_groups",
            "sub" : "users",
            "start" : self.page
        }
        response = self.session.get_admin("/admin/index.forum", params=params)

        # Check if the page was blocked
        query = urllib.parse.urlparse(response.url).query
        query = urllib.parse.parse_qs(query)
        if "start" not in query:
            raise MemberPageBlocked()

        document = PyQuery(response.text)

        for element in document('tbody tr'):
            e = PyQuery(element)
            oldid = int(re.search(r"&u=(\d+)&", clean_url(e("td a").eq(0).attr("href"))).group(1))

            self.logger.info('Récupération du membre %d', oldid)
            name = e("td a").eq(0).text()
            mail = e("td a").eq(1).text()
            posts = int(e("td").eq(2).text())

            date = parse_admin_date(e("td").eq(3).text())
            lastvisit = parse_admin_date(e("td").eq(4).text())

            self.add_child(User(oldid, name, mail, posts, date, lastvisit))
Ejemplo n.º 3
0
    def get_subforums_infos(self, html):
        """
        Get informations (description, number of topics and posts, ...) about
        the forums listed on a page
        """
        document = PyQuery(html)

        idpattern = re.compile(r"/([fc]\d+)-.*")

        for element in document("a.forumlink"):
            e = PyQuery(element)

            match = idpattern.fullmatch(clean_url(e.attr("href")))
            if not match:
                continue

            oldid = match.group(1)

            row = e.closest("tr")

            # Get forum status
            alt = row("td:nth-of-type(1) img").eq(0).attr("alt")
            self.forums[oldid].status = 1 if "verrouillé" in alt else 0

            # Get subforum description
            self.forums[oldid].description = row("td:nth-of-type(2) span").eq(
                1).html() or ""

            # TODO : Get subforum icon

            # Get subforum numbers of topics and posts
            self.forums[oldid].num_topics = int(row("td").eq(2).text())
            self.forums[oldid].num_posts = int(row("td").eq(3).text())
Ejemplo n.º 4
0
    def _export_(self):
        self.logger.debug("Récupération des messages du sujet %d (page %d)", self.topic.topic_id, self.page)

        response = self.session.get("/t{}p{}-a".format(self.topic.topic_id, self.page))
        document = PyQuery(response.text)

        pattern = re.compile(r"/u(\d+)")

        for element in document.find("tr.post"):
            e = PyQuery(element)

            post_id = int(e("td span.name a").attr("name"))

            self.logger.info("Récupération du message %d (sujet %d)", post_id, self.topic.topic_id)

            match = pattern.fullmatch(clean_url(e("td span.name strong a").eq(0).attr("href") or ""))
            if match:
                poster = self.users[int(match.group(1))]
            else:
                poster = AnonymousUser()

            post = e("td div.postbody div").eq(0).html()
            if not post:
                self.logger.warning("Le message  %d (sujet %d) semble être vide", post_id, self.topic.topic_id)
                post = ""

            # Get title
            title = e("table td span.postdetails").contents()[1]
            # Remove "Sujet :" before the title and spaces at the end
            title = title[7:].rstrip()

            # Get the date and time of the post
            timestamp = parse_date(e("table td span.postdetails").contents()[3])

            self.add_child(Post(post_id, post, title, timestamp, poster))
Ejemplo n.º 5
0
    def _export_(self):
        self.logger.debug('Récupération des membres (page %d)', self.page)

        # Get the page of list of users from the administration panel
        params = {"part": "users_groups", "sub": "users", "start": self.page}
        response = self.session.get_admin("/admin/index.forum", params=params)

        # Check if the page was blocked
        query = urllib.parse.urlparse(response.url).query
        query = urllib.parse.parse_qs(query)
        if "start" not in query:
            raise MemberPageBlocked()

        document = PyQuery(response.text)

        for element in document('tbody tr'):
            e = PyQuery(element)
            oldid = int(
                re.search(r"&u=(\d+)&",
                          clean_url(e("td a").eq(0).attr("href"))).group(1))

            self.logger.info('Récupération du membre %d', oldid)
            name = e("td a").eq(0).text()
            mail = e("td a").eq(1).text()
            posts = int(e("td").eq(2).text())

            date = parse_admin_date(e("td").eq(3).text())
            lastvisit = parse_admin_date(e("td").eq(4).text())

            self.add_child(User(oldid, name, mail, posts, date, lastvisit))
Ejemplo n.º 6
0
    def get_subforums_infos(self, html):
        """
        Get informations (description, number of topics and posts, ...) about
        the forums listed on a page
        """
        document = PyQuery(html)

        idpattern = re.compile(r"/([fc]\d+)-.*")

        for element in document("a.forumlink"):
            e = PyQuery(element)

            match = idpattern.fullmatch(clean_url(e.attr("href")))
            if not match:
                continue

            oldid = match.group(1)

            row = e.closest("tr")

            # Get forum status
            alt = row("td:nth-of-type(1) img").eq(0).attr("alt")
            self.forums[oldid].status = 1 if "verrouillé" in alt else 0

            # Get subforum description
            self.forums[oldid].description = row("td:nth-of-type(2) span").eq(1).html() or ""

            # TODO : Get subforum icon

            # Get subforum numbers of topics and posts
            self.forums[oldid].num_topics = int(row("td").eq(2).text())
            self.forums[oldid].num_posts = int(row("td").eq(3).text())
Ejemplo n.º 7
0
    def _export_(self):
        self.logger.debug('Récupération des messages du sujet %d (page %d)',
                          self.topic.topic_id, self.page)

        response = self.session.get("/t{}p{}-{}".format(
            self.topic.topic_id, self.page, self.topic.topic_slug))
        self.logger.debug("url : t{}p{}-{}".format(self.topic.topic_id,
                                                   self.page,
                                                   self.topic.topic_slug))
        document = PyQuery(response.text)

        pattern = re.compile(r"/u(\d+)")

        for element in document.find('tr.post'):
            e = PyQuery(element)

            name = e("td span.name a").attr("name")

            if name is None:
                self.logger.warning(
                    'Message %s (sujet %d) irrécupérable, informations manquantes. !',
                    e.attr('class'), self.topic.topic_id)
                continue

            post_id = int(name)

            self.logger.info('Récupération du message %d (sujet %d)', post_id,
                             self.topic.topic_id)

            match = pattern.fullmatch(
                clean_url(e("td span.name strong a").eq(0).attr("href") or ""))
            if match:
                poster = self.users[int(match.group(1))]
            else:
                poster = AnonymousUser()

            post = e("td div.postbody div").eq(0).html()
            if not post:
                self.logger.warning(
                    'Le message  %d (sujet %d) semble être vide', post_id,
                    self.topic.topic_id)
                post = ""

            # Get title
            title = e("table td span.postdetails").contents()[1]
            # Remove "Sujet :" before the title and spaces at the end
            title = title[7:].rstrip()

            # Get the date and time of the post
            timestamp = parse_date(
                e("table td span.postdetails").contents()[3])

            self.add_child(Post(post_id, post, title, timestamp, poster))
Ejemplo n.º 8
0
    def _export_(self):
        self.logger.debug('Récupération des membres (page %d)', self.page)

        params = {
            "mode": "joined",
            "order": "",
            "start": self.page,
            "username": ""
        }
        response = self.session.get("/memberlist", params=params)
        document = PyQuery(response.text)

        table = PyQuery(
            document("form[action=\"/memberlist\"]").next_all(
                "table.forumline").eq(0))

        urlpattern = re.compile(r"/u(\d+)")
        stylepattern = re.compile(r"color:#(.{6})")

        first = True
        for element in table.find("tr"):
            # Skip first row
            if first:
                first = False
                continue

            e = PyQuery(element)
            oldid = int(
                urlpattern.fullmatch(clean_url(
                    e("td a").eq(0).attr("href"))).group(1))

            name = e("td a").eq(1).text()
            posts = int(e("td").eq(6).text())

            date = e("td").eq(4).text().split("/")
            date = int(
                time.mktime(
                    time.struct_time((int(date[2]), int(date[1]), int(date[0]),
                                      0, 0, 0, 0, 0, 0))))

            match = stylepattern.fullmatch(
                e("td a").eq(1).children("span").attr("style") or "")
            if match:
                colour = match.group(1)
            else:
                colour = ""

            self.add_child(OcrUser(oldid, name, posts, date, colour))
Ejemplo n.º 9
0
    def _export_(self):
        self.logger.debug('Récupération du groupe %d (page %d)', self.group.oldid, self.page)
        params = {
            "start": self.page
        }
        response = self.session.get(r"/g{}-a".format(self.group.oldid), params=params)
        document = PyQuery(response.text)

        pattern = re.compile(r"/u(\d+)")

        for element in document.find("a"):
            match = pattern.fullmatch(clean_url(element.get("href", "")))
            if match:
                user = self.users[int(match.group(1))]
                if self.group not in user.groups:
                    user.groups.append(self.group)
Ejemplo n.º 10
0
    def _export_(self):
        self.logger.debug('Récupération des membres (page %d)', self.page)

        params = {
            "mode" : "joined",
            "order" : "",
            "start" : self.page,
            "username" : ""
        }
        response = self.session.get("/memberlist", params=params)
        document = PyQuery(response.text)

        table = PyQuery(document("form[action=\"/memberlist\"]").next_all("table.forumline").eq(0))

        urlpattern = re.compile(r"/u(\d+)")
        stylepattern = re.compile(r"color:#(.{6})")

        first = True
        for element in table.find("tr"):
            # Skip first row
            if first:
                first = False
                continue

            e = PyQuery(element)
            oldid = int(urlpattern.fullmatch(clean_url(e("td a").eq(0).attr("href"))).group(1))

            name = e("td a").eq(1).text()
            posts = int(e("td").eq(6).text())

            date = e("td").eq(4).text().split("/")
            date = int(time.mktime(time.struct_time(
                (int(date[2]), int(date[1]), int(date[0]), 0, 0, 0, 0, 0, 0))))

            match = stylepattern.fullmatch(e("td a").eq(1).children("span").attr("style") or "")
            if match:
                colour = match.group(1)
            else:
                colour = ""

            self.add_child(OcrUser(oldid, name, posts, date, colour))
Ejemplo n.º 11
0
    def _export_(self):
        self.logger.debug("Récupération du forum %s (page %d)", self.forum.oldid, self.page)

        # Download the page
        response = self.session.get("/{}p{}-a".format(self.forum.oldid, self.page))
        document = PyQuery(response.text)

        # Get the topics
        for element in document.find("div.topictitle"):
            e = PyQuery(element)

            topic_id = int(re.search(r"/t(\d+)-.*", clean_url(e("a").attr("href"))).group(1))
            if topic_id not in self.announcements:
                f = e.parents().eq(-2)
                locked = 1 if ("verrouillé" in f("td img").eq(0).attr("alt")) else 0
                views = int(f("td").eq(5).text())
                topic_type = TOPIC_TYPES.get(e("strong").text(), 0)
                title = e("a").text()

                self.add_child(Topic(topic_id, topic_type, title, locked, views))
                if topic_type >= 2:
                    # The topic is an announcement, save its id to avoid exporting it again
                    self.announcements.append(topic_id)