Beispiel #1
0
    def get_topics(self, list_url):
        nb_page = len(list_url)
        for num_page, url in enumerate(list_url):
            num_page += 1
            obj_page = urlopen(url)
            soup = BeautifulSoup.BeautifulSoup(obj_page)
            name_zone = soup.findAll("div", {"id": "vf"})[0].h2.span.string
            search_category = False
            if name_zone == u"Résultats de la recherche":  # u'Résultats de la recherche':
                search_category = True
            else:
                category = name_zone
                id_category = url.split("id=")[-1].split("&")[0]
            sys.stdout.write(
                "\rObtention des pages ▕"
                + "█" * num_page
                + " " * (nb_page - num_page)
                + "▏ "
                + str(num_page)
                + "/"
                + str(nb_page)
            )
            sys.stdout.flush()

            for item in soup.findAll("div", "tclcon"):
                is_move = False
                if item.contents[0] and u"Déplacé" in item.contents[0].strip():
                    is_move = True
                tr_parent = item.findParents("tr")[0]

                topic_id = item.a["href"].split("id=")[-1]
                titre = htmlentitydecode(item.a.string)
                auteur = item.span.contents[0].replace("par ", "")

                is_closed = False
                is_closed = tr_parent.get("class") == "iclosed"
                if not is_move:
                    balise_td = tr_parent.findAll("td", "tcr")[0]
                    date = balise_td.a.string
                    obj_date = transform_date(date)
                else:
                    obj_date = None
                if search_category:
                    td_category = tr_parent.findAll("td", "tc2")[0]
                    category = td_category.a.string
                    id_category = td_category.a["href"].split("id=")[-1]

                yield {
                    "id": topic_id,
                    "auteur": auteur,
                    "titre": titre,
                    "is_closed": is_closed,
                    "date_last": obj_date,
                    "is_move": is_move,
                    "id_category": id_category,
                    "category": category,
                    "num_page": num_page,
                }
        print ("")
Beispiel #2
0
    def get_topics(self, list_url):
        nb_page = len(list_url)
        for num_page, url in enumerate(list_url):
            num_page += 1
            obj_page = urlopen(url)
            soup = BeautifulSoup.BeautifulSoup( obj_page )
            name_zone  = soup.findAll("div",{"id":"vf"})[0].h2.span.string
            search_category = False
            if name_zone == u'Résultats de la recherche':
                search_category = True
            else:
                category = name_zone
                id_category = url.split('id=')[-1].split("&")[0]
            sys.stdout.write('\rObtention des pages ▕'+'█'*num_page+' '*(nb_page-num_page)\
                       +'▏ '+str(num_page)+'/'+str(nb_page))
            sys.stdout.flush()

            for item in soup.findAll("div","tclcon"):
                is_move = False
                if item.contents[0] and \
                    u"Déplacé" in  item.contents[0].strip():
                    is_move = True
                tr_parent = item.findParents("tr")[0]

                topic_id = item.a['href'].split("id=")[-1]
                titre = htmlentitydecode(item.a.string)
                auteur = item.span.contents[0].replace("par ","")

                is_closed = False
                is_closed = tr_parent.get("class") == "iclosed"
                if not is_move:
                    balise_td = tr_parent.findAll("td", "tcr")[0]
                    date = balise_td.a.string
                    obj_date = transform_date(date)
                else:
                    obj_date = None
                if search_category:
                    td_category = tr_parent.findAll('td', 'tc2')[0]
                    category = td_category.a.string
                    id_category = td_category.a['href'].split('id=')[-1]


                yield {'id':topic_id, 'auteur':auteur, 'titre':titre,
                       'is_closed':is_closed, 'date_last':obj_date,
                       'is_move': is_move, 'id_category': id_category,
                       'category': category, 'num_page': num_page}
        print('')
Beispiel #3
0
    def doublons(self, **kwargs):
        """Recherche les doublons dans les derniers messages du forum"""
        topics = {}
        topic_by_auteur = {}
        url = URL_24H
        nb_page = self.get_page_max(url)
        if not kwargs["nb_page"]:
            pass
        elif kwargs["nb_page"] < nb_page:
            nb_page = kwargs["nb_page"]
        else:
            print (
                "Vous dépassez la limite du forum, \
    il y a %s pages sur ce forum"
                % nb_page
            )
        url = url + "&p=%s"
        list_url = [url % item for item in range(1, 1 + nb_page)]
        for topic in self.get_topics(list_url):
            if topic["category"] != "Trash":
                topic_id = topic["id"]
                auteur = topic["auteur"]
                topics[topic_id] = topic
                topic_by_auteur.setdefault(auteur, [])
                topic_by_auteur[auteur].append(topic_id)

        auteur_many_topic = dict(
            [
                (key, [ele for ele in value if not topics[ele]["is_closed"]])
                for key, value in topic_by_auteur.items()
                if len(value) > 1 and [ele for ele in value if not topics[ele]["is_closed"]]
            ]
        )

        matchs_by_auth = {}
        namespace = {}
        for auteur, value in auteur_many_topic.items():
            value = set(value)  # bug de double id si un sujet chage de page pendant la récup
            titles = dict([(id_top, topics[id_top]) for id_top in auteur_many_topic[auteur]])
            matchs_by_auth[auteur] = []
            for id_nbr, id_topic in enumerate(value):
                matchs = {}
                title = titles.pop(id_topic)["titre"]
                for id_top in titles:
                    if matchs_by_auth[auteur].count(id_topic) == 0:
                        match = difflib.get_close_matches(title, [titles[id_top]["titre"]], cutoff=0.5)
                        if match:
                            matchs[id_top] = match[0]
                            matchs_by_auth[auteur].append(id_top)
                if matchs:
                    matchs[id_topic] = title
                    obj_page = urlopen(URL_TOPIC % id_topic)
                    soup = BeautifulSoup.BeautifulSoup(obj_page)
                    auteur_id = soup.findAll("div", "postleft")[0].findAll("a")[0]["href"].split("id=")[-1]
                    self.debug("--------------\n" + auteur)
                    for key in matchs.keys():
                        self.debug(
                            htmlentitydecode(matchs[key])
                            + u" (id:"
                            + str(key)
                            + u", "
                            + htmlentitydecode(topics[key]["category"])
                            + u")"
                        )
                    namespace.setdefault("topics", [])
                    list_titre = [
                        {
                            "topic_id": key,
                            "titre": value,
                            "id_category": topics[key]["id_category"],
                            "category": topics[key]["category"],
                        }
                        for key, value in matchs.items()
                    ]
                    namespace["topics"].append(
                        {"auteur_id": auteur_id, "auteur": auteur, "list_titre": list_titre, "topic_id": topic_id}
                    )

        html_page = self.affichage("doublons.txt", namespace)
        obj_file = open("doublons.html", "w")
        obj_file.write(html_page)
        obj_file.close()