def get_topics(self, list_url): nb_page = len(list_url) for num_page, url in enumerate(list_url): num_page += 1 obj_page = urlopen(url) soup = BeautifulSoup.BeautifulSoup(obj_page) name_zone = soup.findAll("div", {"id": "vf"})[0].h2.span.string search_category = False if name_zone == u"Résultats de la recherche": # u'Résultats de la recherche': search_category = True else: category = name_zone id_category = url.split("id=")[-1].split("&")[0] sys.stdout.write( "\rObtention des pages ▕" + "█" * num_page + " " * (nb_page - num_page) + "▏ " + str(num_page) + "/" + str(nb_page) ) sys.stdout.flush() for item in soup.findAll("div", "tclcon"): is_move = False if item.contents[0] and u"Déplacé" in item.contents[0].strip(): is_move = True tr_parent = item.findParents("tr")[0] topic_id = item.a["href"].split("id=")[-1] titre = htmlentitydecode(item.a.string) auteur = item.span.contents[0].replace("par ", "") is_closed = False is_closed = tr_parent.get("class") == "iclosed" if not is_move: balise_td = tr_parent.findAll("td", "tcr")[0] date = balise_td.a.string obj_date = transform_date(date) else: obj_date = None if search_category: td_category = tr_parent.findAll("td", "tc2")[0] category = td_category.a.string id_category = td_category.a["href"].split("id=")[-1] yield { "id": topic_id, "auteur": auteur, "titre": titre, "is_closed": is_closed, "date_last": obj_date, "is_move": is_move, "id_category": id_category, "category": category, "num_page": num_page, } print ("")
def get_topics(self, list_url): nb_page = len(list_url) for num_page, url in enumerate(list_url): num_page += 1 obj_page = urlopen(url) soup = BeautifulSoup.BeautifulSoup( obj_page ) name_zone = soup.findAll("div",{"id":"vf"})[0].h2.span.string search_category = False if name_zone == u'Résultats de la recherche': search_category = True else: category = name_zone id_category = url.split('id=')[-1].split("&")[0] sys.stdout.write('\rObtention des pages ▕'+'█'*num_page+' '*(nb_page-num_page)\ +'▏ '+str(num_page)+'/'+str(nb_page)) sys.stdout.flush() for item in soup.findAll("div","tclcon"): is_move = False if item.contents[0] and \ u"Déplacé" in item.contents[0].strip(): is_move = True tr_parent = item.findParents("tr")[0] topic_id = item.a['href'].split("id=")[-1] titre = htmlentitydecode(item.a.string) auteur = item.span.contents[0].replace("par ","") is_closed = False is_closed = tr_parent.get("class") == "iclosed" if not is_move: balise_td = tr_parent.findAll("td", "tcr")[0] date = balise_td.a.string obj_date = transform_date(date) else: obj_date = None if search_category: td_category = tr_parent.findAll('td', 'tc2')[0] category = td_category.a.string id_category = td_category.a['href'].split('id=')[-1] yield {'id':topic_id, 'auteur':auteur, 'titre':titre, 'is_closed':is_closed, 'date_last':obj_date, 'is_move': is_move, 'id_category': id_category, 'category': category, 'num_page': num_page} print('')
def doublons(self, **kwargs): """Recherche les doublons dans les derniers messages du forum""" topics = {} topic_by_auteur = {} url = URL_24H nb_page = self.get_page_max(url) if not kwargs["nb_page"]: pass elif kwargs["nb_page"] < nb_page: nb_page = kwargs["nb_page"] else: print ( "Vous dépassez la limite du forum, \ il y a %s pages sur ce forum" % nb_page ) url = url + "&p=%s" list_url = [url % item for item in range(1, 1 + nb_page)] for topic in self.get_topics(list_url): if topic["category"] != "Trash": topic_id = topic["id"] auteur = topic["auteur"] topics[topic_id] = topic topic_by_auteur.setdefault(auteur, []) topic_by_auteur[auteur].append(topic_id) auteur_many_topic = dict( [ (key, [ele for ele in value if not topics[ele]["is_closed"]]) for key, value in topic_by_auteur.items() if len(value) > 1 and [ele for ele in value if not topics[ele]["is_closed"]] ] ) matchs_by_auth = {} namespace = {} for auteur, value in auteur_many_topic.items(): value = set(value) # bug de double id si un sujet chage de page pendant la récup titles = dict([(id_top, topics[id_top]) for id_top in auteur_many_topic[auteur]]) matchs_by_auth[auteur] = [] for id_nbr, id_topic in enumerate(value): matchs = {} title = titles.pop(id_topic)["titre"] for id_top in titles: if matchs_by_auth[auteur].count(id_topic) == 0: match = difflib.get_close_matches(title, [titles[id_top]["titre"]], cutoff=0.5) if match: matchs[id_top] = match[0] matchs_by_auth[auteur].append(id_top) if matchs: matchs[id_topic] = title obj_page = urlopen(URL_TOPIC % id_topic) soup = BeautifulSoup.BeautifulSoup(obj_page) auteur_id = soup.findAll("div", "postleft")[0].findAll("a")[0]["href"].split("id=")[-1] self.debug("--------------\n" + auteur) for key in matchs.keys(): self.debug( htmlentitydecode(matchs[key]) + u" (id:" + str(key) + u", " + htmlentitydecode(topics[key]["category"]) + u")" ) namespace.setdefault("topics", []) list_titre = [ { "topic_id": key, "titre": value, "id_category": topics[key]["id_category"], "category": topics[key]["category"], } for key, value in matchs.items() ] namespace["topics"].append( {"auteur_id": auteur_id, "auteur": auteur, "list_titre": list_titre, "topic_id": topic_id} ) html_page = self.affichage("doublons.txt", namespace) obj_file = open("doublons.html", "w") obj_file.write(html_page) obj_file.close()