def scrap_genre(self, channel, genre, url): html = urllib.urlopen(url).read() soup = BeautifulSoup(html, from_encoding='utf-8') for show in soup.article('li'): try: url = urllib.basejoin(self.BASE_URL, show.a.get('href')) print url self.get_serie(channel, genre, url) except Exception,e: print "ERROR: %s" % str(e)
def scrape_news(): try: news_url = "https://mars.nasa.gov/news/" browser.visit(news_url) html = browser.html soup = BeautifulSoup(html, "html.parser") #find text for title and teaser article = soup.find_all("div", class_="list_text") #print(len(article)) #print(article) #find title and teaser news_title = soup.article("div", class_="content_title") news_p = soup.article("div", class_="article_teaser_body") # Dictionary entry from MARS NEWS mars_info['news_title'] = news_title mars_info['news_paragraph'] = news_p return mars_info finally: browser.quit()
def recuperer_infos_conseiller(url): res = requests.get(f"{AFE_DOMAIN}{url}") res.raise_for_status() soup = BeautifulSoup(res.content, "lxml") header = soup.article.header titre, *_ = header.h1.contents[0].strip().split(" ", 1) # on écarte le premier qui est traité à part (date et lieu de naissance) fields = [ li for li in soup.article("li") if len(li.contents) == 2 and li.strong ][1:] infos = { correspondance[li.contents[0].text.strip(": ")]: li.contents[1] for li in fields } bloc_naissance = header.ul("li")[0] if len(bloc_naissance.contents) > 1: infos["date_naissance"], *lieu = bloc_naissance.contents[1].split( " à ") if lieu: infos["lieu_naissance"] = lieu[0] if "email_principal" in infos: infos["email_principal"] = infos["email_principal"].text.replace( " chez ", "@") if "email_autre" in infos: infos["email_autre"] = infos["email_autre"].text.replace(" chez ", "@") return { "titre": titre, **{ k: v.strip() if isinstance(v, str) else v.text.strip() for k, v in infos.items() }, }
def download_chapter( url=None, filename=None, main_title=None, title_strip=None, title_re=None, scraper=None ): # Download a given chapter from lxml import html, etree from bs4 import BeautifulSoup, UnicodeDammit import requests, re #~ import pdb #~ pdb.set_trace() if (url is None or filename is None): return False if (scraper is not None): page = scraper.get(url) else: page = requests.get(url) if page.status_code == 404: return False page.encoding="utf-8" #tree = BeautifulSoup(UnicodeDammit.detwingle(page.text), "html5lib") tree = BeautifulSoup(page.text, "html5lib") # Trim down to just the article content title = "" btree = tree.article if not btree: return False # remove Next/Previous for i in btree("a", string=re.compile("(Next|Prev(ious)?|Index)( ?Chapter)?")): i.decompose() #~ for i in btree("a", string="Previous Chapter"): #~ i.decompose() for i in btree("hr"): i.unwrap() for i in btree("span", style=re.compile("float: ?right")): i.decompose() for i in btree("span", style=re.compile("(font-family|color|text-align)")): i.unwrap() for i in btree("div", class_=re.compile("wpcnt|sharedaddy")): i.decompose() for i in btree("p"): if 'style' in i: del i['style'] if "Previous Chapter" in btree.p.text: btree.p.decompose() # TODO: remove all empty tags # Want to rewrite chapter links # pull images from glossary page and embed? if (main_title is not None and main_title != ""): title = main_title else: doc_title = btree.find("h1", class_="entry-title") temp_string = "" if doc_title and isinstance(doc_title, list): doc_title = doc_title[0] if doc_title: for x in doc_title.stripped_strings: temp_string = "{} {}".format(temp_string, x) doc_title = temp_string if ("glossary" in doc_title.lower() or "index" in doc_title.lower() ): title = doc_title else: t_div = btree("div", class_="entry-content")[0] if (t_div.u): t_div.u.unwrap() if t_div.span: t_div.span.unwrap() if t_div.br: t_div.br.decompose() if (t_div.b): st = tree.new_tag("strong") temp_string = "" for x in t_div.b.stripped_strings: temp_string = temp_string + " {}".format(x) st.string = temp_string tree.article.div.b.replace_with(st) titles = t_div.strong #~ print("titles:{}".format(titles)) if (titles): if titles.br: titles.br.decompose() if not titles.string: title = "" for x in titles.stripped_strings: title = title + " {}".format(x) else: title = titles.string #~ print("strtitle:{}".format(title)) if (re.match('^\s+$', title)): title = "" if (title == "" and t_div.h3): title = t_div.h3.string #~ print("h3title:{}".format(title)) if (title == ""): title = doc_title #~ print("dtitle:{}".format(title)) if not title: title = tree.title.string #~ print("title:{}".format(title)) title = title.strip() title = re.sub(re.compile('\n| |\r|\t| '), ' ', title) title = title.replace(' ', ' ') if (title_strip is not None): #~ print("strip:'{}' title:'{}'".format(title_strip, title)) #title = title.replace(title_strip, '').strip() title = re.sub(title_strip, '', title).strip() #~ print("stripped:'{}'".format(title)) if (title_re is not None): title_re = title_re.strip() title_re = title_re.rstrip('"') title_re = title_re.lstrip('"') title_re = title_re.rstrip("'") title_re = title_re.lstrip("'") t_regex = title_re.split('||') title = re.sub(t_regex[0], t_regex[1], title) #~ print("re:'{}' title:'{}'".format(title_re,title)) nt = tree.new_tag("section") nt["epub:type"] = "chapter" tmp = tree.article("div", class_="entry-meta") if tmp: tmp[0].decompose() tree.article.div.wrap(nt) tree.article.div.unwrap() nt = tree.new_tag("body") tree.article.section.wrap(nt) nt = tree.new_tag("html") tree.article.section.wrap(nt) nt = tree.new_tag("head") tree.article.section.insert_before(nt) nt = tree.new_tag("title") nt.string = title.strip() tree.article.head.append(nt) nt = tree.new_tag("link", rel="stylesheet", href="style/main.css") nt["type"] = "text/css" tree.article.head.append(nt) #~ tree = html.fromstring(big_html) #~ html.html_to_xhtml(tree) tree = BeautifulSoup(tree.article.html.prettify(formatter="html"), "html5lib") with open(filename, 'w') as f: f.write(tree.prettify(formatter="html"))