Ejemplo n.º 1
0
 def scrap_genre(self, channel, genre, url):
     html = urllib.urlopen(url).read()
     soup = BeautifulSoup(html, from_encoding='utf-8')
     for show in soup.article('li'):
         try:
             url = urllib.basejoin(self.BASE_URL, show.a.get('href'))
             print url
             self.get_serie(channel, genre, url)
         except Exception,e:
             print "ERROR: %s" % str(e)
Ejemplo n.º 2
0
def scrape_news():
    try:
        news_url = "https://mars.nasa.gov/news/"
        browser.visit(news_url)
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")

        #find text for title and teaser
        article = soup.find_all("div", class_="list_text")
        #print(len(article))
        #print(article)

        #find title and teaser
        news_title = soup.article("div", class_="content_title")
        news_p = soup.article("div", class_="article_teaser_body")

        # Dictionary entry from MARS NEWS
        mars_info['news_title'] = news_title
        mars_info['news_paragraph'] = news_p

        return mars_info

    finally:
        browser.quit()
Ejemplo n.º 3
0
def recuperer_infos_conseiller(url):
    res = requests.get(f"{AFE_DOMAIN}{url}")
    res.raise_for_status()

    soup = BeautifulSoup(res.content, "lxml")
    header = soup.article.header

    titre, *_ = header.h1.contents[0].strip().split(" ", 1)
    # on écarte le premier qui est traité à part (date et lieu de naissance)
    fields = [
        li for li in soup.article("li") if len(li.contents) == 2 and li.strong
    ][1:]

    infos = {
        correspondance[li.contents[0].text.strip(": ")]: li.contents[1]
        for li in fields
    }

    bloc_naissance = header.ul("li")[0]
    if len(bloc_naissance.contents) > 1:
        infos["date_naissance"], *lieu = bloc_naissance.contents[1].split(
            " à ")
        if lieu:
            infos["lieu_naissance"] = lieu[0]

    if "email_principal" in infos:
        infos["email_principal"] = infos["email_principal"].text.replace(
            " chez ", "@")
    if "email_autre" in infos:
        infos["email_autre"] = infos["email_autre"].text.replace(" chez ", "@")

    return {
        "titre": titre,
        **{
            k: v.strip() if isinstance(v, str) else v.text.strip()
            for k, v in infos.items()
        },
    }
Ejemplo n.º 4
0
def download_chapter(
  url=None, filename=None,
  main_title=None, title_strip=None, title_re=None, scraper=None
):
  # Download a given chapter
  from lxml import html, etree
  from bs4 import BeautifulSoup, UnicodeDammit
  import requests, re
  #~ import pdb

  #~ pdb.set_trace()
  if (url is None or filename is None):
    return False

  if (scraper is not None):
    page = scraper.get(url)
  else:
    page = requests.get(url)
  if page.status_code == 404:
    return False
  page.encoding="utf-8"
  #tree = BeautifulSoup(UnicodeDammit.detwingle(page.text), "html5lib")
  tree = BeautifulSoup(page.text, "html5lib")
  # Trim down to just the article content
  title = ""
  btree = tree.article
  if not btree:
    return False
  # remove Next/Previous
  for i in btree("a", string=re.compile("(Next|Prev(ious)?|Index)( ?Chapter)?")):
    i.decompose()
  #~ for i in btree("a", string="Previous Chapter"):
    #~ i.decompose()
  for i in btree("hr"):
    i.unwrap()
  for i in btree("span", style=re.compile("float: ?right")):
    i.decompose()
  for i in btree("span", style=re.compile("(font-family|color|text-align)")):
    i.unwrap()
  for i in btree("div", class_=re.compile("wpcnt|sharedaddy")):
    i.decompose()
  for i in btree("p"):
    if 'style' in i:
      del i['style']
  if "Previous Chapter" in btree.p.text:
    btree.p.decompose()
  # TODO: remove all empty tags
  # Want to rewrite chapter links
  # pull images from glossary page and embed?
  if (main_title is not None and main_title != ""):
    title = main_title
  else:
    doc_title = btree.find("h1", class_="entry-title")
    temp_string = ""
    if doc_title and isinstance(doc_title, list):
      doc_title = doc_title[0]
    if doc_title:
      for x in doc_title.stripped_strings:
        temp_string = "{} {}".format(temp_string, x)
    doc_title = temp_string
    if ("glossary" in doc_title.lower() or
      "index" in doc_title.lower()
    ):
      title = doc_title
    else:
      t_div = btree("div", class_="entry-content")[0]
      if (t_div.u):
        t_div.u.unwrap()
      if t_div.span:
        t_div.span.unwrap()
      if t_div.br:
        t_div.br.decompose()
      if (t_div.b):
        st = tree.new_tag("strong")
        temp_string = ""
        for x in t_div.b.stripped_strings:
          temp_string = temp_string + " {}".format(x)
        st.string = temp_string
        tree.article.div.b.replace_with(st)
      titles = t_div.strong
      #~ print("titles:{}".format(titles))
      if (titles):
        if titles.br:
          titles.br.decompose()
        if not titles.string:
          title = ""
          for x in titles.stripped_strings:
            title = title + " {}".format(x)
        else:
          title = titles.string
        #~ print("strtitle:{}".format(title))
        if (re.match('^\s+$', title)):
          title = ""
      if (title == "" and t_div.h3):
        title = t_div.h3.string
        #~ print("h3title:{}".format(title))
      if (title == ""):
        title = doc_title
        #~ print("dtitle:{}".format(title))
    if not title:
      title = tree.title.string
    #~ print("title:{}".format(title))
    title = title.strip()
    title = re.sub(re.compile('\n|  |\r|\t| '), ' ', title)
    title = title.replace('  ', ' ')
    if (title_strip is not None):
      #~ print("strip:'{}' title:'{}'".format(title_strip, title))
      #title = title.replace(title_strip, '').strip()
      title = re.sub(title_strip, '', title).strip()
      #~ print("stripped:'{}'".format(title))
    if (title_re is not None):
      title_re = title_re.strip()
      title_re = title_re.rstrip('"')
      title_re = title_re.lstrip('"')
      title_re = title_re.rstrip("'")
      title_re = title_re.lstrip("'")
      t_regex = title_re.split('||')
      title = re.sub(t_regex[0], t_regex[1], title)
      #~ print("re:'{}' title:'{}'".format(title_re,title))
  nt = tree.new_tag("section")
  nt["epub:type"] = "chapter"
  tmp = tree.article("div", class_="entry-meta")
  if tmp:
    tmp[0].decompose()
  tree.article.div.wrap(nt)
  tree.article.div.unwrap()
  nt = tree.new_tag("body")
  tree.article.section.wrap(nt)
  nt = tree.new_tag("html")
  tree.article.section.wrap(nt)
  nt = tree.new_tag("head")
  tree.article.section.insert_before(nt)
  nt = tree.new_tag("title")
  nt.string = title.strip()
  tree.article.head.append(nt)
  nt = tree.new_tag("link", rel="stylesheet", href="style/main.css")
  nt["type"] = "text/css"
  tree.article.head.append(nt)
  #~ tree = html.fromstring(big_html)
  #~ html.html_to_xhtml(tree)
  tree = BeautifulSoup(tree.article.html.prettify(formatter="html"), "html5lib")
  with open(filename, 'w') as f:
    f.write(tree.prettify(formatter="html"))