Python MainUpdate.scrape Exemples, ExtempFiles.update.MainUpdate.scrape Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : politico.py Projet : bruenig/extemp

def update():
  paper = "politico"
  feeds = ("http://www.politico.com/rss/congress.xml",
           "http://www.politico.com/rss/politics.xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if 'shr-print' in line:
        actualurls.append(line.split('"')[3])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)

Exemple #2

0

Afficher le fichier

Fichier : chicagotribune.py Projet : bruenig/extemp

def update():
  paper = "chicagotribune"
  feeds = ("http://feeds.chicagotribune.com/chicagotribune/news/nationworld/",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  beginurl = "http://www.chicagotribune.com"
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if 'articletools-print' in line:
        actualurls.append(beginurl + line.split('"')[3])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)

Exemple #3

0

Afficher le fichier

Fichier : foreignpolicy.py Projet : bruenig/extemp

def update():
  paper = "foreignpolicy"
  feeds = ("http://www.foreignpolicy.com/issue/featured_content.php",
           "http://www.foreignpolicy.com/node/feed",
           "http://www.foreignpolicy.com/taxonomy/term/655/0/feed")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  total = len(updatepaper.scrapefiles)
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if '>PRINT' in line:
        actualurls.append(line.split('"')[1])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

 #Format outputted files
  insertfiles = []
  for file in updatepaper.outfiles:
    readfile = file.split("\n")
    insertfile = ""

    for line in readfile:
      if not "window.print()" in line:
        insertfile = insertfile + "\n" + line
        
    insertfiles.append(insertfile)


  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)

Exemple #4

0

Afficher le fichier

Fichier : latimes.py Projet : lonicera/aur

def update():
    paper = "latimes"
    feeds = (
        "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml",
        "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Scrap articles for printable link
    scrapeurls = []
    beginurl = "http://www.latimes.com/news"
    for url in updatepaper.links:
        urlparts = url.split("/")
        scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[
            7] + "/" + urlparts[-1]
        scrapeurls.append(scrapeurl)

    updatepaper.scrape(paper, scrapeurls, updatepaper.titles)

    if len(updatepaper.scrapefiles) == 0:
        print("No new articles found.")
        return 0

    #Get printable urls
    actualurls = []
    actualtitles = []
    beginurl = "http://www.latimes.com"
    total = len(updatepaper.scrapefiles)
    for num, file in enumerate(updatepaper.scrapefiles):
        for line in file:
            if '>Print<' in line:
                actualurls.append(beginurl + line.split('"')[1])
                actualtitles.append(updatepaper.scrapetitles[num])
                break

    #Download the scraped links
    updatepaper.download(paper, actualurls, actualtitles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)

Exemple #5

0

Afficher le fichier

Fichier : latimes.py Projet : bruenig/extemp

def update():
    paper = "latimes"
    feeds = (
        "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml",
        "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml",
    )

    # Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    # Scrap articles for printable link
    scrapeurls = []
    beginurl = "http://www.latimes.com/news"
    for url in updatepaper.links:
        urlparts = url.split("/")
        scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[7] + "/" + urlparts[-1]
        scrapeurls.append(scrapeurl)

    updatepaper.scrape(paper, scrapeurls, updatepaper.titles)

    if len(updatepaper.scrapefiles) == 0:
        print("No new articles found.")
        return 0

    # Get printable urls
    actualurls = []
    actualtitles = []
    beginurl = "http://www.latimes.com"
    for num, file in enumerate(updatepaper.scrapefiles):
        for line in file:
            if "articletools-print" in line:
                actualurls.append(beginurl + line.split('"')[3])
                actualtitles.append(updatepaper.scrapetitles[num])
                break

    # Download the scraped links
    updatepaper.download(paper, actualurls, actualtitles)

    # Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)

Exemple #6

0

Afficher le fichier

def update():
    paper = "chicagotribune"
    feeds = (
        "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/",
        "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Scrap articles for printable link
    updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

    if len(updatepaper.scrapefiles) == 0:
        print("No new articles found.")
        return 0

    #Get printable urls
    actualurls = []
    actualtitles = []
    beginurl = "http://www.chicagotribune.com"
    total = len(updatepaper.scrapefiles)
    for num, file in enumerate(updatepaper.scrapefiles):
        for line in file:
            if 'alt="Print"' in line:
                actualurls.append(beginurl + line.split('"')[5])
                actualtitles.append(updatepaper.scrapetitles[num])
                break

    #Download the scraped links
    updatepaper.download(paper, actualurls, actualtitles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)

Exemple #7

0

Afficher le fichier

Fichier : londontimes.py Projet : bruenig/extemp

def update():
  paper = "londontimes"
  feeds = ("http://feeds.timesonline.co.uk/c/32313/f/440158/index.rss",
           "http://feeds.timesonline.co.uk/c/32313/f/440154/index.rss")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Change links to printable
  beginurl = "http://timesonline.co.uk"
  dlext = "?print=yes"
  actualurls = []
  actualtitles = []
  total = len(updatepaper.scrapefiles)
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if "print-comment" in line:
        actualurls.append(beginurl + line.split("'")[1] + dlext)
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)

Exemple #8

0

Afficher le fichier

Fichier : foreignaffairs.py Projet : bruenig/extemp

def update():
  paper = "foreignaffairs"
  feeds = ("http://www.foreignaffairs.com/rss.xml",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  beginurl = "http://www.foreignaffairs.com"
  total = len(updatepaper.scrapefiles)
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if 'print_html' in line:
        actualurls.append(beginurl + line.split('"')[3])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)