def update(): paper = "politico" feeds = ("http://www.politico.com/rss/congress.xml", "http://www.politico.com/rss/politics.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'shr-print' in line: actualurls.append(line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "chicagotribune" feeds = ("http://feeds.chicagotribune.com/chicagotribune/news/nationworld/",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.chicagotribune.com" for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'articletools-print' in line: actualurls.append(beginurl + line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "foreignpolicy" feeds = ("http://www.foreignpolicy.com/issue/featured_content.php", "http://www.foreignpolicy.com/node/feed", "http://www.foreignpolicy.com/taxonomy/term/655/0/feed") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if '>PRINT' in line: actualurls.append(line.split('"')[1]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "" for line in readfile: if not "window.print()" in line: insertfile = insertfile + "\n" + line insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "latimes" feeds = ( "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml", "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link scrapeurls = [] beginurl = "http://www.latimes.com/news" for url in updatepaper.links: urlparts = url.split("/") scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[ 7] + "/" + urlparts[-1] scrapeurls.append(scrapeurl) updatepaper.scrape(paper, scrapeurls, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.latimes.com" total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if '>Print<' in line: actualurls.append(beginurl + line.split('"')[1]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "latimes" feeds = ( "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml", "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml", ) # Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 # Scrap articles for printable link scrapeurls = [] beginurl = "http://www.latimes.com/news" for url in updatepaper.links: urlparts = url.split("/") scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[7] + "/" + urlparts[-1] scrapeurls.append(scrapeurl) updatepaper.scrape(paper, scrapeurls, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 # Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.latimes.com" for num, file in enumerate(updatepaper.scrapefiles): for line in file: if "articletools-print" in line: actualurls.append(beginurl + line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break # Download the scraped links updatepaper.download(paper, actualurls, actualtitles) # Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "chicagotribune" feeds = ( "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/", "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.chicagotribune.com" total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'alt="Print"' in line: actualurls.append(beginurl + line.split('"')[5]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "londontimes" feeds = ("http://feeds.timesonline.co.uk/c/32313/f/440158/index.rss", "http://feeds.timesonline.co.uk/c/32313/f/440154/index.rss") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Change links to printable beginurl = "http://timesonline.co.uk" dlext = "?print=yes" actualurls = [] actualtitles = [] total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if "print-comment" in line: actualurls.append(beginurl + line.split("'")[1] + dlext) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "foreignaffairs" feeds = ("http://www.foreignaffairs.com/rss.xml",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.foreignaffairs.com" total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'print_html' in line: actualurls.append(beginurl + line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)