def update(): paper = "jerusalempost" feeds = ( "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333468" ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable actualurls = [] for link in updatepaper.links: actualurl = link.replace("ShowFull", "Printer") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "foxnews" feeds = ("http://feeds.foxnews.com/foxnews/world?format=xml", "http://feeds.foxnews.com/foxnews/politics?format=xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Modify links for printable version beginurl = "http://www.foxnews.com/printer_friendly_story/" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link.split('/')[-1].replace("2933", "3566") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Make sure failures are taken out actualinsertfiles = [] actualtitles = [] for num, outfile in enumerate(updatepaper.outfiles): if not "404 Not Found" in outfile and not "Page cannot be found" in outfile: actualinsertfiles.append(outfile) actualtitles.append(updatepaper.outtitles[num]) #Insert the modified links into the DB updatepaper.insert(paper, actualinsertfiles, actualtitles)
def update(): paper = "economist" feeds = ("http://www.economist.com/rss/briefings_rss.xml", "http://www.economist.com/rss/europe_rss.xml", "http://www.economist.com/rss/united_states_rss.xml", "http://www.economist.com/rss/the_americas_rss.xml", "http://www.economist.com/rss/middle_east_and_africa_rss.xml", "http://www.economist.com/rss/asia_rss.xml", "http://www.economist.com/rss/international_rss.xml", "http://www.economist.com/rss/finance_and_economics_rss.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Modify links for printable version actualurls = [] for link in updatepaper.links: actualurl = link.replace("displaystory", "PrinterFriendly") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "allafrica" feeds = ("http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do allaffrica specific stuff to the links beginurl = "http://allafrica.com/stories/printable/" actualurls = [] for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + splitlink[-1] actualurls.append(actualurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "bostonglobe" feeds = ("http://syndication.boston.com/news/nation?mode=rss_10", "http://syndication.boston.com/news/politics/?mode=rss_10", "http://syndication.boston.com/business/?mode=rss_10") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do bostonglobe specific stuff to the links endurl = "?mode=PF" actualurls = [] for link in updatepaper.links: actualurls.append(link.split('?')[0] + endurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append( file.replace("document.location.replace(csplit);", "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "spiegel" feeds = ("http://www.spiegel.de/schlagzeilen/rss/0,5291,676,00.xml", ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #pintable actualurls = [] for link in updatepaper.links: splitlink = link.split(",") actualurl = splitlink[0] + "," + splitlink[1] + ",druck-" + splitlink[ 2] + "," + splitlink[3] actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append(file.replace('window.print()', "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "sydneymorningherald" feeds = ("http://feeds.smh.com.au/rssheadlines/top.xml", "http://feeds.smh.com.au/rssheadlines/national.xml", "http://feeds.smh.com.au/rssheadlines/world.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable beginurl = "http://www.smh.com.au/cgi-bin/common/popupPrintArticle.pl?path=/articles/" actualurls = [] for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + splitlink[-4] + "/" + splitlink[ -3] + "/" + splitlink[-2] + "/" + splitlink[-1] actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "washingtonpost" feeds = ( "http://feeds.washingtonpost.com/wp-dyn/rss/politics/administration/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/politics/congress/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/politics/elections/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/world/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/business/economy/index_xml" ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds, "id") if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable actualurls = [] for link in updatepaper.links: splitlink = link.split(".") if splitlink[3:]: actualurl = splitlink[0] + "." + splitlink[1] + "." + splitlink[ 2] + "_pf." + splitlink[3] actualurls.append(actualurl) #Download modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "bbc" feeds = ( "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/middle_east/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml" ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do bbc specific stuff to the links filter = ("sport", "default", "thread", "blogs", "picture_gallery", "pop_ups") beginurl = "http://newsvote.bbc.co.uk/mpapps/pagetools/print/news.bbc.co.uk" total = len(updatepaper.links) actualurls = [] actualtitles = [] for num in range(0, total): link = updatepaper.links[num] title = updatepaper.titles[num] append = "yes" for blacklist in filter: if blacklist in link: append = "no" break if append == "yes": splitlink = link.split("/") beginnum = splitlink.index("-") + 1 actualurl = beginurl for urlchunk in splitlink[beginnum:]: actualurl = actualurl + "/" + urlchunk actualurls.append(actualurl) actualtitles.append(title) #Check to see if after filter, there are any urls left if len(actualurls) == 0: print("No new articles found.") return 0 #Download the modified links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "latimes" feeds = ( "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml", "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link scrapeurls = [] beginurl = "http://www.latimes.com/news" for url in updatepaper.links: urlparts = url.split("/") scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[ 7] + "/" + urlparts[-1] scrapeurls.append(scrapeurl) updatepaper.scrape(paper, scrapeurls, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.latimes.com" total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if '>Print<' in line: actualurls.append(beginurl + line.split('"')[1]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "financialtimes" feeds = ("http://www.ft.com/rss/world", "http://www.ft.com/rss/companies", "http://www.ft.com/rss/home/uk", "http://www.ft.com/rss/home/us", "http://www.ft.com/rss/home/europe", "http://www.ft.com/rss/home/asia", "http://www.ft.com/rss/lex") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Modify links for printable version actualurls = [] for link in updatepaper.links: actualurl = link.replace("0/", "").replace(".html", ",print=yes.html") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "<b>Financial Times</b>" for line in readfile: if "ft-story-header" in line or "ft-story-body" in line or "charset" in line: insertfile = insertfile + "\n" + line insertfile = insertfile.replace('lang= "en">', '').replace( '}// ]]></script>', '').replace('<script type="text/javascript">', '') insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "dallasmorningnews" feeds = ( "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewswash.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewsworld.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Download the links updatepaper.download(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "<b>Dallas Morning News</b>" dowrite = 0 for line in readfile: if dowrite == 1: if "<!-- vstory end -->" in line: break else: insertfile = insertfile + "\n" + line elif "<!-- vstory begin -->" in line: dowrite = 1 insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "chicagotribune" feeds = ( "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/", "http://feeds.chicagotribune.com/chicagotribune/news/nationworld/") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.chicagotribune.com" total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'alt="Print"' in line: actualurls.append(beginurl + line.split('"')[5]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "haaretz" feeds = ("http://www.haaretz.com/feed/enewsRss.xml", "http://www.haaretz.com/feed/edefenseRss.xml", "http://www.haaretz.com/feed/enationalRss.xml", "http://www.haaretz.com/feed/ejewishworldRss.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 beginurl = "http://www.haaretz.com/hasen/objects/pages/PrintArticleEn.jhtml?itemNo=" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link.split("/")[-1].replace(".html", "") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append( file.replace('<body bgcolor="" onload="print();">', "").replace('charset="windows-1255"', '')) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles ) # 1 - nothing was downloaded, 2 - keyboard interrupt
def update(): paper = "seattlepi" feeds = ("http://seattlepi.nwsource.com/rss/apafrica.rss", "http://seattlepi.nwsource.com/rss/apaa.rss", "http://seattlepi.nwsource.com/rss/apasia.rss", "http://seattlepi.nwsource.com/rss/apelection.rss", "http://seattlepi.nwsource.com/rss/apmideast.rss", "http://seattlepi.nwsource.com/rss/apwash.rss") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable version beginurl = "http://seattlepi.nwsource.com/printer2/index.asp?ploc=t&refer=" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append(file.replace('window.print()', "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "japantimes" feeds = ("http://feeds.feedburner.com/japantimes_news", ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable beginurl = "http://search.japantimes.co.jp/print/" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link.split("/")[-1] actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "miamiherald" feeds = ("http://www.miamiherald.com/884/index.xml", "http://www.miamiherald.com/509/index.xml", "http://www.miamiherald.com/578/index.xml", "http://www.miamiherald.com/579/index.xml", "http://www.miamiherald.com/581/index.xml", "http://www.miamiherald.com/582/index.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #change to printable actualurls = [] for link in updatepaper.links: actualurl = link.replace("/story/", "/v-print/story/") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Weed out the non complete articles insertfiles = [] inserttitles = [] for num, file in enumerate(updatepaper.outfiles): if not "Click here for full story" in file: insertfiles.append(file) inserttitles.append(updatepaper.outtitles[num]) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, inserttitles)
def update(): paper = "londontimes" feeds = ("http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml", ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable dlext = "?print=yes" actualurls = [] for link in updatepaper.links: actualurl = link.split("#")[0] + dlext actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)