def update(): paper = "chicagotribune" feeds = ("http://feeds.chicagotribune.com/chicagotribune/news/nationworld/",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.chicagotribune.com" for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'articletools-print' in line: actualurls.append(beginurl + line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "politico" feeds = ("http://www.politico.com/rss/congress.xml", "http://www.politico.com/rss/politics.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'shr-print' in line: actualurls.append(line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "washingtonpost" feeds = ( "http://feeds.washingtonpost.com/wp-dyn/rss/politics/administration/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/politics/congress/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/politics/elections/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/world/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/business/economy/index_xml" ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds, "id") if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable actualurls = [] for link in updatepaper.links: splitlink = link.split(".") if splitlink[3:]: actualurl = splitlink[0] + "." + splitlink[1] + "." + splitlink[ 2] + "_pf." + splitlink[3] actualurls.append(actualurl) #Download modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "jerusalempost" feeds = ( "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333468" ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable actualurls = [] for link in updatepaper.links: actualurl = link.replace("ShowFull", "Printer") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "foxnews" feeds = ("http://feeds.foxnews.com/foxnews/world?format=xml", "http://feeds.foxnews.com/foxnews/politics?format=xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Modify links for printable version beginurl = "http://www.foxnews.com/printer_friendly_story/" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link.split('/')[-1].replace("2933", "3566") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Make sure failures are taken out actualinsertfiles = [] actualtitles = [] for num, outfile in enumerate(updatepaper.outfiles): if not "404 Not Found" in outfile and not "Page cannot be found" in outfile: actualinsertfiles.append(outfile) actualtitles.append(updatepaper.outtitles[num]) #Insert the modified links into the DB updatepaper.insert(paper, actualinsertfiles, actualtitles)
def update(): paper = "bostonglobe" feeds = ("http://syndication.boston.com/news/nation?mode=rss_10", "http://syndication.boston.com/news/politics/?mode=rss_10", "http://syndication.boston.com/business/?mode=rss_10") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do bostonglobe specific stuff to the links endurl = "?mode=PF" actualurls = [] for link in updatepaper.links: actualurls.append(link.split('?')[0] + endurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append( file.replace("document.location.replace(csplit);", "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "allafrica" feeds = ("http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do allaffrica specific stuff to the links beginurl = "http://allafrica.com/stories/printable/" actualurls = [] for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + splitlink[-1] actualurls.append(actualurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "washingtonpost" feeds = ( "http://feeds.washingtonpost.com/wp-dyn/rss/politics/administration/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/politics/congress/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/politics/elections/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/world/index_xml", "http://feeds.washingtonpost.com/wp-dyn/rss/business/economy/index_xml", ) # Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds, "id") if len(updatepaper.links) == 0: print("No new articles found.") return 0 # printable actualurls = [] for link in updatepaper.links: splitlink = link.split(".") if splitlink[3:]: actualurl = splitlink[0] + "." + splitlink[1] + "." + splitlink[2] + "_pf." + splitlink[3] actualurls.append(actualurl) # Download modified links updatepaper.download(paper, actualurls, updatepaper.titles) # Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "msnbc" feeds = ("http://rss.msnbc.msn.com/id/3032552/device/rss/rss.xml", "http://rss.msnbc.msn.com/id/3032506/device/rss/rss.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #change to printable dlext="/print/1/displaymode/1098" actualurls = [] for link in updatepaper.links: actualurl = link + dlext actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "sydneymorningherald" feeds = ("http://feeds.smh.com.au/rssheadlines/top.xml", "http://feeds.smh.com.au/rssheadlines/national.xml", "http://feeds.smh.com.au/rssheadlines/world.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable beginurl = "http://www.smh.com.au/cgi-bin/common/popupPrintArticle.pl?path=/articles/" actualurls = [] for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + splitlink[-4] + "/" + splitlink[-3] + "/" + splitlink[-2] + "/" + splitlink[-1] actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "straitstimes" feeds = ("http://www.straitstimes.com/STI/STIFILES/rss/break_world.xml", "http://www.straitstimes.com/STI/STIFILES/rss/break_money.xml", "http://www.straitstimes.com/STI/STIFILES/rss/break_sea.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do allaffrica specific stuff to the links beginurl = "http://www.straitstimes.com/print" actualurls = [] for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + "/" + splitlink[-4] + "/" + splitlink[-3] + "/" + splitlink[-2] + "/" + splitlink[-1] actualurls.append(actualurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "bostonglobe" feeds = ("http://syndication.boston.com/news/nation?mode=rss_10", "http://syndication.boston.com/news/politics/?mode=rss_10", "http://syndication.boston.com/business/?mode=rss_10") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do bostonglobe specific stuff to the links endurl = "?mode=PF" actualurls = [] for link in updatepaper.links: actualurls.append(link.split('?')[0] + endurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append(file.replace("document.location.replace(csplit);", "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "jerusalempost" feeds = ("http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144", "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333468") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable actualurls = [] for link in updatepaper.links: actualurl = link.replace("ShowFull", "Printer") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "foreignpolicy" feeds = ("http://www.foreignpolicy.com/issue/featured_content.php", "http://www.foreignpolicy.com/node/feed", "http://www.foreignpolicy.com/taxonomy/term/655/0/feed") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if '>PRINT' in line: actualurls.append(line.split('"')[1]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "" for line in readfile: if not "window.print()" in line: insertfile = insertfile + "\n" + line insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "spiegel" feeds = ("http://www.spiegel.de/schlagzeilen/rss/0,5291,676,00.xml",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #pintable actualurls = [] for link in updatepaper.links: splitlink = link.split(",") actualurl = splitlink[0] + "," + splitlink[1] + ",druck-" + splitlink[2] + "," + splitlink[3] actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append(file.replace('window.print()', "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "allafrica" feeds = ("http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do allaffrica specific stuff to the links beginurl = "http://allafrica.com/stories/printable/" actualurls = [] for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + splitlink[-1] actualurls.append(actualurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "sfchronicle" feeds = ("http://feeds.sfgate.com/sfgate/rss/feeds/news", "http://feeds.sfgate.com/sfgate/rss/feeds/business") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds, "id") if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable actualurls = [] actualtitles = [] beginurl = "http://www.sfgate.com/cgi-bin/article.cgi?f=" total = len(updatepaper.links) for num in range(0, total): if "DTL" in updatepaper.links[num]: actualurl = beginurl + updatepaper.links[num].replace("http://feeds.sfgate.com", "") + "&type=printable" actualurls.append(actualurl) actualtitles.append(updatepaper.titles[num]) if not actualurls: print("No new articles found.") return 0 #Download modified links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "omahaworldherald" feeds = ("http://www.omaha.com/apps/pbcs.dll/section?category=rss&c=news03&mime=xml", "http://www.omaha.com/apps/pbcs.dll/section?category=rss&c=money01&mime=xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #pintable actualurls = [] beginurl="http://www.omaha.com/apps/pbcs.dll/article?AID=" for link in updatepaper.links: splitlink = link.split("/") actualurl = beginurl + "/" + splitlink[-3] + "/" + splitlink[-2] + "/" + splitlink[-1] + "&template=printart" actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append(file.replace('window.print()', "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "economist" feeds = ("http://www.economist.com/rss/briefings_rss.xml", "http://www.economist.com/rss/europe_rss.xml", "http://www.economist.com/rss/united_states_rss.xml", "http://www.economist.com/rss/the_americas_rss.xml", "http://www.economist.com/rss/middle_east_and_africa_rss.xml", "http://www.economist.com/rss/asia_rss.xml", "http://www.economist.com/rss/international_rss.xml", "http://www.economist.com/rss/finance_and_economics_rss.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Modify links for printable version actualurls = [] for link in updatepaper.links: actualurl = link.replace("displaystory", "PrinterFriendly") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "latimes" feeds = ( "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml", "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml", ) # Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 # Scrap articles for printable link scrapeurls = [] beginurl = "http://www.latimes.com/news" for url in updatepaper.links: urlparts = url.split("/") scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[7] + "/" + urlparts[-1] scrapeurls.append(scrapeurl) updatepaper.scrape(paper, scrapeurls, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 # Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.latimes.com" for num, file in enumerate(updatepaper.scrapefiles): for line in file: if "articletools-print" in line: actualurls.append(beginurl + line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break # Download the scraped links updatepaper.download(paper, actualurls, actualtitles) # Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "bbc" feeds = ("http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/middle_east/rss.xml", "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml" ) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do bbc specific stuff to the links filter = ("sport", "default", "thread", "blogs", "picture_gallery", "pop_ups") beginurl = "http://newsvote.bbc.co.uk/mpapps/pagetools/print/news.bbc.co.uk" total = len(updatepaper.links) actualurls = [] actualtitles = [] for num in range(0, total): link = updatepaper.links[num] title = updatepaper.titles[num] append = "yes" for blacklist in filter: if blacklist in link: append = "no" break if append == "yes": splitlink = link.split("/") beginnum = splitlink.index("-") + 1 actualurl = beginurl for urlchunk in splitlink[beginnum:]: actualurl = actualurl + "/" + urlchunk actualurls.append(actualurl) actualtitles.append(title) #Check to see if after filter, there are any urls left if len(actualurls) == 0: print("No new articles found.") return 0 #Download the modified links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "londontimes" feeds = ("http://feeds.timesonline.co.uk/c/32313/f/440158/index.rss", "http://feeds.timesonline.co.uk/c/32313/f/440154/index.rss") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Change links to printable beginurl = "http://timesonline.co.uk" dlext = "?print=yes" actualurls = [] actualtitles = [] total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if "print-comment" in line: actualurls.append(beginurl + line.split("'")[1] + dlext) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "dallasmorningnews" feeds = ( "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewswash.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewsworld.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Download the links updatepaper.download(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "<b>Dallas Morning News</b>" dowrite = 0 for line in readfile: if dowrite == 1: if "<!-- vstory end -->" in line: break else: insertfile = insertfile + "\n" + line elif "<!-- vstory begin -->" in line: dowrite = 1 insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "foreignaffairs" feeds = ("http://www.foreignaffairs.com/rss.xml",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Scrap articles for printable link updatepaper.scrape(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.scrapefiles) == 0: print("No new articles found.") return 0 #Get printable urls actualurls = [] actualtitles = [] beginurl = "http://www.foreignaffairs.com" total = len(updatepaper.scrapefiles) for num, file in enumerate(updatepaper.scrapefiles): for line in file: if 'print_html' in line: actualurls.append(beginurl + line.split('"')[3]) actualtitles.append(updatepaper.scrapetitles[num]) break #Download the scraped links updatepaper.download(paper, actualurls, actualtitles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "financialtimes" feeds = ("http://www.ft.com/rss/world", "http://www.ft.com/rss/companies", "http://www.ft.com/rss/home/uk", "http://www.ft.com/rss/home/us", "http://www.ft.com/rss/home/europe", "http://www.ft.com/rss/home/asia", "http://www.ft.com/rss/lex") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Modify links for printable version actualurls = [] for link in updatepaper.links: actualurl = link.replace("0/", "").replace(".html", ",print=yes.html") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "<b>Financial Times</b>" for line in readfile: if "ft-story-header" in line or "ft-story-body" in line or "charset" in line: insertfile = insertfile + "\n" + line insertfile = insertfile.replace('lang= "en">', '').replace('}// ]]></script>', '').replace('<script type="text/javascript">' , '') insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "seattlepi" feeds = ("http://seattlepi.nwsource.com/rss/apafrica.rss", "http://seattlepi.nwsource.com/rss/apaa.rss", "http://seattlepi.nwsource.com/rss/apasia.rss", "http://seattlepi.nwsource.com/rss/apelection.rss", "http://seattlepi.nwsource.com/rss/apmideast.rss", "http://seattlepi.nwsource.com/rss/apwash.rss") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #printable version beginurl = "http://seattlepi.nwsource.com/printer2/index.asp?ploc=t&refer=" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Strip some bad stuff out of the downloaded files insertfiles = [] for file in updatepaper.outfiles: insertfiles.append(file.replace('window.print()', "")) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "dallasmorningnews" feeds = ("http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewswash.xml", "http://www.dallasnews.com/newskiosk/rss/dallasnewsworld.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Download the links updatepaper.download(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "<b>Dallas Morning News</b>" dowrite = 0 for line in readfile: if dowrite == 1: if "<!-- vstory end -->" in line: break else: insertfile = insertfile + "\n" + line elif "<!-- vstory begin -->" in line: dowrite = 1 insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "csm" feeds = ("http://www.csmonitor.com/rss/top.rss", "http://rss.csmonitor.com/feeds/usa", "http://rss.csmonitor.com/feeds/world") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 actualurls = [] dlext=".htm?print=true" for link in updatepaper.links: splitlink = link.split(".html") actualurls.append(splitlink[0] + dlext) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "" for line in readfile: if not "window.print()" in line: insertfile = insertfile + "\n" + line insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "feer" feeds = ("http://www.feer.com/rss?cat=politics", "http://www.feer.com/rss?cat=international-relations", "http://www.feer.com/rss?cat=economics") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Download the links updatepaper.download(paper, updatepaper.links, updatepaper.titles) if len(updatepaper.outfiles) == 0: return 0 #Format outputted files insertfiles = [] for file in updatepaper.outfiles: readfile = file.split("\n") insertfile = "<b>Far Eastern Economic Review</b>" dowrite = 0 for line in readfile: if dowrite == 1: if '<div class="content_box">' in line: break else: insertfile = insertfile + "\n" + line elif "<!-- Some content EG article -->" in line: dowrite = 1 insertfiles.append(insertfile) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
def update(): paper = "miamiherald" feeds = ("http://www.miamiherald.com/884/index.xml", "http://www.miamiherald.com/509/index.xml", "http://www.miamiherald.com/578/index.xml", "http://www.miamiherald.com/579/index.xml", "http://www.miamiherald.com/581/index.xml", "http://www.miamiherald.com/582/index.xml") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #change to printable actualurls = [] for link in updatepaper.links: actualurl = link.replace("/story/", "/v-print/story/") actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Weed out the non complete articles insertfiles = [] inserttitles = [] for num, file in enumerate(updatepaper.outfiles): if not "Click here for full story" in file: insertfiles.append(file) inserttitles.append(updatepaper.outtitles[num]) #Insert the modified links into the DB updatepaper.insert(paper, insertfiles, inserttitles)
def update(): paper = "londontimes" feeds = ("http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable dlext="?print=yes" actualurls = [] for link in updatepaper.links: actualurl = link.split("#")[0] + dlext actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "japantimes" feeds = ("http://feeds.feedburner.com/japantimes_news",) #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Change links to printable beginurl = "http://search.japantimes.co.jp/print/" actualurls = [] for link in updatepaper.links: actualurl = beginurl + link.split("/")[-1] actualurls.append(actualurl) #Download the links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
def update(): paper = "ajc" feeds = ("http://www.ajc.com/section-rss.do?source=nation-world", "http://www.ajc.com/genericList-rss.do?source=94547") #Get links and titles from parsing updatepaper = MainUpdate() updatepaper.parse(paper, feeds) if len(updatepaper.links) == 0: print("No new articles found.") return 0 #Do allaffrica specific stuff to the links actualurls = [] for link in updatepaper.links: actualurl = link.split("?")[0] + "?printArticle=y" actualurls.append(actualurl) #Download the modified links updatepaper.download(paper, actualurls, updatepaper.titles) #Insert the modified links into the DB updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)