def start(searchfor,savepath): starturl = 'http://www.chron.com/search/?action=search&firstRequest=1&query="'+searchfor+'"&x=0&y=0&searchindex=gsa&sort=date' print("\nIn Houston Chronicle searching for "+searchfor+" with save path: "+savepath) #get html page startsoup = get_page(starturl) numpages = get_numpages(startsoup) print("HC Found "+str(numpages)) #get articles for first page articles = get_links(startsoup,[]) i = 2; if numpages > 1: endreached = False while endreached == False: #url template cururl = 'http://www.chron.com/search/?action=search&searchindex=gsa&query="'+searchfor+'"&sort=date&page='+str(i) cursoup = get_page(cururl) priorarticles = len(articles) articles = get_links(cursoup,articles) if len(articles) == priorarticles: endreached = True else: print "Currently in iteration "+str(i)+" with "+str(len(articles))+" calling get_newpages" nownum = get_numpages(cursoup) #if nownum == numpages: # endreached = True #elif nownum > numpages: # numpages = nownum # i = i + 1 #else: # i = i + 1 if nownum > numpages: numpages = nownum i = i + 1 else: if i == numpages: print "At Last Page: i="+str(i)+" and numpages="+str(numpages)+" and nownum="+str(nownum) endreached = True else: i = i + 1 #check if folder searchfor exists if not os.path.exists(savepath+"/"+searchfor): os.makedirs(savepath+"/"+searchfor.replace("+","_")) #Save links json file in folder searchfor saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_") else: print "PATH: "+searchfor+" already exists so save file with date info" saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_") + "-" + time.strftime("%d-%m-%y") save_list(savepath+"/"+saveto,articles) print "Done. Links file saved to "+saveto print "NOW get article information from links!" return getarticles_fromlinksjson.main_meth(searchfor,savepath)
def start(searchfor,savepath): print("\nIn Texas Observer searching for "+searchfor+" with save path: "+savepath) url = 'http://www.texasobserver.org/search-results/?q="'+searchfor+'"' print("call url: "+url) #check for initial block up or wait element to close! success,driver = check_for_blocking_popup(url) print("POP to block: " +str(success)) if success == 0: driver = webdriver.PhantomJS() links = [] links = get_links(driver,url,links) print "After first page we have "+str(len(links))+" links" print "\n".join([f['url'] for f in links]) pagenum = 1 endfound = False while endfound == False: nextpage,driver = get_next_tab(driver,pagenum) if nextpage == pagenum: endfound = True else: pagenum = nextpage print "Get Links for page: "+pagenum links = get_links(driver,"",links) print "After pagenum "+str(pagenum)+" we have "+str(len(links))+" links" print "\n".join([f['url'] for f in links]) print "OBSERVER Done with getting links" if success == 1: driver.quit() #close chrome tab if opened #for saving purposes use _ instead of + searchfor = searchfor.replace("+","_") print "CHECK IF FOLDER "+searchfor+" EXISTS IN OBSERVER FOLDER" #check if folder searchfor exists if not os.path.exists(savepath+"/"+searchfor): print "No, so make folder" os.makedirs(savepath+"/"+searchfor) #Save links json file in folder searchfor saveto = searchfor +"/" + "links-"+searchfor else: print "PATH: "+searchfor+" already exists so save file with date info" saveto = searchfor +"/" + "links-"+searchfor+ "-" + time.strftime("%d-%m-%y") #save_list(saveto,articles) save_list(savepath+"/"+saveto,links) print "OBSERVER Done. Links file saved to "+savepath+"/"+saveto print "NOW get article information from links!" return getarticles_fromlinksjson.main_meth(searchfor.replace("_","+"),savepath)
def start(searchfor,savepath): print("In Dallas Morning news searching for "+searchfor+" with save path: "+savepath) url = 'http://www.dallasnews.com/site-search/?q="'+searchfor+'"' print("call url: "+url) #check for initial block up or wait element to close! success,driver = check_for_long_open_time(url) print("POP to block: " +str(success)) if success == 0: driver = webdriver.PhantomJS() links = [] links = get_links(driver,url,links) pagenum = 1 endfound = False while endfound == False: nextpage,driver = get_next_tab(driver,pagenum) if nextpage == pagenum: endfound = True else: pagenum = nextpage print "Get Links for page: "+pagenum links = get_links(driver,"",links) if success == 1: driver.quit() #close chrome if opened #for saving purposes use _ instead of + searchfor = searchfor.replace("+","_") #check if folder searchfor exists if not os.path.exists(savepath+"/"+searchfor): os.makedirs(savepath+"/"+searchfor) #Save links json file in folder searchfor saveto = searchfor +"/" + "links-"+searchfor else: print "PATH: "+searchfor+" already exists so save file with date info" saveto = searchfor +"/" + "links-"+searchfor+ "-" + time.strftime("%d-%m-%y") #save_list(saveto,articles) save_list(savepath+"/"+saveto,links) print "Done. Links file saved to "+savepath+"/"+saveto print "NOW get article information from links!" return getarticles_fromlinksjson.main_meth(searchfor.replace("_","+"),savepath)
def start(searchfor,savepath): print("\nIn NYTimes searching for "+searchfor+" with save path: "+savepath) if "Texas+" in searchfor: searchfor = searchfor.replace("Texas+","").replace('"','') url = "http://query.nytimes.com/search/sitesearch/?action=click&contentCollection®ion=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/Texas%22"+searchfor+"%22/since1851/allresults/1/allauthors/newest/" else: url = "http://query.nytimes.com/search/sitesearch/?action=click&contentCollection®ion=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/%22"+searchfor+"%22/since1851/allresults/1/allauthors/newest/" print("GET url: "+url) driver = webdriver.PhantomJS() links = [] links = get_links(driver,url,links) pagenum = 1 endfound = False while endfound == False: nextpage,driver = get_next_tab(driver,pagenum) if nextpage == pagenum: endfound = True else: pagenum = nextpage print "Get Links for page: "+pagenum links = get_links(driver,"",links) #for saving purposes use _ instead of + searchfor = searchfor.replace("+","_") #check if folder searchfor exists if not os.path.exists(savepath+"/"+searchfor): os.makedirs(savepath+"/"+searchfor) #Save links json file in folder searchfor saveto = searchfor +"/" + "links-"+searchfor else: print "PATH: "+searchfor+" already exists so save file with date info" saveto = searchfor +"/" + "links-"+searchfor+ "-" + time.strftime("%d-%m-%y") #save_list(saveto,articles) save_list(savepath+"/"+saveto,links) print "Done. Links file saved to "+savepath+"/"+saveto print "NOW get article information from links!" return getarticles_fromlinksjson.main_meth(searchfor.replace("_","+"),savepath)
def start(searchfor,savepath): starturl = 'http://www.texastribune.org/search/?page=1&q="'+searchfor+'"' print("\nIn Texas Tribune searching for "+searchfor+" with save path: "+savepath) #get html page startsoup = get_page(starturl) #get number of pages total numpages, startsoup = get_numpages(startsoup,starturl) print("Found "+str(numpages)+" pages of results") #get articles for first page articles = get_links(startsoup,[]) #save partial result #save_list(searchfor.replace("+","_")+"-page1",articles) i = 2; while int(i) <= int(numpages): #url template cururl = 'http://www.texastribune.org/search/?page='+str(i)+'&q="'+searchfor+'"' cursoup = get_page(cururl) articles = get_links(cursoup,articles) i = i + 1 #check if folder searchfor exists if not os.path.exists(savepath+"/"+searchfor): os.makedirs(savepath+"/"+searchfor.replace("+","_")) #Save links json file in folder searchfor saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_") else: print "PATH: "+searchfor+" already exists so save file with date info" saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_") + "-" + time.strftime("%d-%m-%y") save_list(savepath+"/"+saveto,articles) print "Done. Links file saved to "+savepath+"/"+saveto print "NOW get article information from links!" return getarticles_fromlinksjson.main_meth(searchfor,savepath)