Python main_meth Beispiele, getarticles_fromlinksjson.main_meth Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: getlinksforlegislator.py Projekt: diegoolano/whoyouelect

def start(searchfor,savepath):
	starturl = 'http://www.chron.com/search/?action=search&firstRequest=1&query="'+searchfor+'"&x=0&y=0&searchindex=gsa&sort=date'
	print("\nIn Houston Chronicle searching for "+searchfor+" with save path: "+savepath)

	#get html page
	startsoup = get_page(starturl)

	numpages = get_numpages(startsoup)
	print("HC Found "+str(numpages))

	#get articles for first page
	articles = get_links(startsoup,[])


	i = 2;
	if numpages > 1:
		endreached = False
		while endreached == False:
			#url template
			cururl = 'http://www.chron.com/search/?action=search&searchindex=gsa&query="'+searchfor+'"&sort=date&page='+str(i)
			cursoup = get_page(cururl)
			priorarticles = len(articles)
			articles = get_links(cursoup,articles)
			if len(articles) == priorarticles:
				endreached = True
			else:
				print "Currently in iteration "+str(i)+" with "+str(len(articles))+" calling get_newpages"
				nownum = get_numpages(cursoup)
				#if nownum == numpages:
				#	endreached = True
				#elif nownum > numpages:
				#	numpages = nownum
				#	i = i + 1
				#else:
				#	i = i + 1
				if nownum > numpages:
					numpages = nownum
					i = i + 1
				else:
					if i == numpages:
						print "At Last Page: i="+str(i)+" and numpages="+str(numpages)+" and nownum="+str(nownum)
						endreached = True
					else:
						i = i + 1

	#check if folder searchfor exists
	if not os.path.exists(savepath+"/"+searchfor):
		os.makedirs(savepath+"/"+searchfor.replace("+","_"))
		#Save links json file in folder searchfor
		saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_")
	else:
		print "PATH: "+searchfor+" already exists so save file with date info"
		saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_") + "-" + time.strftime("%d-%m-%y")


	save_list(savepath+"/"+saveto,articles)
	print "Done.  Links file saved to "+saveto
	print "NOW get article information from links!"

	return getarticles_fromlinksjson.main_meth(searchfor,savepath)

Beispiel #2

0

Datei anzeigen

Datei: phantomscrape.py Projekt: diegoolano/whoyouelect

def start(searchfor,savepath):
	print("\nIn Texas Observer searching for "+searchfor+" with save path: "+savepath)
	url = 'http://www.texasobserver.org/search-results/?q="'+searchfor+'"'
	print("call url: "+url)


	#check for initial block up or wait element to close!
	success,driver = check_for_blocking_popup(url)

	print("POP to block: " +str(success))
	if success == 0:
		driver = webdriver.PhantomJS()

	links = []
	links = get_links(driver,url,links)
	print "After first page we have "+str(len(links))+" links"
	print "\n".join([f['url'] for f in links])

	pagenum = 1
	endfound = False

	while endfound == False:
		nextpage,driver = get_next_tab(driver,pagenum)
		if nextpage == pagenum:
			endfound = True
		else:
			pagenum = nextpage
			print "Get Links for page: "+pagenum
			links = get_links(driver,"",links)
			print "After pagenum "+str(pagenum)+" we have "+str(len(links))+" links"
			print "\n".join([f['url'] for f in links])
			

	print "OBSERVER Done with getting links"
	if success == 1:
		driver.quit()  #close chrome tab if opened

	#for saving purposes use _ instead of +
	searchfor = searchfor.replace("+","_")

	print "CHECK IF FOLDER "+searchfor+" EXISTS IN OBSERVER FOLDER"
	#check if folder searchfor exists
	if not os.path.exists(savepath+"/"+searchfor):
		print "No, so make folder"
		os.makedirs(savepath+"/"+searchfor)
		#Save links json file in folder searchfor
		saveto = searchfor +"/" + "links-"+searchfor
	else:
		print "PATH: "+searchfor+" already exists so save file with date info"
		saveto = searchfor +"/" + "links-"+searchfor+ "-" + time.strftime("%d-%m-%y")

	#save_list(saveto,articles)
	save_list(savepath+"/"+saveto,links)
	print "OBSERVER Done.  Links file saved to "+savepath+"/"+saveto
	print "NOW get article information from links!"

	return getarticles_fromlinksjson.main_meth(searchfor.replace("_","+"),savepath)

Beispiel #3

0

Datei anzeigen

Datei: phantomscrape.py Projekt: diegoolano/whoyouelect

def start(searchfor,savepath):
	print("In Dallas Morning news searching for "+searchfor+" with save path: "+savepath)
	url = 'http://www.dallasnews.com/site-search/?q="'+searchfor+'"'
	print("call url: "+url)

	#check for initial block up or wait element to close!
	success,driver = check_for_long_open_time(url)

	print("POP to block: " +str(success))
	if success == 0:
		driver = webdriver.PhantomJS()

	links = []
	links = get_links(driver,url,links)

	pagenum = 1
	endfound = False

	while endfound == False:
		nextpage,driver = get_next_tab(driver,pagenum)
		if nextpage == pagenum:
			endfound = True
		else:
			pagenum = nextpage
			print "Get Links for page: "+pagenum
			links = get_links(driver,"",links)
			

	if success == 1:
		driver.quit() #close chrome if opened

	#for saving purposes use _ instead of +
	searchfor = searchfor.replace("+","_")

	#check if folder searchfor exists
	if not os.path.exists(savepath+"/"+searchfor):
		os.makedirs(savepath+"/"+searchfor)
		#Save links json file in folder searchfor
		saveto = searchfor +"/" + "links-"+searchfor
	else:
		print "PATH: "+searchfor+" already exists so save file with date info"
		saveto = searchfor +"/" + "links-"+searchfor+ "-" + time.strftime("%d-%m-%y")


	#save_list(saveto,articles)
	save_list(savepath+"/"+saveto,links)
	print "Done.  Links file saved to "+savepath+"/"+saveto
	print "NOW get article information from links!"

	return getarticles_fromlinksjson.main_meth(searchfor.replace("_","+"),savepath)

Beispiel #4

0

Datei anzeigen

Datei: phantomscrape.py Projekt: diegoolano/whoyouelect

def start(searchfor,savepath):
	print("\nIn NYTimes searching for "+searchfor+" with save path: "+savepath)
	if "Texas+" in searchfor:
		searchfor = searchfor.replace("Texas+","").replace('"','')
		url = "http://query.nytimes.com/search/sitesearch/?action=click&contentCollection&region=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/Texas%22"+searchfor+"%22/since1851/allresults/1/allauthors/newest/"
	else:
		url = "http://query.nytimes.com/search/sitesearch/?action=click&contentCollection&region=TopBar&WT.nav=searchWidget&module=SearchSubmit&pgtype=Homepage#/%22"+searchfor+"%22/since1851/allresults/1/allauthors/newest/"
	
	print("GET url: "+url)
	driver = webdriver.PhantomJS()
	links = []
	links = get_links(driver,url,links)

	pagenum = 1
	endfound = False

	while endfound == False:
		nextpage,driver = get_next_tab(driver,pagenum)
		if nextpage == pagenum:
			endfound = True
		else:
			pagenum = nextpage
			print "Get Links for page: "+pagenum
			links = get_links(driver,"",links)
			

	#for saving purposes use _ instead of +
	searchfor = searchfor.replace("+","_")

	#check if folder searchfor exists
	if not os.path.exists(savepath+"/"+searchfor):
		os.makedirs(savepath+"/"+searchfor)
		#Save links json file in folder searchfor
		saveto = searchfor +"/" + "links-"+searchfor
	else:
		print "PATH: "+searchfor+" already exists so save file with date info"
		saveto = searchfor +"/" + "links-"+searchfor+ "-" + time.strftime("%d-%m-%y")


	#save_list(saveto,articles)
	save_list(savepath+"/"+saveto,links)
	print "Done.  Links file saved to "+savepath+"/"+saveto
	print "NOW get article information from links!"

	return getarticles_fromlinksjson.main_meth(searchfor.replace("_","+"),savepath)

Beispiel #5

0

Datei anzeigen

Datei: getlinksforlegislator.py Projekt: diegoolano/whoyouelect

def start(searchfor,savepath):
	starturl = 'http://www.texastribune.org/search/?page=1&q="'+searchfor+'"'
	print("\nIn Texas Tribune searching for "+searchfor+" with save path: "+savepath)

	#get html page
	startsoup = get_page(starturl)

	#get number of pages total
	numpages, startsoup = get_numpages(startsoup,starturl)
	print("Found "+str(numpages)+" pages of results")

	#get articles for first page
	articles = get_links(startsoup,[])

	#save partial result
	#save_list(searchfor.replace("+","_")+"-page1",articles)

	i = 2;
	while int(i) <= int(numpages):
		#url template
		cururl = 'http://www.texastribune.org/search/?page='+str(i)+'&q="'+searchfor+'"'
		cursoup = get_page(cururl)
		articles = get_links(cursoup,articles)
		i = i + 1

	#check if folder searchfor exists
	if not os.path.exists(savepath+"/"+searchfor):
		os.makedirs(savepath+"/"+searchfor.replace("+","_"))
		#Save links json file in folder searchfor
		saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_")
	else:
		print "PATH: "+searchfor+" already exists so save file with date info"
		saveto = searchfor.replace("+","_") +"/" + "links-"+searchfor.replace("+","_") + "-" + time.strftime("%d-%m-%y")


	save_list(savepath+"/"+saveto,articles)
	print "Done.  Links file saved to "+savepath+"/"+saveto
	print "NOW get article information from links!"

	return getarticles_fromlinksjson.main_meth(searchfor,savepath)