def loadsunsearchedkeywords(): res = DBkeywords.getunsearchedID('bing') for ke in res: keywordseearching = DBkeywords.findkeyword(ke) url = 'https://www.bing.com/search?q=' + keywordseearching[0].replace( " ", "+") get_search(url, keywordseearching[0], 'DE') time.sleep(50)
def get_search(url, keyword, ac): browser = get_browser(binary=firefox_binary) browser.get(url) counserat = 0 page_source = browser.page_source soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] ID = DBkeywords.findkeywordID(keyword) resultofitems = soup.find("div", { "class": "l-web-results" }).find_all("div", {"class": "web-result"}) for item in resultofitems: try: print str(counserat) # abstract = element.find("span",{"class":"st"}).get_text() Tittle = item.find("h3", {"class": "web-result-title"}).get_text() further_url = 'cannot load url' further_url = item.find("p", { "class": "web-result-url" }).get_text() abstract = item.find("p", { "class": "web-result-description" }).get_text() if (DBkeywords.checktrustful(further_url)): print "trust url:" + str(further_url) continue print "working on:" + further_url # browser.set_page_load_timeout(180) browser.get(getextraweb.checkhttp(further_url)) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" body = browser.page_source test_DB.search(ID[0], Tittle, further_url, abstract, body, currentdate.getdate(), ac) counserat = counserat + 1 except AttributeError: print str('AttributeError') test_DB.suspect('search', -1, ID[0], further_url, 3, currentdate.getdate(), 'AttributeError', Tittle, abstract) continue except Exception, e: print "caught exception :site:" + further_url + "keyword: " + keyword test_DB.suspect('search', -1, ID[0], further_url, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e) continue
def get_search(url, keyword, ac): browser = get_browser(binary=firefox_binary) browser.get(url) time.sleep(15) print "get_search url done" page_source = browser.page_source soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] ID = DBkeywords.findkeywordID(keyword) resultofitems = soup.find_all("div", {"class": "c-container"}) print "get_search baidu :" + url # while resultofitems==None : # print "searching page cannot load" # WebDriverWait(browser,30) # browser = get_browser(binary=firefox_binary) # browser.get(url) # resultofitems = soup.find_all("div",{"class":"c-container"}) for item in resultofitems: try: time.sleep(15) Tittle = 'cannot load ' abstract = 'cannot load ' currenturl = 'cannot load url' further_url = item.a["href"] Tittle = item.a.get_text() print Tittle abstract = item.find_all("div")[0].get_text() print abstract # thread.start_new_thread(getextraweb.baidufurthersearch,(ID[0],Tittle,abstract,further_url,ac)) browser.set_page_load_timeout(180) browser.get(further_url) try: WebDriverWait(browser, 20).until( EC.presence_of_element_located( browser.find_element_by_xpath("*/body"))) print "Page is ready!" except TimeoutException: print "Loading took too much time!" time.sleep(30) currenturl = (browser.current_url) print currenturl body = browser.page_source test_DB.baidu(ID[0], Tittle, currenturl, abstract, body, currentdate.getdate(), ac) # except AttributeError: # print str(AttributeError) # test_DB.suspect('baidu',-1,ID[0],currenturl,3,currentdate.getdate(),'AttributeError') # continue except TimeoutException, e2: print "Timeout, retrying..." test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e2), Tittle, abstract) time.sleep(30) continue except Exception, e: print "caught exception :site:" + currenturl + "keyword: " + keyword test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e) continue
def get_search(url,keyword,ac): browser = get_browser(binary=firefox_binary) browser.get(url) WebDriverWait(browser,30) page_source =browser.page_source soup = BeautifulSoup(page_source,"lxml") [s.extract() for s in soup('script')] ID =DBkeywords.findkeywordID(keyword) resultofitems = soup.find("ul",{"content":"ALGO"}).find_all("li") Tittle ='cannot load' abstract ='cannot load' for item in resultofitems: try: # abstract = element.find("span",{"class":"st"}).get_text() print "start" further_url ='cannot load url' Tittle = item.a.get_text() further_url = item.a["href"] print further_url abstract = item.find("p",{"property":"f:desc"}).get_text() if(DBkeywords.checktrustful(further_url)): print "trust url:"+str(further_url) continue print "working on:"+further_url WebDriverWait(browser,30) # browser.set_page_load_timeout(180) browser.get(getextraweb.checkhttp(further_url)) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" currenturl = (browser.current_url) # soupfurther = BeautifulSoup(browser.page_source,"lxml") # [s.extract() for s in soupfurther('script')] # body = cleanup(soupfurther.get_text()) body = browser.page_source test_DB.aol(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) except AttributeError: print str(AttributeError) test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue
def get_search(url,keyword,ac): browser = get_browser(binary=firefox_binary) browser.get(url) page_source =browser.page_source soup = BeautifulSoup(page_source,"lxml") [s.extract() for s in soup('script')] ID =DBkeywords.findkeywordID(keyword) resultofitems = soup.find_all("div",{"class":"PartialSearchResults-item"}) for item in resultofitems: try: time.sleep(10) further_url ='cannot load url' Tittle ='cannot load url' abstract = 'cannot load url' Tittle = item.find("div",{"class":"PartialSearchResults-item-title"}).get_text() print Tittle further_url = item.find("p",{"class":"PartialSearchResults-item-url"}).get_text() print further_url abstract = item.find("p",{"class":"PartialSearchResults-item-abstract"}).get_text() print abstract if(DBkeywords.checktrustful(further_url)): print "trust url:"+str(further_url) continue print "working on:"+further_url # browser.set_page_load_timeout(180) browser.get(getextraweb.checkhttp(further_url)) # soupfurther = BeautifulSoup(browser.page_source,"lxml") # [s.extract() for s in soupfurther('script')] # # body = cleanup(soupfurther.get_text()) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" time.sleep(30) body = browser.page_source test_DB.ask(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) # except AttributeError: # print str('AttributeError') # test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) # continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue
def loadallkeywords(ac): res = DBkeywords.getkeyword() for ke in res: try: url = 'https://www.bing.com/search?q=' + ke[0].replace(" ", "+") get_search(url, ke[0], ac) except Exception, e: print str(e) continue
def get_search(url, keyword, ac): browser = get_browser(binary=firefox_binary) browser.get(url) page_source = browser.page_source soup = BeautifulSoup(page_source, "lxml") [s.extract() for s in soup('script')] letters = soup.find_all("li", {"class": "b_algo"}) counserat = 0 ID = DBkeywords.findkeywordID(keyword) for element in letters: try: print str(counserat) further_url = 'cannot load url' ctime = time.time() Tittle = element.a.get_text() further_url = element.a["href"] abstract = element.p.get_text() if (DBkeywords.checktrustful(further_url)): print "trust url:" + str(further_url) continue print "working on:" + further_url # browser.set_page_load_timeout(120) browser.get(getextraweb.checkhttp(further_url)) # try: # WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" time.sleep(30) body = browser.page_source test_DB.bing(ID[0], Tittle, further_url, abstract, body, currentdate.getdate(), ac) counserat = counserat + 1 # except AttributeError: # print str('AttributeError') # # test_DB.suspect('bing',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) # # browser.quit() # continue except Exception, e: print str(e) # browser.quit() test_DB.suspect('bing', -1, ID[0], further_url, 3, currentdate.getdate(), str(e), Tittle, abstract) continue
# print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" body = browser.page_source test_DB.lycos(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) except AttributeError: print str('AttributeError') test_DB.suspect('lycos',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword+" " test_DB.suspect('lycos',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue # browser.quit() if __name__ == "__main__": li=DBkeywords.loadsunsearchedkeywords('lycos') print li count =0 for l in li: # if count>10: # break try: url = 'http://search.lycos.com/web/?q='+l[0].replace(" ","+") get_search(url,l[0],'FR') time.sleep(50) except Exception, e: print str(e) continue # count +=1
# WebDriverWait(browser, 20).until(EC.presence_of_element_located(browser.find_element_by_xpath("*/body"))) # print "Page is ready!" # except TimeoutException: # print "Loading took too much time!" time.sleep(30) body = browser.page_source test_DB.ask(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) # except AttributeError: # print str('AttributeError') # test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) # continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword test_DB.suspect('ask',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue time.sleep(20) if __name__ == "__main__": res = DBkeywords.getkeyword() li=DBkeywords.loadsunsearchedkeywords('ask') for ke in li: try: url = 'http://www.ask.com/web?q='+ke[0].replace(" ","+") print url get_search(url,ke[0],'FR') except Exception, e: print str(e) continue
def loadsunsearchedkeywords(): res = DBkeywords.getunsearchedID('bing') for ke in res: keywordseearching = DBkeywords.findkeyword(ke) url = 'https://www.bing.com/search?q=' + keywordseearching[0].replace( " ", "+") get_search(url, keywordseearching[0], 'DE') time.sleep(50) def loadallkeywords(ac): res = DBkeywords.getkeyword() for ke in res: try: url = 'https://www.bing.com/search?q=' + ke[0].replace(" ", "+") get_search(url, ke[0], ac) except Exception, e: print str(e) continue if __name__ == "__main__": li = DBkeywords.loadsunsearchedkeywords('bing') for l in li: try: url = 'https://www.bing.com/search?q=' + l[0].replace(" ", "+") get_search(url, l[0], 'FR') except Exception, e: print str(e) continue
body = browser.page_source test_DB.search(ID[0], Tittle, further_url, abstract, body, currentdate.getdate(), ac) counserat = counserat + 1 except AttributeError: print str('AttributeError') test_DB.suspect('search', -1, ID[0], further_url, 3, currentdate.getdate(), 'AttributeError', Tittle, abstract) continue except Exception, e: print "caught exception :site:" + further_url + "keyword: " + keyword test_DB.suspect('search', -1, ID[0], further_url, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e) continue # browser.quit() if __name__ == "__main__": res = DBkeywords.loadsunsearchedkeywords('search') li = [] for ke in res: try: url = 'https://www.search.com/web?q=' + ke[0].replace(" ", "+") get_search(url, ke[0], 'FR') time.sleep(50) except Exception, e: print str(e) continue
# [s.extract() for s in soupfurther('script')] # body = cleanup(soupfurther.get_text()) body = browser.page_source test_DB.aol(ID[0],Tittle,further_url,abstract,body,currentdate.getdate(),ac) except AttributeError: print str(AttributeError) test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),'AttributeError',Tittle,abstract) continue except Exception, e: print "caught exception :site:"+further_url +"keyword: "+keyword test_DB.suspect('aol',-1,ID[0],further_url,3,currentdate.getdate(),str(e),Tittle,abstract) print str(e) continue if __name__ == "__main__": # res = DBkeywords.getkeyword() # li =[] li=DBkeywords.loadsunsearchedkeywords('aol') for ke in li: try: url = 'https://search.aol.com/aol/search?s_it=sb-top&v_t=na&q='+ke[0].replace(" ","+") print url get_search(url,ke[0],'DE') print "sleep" time.sleep(50) print "wake up" except Exception, e: print str(e) continue
# test_DB.suspect('baidu',-1,ID[0],currenturl,3,currentdate.getdate(),'AttributeError') # continue except TimeoutException, e2: print "Timeout, retrying..." test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e2), Tittle, abstract) time.sleep(30) continue except Exception, e: print "caught exception :site:" + currenturl + "keyword: " + keyword test_DB.suspect('baidu', -1, ID[0], currenturl, 3, currentdate.getdate(), str(e), Tittle, abstract) print str(e) continue if __name__ == "__main__": # res = DBkeywords.getkeyword() res = DBkeywords.loadsunsearchedkeywords('baidu') li = [] for ke in res: url = 'https://www.baidu.com/s?ie=utf-8&wd=' + ke[0].replace(" ", "+") print "main function of baidu :" + url try: get_search(url, ke[0], 'US') except TimeoutException, e: print "Timeout, retrying... keyword:" + ke[0] time.sleep(30) continue time.sleep(50)