def getUrls2( page_num ): gs = GoogleSearch('shareholder letter') gs.results_per_page = 50 gs.page = page_num results = gs.get_results() for item in results: print item.url.encode("utf8")
def go(self, query, pages): search = GoogleSearch(query) search.results_per_page = 10 for i in range(pages): search.page = i results = search.get_results() for page in results: self.scrape(page)
def google_search_results(search_query, wait=40, number_of_results=10, encode=True, max_fail_count=5, current_fail_count=1, random_text=None): ''' DO NOT MESS WITH THIS IT IS PERFECT FOR NOW''' # gets AT LEAST number_of_results results # don't query too fast or Google will block your IP temporarily # for this purpose, I have added the variable max_result_size # if your IP does get blocked, try later in the day or wait a day or two try: max_result_size = 10 #don't change it from this: the standard of 10 seems the least suspicious to google gs = GoogleSearch(search_query, random_agent=True) # does not actually search gs.results_per_page = max_result_size gs.page = 0 times_tried = 0 results = [] prev = 0 # print "getting results:" while len(results) < number_of_results: prev = len(results) times_tried += 1 time.sleep(random.uniform(0.5 * wait, 1.5 * wait)) results += gs.get_results( ) # Actual search and extraction of results. print "\rtimes_tried: %s\tlen(results): %s\tpage_number: %s" % ( times_tried, len(results), gs.page), print "\n" # We now have a list of SearchResult objects, called 'results'. # A SearchResult object has three attributes -- "title", "desc", and "url". # They are Unicode strings, so do a proper encoding before outputting them. (done below) if encode: for i in range(0, len(results)): results[i].title = results[i].title.encode("utf8", "ignore") results[i].desc = results[i].desc.encode("utf8", "ignore") results[i].url = results[i].url # random.shuffle(results) except SearchError, e: print "Google Try #%s: Search failed on this url:\t%s" % ( current_fail_count, e) google_search_redirect(random_text) if current_fail_count != max_fail_count: return google_search_results( search_query, wait=wait, number_of_results=wait, encode=encode, max_fail_count=max_fail_count, current_fail_count=current_fail_count + 1)
def searchPage(textToSearch, page): items = [] gs = GoogleSearch(textToSearch) gs.results_per_page = 100 gs.page = page results = gs.get_results() for res in results: url = res.url.encode('utf8') items.append(url); return items
def scrape(self, keyword, pages=2): try: gs = GoogleSearch(keyword) gs.results_per_page = 10 gs.page = 0 results = gs.get_results() for res in results: url = res.url.encode('utf8') Title = res.title self.urls.append((url, Title)) except SearchError, e: print "Search failed: %s" % e
def DoSearch(mc,search,page,accountStatus=None): gs = GoogleSearch('site:'+ICEFILMS_URL+'ip '+search+'') gs.results_per_page = 25 gs.page = page results = gs.get_results() for res in results: name=res.title.encode('utf8') name=CLEANSEARCH(name) url=res.url.encode('utf8') index=url.index("/ip") match=url[index:len(url)] addSearchResult(mc,name,match,'Movie')
def scrapsomesqlfiles(keyword, pages=20): try: for i in range(0,pages+1): wt = random.uniform(2, 5) gs = GoogleSearch(keyword) gs.results_per_page = 50 gs.page = i results = gs.get_results() time.sleep(wt) print 'This is the %dth iteration and waited %f seconds' % (i, wt) for res in results: get_url_info(res.url.encode('utf8')) except SearchError, e: print "Search failed: %s" % e
def websearch(query): limit = config['web_results_limit'] search_library = config['search_library_active'] search_engine = config['search_engine_active'] ret = [] # Bing=50 per page, Google=10 - go figure! per_page = config[search_engine + '_per_page'] pages = int(math.ceil(limit / float(per_page))) if search_library == 'pattern': if search_engine == 'bing': engine = Bing(license='cvzWROzO9Vaxqu0k33+y6h++ts+a4PLQfvA7HlyJyXM=', language="en") elif search_engine == 'google': engine = Google(license=config[config['use_whose_key'] + '_google_key'], language="en") for page in range(pages): try: # turns out start = starting page and count is results per page # could probably do some logic to make sure count is right if limit was 130, on page 3, count should be 30, whereas # our code is going to fetch 50 for a total of 150. ... I think we can probably mess with that later and just work in blocks of 50 request = asynchronous(engine.search, clean_query(query), start=page+1, count=per_page, type=SEARCH, timeout=10, throttle=0.5) while not request.done: time.sleep(0.01) except: raise if request.value != None: for result in request.value: ret.append({'title' : result.title, 'description' : result.text}) elif search_library == 'requests': for page in range(pages): offset = per_page * page params = {'$format': 'json', '$top': per_page,'$skip': offset} results = bing.search('web',clean_query(query),params)()['d']['results'][0]['Web'] for result in results: ret.append({'title' : result['Title'], 'description' : result['Description']}) elif search_library == 'xgoogle': for page in range(pages): try: # inject some delay time.sleep(0.04) gs = GoogleSearch(clean_query(query)) gs.page = page+1 gs.results_per_page = per_page results = gs.get_results() for res in results: ret.append({'title' : res.title.encode("utf8"), 'description' : res.desc.encode("utf8")}) except SearchError, e: print "Search failed: %s" % e
def Search_YTonGoogle(self,search): # import Google Search from xgoogle.search import GoogleSearch # search on google gs = GoogleSearch(search+' site:http://www.youtube.com ') gs.results_per_page = 25 gs.page = 0 # return result or None try: results = gs.get_results() return results except Exception, e: print 'getTrailer --> Error: %s' % e return None
def main(): gs = GoogleSearch('intitle:道德黑客技术论坛内部专版WEBSHELL') gs.results_per_page = 100 for index in range(4): gs.page = index + 1 results = gs.get_results() for result in results: url = result.getURL() print result ret = exploit(url) if ret == '': continue open('result.txt', 'a').write(ret)
def searchHandler(user,command,args,mess): try: if len(args)<2: return "Please Provide your search Query" else: gs = GoogleSearch(args) gs.results_per_page = 10 gs.page = 1 results = gs.get_results() if len(results) > 0: for res in results: return res.title.encode("utf8") + "\n" + res.desc.encode("utf8") + "\n" + res.url.encode("utf8") else: return "No Search Result Found for your query." except SearchError, e: return "Search failed: %s" % e
def searchHandler(user, command, args, mess): try: if len(args) < 2: return "Please Provide your search Query" else: gs = GoogleSearch(args) gs.results_per_page = 10 gs.page = 1 results = gs.get_results() if len(results) > 0: for res in results: return res.title.encode("utf8") + "\n" + res.desc.encode( "utf8") + "\n" + res.url.encode("utf8") else: return "No Search Result Found for your query." except SearchError, e: return "Search failed: %s" % e
def search(self, search): g = GoogleSearch(search) g.results_per_page = 50 g.page = 1 searchNot = self.mapKeeper.searchNot(search.replace(' ', '_')) results = g.get_results() print "number of results: ", len(results) for res in results: url = res.url.encode("utf8") base_url = url req = urllib2.Request(url,headers=self.hdr) try: response = urllib2.urlopen(req) print "Processing: ", url except (UnicodeEncodeError, urllib2.HTTPError, urllib2.URLError, socket.error, httplib.BadStatusLine), e: print url+": ", e continue page = BeautifulSoup(response, "lxml") images = page.select('img[alt]') for image in images: if search in image.get('alt').lower(): imageURL = image.get('src') imageURL = urlparse.urljoin(base_url, imageURL) if imageURL in searchNot: print "Image is in searchNot: ", imageURL continue try: imgdata = urllib2.urlopen(imageURL) except urllib2.HTTPError, e: print "Error: "+imageURL+":", e.code self.mapKeeper.addNot(search.replace(' ', '_')+" "+imageURL) continue except urllib2.URLError, e: print "Error: "+imageURL+":", e.args self.mapKeeper.addNot(search.replace(' ', '_')+" "+imageURL) continue image_type,width,height = getimageinfo.getImageInfo(imgdata) if image_type == ' ' or (width < 200 and height < 200): print "Image Invalid: ", imageURL self.mapKeeper.addNot(search.replace(' ', '_')+" "+imageURL) continue print "image type:", image_type, "width:", width, "height:", height return imageURL
def google_search(query): try: list = Set() for i in range(0,15): print "Step: " + str(i) + " for "+query gs = GoogleSearch(query) gs.results_per_page = 100 gs.page=i results = gs.get_results() for res in results: url = res.url.encode('utf8') url = url[url.find(".")+1:find_nth(url, "/", 3)] if url.count('.', 0, len(url)) > 1: url = url[url.find(".")+1:len(url)] list.add(url) return list except SearchError, e: print "Search failed: %s" % e
def google_search(query): try: list = Set() for i in range(0, 15): print "Step: " + str(i) + " for " + query gs = GoogleSearch(query) gs.results_per_page = 100 gs.page = i results = gs.get_results() for res in results: url = res.url.encode('utf8') url = url[url.find(".") + 1:find_nth(url, "/", 3)] if url.count('.', 0, len(url)) > 1: url = url[url.find(".") + 1:len(url)] list.add(url) return list except SearchError, e: print "Search failed: %s" % e
def GetSearchResults(query=None,type=None,imdb_id=None, exact=False): if (type=="movies"): # This a google search. The -tv will ommit all TV shows. search = 'intitle:%s -"Episode List" -"Series Rating" site:%s' % (query,ICEFILMS_URL) else: search = 'allintitle:%s "Episode List" site:%s' % (query, ICEFILMS_URL) gs = GoogleSearch(search) gs.results_per_page = 25 gs.page = 0 results = gs.get_results() items = [] for res in results: name = re.sub( '(<em>|</em>|<a>|</a>|DivX|-|icefilms(\.info)?|<b>\.\.\.</b>|Episode List|links)', '', res.title.encode('utf8') ).strip() url=res.url video_url = re.search("icefilms\.info(/.*)", url).group(1) res = MediaInfo() res.type = type res.title = name match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) res.id = video_url items.append(res) return items
def google_search_results(search_query, wait=40, number_of_results=10, encode=True, max_fail_count=5, current_fail_count=1, random_text=None): ''' DO NOT MESS WITH THIS IT IS PERFECT FOR NOW''' # gets AT LEAST number_of_results results # don't query too fast or Google will block your IP temporarily # for this purpose, I have added the variable max_result_size # if your IP does get blocked, try later in the day or wait a day or two try: max_result_size=10 #don't change it from this: the standard of 10 seems the least suspicious to google gs = GoogleSearch(search_query, random_agent=True) # does not actually search gs.results_per_page = max_result_size gs.page=0 times_tried=0 results=[] prev=0 # print "getting results:" while len(results) < number_of_results: prev=len(results) times_tried+=1 time.sleep(random.uniform(0.5*wait, 1.5*wait)) results+=gs.get_results() # Actual search and extraction of results. print "\rtimes_tried: %s\tlen(results): %s\tpage_number: %s"%(times_tried, len(results), gs.page), print "\n" # We now have a list of SearchResult objects, called 'results'. # A SearchResult object has three attributes -- "title", "desc", and "url". # They are Unicode strings, so do a proper encoding before outputting them. (done below) if encode: for i in range (0, len(results)): results[i].title=results[i].title.encode("utf8", "ignore") results[i].desc=results[i].desc.encode("utf8", "ignore") results[i].url=results[i].url # random.shuffle(results) except SearchError, e: print "Google Try #%s: Search failed on this url:\t%s" %(current_fail_count,e) google_search_redirect(random_text) if current_fail_count!=max_fail_count: return google_search_results(search_query, wait=wait, number_of_results=wait, encode=encode, max_fail_count=max_fail_count, current_fail_count=current_fail_count+1)
def GetSearchResults(query=None,type=None,imdb_id=None): if (type=="movies"): # This a google search. The -tv will ommit all TV shows. search = 'intitle:%s -"Episode List" -"Series Rating" site:%s' % (query,ICEFILMS_URL) else: search = 'allintitle:%s "Episode List" site:%s' % (query, ICEFILMS_URL) gs = GoogleSearch(search) gs.results_per_page = 25 gs.page = 0 results = gs.get_results() items = [] for res in results: name = re.sub( '(<em>|</em>|<a>|</a>|DivX|-|icefilms(\.info)?|<b>\.\.\.</b>|Episode List|links)', '', res.title.encode('utf8') ).strip() url=res.url video_url = re.search("icefilms\.info(/.*)", url).group(1) res = MediaInfo() res.type = type res.title = name match = re.search("(.*)\((\d*)\)", res.title) if (match): res.title = match.group(1).strip() res.year = int(match.group(2).strip()) res.id = video_url items.append(res) return items
def getGooogleResults(query, exclude): try: print "Searching for {0} but excluding these {1}".format(query, exclude) page = 1 gs = GoogleSearch(query) gs.results_per_page = 1000 results = gs.get_results() print "results", gs.num_results # number of results while page < 100: gs.page = page results += gs.get_results() page += 1 #print len(results) i = 1 file = open("researchResults/" + query + "/data/searchResults.txt", "w") for res in results: #write the file to disk in the expected format file.write("{0}^{1}^{2}\n".format(i, res.title.encode("utf8"), res.url.encode("utf8"))) i += 1 file.close() print "{0} results found".format(i) except SearchError, e: print "Search failed: %s" % e
f1=open('uvasearch.txt','w') for match in re.finditer(r'(href="http://)([\w\.-_]+)(virginia.edu)([\w\.-_]*)',website_text): clipped=re.search('(href=")(http://[\w\.-_]+)',match.group(0)) f1.write(clipped.group(2)+'\n') uva_list.append(clipped.group(2)) f1.close() return sorted(uva_list) handle=retrieve_web_page("http://www.virginia.edu/atoz/") website_text=handle.read() uva_list=store_urls(website_text) #strap related urls from google try: gs=GoogleSearch("University of Virginia") gs.results_per_page=25 search_list=[] f2=open('googlesearch.txt','w') for page in range(0,5): gs.page=page results=gs.get_results() for result in results: search_list.append(result.url.encode('utf8')) f2.write(result.url.encode('utf8')+'\n') f2.close() except SearchError, e: sys.exit(1) #compare two common=set(search_list).intersection(set(uva_list))
#!/usr/bin/python # # This program stores urls from Google search results on a keyword. # import time, random from xgoogle.search import GoogleSearch, SearchError for i in range(5, 100, 5): f = open('pp'+str(i)+'.txt','wb') for j in range(i-5,i): wt = random.uniform(1.5, 3.5) gs = GoogleSearch("privacy policy") gs.results_per_page = 10 gs.page = j results = gs.get_results() #Try not to annnoy Google, with a random short wait time.sleep(wt) print 'This is the %dth iteration and waited %f seconds' % (j, wt) for res in results: f.write(res.url.encode("utf8")) f.write("\n") print 'Page %d to %d done' % (i-4,i) f.close() #Wait few more seconds for every 50 urls time.sleep(random.uniform(3, 5)) print "All done"
import time, random from xgoogle.search import GoogleSearch, SearchError f = open('a.txt', 'wb') for i in range(0, 1000): wt = random.uniform(3, 10) q = '"manage.py" "settings.py" "urls.py" intitle:"- Github" -inurl:/wiki/ -inurl:/blob/ -inurl:manage.py -inurl:commits -inurl:README -inurl:gist site:github.com' gs = GoogleSearch(q) gs.results_per_page = 100 gs.page = i results = gs.get_results() #Try not to annnoy Google, with a random short wait time.sleep(wt) print 'This is the %dth iteration and waited %f seconds' % (i, wt) for res in results: f.write(res.url.encode("utf8")) f.write("\n") f.flush() print "Done" f.close()
import time, random from xgoogle.search import GoogleSearch, SearchError f = open('a.txt','wb') for i in range(0,5): wt = random.uniform(2, 5) gs = GoogleSearch("sport calcio") gs.results_per_page = 10 gs.page = i results = gs.get_results() #Try not to annnoy Google, with a random short wait time.sleep(wt) print 'This is the %dth iteration and waited %f seconds' % (i, wt) for res in results: f.write(res.url.encode("utf8")) f.write("\n") print "Done" f.close()
cnt = 0 gs = GoogleSearch(query) gs.filetype = mfiletype gs.results_per_page = 50 pgCnt = 1 if n_cnt <> 0: print "Download Continuing from result number: ",str(n_cnt) #import pdb lastUrl = None while True: gs.page = pgCnt results = gs.get_results() pgCnt = pgCnt +1 #Increase page count to next page if not results: # no more results were found break for res in results: cnt = cnt +1 print "Search No. : ",str(cnt) print res.title.encode("utf8") # print res.desc.encode("utf8") temp_url = res.url.encode("utf8") if n_cnt <> 0: if cnt < n_cnt: continue