Example #1
0
def getUrls2( page_num ):
   gs = GoogleSearch('shareholder letter')
   gs.results_per_page = 50
   gs.page = page_num
   results = gs.get_results()
   for item in results:
      print item.url.encode("utf8")
Example #2
0
    def go(self, query, pages):
        search = GoogleSearch(query)
        search.results_per_page = 10

        for i in range(pages):
            search.page = i
            results = search.get_results()
            for page in results:
                self.scrape(page)
Example #3
0
    def go(self, query, pages):
        search = GoogleSearch(query)
        search.results_per_page = 10

        for i in range(pages):
            search.page = i
            results = search.get_results()
            for page in results:
                self.scrape(page)
Example #4
0
def google_search_results(search_query,
                          wait=40,
                          number_of_results=10,
                          encode=True,
                          max_fail_count=5,
                          current_fail_count=1,
                          random_text=None):
    ''' DO NOT MESS WITH THIS IT IS PERFECT FOR NOW'''
    # gets AT LEAST number_of_results results
    # don't query too fast or Google will block your IP temporarily
    # for this purpose, I have added the variable max_result_size
    # if your IP does get blocked, try later in the day or wait a day or two

    try:
        max_result_size = 10  #don't change it from this: the standard of 10 seems the least suspicious to google
        gs = GoogleSearch(search_query,
                          random_agent=True)  # does not actually search
        gs.results_per_page = max_result_size

        gs.page = 0
        times_tried = 0
        results = []
        prev = 0
        # print "getting results:"
        while len(results) < number_of_results:
            prev = len(results)
            times_tried += 1
            time.sleep(random.uniform(0.5 * wait, 1.5 * wait))
            results += gs.get_results(
            )  # Actual search and extraction of results.
            print "\rtimes_tried: %s\tlen(results): %s\tpage_number: %s" % (
                times_tried, len(results), gs.page),
        print "\n"

        # We now have a list of SearchResult objects, called 'results'.
        # A SearchResult object has three attributes -- "title", "desc", and "url".
        # They are Unicode strings, so do a proper encoding before outputting them. (done below)
        if encode:
            for i in range(0, len(results)):
                results[i].title = results[i].title.encode("utf8", "ignore")
                results[i].desc = results[i].desc.encode("utf8", "ignore")
                results[i].url = results[i].url
        # random.shuffle(results)

    except SearchError, e:
        print "Google Try #%s: Search failed on this url:\t%s" % (
            current_fail_count, e)
        google_search_redirect(random_text)
        if current_fail_count != max_fail_count:
            return google_search_results(
                search_query,
                wait=wait,
                number_of_results=wait,
                encode=encode,
                max_fail_count=max_fail_count,
                current_fail_count=current_fail_count + 1)
Example #5
0
def searchPage(textToSearch, page):
    items = []
    gs = GoogleSearch(textToSearch)
    gs.results_per_page = 100
    gs.page = page
    results = gs.get_results()
    for res in results:
        url = res.url.encode('utf8')
        items.append(url);
    return items
 def scrape(self, keyword, pages=2):
     try:
         gs = GoogleSearch(keyword)
         gs.results_per_page = 10
         gs.page = 0
         results = gs.get_results()
         for res in results:
             url = res.url.encode('utf8')
             Title = res.title
             self.urls.append((url, Title))
     except SearchError, e:
         print "Search failed: %s" % e
 def scrape(self, keyword, pages=2):
     try:
         gs = GoogleSearch(keyword)
         gs.results_per_page = 10
         gs.page = 0
         results = gs.get_results()
         for res in results:
             url = res.url.encode('utf8')
             Title = res.title
             self.urls.append((url, Title))
     except SearchError, e:
       print "Search failed: %s" % e
Example #8
0
def DoSearch(mc,search,page,accountStatus=None):        
	gs = GoogleSearch('site:'+ICEFILMS_URL+'ip '+search+'')
        gs.results_per_page = 25
        gs.page = page
        results = gs.get_results()
	
        for res in results:
                name=res.title.encode('utf8')
                name=CLEANSEARCH(name)
                url=res.url.encode('utf8')
		index=url.index("/ip")
		match=url[index:len(url)]
		
		addSearchResult(mc,name,match,'Movie')
def scrapsomesqlfiles(keyword, pages=20):
    try:
        for i in range(0,pages+1):
            wt = random.uniform(2, 5)   
            gs = GoogleSearch(keyword)
            gs.results_per_page = 50
            gs.page = i 
            results = gs.get_results()
            time.sleep(wt)
            print 'This is the %dth iteration and waited %f seconds' % (i, wt)
            for res in results:
                get_url_info(res.url.encode('utf8'))    
    except SearchError, e:
      print "Search failed: %s" % e
Example #10
0
def websearch(query):
    limit = config['web_results_limit']
    search_library = config['search_library_active']
    search_engine = config['search_engine_active']
    
    ret = []
    # Bing=50 per page, Google=10 - go figure!
    per_page = config[search_engine + '_per_page']
    pages = int(math.ceil(limit / float(per_page)))

    if search_library == 'pattern':
        if search_engine == 'bing':
            engine = Bing(license='cvzWROzO9Vaxqu0k33+y6h++ts+a4PLQfvA7HlyJyXM=', language="en")
        elif search_engine == 'google':
            engine = Google(license=config[config['use_whose_key'] + '_google_key'], language="en")
        for page in range(pages):
            try:
                # turns out start = starting page and count is results per page
                # could probably do some logic to make sure count is right if limit was 130, on page 3, count should be 30, whereas 
                # our code is going to fetch 50 for a total of 150. ... I think we can probably mess with that later and just work in blocks of 50
                request = asynchronous(engine.search, clean_query(query), start=page+1, count=per_page, type=SEARCH, timeout=10, throttle=0.5)
                while not request.done:
                    time.sleep(0.01)
            except:
                raise
            if request.value != None:
                for result in request.value:
                    ret.append({'title' : result.title, 'description' : result.text})
            
    elif search_library == 'requests':
        for page in range(pages):
            offset = per_page * page
            params = {'$format': 'json', '$top': per_page,'$skip': offset}
            results = bing.search('web',clean_query(query),params)()['d']['results'][0]['Web']
            for result in results:
                ret.append({'title' : result['Title'], 'description' : result['Description']})
                
    elif search_library == 'xgoogle':
        for page in range(pages):
            try:
                # inject some delay
                time.sleep(0.04)
                gs = GoogleSearch(clean_query(query))
                gs.page = page+1
                gs.results_per_page = per_page
                results = gs.get_results()
                for res in results:
                    ret.append({'title' : res.title.encode("utf8"), 'description' : res.desc.encode("utf8")})
            except SearchError, e:
                print "Search failed: %s" % e
Example #11
0
 def Search_YTonGoogle(self,search):
     # import Google Search
     from xgoogle.search import GoogleSearch
     # search on google
     gs = GoogleSearch(search+' site:http://www.youtube.com ')
     gs.results_per_page = 25
     gs.page = 0
     # return result or None
     try:
         results = gs.get_results()
         return results
     except Exception, e:
         print 'getTrailer --> Error: %s' % e
         return None
Example #12
0
def main():
    gs = GoogleSearch('intitle:道德黑客技术论坛内部专版WEBSHELL')
    gs.results_per_page = 100
    for index in range(4):
        gs.page = index + 1
        results = gs.get_results()
        for result in results:
            url = result.getURL() 
            print result
            
            ret = exploit(url)
            if ret == '':
                continue

            open('result.txt', 'a').write(ret)
Example #13
0
def main():
    gs = GoogleSearch('intitle:道德黑客技术论坛内部专版WEBSHELL')
    gs.results_per_page = 100
    for index in range(4):
        gs.page = index + 1
        results = gs.get_results()
        for result in results:
            url = result.getURL()
            print result

            ret = exploit(url)
            if ret == '':
                continue

            open('result.txt', 'a').write(ret)
Example #14
0
def searchHandler(user,command,args,mess):
    try:
      if len(args)<2:
        return "Please Provide your search Query"
      else:
          gs = GoogleSearch(args)
          gs.results_per_page = 10
          gs.page = 1
          results = gs.get_results()
          if len(results) > 0:
              for res in results:
                return res.title.encode("utf8") + "\n" + res.desc.encode("utf8") + "\n" + res.url.encode("utf8")
          else:
            return "No Search Result Found for your query."
    except SearchError, e:
      return "Search failed: %s" % e
Example #15
0
def searchHandler(user, command, args, mess):
    try:
        if len(args) < 2:
            return "Please Provide your search Query"
        else:
            gs = GoogleSearch(args)
            gs.results_per_page = 10
            gs.page = 1
            results = gs.get_results()
            if len(results) > 0:
                for res in results:
                    return res.title.encode("utf8") + "\n" + res.desc.encode(
                        "utf8") + "\n" + res.url.encode("utf8")
            else:
                return "No Search Result Found for your query."
    except SearchError, e:
        return "Search failed: %s" % e
 def search(self, search):
     g = GoogleSearch(search)
     g.results_per_page = 50
     g.page = 1
     searchNot = self.mapKeeper.searchNot(search.replace(' ', '_'))
     results = g.get_results()
     print "number of results: ", len(results)
     for res in results:
         url = res.url.encode("utf8")
         base_url = url
         req = urllib2.Request(url,headers=self.hdr)
         try:
             response = urllib2.urlopen(req)
             print "Processing: ", url
         except (UnicodeEncodeError, urllib2.HTTPError, urllib2.URLError, socket.error, httplib.BadStatusLine), e:
             print url+": ", e
             continue
         page = BeautifulSoup(response, "lxml")
         images = page.select('img[alt]')
         for image in images:
             if search in image.get('alt').lower():
                 imageURL = image.get('src')
                 imageURL = urlparse.urljoin(base_url, imageURL)
                 if imageURL in searchNot:
                     print "Image is in searchNot: ", imageURL
                     continue
                 try:
                     imgdata = urllib2.urlopen(imageURL)
                 except urllib2.HTTPError, e:
                         print "Error: "+imageURL+":", e.code
                         self.mapKeeper.addNot(search.replace(' ', '_')+" "+imageURL)
                         continue
                 except urllib2.URLError, e:
                         print "Error: "+imageURL+":", e.args
                         self.mapKeeper.addNot(search.replace(' ', '_')+" "+imageURL)
                         continue
                 image_type,width,height = getimageinfo.getImageInfo(imgdata)
                 if image_type == ' ' or (width < 200 and height < 200):
                     print "Image Invalid: ", imageURL
                     self.mapKeeper.addNot(search.replace(' ', '_')+" "+imageURL)
                     continue
                 print "image type:", image_type, "width:", width, "height:", height
                 return imageURL
def google_search(query):
  try:
    list = Set()
    for i in range(0,15):
      print "Step: " + str(i) + " for "+query
      gs = GoogleSearch(query)
      gs.results_per_page = 100
      gs.page=i
      results = gs.get_results()
      for res in results:
        url = res.url.encode('utf8')
        url = url[url.find(".")+1:find_nth(url, "/", 3)]
        if url.count('.', 0, len(url)) > 1:
          url = url[url.find(".")+1:len(url)]
        list.add(url)
        
    return list 
  except SearchError, e:
    print "Search failed: %s" % e
Example #18
0
def google_search(query):
    try:
        list = Set()
        for i in range(0, 15):
            print "Step: " + str(i) + " for " + query
            gs = GoogleSearch(query)
            gs.results_per_page = 100
            gs.page = i
            results = gs.get_results()
            for res in results:
                url = res.url.encode('utf8')
                url = url[url.find(".") + 1:find_nth(url, "/", 3)]
                if url.count('.', 0, len(url)) > 1:
                    url = url[url.find(".") + 1:len(url)]
                list.add(url)

        return list
    except SearchError, e:
        print "Search failed: %s" % e
Example #19
0
def GetSearchResults(query=None,type=None,imdb_id=None, exact=False):
	
	if (type=="movies"):
		# This a google search. The -tv will ommit all TV shows.
		search = 'intitle:%s -"Episode List" -"Series Rating" site:%s' % (query,ICEFILMS_URL)
	else:
		search = 'allintitle:%s "Episode List" site:%s' % (query, ICEFILMS_URL)
	
	gs = GoogleSearch(search)
	gs.results_per_page = 25
	gs.page = 0
	results = gs.get_results()
	items = []
	
	for res in results:
	
		name = re.sub(
			'(<em>|</em>|<a>|</a>|DivX|-|icefilms(\.info)?|<b>\.\.\.</b>|Episode List|links)',
			'',
			res.title.encode('utf8')
		).strip()

		url=res.url
		video_url = re.search("icefilms\.info(/.*)", url).group(1)
		
		res = MediaInfo()
		
		res.type = type
		res.title = name

		match = re.search("(.*)\((\d*)\)", res.title)
		
		if (match):
			res.title = match.group(1).strip()
			res.year = int(match.group(2).strip())
			
		res.id = video_url
		
		items.append(res)
	
	return items
Example #20
0
def google_search_results(search_query, wait=40, number_of_results=10, encode=True, max_fail_count=5, current_fail_count=1, random_text=None):
	''' DO NOT MESS WITH THIS IT IS PERFECT FOR NOW'''
	# gets AT LEAST number_of_results results
	# don't query too fast or Google will block your IP temporarily 
	# for this purpose, I have added the variable max_result_size
	# if your IP does get blocked, try later in the day or wait a day or two


	try:
		max_result_size=10 #don't change it from this: the standard of 10 seems the least suspicious to google
		gs = GoogleSearch(search_query, random_agent=True) # does not actually search
		gs.results_per_page = max_result_size
		
		gs.page=0
		times_tried=0
		results=[]
		prev=0
		# print "getting results:"	
		while len(results) < number_of_results:
			prev=len(results)
			times_tried+=1
			time.sleep(random.uniform(0.5*wait, 1.5*wait))
			results+=gs.get_results() # Actual search and extraction of results.
			print "\rtimes_tried: %s\tlen(results): %s\tpage_number: %s"%(times_tried, len(results), gs.page),
		print "\n"

		# We now have a list of SearchResult objects, called 'results'.
		# A SearchResult object has three attributes -- "title", "desc", and "url".
		# They are Unicode strings, so do a proper encoding before outputting them. (done below)
		if encode:
			for i in range (0, len(results)):
				results[i].title=results[i].title.encode("utf8", "ignore")
				results[i].desc=results[i].desc.encode("utf8", "ignore")
				results[i].url=results[i].url
		# random.shuffle(results)

	except SearchError, e:
		print "Google Try #%s: Search failed on this url:\t%s" %(current_fail_count,e)
		google_search_redirect(random_text)
		if current_fail_count!=max_fail_count:
			return google_search_results(search_query, wait=wait, number_of_results=wait, encode=encode, max_fail_count=max_fail_count, current_fail_count=current_fail_count+1)
Example #21
0
def GetSearchResults(query=None,type=None,imdb_id=None):
	
	if (type=="movies"):
		# This a google search. The -tv will ommit all TV shows.
		search = 'intitle:%s -"Episode List" -"Series Rating" site:%s' % (query,ICEFILMS_URL)
	else:
		search = 'allintitle:%s "Episode List" site:%s' % (query, ICEFILMS_URL)
	
	gs = GoogleSearch(search)
	gs.results_per_page = 25
	gs.page = 0
	results = gs.get_results()
	items = []
	
	for res in results:
	
		name = re.sub(
			'(<em>|</em>|<a>|</a>|DivX|-|icefilms(\.info)?|<b>\.\.\.</b>|Episode List|links)',
			'',
			res.title.encode('utf8')
		).strip()

		url=res.url
		video_url = re.search("icefilms\.info(/.*)", url).group(1)
		
		res = MediaInfo()
		
		res.type = type
		res.title = name

		match = re.search("(.*)\((\d*)\)", res.title)
		
		if (match):
			res.title = match.group(1).strip()
			res.year = int(match.group(2).strip())
			
		res.id = video_url
		
		items.append(res)
	
	return items
Example #22
0
def getGooogleResults(query, exclude):
    try:
        print "Searching for {0} but excluding these {1}".format(query, exclude)
        page = 1
        gs = GoogleSearch(query)
        gs.results_per_page = 1000
        results = gs.get_results()
        print "results", gs.num_results  # number of results
        while page < 100:
            gs.page = page
            results += gs.get_results()
            page += 1
        #print len(results)
        i = 1
        file = open("researchResults/" + query + "/data/searchResults.txt", "w")
        for res in results:
            #write the file to disk in the expected format
            file.write("{0}^{1}^{2}\n".format(i, res.title.encode("utf8"), res.url.encode("utf8")))
            i += 1
        file.close()
        print "{0} results found".format(i)
    except SearchError, e:
      print "Search failed: %s" % e
Example #23
0
    f1=open('uvasearch.txt','w')
    for match in re.finditer(r'(href="http://)([\w\.-_]+)(virginia.edu)([\w\.-_]*)',website_text):
        clipped=re.search('(href=")(http://[\w\.-_]+)',match.group(0))
        f1.write(clipped.group(2)+'\n')
        uva_list.append(clipped.group(2))
    f1.close()
    return sorted(uva_list)

handle=retrieve_web_page("http://www.virginia.edu/atoz/")
website_text=handle.read()
uva_list=store_urls(website_text)

#strap related urls from google
try:
    gs=GoogleSearch("University of Virginia")
    gs.results_per_page=25
    search_list=[]
    f2=open('googlesearch.txt','w')
    for page in range(0,5):
        gs.page=page
        results=gs.get_results()
        for result in results:
            search_list.append(result.url.encode('utf8'))
            f2.write(result.url.encode('utf8')+'\n')
    f2.close()
except SearchError, e:
    sys.exit(1)

#compare two 
common=set(search_list).intersection(set(uva_list))
Example #24
0
#!/usr/bin/python
#
# This program stores urls from Google search results on a keyword.
#
import time, random
from xgoogle.search import GoogleSearch, SearchError

for i in range(5, 100, 5):
    
    f = open('pp'+str(i)+'.txt','wb')
    
    for j in range(i-5,i):
        wt = random.uniform(1.5, 3.5)
        gs = GoogleSearch("privacy policy")
        gs.results_per_page = 10
        gs.page = j
        results = gs.get_results()
        #Try not to annnoy Google, with a random short wait
        time.sleep(wt)
        print 'This is the %dth iteration and waited %f seconds' % (j, wt)
        for res in results:
            f.write(res.url.encode("utf8"))
            f.write("\n")
    print 'Page %d to %d done' % (i-4,i)
    f.close()
    
    #Wait few more seconds for every 50 urls
    time.sleep(random.uniform(3, 5))

print "All done"
Example #25
0
import time, random
from xgoogle.search import GoogleSearch, SearchError

f = open('a.txt', 'wb')

for i in range(0, 1000):
    wt = random.uniform(3, 10)
    q = '"manage.py" "settings.py" "urls.py" intitle:"- Github" -inurl:/wiki/ -inurl:/blob/ -inurl:manage.py -inurl:commits -inurl:README -inurl:gist site:github.com'
    gs = GoogleSearch(q)
    gs.results_per_page = 100
    gs.page = i
    results = gs.get_results()
    #Try not to annnoy Google, with a random short wait
    time.sleep(wt)
    print 'This is the %dth iteration and waited %f seconds' % (i, wt)
    for res in results:
        f.write(res.url.encode("utf8"))
        f.write("\n")
    f.flush()

print "Done"
f.close()
Example #26
0
import time, random
from xgoogle.search import GoogleSearch, SearchError

f = open('a.txt','wb')

for i in range(0,5):
    wt = random.uniform(2, 5)
    gs = GoogleSearch("sport calcio")
    gs.results_per_page = 10
    gs.page = i
    results = gs.get_results()
    #Try not to annnoy Google, with a random short wait
    time.sleep(wt)
    print 'This is the %dth iteration and waited %f seconds' % (i, wt)
    for res in results:
        f.write(res.url.encode("utf8"))
        f.write("\n")

print "Done"
f.close()
Example #27
0
    cnt = 0

    gs = GoogleSearch(query)
    gs.filetype = mfiletype
    gs.results_per_page = 50
    pgCnt = 1

    if n_cnt <> 0:
        print "Download Continuing from result number: ",str(n_cnt)

    #import pdb

    lastUrl = None

    while True:
        gs.page = pgCnt
        results = gs.get_results()
        pgCnt = pgCnt +1  #Increase page count to next page

        if not results: # no more results were found
            break

        for res in results:
            cnt = cnt +1
            print "Search No. : ",str(cnt)
            print res.title.encode("utf8")
            # print res.desc.encode("utf8")
            temp_url = res.url.encode("utf8")
            if n_cnt <> 0:
                 if cnt < n_cnt:
                    continue