Example #1
0
 def _get_goog_urls(query):
     g = pygoogle.pygoogle(query)
     g.pages = 1 
     g.hl = "en"
     self.l.info("Google search result count: %s" % g.get_result_count())
     if g.get_result_count() > 0:
         return g.search_page_wise()[0]
     else:
         g = pygoogle.pygoogle("site:blogspot.com groot")
         g.pages = 1
         g.hl = "en"
         self.l.info("No results for original query, retrying with 'groot'")
         return g.search_page_wise()[0]
Example #2
0
def do_action(lcars, string, case):
	target = string.split(' ', 1)[1]

	g = pygoogle("site:youtube.com " + target)
	g.pages = 1
	urls = g.get_urls()
	if len(urls) == 0:
		reply = "No results found for" + target
		lcars.reply_with(reply)
		return reply
	link = urls[0]

	if "user" in link or "store" in link or "feed" in link or "playlist" in link or "channel" in link:
		for url in urls:
			if "user" in url:
				link = "http://youtube.nestharion.de/" + url.split('/')[-1]
				break
		if not link:
			for url in urls:
				if "store" in url or "feed" in url or "playlist" in url or "channel" in url:
					continue
				else:
					link = url
					break
	if not link:
		link = urls[randint(0, len(urls) - 1)]

	lcars.background(["google-chrome", link])
	reply = "Playing " + target
	lcars.reply_with(reply)
	os.system("sleep 1")
	os.system("xdotool windowactivate --sync $(xdotool search --class Chrome | head -n 1) & sleep 0.3; pkill xdotool")
	os.system("xdotool windowactivate --sync $(xdotool search --class Chrome | tail -n 1) & sleep 0.3; pkill xdotool")

	return reply
Example #3
0
def get_impact_factor_from_issn(issn="1475-7516", debug=False):
    """
      For the input ISSN in the format NNNN-NNNN obtain
      the headers and the datasets in a nested list
      equivalent to an array of (# headers)*[4 (years)]
    """
    g = pygoogle("site:http://www.bioxbio.com/if/html " + issn)
    g.pages = 1
    if g.get_urls():
        if_file = urllib.urlopen(g.get_urls()[0])
        html = if_file.read()
        if_file.close()
    else:
        return [], []

    if debug:
        print (html)
    soup = BeautifulSoup(html)
    table = soup.find("table")

    # The first tr contains the field names.
    headings = [th.get_text().strip() for th in table.find("tr").find_all("td")]

    datasets = []
    for row in table.find_all("tr")[1:]:
        dataset = [eval(td.get_text().replace("-", "0")) for td in row.find_all("td") if td.get_text().strip()]
        datasets.append(dataset)

    return headings, datasets
def LinkedIn(linkedinusername):
	from pygoogle import pygoogle
	g = pygoogle("linkedin "+linkedinusername)
	g.pages = 5
	g.get_result_count()
	myURLs = g.get_urls()
	return myURLs
Example #5
0
    def dork(self,search_term,p,output):
        print YELLOW + "[+] " + END + WHITE + "Searching for " + END + "%s " % search_term
        gs = pygoogle(search_term)
        gs.pages = p
        print YELLOW + "[+] " + END + WHITE + "Results Found : " + END + "%s " % (gs.get_result_count())
        if gs.get_result_count() == 0: print RED + "[-] " + END + WHITE + "No Results Found" + END; time.sleep(1); sys.exit()

        print YELLOW + "[+] " + END + WHITE + "Fetching " + END + "[%s] Results " % (gs.get_result_count())
        url_list = gs.get_urls()

        if len(url_list) == 0:
            print YELLOW + "[!] " + END + WHITE + "Got 0 URLs" + END
            print RED + "[!] " + END + WHITE + "Nothing to save" + END
            time.sleep(1)
            sys.exit()
            
        elif len(url_list) > 1:
            print YELLOW + "[+] " + END + WHITE + "Got " + END + "[%s] URLs" % (len(url_list))
            print YELLOW + "[+] " + END + WHITE + "Writing URLs to " + END + "[%s] " % (output)

            with open(output,'w') as w_file:
                for i in url_list: w_file.write(i+'\n')
            print YELLOW + "[+] " + END + WHITE + "URLs saved to " + END + "[%s] " % (output)

            time.sleep(2)
Example #6
0
def googleIt(url):
		db = database()
		source = url
		en = ["it","zh-Hans","fr","nl","es","pt-BR","ca","pa","qu","mr","mo","mn","ne","pcm","nn","or","qu"]
		random.shuffle(en)

		search = pygoogle.pygoogle(hl=en[0],query="site:"+source)
		urlList = search.get_urls()

		print urlList
		sha1 = hashlib.sha1()

		for eachUrl in urlList:
			#Generate hash for url - used as primary key for database
			try:
				eachUrl = u"".join(eachUrl).encode('utf-8').strip()
				sha1.update(eachUrl)
				hash = sha1.hexdigest()
				numTLD = db.countTLD(eachUrl)
				#Persist item in database 
				db.addGoodUrl(source,hash, eachUrl, numTLD)
			except:
				doNothing =0
		print 'Done'
		db.close()
Example #7
0
def fresh_google_check(link: str, attempt=5, debug=False):
    """
    Проверяет, индексировался ли уже ресурс гуглом раньше
    чем за 2 недели до сегодня.
    :param link:
    :param attempt:
    :return:
    """
    if debug:
        return False
    try:
        assert isinstance(link, str)
        today = datetime.date.today()
        date_s = _date_to_julian_day(today - datetime.timedelta(days=365 * 8))
        date_e = _date_to_julian_day(today - datetime.timedelta(days=7 * 2))
        query = u'site:%s daterange:%s-%s' % (link, date_s, date_e,)

        result = False
        for i in range(0, attempt):
            g = pygoogle(
                query.encode('utf-8'),
                raise_http_exceptions=True,
                proxies=settings.PROXIES_FOR_GOOGLING
            )

            try:
                result = bool(g.get_result_count())
            except PyGoogleHttpException as e:
                renew_connection()
                continue
            break
    except (AssertionError, PyGoogleHttpException, stem.SocketError):
        result = False

    return result
Example #8
0
def searchGoogle(searchTerm):

    print ''
    print 'Searching Google...'
    print ''

    googler = pygoogle(
        searchTerm)  # initialize pygoogle object with search term
    googler.pages = 3  # set max pages

    print '*********************************'
    print 'Google Results'
    print '*********************************'
    print ''

    # display google results in a formatted way
    for keys, values in googler.search().items():
        theKey = keys.replace("'", "'")
        theKey = theKey.replace("&", "&")
        theValue = values.replace("'", "'")
        theValue = theValue.replace("&", "&")
        print 'Title: ' + (theKey.encode('ascii', 'ignore'))
        print 'URL: ' + (theValue.encode('ascii', 'ignore'))
        print ''
    print ''
    print '*********************************'
    print ''
Example #9
0
def fresh_google_check(link: str, attempt=5, debug=False):
    """Проверяет, индексировался ли уже ресурс гуглом раньше.

    чем за 2 недели до сегодня.
    :param link:
    :param attempt:
    :return:

    """
    if debug:
        return False
    try:
        assert isinstance(link, str)
        today = datetime.date.today()
        date_s = _date_to_julian_day(today - datetime.timedelta(days=365 * 8))
        date_e = _date_to_julian_day(today - datetime.timedelta(days=7 * 2))
        query = u'site:%s daterange:%s-%s' % (link, date_s, date_e, )

        result = False
        for i in range(0, attempt):
            g = pygoogle(query.encode('utf-8'),
                         raise_http_exceptions=True,
                         proxies=settings.PROXIES_FOR_GOOGLING)

            try:
                result = bool(g.get_result_count())
            except PyGoogleHttpException as e:
                renew_connection()
                continue
            break
    except (AssertionError, PyGoogleHttpException, stem.SocketError):
        result = False

    return result
Example #10
0
def searchGoogle(searchTerm): 

    print ''
    print 'Searching Google...'
    print ''

    googler = pygoogle(searchTerm) # initialize pygoogle object with search term
    googler.pages = 3 # set max pages

    print '*********************************'
    print 'Google Results'
    print '*********************************'
    print ''

    # display google results in a formatted way
    for keys, values in googler.search().items():
        theKey=keys.replace("'","'")
        theKey=theKey.replace("&","&")
        theValue=values.replace("'","'")
        theValue=theValue.replace("&","&")
        print 'Title: ' + (theKey.encode('ascii', 'ignore'))
        print 'URL: ' + (theValue.encode('ascii', 'ignore'))
        print ''
    print ''
    print '*********************************'
    print ''
Example #11
0
def google_search(search_string):
    g = pygoogle(search_string)
    g.pages = 1
    results = g.get_urls()
    try:
        return results[0]
    except:
        return "That was not the word you're looking for"
Example #12
0
def google_first_result(googlestring):
	pygoog = pygoogle(googlestring)
	pygoog.pages = 1
	urls = pygoog.get_urls()
	try:		
		return urls[0]
	except IndexError:
		return "http://www.google.com"
Example #13
0
def https_search(url):
   
    string_search = "inurl:https site:"+str(url)

    g = pygoogle(string_search)
    g.pages = 5
    g.hl = "br"

    print string_search 

    results_numbers = 0
    count = 0
    temp = 6 # segundos

    while results_numbers == 0 :
        results_numbers = g.get_result_count()
        print "Resultados:",results_numbers
        print
        if results_numbers == 0:
            time.sleep( temp ) 
            count += temp 
            if count > 60: # segundos
                count = -1
                print "Desisto!"
                break

    desired_results = 5
    search_sites = {}

    if count == -1:
        print "Sem estima dos resultados da pesquisa"
        return 0

    elif results_numbers < desired_results:
        print "Poucos sites!"
        desired_results = results_numbers
    
    while len(search_sites) == 0:
        search_sites = g.search()
        print search_sites
        print
        for key in search_sites.keys():
            #print key, search_sites[key]
            print unicode(key).encode('cp850'), unicode(search_sites[key]).encode('cp850')
        if len(search_sites) == 0 or len(search_sites) < desired_results:
            time.sleep( temp ) 
            count += temp 
            if count > 60: # segundos
                count = -1
                print "Desisto!"
                break
    
    if count == -1:
        print "Possivel bloqueio do Google"
        return 0
   
    
    print "Fim"
Example #14
0
def googleSearch ( searchString ):
    g = pygoogle(searchString)
    g.pages = 2
    urls = g.get_urls()
    urls = urls[:10]
    for i in range(len(urls)):
        urls[i]=unicode(urlnorm.norm(urls[i]))

    return urls
Example #15
0
def getSearchAndDownloadPaper(textToSearch, fileNameToSave):
	g = pygoogle(textToSearch + ' filetype:pdf')
	g.pages = 1
	try:
		pdfUrl = g.get_urls()[0]
		urllib.urlretrieve(pdfUrl, "../pdfdownloaded/" + fileNameToSave)
		time.sleep(random.randint(30,60))
	except IndexError:
		print fileNameToSave + " " + textToSearch
		time.sleep(180);
def crackedonpygoogle(passhash, plaintext):  #trying this approach
    from pygoogle import pygoogle
    googleresult = pygoogle(passhash)  #default is for moderate safe search.  Probably OK to let this be, since we won't find p**n while googling a password hash.  Probably throwing caution (and Rule 34) to the wind here.
    googleresult.pages = 1
    resulturls = googleresult.get_urls()
    for i in range(0,len(resulturls)):
        resulturls[i] = str(resulturls[i])
    if crackedonweb(passhash, plaintext, resulturls):
        return True
    else:
        return False
Example #17
0
	def searchtext(self, user_input):
		searchterms = user_input + ' site:stackoverflow.com'
		print "Searching:", searchterms
		g = pygoogle(searchterms)
		g.pages = 1
		urls = g.get_urls()
		#go through search results
		for url in urls[:int(len(urls)/4+0.5)]:
			req = urllib2.Request(url, headers = hdr)
			try:
				self.myParser.answers = 0
				page = urllib2.urlopen(req)
				html = page.read()
				#print html
				html_fixed = html.replace('&gt;', '3cmr93iwm0c9ri3w0')
				html_fixed = html_fixed.replace('&lt;','98jdsf98j3oisdf')
				html_fixed = html_fixed.replace('&amp;','dksljf9w8ejfosidjf')

				#html_fixed = html_fixed.replace('...',' ')
				self.myParser.feed(html_fixed)
				self.snips = self.myParser.snips
				#print self.snips
				for x in self.snips:
					for y in x[0]:
						print url
						answer = sublime.ok_cancel_dialog(y.replace('98jdsf98j3oisdf','<').replace('3cmr93iwm0c9ri3w0','>').replace('dksljf9w8ejfosidjf','&'))
						if answer == 1:
							self.view.insert(self.editor,
								self.view.sel()[0].begin(),y.replace('98jdsf98j3oisdf','<').replace('3cmr93iwm0c9ri3w0','>').replace('dksljf9w8ejfosidjf','&'))
							if self.language in starter:
								self.view.insert(self.editor,
									self.view.sel()[0].begin(),"\n\n"+starter[self.language]+'\n'+x[1].replace('98jdsf98j3oisdf','<').replace('3cmr93iwm0c9ri3w0','>').replace('\t',' ').replace('\n','').replace(starter[self.language],' ').replace(ender[self.language],' ').replace('dksljf9w8ejfosidjf','&')+'\n'+\
									ender[self.language]+"\n\n")
							else:
								self.view.insert(self.editor,
									self.view.sel()[0].begin(),"/*"+x[1].replace('98jdsf98j3oisdf','<').replace('3cmr93iwm0c9ri3w0','>').replace('\t',' ').replace('\n','').replace('dksljf9w8ejfosidjf','&')+\
									'*/'+"\n\n")
							self.myParser.snips = []
							self.myParser.curr_snips = []
							self.myParser.curr_snip = ''
							self.myParser.curr_comment = ''
							self.snips = []
							break
					else: 
						continue
					break
				else:
						self.myParser.snips = []
						continue
				break
			except urllib2.HTTPError,e:
				print e.fp.read()
Example #18
0
 def google(self, args, irc):
   '''(google [search term]) -- 
   Return the top Google result for the term searched.
   '''
   try:
       g = pygoogle(u' '.join(args))
       g.pages = 1
       for title, descr in g.search().iteritems():
           reply = u'{} | {}'.format(descr.strip(), title.strip())
           return reply
   except:
       log.err('[Error]: Google {}'.format(sys.exc_info()[0]))
       return '[Error]: Cannot contact Google API.'
Example #19
0
def fresh_google_check(link):
    '''
    Проверяет, индексировался ли уже ресурс гуглом раньше
    чем за 2 недели до сегодня.
    '''
    sleep(random.random())
    today = datetime.date.today()
    date_s = date_to_julian_day( today - datetime.timedelta(days=365 * 8) )
    date_e = date_to_julian_day( today - datetime.timedelta(days=7 * 2) )
    query = u'site:%s daterange:%s-%s' % (link, date_s, date_e,)
    g = pygoogle(query.encode('utf-8'))
    g.pages = 1
    return bool(g.get_result_count())
Example #20
0
def findUrl(key, numPages, save = False):
    g = pygoogle(key)
    g.pages = numPages
    links = g.get_urls()
    if save:
        try:
            f = open("links.txt","w")
            for link in links:
                f.write(link+"\n")
            f.close()
        except IOError:
            print "cannot open new file"
    else:
        return links
Example #21
0
def google_query(query):

    g = pygoogle(query)
    g.pages = 1
    g.rsz = 4
    results = {}
    results = g.search()
    rl = results.keys()

    print rl

    s = rl[0]
    s.encode('utf-8')
    return s
Example #22
0
def google_query(query):
	
	g = pygoogle(query) 
	g.pages=1
	g.rsz = 4
	results = {}
	results = g.search()
	rl = results.keys()

	print rl
	

	s = rl[0]
	s.encode('utf-8')
	return s
Example #23
0
 def search(self, group, filename, destination):
     movie_name = getTitle(group['library'])
     movienorm = unicodedata.normalize('NFKD', movie_name).encode('ascii','ignore')
     movie_year = group['library']['year']
     searchstring=movienorm+' '+ str(movie_year) +' bande annonce vf HD'
     time.sleep(3)
     log.info('Searching google for: %s', searchstring)
     g = pygoogle(str(searchstring))
     diclist = g.search()
     urllist = g.get_urls()
     cleanlist=[]
     for x in urllist:
         if 'youtube' in x or 'dailymotion' in x:
             cleanlist.append(x)
     if cleanlist:
         bocount=0
         for bo in cleanlist:
             if bocount==0:
                 tempdest=unicodedata.normalize('NFKD', os.path.join(rootDir,filename)).encode('ascii','ignore')+u'.%(ext)s'
                 dest=destination+u'.%(ext)s'
                 log.info('Trying to download : %s to %s ', (bo, tempdest))
                 p=subprocess.Popen([sys.executable, 'youtube_dl/__main__.py', '-o',tempdest,'--newline', bo],cwd=rootDir, shell=False, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
                 while p.poll() is None:
                     l = p.stdout.readline() # This blocks until it receives a newline.
                     lmsg= l.replace('%',' percent')+' '+filename
                     log.info(lmsg)
                 # When the subprocess terminates there might be unconsumed output 
                 # that still needs to be processed.
                 (out, err) = p.communicate()
                 outmsg='Out for '+filename +' : '+out
                 errmsg='Err for '+filename +' : '+err
                 if out:
                     log.info(outmsg)
                 if err:
                     log.info(errmsg)
                     continue
                 else:
                     listetemp=glob.glob(os.path.join(rootDir,'*'))
                     for listfile in listetemp:
                         if unicodedata.normalize('NFKD', filename).encode('ascii','ignore') in listfile:
                             ext=listfile[-4:]
                             finaldest=destination+ext
                             shutil.move(listfile, finaldest)
                             bocount=1
                             log.info('Downloaded trailer for : %s', movienorm)
                             return True
     else:
         return False
Example #24
0
def giveSong(user, sortedtweets, mood, p): #returns a (user, mood, newsongslist) tuple
	#get latest p tweets
	tweettexts = [x.text for x in sortedtweets[-p:]]
	#use a filtered list of sentiment terms from text + mood
	for text in tweettexts:
		words = [strip(x) for x in text.split()]
		searchterm = mood
		for word in words:
			if word in sentimentlexicon:
				searchterm += ' ' + word
		print 'searching for: ' + searchterm
		if len(searchterm) > 0:
			search = pygoogle.pygoogle(searchterm)
			urls = [x for x in search.get_urls() if isYoutube(x)]
			if len(urls) > 0:
				return urls[0]
Example #25
0
def autoupdate(request):
	search=request.POST['search']
	db = MySQLdb.connect(host="127.0.0.1",user="******", passwd="root", db="nutch")
        cur = db.cursor()
	cur.execute("insert into web select * from webpage")
	cur.execute("truncate table webpage")
	os.chdir("/root/nutch/runtime/local")
	g = pygoogle(search)
	g.pages = 1
	x=g.get_urls()
	dataFile = open('urls/seed.txt','w')
	for eachitem in x:
		dataFile.write(str(eachitem)+'\n')
	dataFile.close()
	os.system("bin/nutch crawl urls -depth 3 -topN 5")
	return HttpResponseRedirect("/")
Example #26
0
  def fetch_google_results(self):
    print "Searching Google"
    search = pygoogle(self.query)
    results = search.get_urls()[:10] #Only get the first 10 results
    for result in results:
      print "Google Result: " + str(result)

      if ( self.skip_specific_websites(result) == True ):
        continue

      time = datetime.now().time()
      score,code = self.calculate_BM25_score(result)
      if (not (score == None)) and (code == 200) and (self.is_illegal_folder(result) == False) and (self.is_illegal_extension(result) == False) :
        self.urls.put((score,(str(result),1))) #All google results are at depth 1 with google.com being at depth 0
      self.write_to_file(result,score,int(1),code,time)
      self.pages_crawled += 1
Example #27
0
def getpost(name):
	searcht = name
	results = pygoogle(searcht + ' wikipedia')
	results.pages = 1
	links = results.get_urls()
	url = links[0]

	wikititle = arttitle(url)
	
	#Check if last char is a ) and fix link if needed
	if url[-1] == ')':
		url = url[:-1]
		url+='\)'
	
	print ('#Here is a Wikipedia link to [' + wikititle + '](' + url + ').\n\n^This ^message ^was ^created ^by ^a ^[bot](http://www.reddit.com/r/wikime/comments/1vweq5/what_is_this_bot/).')

	return ('#Here is a Wikipedia link to [' + wikititle + '](' + url + ').\n\n^This ^message ^was ^created ^by ^a ^[bot](http://www.reddit.com/r/wikime/comments/1vweq5/what_is_this_bot/).')
Example #28
0
def main(query):
	g = pygoogle(query)
	g.pages = 1		# Get one page of results
	linkFile = open(os.path.join('data', 'linkFile'),'w')	# Store all search URLs
	count=0
	cleaner = re.compile('\[.*?\]')
	for url in g.get_urls():
		linkFile.write(url+'\n')
		target = open(os.path.join('data', query+'_'+str(count)+'.txt' ),'w')	# Create corresponding 'query_' + count filename
		#target.write(get_text(url).encode('ascii','ignore'))
		text = (get_text(url).encode('ascii','ignore'))
		for line in text.split('\n'):
			if len(line) > 600:
				line = re.sub(cleaner,'',line)
				if line[len(line)-1] == '.':
					target.write(line+'\n\n')
				else:
					target.write(line+'.\n\n')
		count = count + 1
Example #29
0
 def _google_search(self, songs):
     for song in songs:
         print '''fetching ''' + song
         googsearch = pygoogle(song + ' site:youtube.com/watch')
         googsearch.pages = int(self._Entry2.get())
         namelist = []
         index = 0
         for url in googsearch.get_urls():
             if self._CheckVar1.get() > 0 and index > 0:
                 namelist.append(url)
             else:
                 try:
                     video = pafy.new(url)
                     namelist.append(video.title + ''' : ''' + video.author)
                     index = index+1
                 except IOError:
                     namelist.append("Video Not Available")
         self.songlist.append((namelist,googsearch.get_urls()))
         self.selectedurllist.append(0)
Example #30
0
    def fetch_google_results(self):
        print "Searching Google"
        search = pygoogle(self.query)
        results = search.get_urls()[:10]  #Only get the first 10 results
        for result in results:
            print "Google Result: " + str(result)

            if (self.skip_specific_websites(result) == True):
                continue

            time = datetime.now().time()
            score, code = self.calculate_BM25_score(result)
            if (not (score == None)) and (code == 200) and (
                    self.is_illegal_folder(result)
                    == False) and (self.is_illegal_extension(result) == False):
                self.urls.put(
                    (score, (str(result), 1))
                )  #All google results are at depth 1 with google.com being at depth 0
            self.write_to_file(result, score, int(1), code, time)
            self.pages_crawled += 1
Example #31
0
 def search(self, search):
     g = pygoogle(search)
     g.pages = 5
     searchNot = self.mapKeeper.searchNot(search.replace(" ", "_"))
     results = g.get_urls()
     print "number of results: ", len(results)
     for url in results:
         base_url = url
         req = urllib2.Request(url, headers=self.hdr)
         try:
             response = urllib2.urlopen(req)
             print "Processing: ", url
         except (UnicodeEncodeError, urllib2.HTTPError, urllib2.URLError, socket.error, httplib.BadStatusLine), e:
             print "Error when opening url -> " + url + ": ", e
             continue
         page = BeautifulSoup(response, "lxml")
         images = page.select("img[alt]")
         for image in images:
             if search in image.get("alt").lower():
                 imageURL = image.get("src")
                 imageURL = urlparse.urljoin(base_url, imageURL)
                 if imageURL in searchNot:
                     print "Image is in searchNot: ", imageURL
                     continue
                 try:
                     imgdata = urllib2.urlopen(imageURL)
                 except urllib2.HTTPError, e:
                     print "Error: " + imageURL + ":", e.code
                     self.mapKeeper.addNot(search.replace(" ", "_") + " " + imageURL)
                     continue
                 except urllib2.URLError, e:
                     print "Error: " + imageURL + ":", e.args
                     self.mapKeeper.addNot(search.replace(" ", "_") + " " + imageURL)
                     continue
                 image_type, width, height = getimageinfo.getImageInfo(imgdata)
                 if image_type == " " or (width < 200 and height < 200):
                     print "Image Invalid: ", imageURL
                     self.mapKeeper.addNot(search.replace(" ", "_") + " " + imageURL)
                     continue
                 print "image type:", image_type, "width:", width, "height:", height
                 return imageURL
Example #32
0
def main():
    urls = []
    if len(sys.argv) > 1 and sys.argv[1] == '-csv':
        with open(sys.argv[2], 'rb') as csvfile: 
            csvreader = csv.reader(csvfile)
            csvarray = []
            for row in csvreader:
                csvarray.append(row)
            for row in csvarray:
                for song in row:
                    googsearch = pygoogle(song + ' site:youtube.com/watch')
                    googsearch.pages = 1
                    ytlink = googsearch.get_urls()[0]
                    urls.append(ytlink)
            if len(sys.argv) > 2:
                scrape(urls, sys.argv[2])
            else:
                scrape(urls)
    elif len(sys.argv) > 1 and (sys.argv[1] == '-h' or sys.argv[1] == '-help'):
        print '''Youtube Song Downloader
        Usage examples:
        
        OPEN GUI
        python pysonggui.py
        
        DOWNLOAD FROM CSV FILE
        python pysonggui.py -csv <csv filename> [download directory]
        
        DISPLAY HELP
        python pysonggui.py -h'''
    else:
        Root = Tk()
        Pmw.initialise(Root)
        import Tkinter
        del Tkinter
        App = pysonggui(Root)
        App.pack(expand='yes',fill='both')
        Root.geometry('1000x480+10+10')
        Root.title('Python Youtube Downloader')
        Root.mainloop()
Example #33
0
def getNPages(searchterms, N, verbose):
    myParser = MyHTMLParser()
    myParser.verbose = verbose
    len_modifer = 0
    searchterms += " site:stackoverflow.com"
    print "Searching:", searchterms
    g = pygoogle(searchterms)
    modifer = 0
    if N < 1:
        g.pages = 1
    else:
        g.pages = N
    urls = g.get_urls()
    # can do less than a page too!
    if N < 1:
        urls = urls[: max([int(len(urls) * N), 1])]
        # go through search results
    for url in urls:
        req = urllib2.Request(url, headers=hdr)
        try:
            myParser.answers = 0
            page = urllib2.urlopen(req)
            html = page.read()
            # IDs for unusual characters
            myParser.feed(IDsIn(html))
            snips = myParser.snips
            # print snips
            for x in snips:
                comment = IDsOut(x[1])
                for y in x[0]:
                    yield [termFix(IDsOut(y)), comment, url]

            myParser.code_flag = 0
            myParser.curr_snip = ""
            myParser.curr_snips = []
            myParser.curr_comment = ""
            myParser.snips = []
            myParser.answers = 0
        except urllib2.HTTPError, e:
            print e.fp.read()
Example #34
0
def main(search_root, pages=1, word=None):
    """
    - search_root : the first request on Google
    - word : word to search in sentence
    - pages[1-5] : number of google pages results
    """
    list_sentences = []
    list_soup = []
    list_url = []

    # Initialising research
    research = pygoogle(search_root)
    
    # Define page number
    research.pages = int(pages)
    
    # Getting URLs
    url_list = research.get_urls()
    
    # Parsing
    for url in url_list:
        list_soup.append(get_soup(url))
Example #35
0
from pygoogle import pygoogle
g = pygoogle('cisco')
g.pages = 1
x = g.results()
print x

Example #36
0
def main(input=input, *args):
    response = None
    choose = False
    choice = ""
    YorN = None
    words = ['']
    link = 0
    more = 0
    more2 = 0
    more3 = 0
    doChunk = True
    responseChunks = []
    url = 'https://en.wikipedia.org/wiki/Main_Page'
    global droid, prompt, tts
    exec('with open(storageFile) as file: list1 = file.readlines()'
         )  #in locals(),globals()
    #### MAIN LOOP:
    quit = False
    verbose = True  #False#
    #while response is not "":
    while quit is not True:
        try:
            if verbose: print 0
            ################### input and convert to list of words
            print 'input1=', repr(
                input), "response1=", response  #, "choice=",choice

            while input == "" or not input or input is None:
                # input = droid.recognizeSpeech().result
                # if not response: print 'noresponse'; input = droid.recognizeSpeech().result#exec(channel)
                # if choose: print 'choose'; prompt = choice; choice = droid.recognizeSpeech().result; input="choose"#exec(channel)
                # if not choose and response: input = droid.recognizeSpeech().result # prompt = response+'>'; exec(channel)
                if response is None:
                    prompt = '>'
                    exec(channel)
                #if not choose: prompt = '>'; exec(channel)
                if choose:
                    print 'choose'
                    prompt = choice
                    exec(channel)
                    choice = input.strip('\r')
                    input = ''
                    print choice
                    break
                #print 1
                if input is None:
                    time.sleep(7)
                    print 'input is None'
                    input = ""
                    exec(channel)
                #else: print "input2=",input;

            if verbose: print 1
            input = input.strip('\r')

            #if input == 'set': continue
            # if input == 'loop': response = mainLoop()

            # run=True; tts=False
            # global response; reponse = True
            # code='';i=0
            # input = raw_input('yes?\n').strip('\r'); print repr(input)
            #input = input.strip('\r'); print repr(input)
            for index, item in enumerate(list1):
                try:
                    exec(list1[index])
                    #print i;i=i+1
                except Exception, e:
                    pass  #print 'err', str(e)

            try:
                words = input.split(' ')
            except:
                pass
            if verbose: print 2

            #### set context(s)
            '''if context: 
				phrase2 = raw_input(str(context)+ ' is ')
				context['action'] = phrase2; context = None
				print dctn[df[0]]['action']
				#confirm = raw_input('confirm?')
				#if confirm == 'y':  context = confirm; context = None; input ="okay"'''

            ################# direct commands
            # if input == 'quit': response = ""
            if input == 'quit' or input == 'q' or input == 'end' or input == 'exit':
                break
            if input == 'load':
                exec(
                    'with open(storageFile) as file: list1 = file.readlines()')
            if input == 'dump':
                exec(
                    'with open(storageFile, "wb") as file: file.writelines(list1)'
                )
            if input == 'save':
                PBcreateBranch()
                break
            if input == 'dctn':
                response = str(dctn)
                print response, dctn
                continue
            if input == 'done': choose = False

            if verbose: print 3
            # print 3################### keyword based commands

            ######## parsing phrase
            # if ' is ' in input and not 'what is ' in input and not words[0] == 'is':
            # df = input.split(' is ') #definition
            # try: dctn[df[0]] = df[1]
            # except: print 'error, not entered' #dctn[df[0]]=[df[1]]
            # if df[1] == 'action':
            # dctn[df[0]]={'action':''}
            # response = 'how '+ df[0] +"?"
            # context = dctn[df[0]]
            # response = 'okay'

            # if ' is not ' in input:
            # split= input.split(' is not ') #remove definition
            # try: dctn[split[0]].remove(split[1])
            # except: pass

            ###### question
            if '?' in input:
                input = input.strip('?')
                if 'what is' in input:
                    q = input.split('what is ')
                    # print dctn[q[1]]
                    if q[1] in dctn: response = dctn[q[1]]
                    else:
                        try:
                            input = "search " + q[1]
                        except:
                            response = q[1] + ' is not known'

            ###### google
            if 'search' in input:
                try:
                    query = input.replace('search ', '')
                    print "searching.. " + query
                    from pygoogle import pygoogle
                    g = pygoogle(query)
                    g.pages = 1
                    results = g.__search__()
                    #choose=True;
                    response = results[link]['content']
                    #response = repr(response)
                    response = response.encode('ascii',
                                               'ignore').replace('\n', '')
                    url = list(results[link]['url'])[0]
                    print url
                    # response.encode('ascii', 'ignore');
                    doChunk = False
                except Exception, e:
                    print str(e)
                # print str(results)
                print response

            if verbose: print 5
            # print 5######## browse
            if choose:
                print 'chooseTrue'
                if choice == 'next':
                    link = link + 1
                    print 'link=', link
                    response = results[link]['content']
                    #response = repr(response)
                    response.encode('ascii')
                print choice
                if choice == 'go':
                    try:
                        response = " ".join(go(url))
                    except Exception, e:
                        print str(e)
                        input = raw_input('pause')
Example #37
0
def search(nome):
    dis = pygoogle(nome)
    dis.pages = 1
    print('dis is', dis.cont())
    result = dis.cont()
    return result
Example #38
0
def google_search(keyword):
    p = pygoogle(keyword)
    p.pages = 1
    result = p.search().items()[0]

    return result[0] + " - " + result[1]
Example #39
0
 def pygoogle_test(self):
     g = pygoogle()
Example #40
0
#choose 3 keywords

from random import randint
import os
import subprocess
from pygoogle import pygoogle

num_lines = sum(1 for line in open('strippedkws'))
print("%s lines in keyword file" % num_lines)
a = (randint(0, (num_lines)))
b = (randint(0, (num_lines)))
c = (randint(0, (num_lines)))

f = open('strippedkws')
lines = f.readlines()
a = (lines[a])
b = (lines[b])
c = (lines[c])

print("Searching for...")
for i in a, b, c:
    print(i)

g = pygoogle(i)
g.pages = 5
print '*Found %s results*' % (g.get_result_count())
g.get_urls()
g.get_urls()
g.display_results()
from pygoogle import pygoogle
import wikipedia


searchTarget = raw_input()
g = pygoogle(searchTarget)
g.pages = 1
#g.display_results()
urls = []
urls = g.get_urls()
urlwiki = ""

for url in urls:
 	if url.find("wikipedia") == -1:
 			continue
 	else:
     		urlwiki = url
     		break

if(urlwiki != ""):
 	urlparse = urlwiki.rstrip().split("/")
 	wikipage = wikipedia.page(urlparse[-1])
 	print(wikipedia.summary(wikipage.title, sentences=1))
Example #42
0
def khoj(request):

    #form = PostForm(request.POST)
    #s_term = request.POST.get('s_term',request.GET.get('s_term',None))
    s_term = PostForm(request.POST)
    #s_term = forms.CharField(error_messages = my_default_errors)
    top_10 = {}
    dummy = OrderedDict()
    dummy2 = OrderedDict()
    if request.POST:
        s_term = PostForm(request.POST)
        dummy = OrderedDict()
        #dummy2.clear()
        if s_term.is_valid():
            c_dummy2 = ""
            print s_term.cleaned_data
            #s_term.save()
            if s_term.cleaned_data.values()[0] in infereddic2.keys():
                c_dummy = infereddic2[s_term.cleaned_data.values()[0]]
                print type(c_dummy)
                c_dummy2 = " ".join(c_dummy)
            else:
                infereddic2.setdefault(s_term.cleaned_data.values()[0], [])

            #if s_term.cleaned_data.values()[0] in infereddic.keys():
            #   dummy = infereddic[s_term.cleaned_data.values()[0]]
            #else :
            #   infereddic.setdefault(s_term.cleaned_data.values()[0], {})
            #print infereddic
            #print type(c_dummy)
            print ";;;;;;;;;;;;;;;;;;;;"
            print s_term.cleaned_data.values()[0] + " " + c_dummy2
            result = pygoogle(s_term.cleaned_data.values()[0] + " " + c_dummy2)
            result.pages = 2
            top_10 = {}
            n = 0
            print "dummmmmmmyy"
            print dummy
            for k, v in result.search().iteritems():
                if n < 10:
                    if k in dummy.keys():
                        n += 1
                    else:
                        top_10[k] = v
                        n += 1
                else:
                    break
            print "top_100->"
            print top_10
            #if dummy:
            #
            #  dummy2 = OrderedDict(dummy.items()+top_10.items())
            # print "dummyyy"
            #print dummy
            #print "2"
            #print dummy2
            #else:
            #   dummy2 = top_10

            if 'select-id' in request.POST:
                selected_ids = request.POST.getlist('select-id', [])
                text1 = []
                for i in selected_ids:
                    r_open = urllib.urlopen(i[:-1]).read()
                    soup = BeautifulSoup(r_open)
                    text1.append(soup.title.string)

                stopwords = nltk.corpus.stopwords.words('english')
                print stopwords[:10]
                from nltk.stem.snowball import SnowballStemmer
                stemmer = SnowballStemmer('english')

                def tokenize_and_stem(text):
                    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
                    tokens = [
                        word for sent in nltk.sent_tokenize(text)
                        for word in nltk.word_tokenize(sent)
                    ]
                    #print tokens
                    filtered_tokens = []
                    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
                    for token in tokens:
                        if re.search('[a-zA-Z]', token):
                            filtered_tokens.append(token)
                    stems = [stemmer.stem(t) for t in filtered_tokens]
                    #print stems[:20]
                    return stems

                def tokenize_only(text):
                    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
                    tokens = [
                        word.lower() for sent in nltk.sent_tokenize(text)
                        for word in nltk.word_tokenize(sent)
                    ]
                    filtered_tokens = []
                    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
                    for token in tokens:
                        if re.search('[a-zA-Z]', token):
                            filtered_tokens.append(token)
                    #print filtered_tokens[:20]
                    return filtered_tokens

                totalvocab_stemmed = []
                totalvocab_tokenized = []

                for i in text1:
                    allwords_stemmed = tokenize_and_stem(
                        i)  #for each item in 'synopses', tokenize/stem
                    totalvocab_stemmed.extend(
                        allwords_stemmed
                    )  #extend the 'totalvocab_stemmed' list

                    allwords_tokenized = tokenize_only(i)
                    totalvocab_tokenized.extend(allwords_tokenized)

                vocab_frame = pd.DataFrame({'words': totalvocab_tokenized},
                                           index=totalvocab_stemmed)
                print 'there are ' + str(
                    vocab_frame.shape[0]) + ' items in vocab_frame'

                from sklearn.feature_extraction.text import TfidfVectorizer

                #define vectorizer parameters
                tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                                   max_features=200000,
                                                   min_df=0.2,
                                                   stop_words='english',
                                                   use_idf=True,
                                                   tokenizer=tokenize_and_stem,
                                                   ngram_range=(1, 3))

                tfidf_matrix = tfidf_vectorizer.fit_transform(
                    text1)  #fit the vectorizer to synopses

                print(tfidf_matrix.shape)

                terms = tfidf_vectorizer.get_feature_names()

                from sklearn.metrics.pairwise import cosine_similarity
                dist = 1 - cosine_similarity(tfidf_matrix)

                from sklearn.cluster import KMeans

                num_clusters = 1

                km = KMeans(n_clusters=num_clusters)
                km.fit(tfidf_matrix)

                clusters = km.labels_.tolist()
                print clusters

                #from __future__ import print_function
                order_centroids = km.cluster_centers_.argsort()[:, ::-1]
                print "-------start--------------------------------------------"
                for i in range(num_clusters):
                    print "Cluster %d words:" % i

                    for ind in order_centroids[
                            i, :3]:  #replace 6 with n words per cluster
                        print "---------------------this----------------------------------"
                        print ' %s' % vocab_frame.ix[terms[ind].split(
                            ' ')].values.tolist()[0][0].encode(
                                'utf-8', 'ignore')
                        infereddic2[s_term.cleaned_data.values()[0]].append(
                            vocab_frame.ix[terms[ind].split(
                                ' ')].values.tolist()[0][0].encode(
                                    'utf-8', 'ignore'))

                #infereddic2[s_term.cleaned_data.values()[0]].append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')[0])])
                #----------------------------------------------------------------------------
                #for i in selected_ids:
                #print "This is id"+i

            #     for k,v in dummy2.iteritems():
            #print k+"-------"+v
            #          if v==i[:-1]:
            #print v
            #               infereddic[s_term.cleaned_data.values()[0]][k]=v
            #print k
            print "-------"
            #print infereddic
            print "=========="
            #print selected_ids
            #print s_term.cleaned_data

    return render_to_response('khoj.html', {
        'form': s_term,
        'result': top_10
    },
                              context_instance=RequestContext(request))
    '''if request.POST:
Example #43
0
    'metronews', 'nowtoronto', 'torontoist', 'blogto', 'cbc', '680news',
    'citynews'
]
subreddit = r.get_subreddit('toronto')
for submission in subreddit.get_new(limit=5):
    # print submission.title
    print('start submissions')
    # If we haven't replied to this post before
    if (submission.id not in posts_replied_to) and (
            'reddit' not in submission.url) and ('imgur'
                                                 not in submission.url):
        print('gathered submission')
        # Reply to the post
        final_message = 'Hi there! This is the BetterNewsForToronto bot!\n\nI\'m here to provide some information related to this post. Below are a few relevant links from other news sources. (Links are not guaranteed to be news articles...sorry! Bot results depend on the post\'s title.)'
        g = pygoogle(
            'Photo Album: Anti Uber protest at Nathan Phillips Square shut down by police'
        )
        g.pages = 3
        gDict = g.search()
        gTitles = gDict.keys()
        linkCount = 0
        index = 0

        print('Title1: ' + gTitles[0])

        while (linkCount < 5):
            if (index >= len(gTitles)):
                break
            compURL = gDict[gTitles[index]]
            if (submission.url
                    not in compURL) and ('reddit' not in compURL) and any(
Example #44
0
def main(input=input, *args):
    response = 'hi'
    choose = False
    choice = "go"
    YorN = None
    words = ['']
    chunk = 0
    link = 0
    global droid, prompt

    #### MAIN LOOP:
    while response is not "":

        ################### input and convert to list of words
        print 'input1=' + input, "response1=" + response  #, "choice="+choice

        while input == "" or input == 'nospeech' or input is None:
            input = droid.recognizeSpeech().result
            if not response:
                print 'noresponse'
                input = droid.recognizeSpeech().result  #exec(channel)
            if choose:
                print 'choose'
                prompt = choice
                choice = droid.recognizeSpeech().result
                input = "choose"  #exec(channel)
            if not choose and response:
                input = droid.recognizeSpeech(
                ).result  # prompt = response+'>'; exec(channel)

            if input is None:
                time.sleep(7)
                input = ""
                #print 2 #exec(channel)
            else:
                print "input2=", input

        #exec('print 2')
        # if input is None:
        # prompt = response+'>'
        # input = raw_input('>')
        try:
            words = input.split(' ')
        except:
            pass

        #### set context(s)
        '''if context: 
			phrase2 = raw_input(str(context)+ ' is ')
			context['action'] = phrase2; context = None
			print dctn[df[0]]['action']
			#confirm = raw_input('confirm?')
			#if confirm == 'y':  context = confirm; context = None; input ="okay"'''

        ################### direct commands
        if input == 'quit': response = ""
        if input == 'save':
            PBcreateBranch()
            break
        if input == 'dctn':
            response = str(dctn)
            print response, dctn
            continue
        if input == "hi": response = 'hello'
        if prompt == 'anything else? (yes/no)>':
            if YorN == 'yes': pass
            if YorN == 'no': break

        ################### keyword based commands

        ########## definitions
        if ' is ' in input and not 'what is ' in input and not words[0] == 'is':
            df = input.split(' is ')  #definition
            try:
                dctn[df[0]] = df[1]
            except:
                print 'error, not entered'  #dctn[df[0]]=[df[1]]
            if df[1] == 'action':
                dctn[df[0]] = {'action': ''}
                response = 'how ' + df[0] + "?"
                context = dctn[df[0]]
            response = 'okay'
            #continue

        if ' is not ' in input:
            split = input.split(' is not ')  #remove definition
            try:
                dctn[split[0]].remove(split[1])
            except:
                pass

        ######## question
        if '?' in input:
            input = input.strip('?')
            if 'what is' in input:
                q = input.split('what is ')
                #print dctn[q[1]]
                if q[1] in dctn: response = dctn[q[1]]
                else:
                    try:
                        input = "search " + q[1]
                    except:
                        response = q[1] + ' is not known'

        ######## google
        if 'search' in input:
            query = input.replace('search ', '')
            print "searching " + query
            from pygoogle import pygoogle
            g = pygoogle(query)
            g.pages = 1
            results = g.__search__()
            #print str(results)
            choose = True
            response = results[link]['content']
            #response = repr(response)
            response.encode('ascii')
            #response.encode('ascii', 'ignore');

        ##################################################################################################################################
        if choose:
            print 'chooseTrue'
            if choice == 'next':
                link = link + 1
                print 'link=', link
                response = results[link]['content']
                #response = repr(response)
                response.encode('ascii')
            if choice == 'go':
                br = mechanize.Browser()
                br.set_handle_robots(False)
                br.addheaders = [(
                    'User-agent',
                    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
                )]
                page = br.open(url)
                response = page.read()
                soup = BeautifulSoup(response, "html.parser")
                #paras=soup.p #findAll('p', text=True)
                VALID_TAGS = ['p', 'span']  #, 'ul', 'li', 'br']'div',
                paras = [
                    i.text.encode('ascii', "ignore")
                    for i in soup.find_all(VALID_TAGS)
                ]  ################## removes <p>s
                paras = filter(None, paras)
                paras = [
                    i.replace('\n', '.').replace('\r', '.') for i in paras
                ]
                paras = [
                    i.replace('(', 'parens').replace(')', 'parens').replace(
                        '[', 'bracket').replace(']', 'bracket') for i in paras
                ]

                input = raw_input('pause')

        ######## actions
        if 'e' in input:
            exec1 = input.split('e ')  #exec
            try:
                exec(exec1[1])
                continue
            except Exception, e:
                print str(e)

        if 'do' in input:  #action
            try:
                exec(dctn[words[1]]['action'] + ' "' +
                     str(''.join(words[2:99])) + '"')
                continue
            except Exception, e:
                print str(e)
Example #45
0
from __future__ import print_function
from pygoogle import pygoogle
g = pygoogle('barsha biswas')
g.pages = 1
g.display_results()
def search(searchfor):
    g = pygoogle(searchfor)
    return g.get_result_count()
Example #47
0
from pygoogle import pygoogle
g = pygoogle('quake 3 arena')
g.pages = 5
print '*Found %s results*' % (g.get_result_count())
g.get_urls()
else:
    usermagento = args.user
    passmagento = args.pwd

if args.dork == None:
    nada = ''
    print " Usage: python letmefuckit.py --dork <dork> [options]"

else:
    saveresults = open("urls.txt", "w")
    print " Searching for: ", args.dork
    print " Total of google pages to process: ", args.pages
    print " Save results is ", save
    print '\n Initializing...'

    g = pygoogle(args.dork)
    g.pages = 5
    print ' [* Found %s results in search engine *]\n' % (g.get_result_count())
    urles = g.get_urls()
    for n, elem in enumerate(urles):
        url = '{1}\n'.format(n, elem)
        saveresults.write(url)
    saveresults.close()
    print "\n"
    print "--------------------------"
    print " Right! Analysing data...."
    print "--------------------------"
    print "\n"
    print "Possible targets found...\n"
    text_file = open("C:\exploit\urls.txt", "r")
    for line in text_file:
Example #49
0
import re, urllib, random, webbrowser, urllib2, sys
import json
import mechanize
from pygoogle import pygoogle

#phishListParsed = []
#json_list = open('seed.json', 'rb')
#phishList = json.load(json_list)
#for row in phishList:
#	phishListParsed.append(row['url'])

phishBank = []
done = 0
try:
    g = pygoogle('sign up email list')
    br = mechanize.Browser()
    br.set_handle_robots(False)  # ignore robots
    br.set_handle_refresh(False)
    response = br.open(crawl)
    print crawl
    br.form = list(br.forms())[0]
    for control in br.form.controls:
        if control.type == "text":  # means it's class ClientForm.TextControl
            control.value = "*****@*****.**"
        response = br.submit()
        print response
        print("SUCCCCCCESSSSSSSSS")

except Exception as e:
    print("Broken link to %s" % crawl)
    print(type(e))
Example #50
0
    "--output-only",
    dest="outFile",
    default=False,
    help="Only save results to the given file, No further action.")
parser.add_option(
    "-d",
    "--debug",
    dest="debugRun",
    default=False,
    help="Debug the tool without poisoning or exploiting the host")

(options, args) = parser.parse_args()
fk = 7

if options.debugRun not in ["t", "True", "true", "TRUE"]:
    g = pygoogle(options.theDork)
    g.pages = options.numPages  #Set to the number of result url pages to return from google
    fk = g.get_result_count()
    print '*Found %s results*' % (fk)
    # Handle case where we want to use the host for
    urls = g.get_urls()
else:
    print '*Debug test %s*' % (fk)
    urls = ["http://localhost:8087"]

# Handle case where we only want the results to a file.
if options.outFile in ["t", "True", "true", "TRUE"]:
    file = open(options.outFile, "w")
    for url in urls:
        file.write("%s\n" % url)
    print "File saved to: %s" % options.outFile