Example #1
0
def dangerous_retrieve_urls(topic, num = 10, saftey = 5.0):
    '''
    Add a exception check that if this method returns
    a list of size zero, google has ip blocked us and we
        can use our backup crawler, (via google's official api)
    '''
    top_news, niche_news = [], []
    query = ""
    if "news" != topic[-4:]: # If the last four chars don't equal news
        query = str(topic) + " news"

    try:
        for top in g.search(query, tld='com', lang='en',
                            num=num, start=0, stop=num, pause=saftey):
            if is_news_link(top):
                # Note that we are appending tuples of the date
                # and the news link, not just the link
                top_news.append(NewsWebsite(raw_link=top))
        for niche in g.search(query, tld='com', lang='en',
                              num=num, start=300, stop=300+num, pause=saftey):
            if is_news_link(niche):
                niche_news.append(NewsWebsite(raw_link=niche))
    except Exception:
        print "Google has blocked us"
        return None

    return top_news + niche_news
Example #2
0
    def google_search(self):
        # Retrieve pages based on domain search query
        #print "[*] Searching for email addresses in " + str(self.searchMax) + " sites and waiting " + str(self.delay) + " seconds between searches"
        
        # Search for emails based on the search string "@<DOMAIN>"
        print "[*] (PASSIVE) Searching for emails in Google search results: @\"" + self.domain + "\""
        googleResults = googlesearch.SearchGoogle(self.domain, self.searchMax, self.delay)
        emails = googleResults.process()
        if emails:
            for e in emails:
                self.allEmails.append(e)

        # Search for emails not within the domain's site (-site:<domain>)
        query = self.domain + " -site:" + self.domain
        print "[*] (PASSIVE) Searching for emails NOT within the domain's site: " + query
        for url in google.search(query, start=0, stop=self.searchMax, num=self.numMax, pause=self.delay, extra_params={'filter': '0'}):
            self.queue.put(url)  
            
        # Search for emails within the domain's site (site:<domain>)
        if self.active:
            query = "site:" + self.domain
            print "[*] (ACTIVE) Searching for emails within the domain's sites: " + self.domain
            for url in google.search(query, start=0, stop=self.searchMax, num=self.numMax, pause=self.delay, extra_params={'filter': '0'}):
                self.queue.put(url)         
        else:
            print "[*] Active seach (-a) not specified, skipping searching for emails within the domain's sites (*." + self.domain + ")"
        
        th.queue.join()
	def get_suggestions(self, keywords, keyword_confidence):
		stackoverflow_query = keywords + " error stackoverflow"
		askubuntu_query = keywords + " error askubuntu"
		suggestions = []
		question_ids = []
		for url in search(stackoverflow_query, tld='es', lang='en', stop=5):
			hostname = urlparse.urlparse(url).hostname
			if(hostname == "stackoverflow.com"):
				path = urlparse.urlsplit(url).path
				pathx = str(path).split('/')
				question_ids.append(pathx[2])
		if len(question_ids)!=0:
			print  "#DRAK : Fetched Stackoverflow Questions\n#DRAK : Fetching answers" 
			suggestions.extend(self.so.get_suggestions(question_ids))
			print "#DRAK : Answers fetched successfully" 
		question_ids = []
		for url in search(askubuntu_query, tld='es', lang='en', stop=5):
			hostname = urlparse.urlparse(url).hostname
			if(hostname == "askubuntu.com"):
				path = urlparse.urlsplit(url).path
				pathx = str(path).split('/')
				question_ids.append(pathx[2])
		if len(question_ids)!=0:
			print  "#DRAK : Fetched AskUbuntu Questions\n#DRAK : Fetching answers" 
			suggestions.extend(self.au.get_suggestions(question_ids))
			print  "#DRAK : Answers fetched successfully" 
		
		for suggestion in suggestions:
			suggestion.keyword_confidence = keyword_confidence
		return suggestions
Example #4
0
def showresults():
    if(re.match('^Who', request.form['search'])):
        ggen = google.search(request.form['search'], stop=1)
        results = [google.get_page(link) for link in ggen]
        results = [soupify(x) for x in results]
        results = [findName(x) for x in results]
        return render_template("index.html", results = results)
    elif(re.match('^When', request.form['search'])):
        ggen = google.search(request.form['search'], stop=1)
        results = [google.get_page(link) for link in ggen]
        results = [soupify(x) for x in results]
        results = [findDate(x) for x in results]
        return render_template("index.html", results = results)
    return render_template("index.html", results = "Please enter 'Who' or 'When'")
Example #5
0
def get_tokens(query, src="google"):
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]

	if src == "wikipedia":
		print "Searching Wikipedia for " + query
		infile = opener.open('http://en.wikipedia.org/wiki/' + query)
		page = infile.read()
	else:
		print "Searching Google for " + query
		page = ""
		results = google.search(query, "com", "en", 1, 0, 1, 2.0)
		for result in results:
			print "on " + result
			page = google.get_page(result)
	
	#print page

	raw = nltk.clean_html(page) 

	#parses into tokens and saves as lwoercase
	tokens = map(lambda x:x.lower(),word_tokenize(raw))

	#removes punctuation and empty strings
	tokens = [s.translate(None, string.punctuation) for s in tokens]
	tokens = [s for s in tokens if s]

	return tokens
Example #6
0
def get_pubmed_id(pdb_id):
    try:
        url = 'http://www.rcsb.org/pdb/explore.do?structureId='+pdb_id.lower()
        source = urllib.request.urlopen(url).read()
        soup = bs4.BeautifulSoup(source, 'html5lib')
        string = str(soup.find('meta', {'name':'description'}))
        pattern = re.compile('<meta content="[0-9A-Za-z]{4}:\s+(.+)" name')
        article = pattern.match(string).group(1)
    except BaseException:
        print(pdb_id.lower()+': '+"Something's wrong")
        return np.nan
    for counter in range(1):
        try:
            if counter > 0:
                time.sleep(np.random.random_integers(10, 30))
            links = [url for url in search(article, stop=40)]
            pmids = [split[-1] if split[-2] == 'pubmed' else '' for split in [link.split('/') for link in links]]
            global pm
            pm = pmids[np.where(np.array(pmids) != '')[0][0]]
        except BaseException as e:
            if str(e).find('503'):
                raise Exception('Seems google blocked you')
            continue
        else:
            break
    print(pdb_id.lower()+': '+pm)
    return pm
def get_wiki_url(term):
    try:
        urls = search(str(term)+' wikipedia')
	#results = wikipedia.search(term)
        #except wikipedia.exceptions.DisambiguationError as e:
	#print e.options
	while True:
	    try:
	        url = urls.next()
	    except:
		print "Cannot find wiki page for this term, putting as Unknown ", term
		return '','Unknown'
	    #print url, url.find('wikipedia')
	    if (url.find('wikipedia') !=-1):
		#print "Found Wikipedia page ", url
                subterms = re.split("/", url)
		#print subterms
                wiki_name = subterms[len(subterms)-1]
		#try:
		#    wikipedia.summary(wiki_name)
		#except wikipedia.exceptions.DisambiguationError as e:
		#    wiki_name = handle_disambiguation(wiki_name, e)
	        #print "Url and term: ", url, term, wiki_name
		break
    except ValueError:
        print "Error in google query for term: ", term
        url = ''
        wiki_name = ""
        #exit(0)
    return url,wiki_name
Example #8
0
def whenQuery(s):
    """
    TL;DR: Searches input on google for "when" questions.

    Arguments:
    s (string) - A query string; what you are searching for.
       -Ex. "when did World War I start"

    Returns:
    A list of names (strings) from the html of the top 5 google search results from the query.
    """
    results = google.search(s, num = 5, start = 0, stop = 5)

    r = []
    for pg in results:
        r.append(pg)

    dates = []
    for n in range (0,9):
        url = urllib2.urlopen(r[n])
        page = url.read()
        soup = bs4.BeautifulSoup(page, "html.parser")
        raw = soup.get_text()
        text = re.findall("\d{2}(-|\/)?\d{2}(-|\/)?\d+", raw)
        for dict in text:
            for i in dict:
                if i != unicode('') and i != unicode(' ') and i != unicode('\n'):
                    dates.append(i)
    return dates
Example #9
0
def on_message(ws, message):
    message = json.loads(message)
    for i in message:
        unicodedata.normalize('NFKD', i).encode('ascii', 'ignore')
    if message['cmd'] == 'chat':
        if message['text'].lower() == '|ebear' or message['text'].lower() == '|eb':
            _thread.start_new_thread(run, ())
        if message['text'].lower() == '|ebear -s' or message['text'].lower() == '|eb -s':
            _thread.start_new_thread(run, ((1,)))
        elif message['text'].lower() == '|source':
            ws.send(json.dumps({"cmd": "chat", "text": ("%s") % comm['|source']}))
        elif message['text'].lower() == '|help' or message['text'].lower() == '|h':
            ws.send(json.dumps({"cmd": "chat", "text": ("%s") % comm['|help']}))
        elif message['text'].lower()[:3] == '|g ':
            if len(message['text']) > 3:
                ws.send(json.dumps({"cmd": "chat", "text": google.search(message['text'][3:])}))
            else:
                ws.send(json.dumps({"cmd": "chat", "text": "Usage is |g \"string\""}))
        elif message['text'].lower() == '|g':
            ws.send(json.dumps({"cmd": "chat", "text": "Usage is |g \"string\""}))
        elif message['text'].lower()[:4] == '|ed ':
            if len(message['text']) > 4:
                ws.send(json.dumps({"cmd": "chat", "text": ED.search(message['text'][4:])}))
            else:
                ws.send(json.dumps({"cmd": "chat", "text": "Usage is |ed \"string\""}))
        elif message['text'].lower() == '|ed':
            ws.send(json.dumps({"cmd": "chat", "text": "Usage is |ed \"string\""}))
        afk(message)
Example #10
0
    def run(self, keywords=[]):

        if not keywords:
            # Check if file exists
            if not os.path.isfile(self.default_keyword_file):
                return False
            else:
                keywords = []
                fp = open(self.default_keyword_file,"r")
                for line in fp.readlines():
                    keywords.append(line.strip())
                fp.close()

        self.keywords = keywords
        print "Using Keywords:{0}".format(self.keywords)

        try:
            # Get the hits for the given keywords
            for keyword in self.keywords:
                print "KEYWORD:{0}".format(keyword)
                for url in search(keyword, stop=self.maxResuts):
                    print "Found URL:{0}".format(url)
                    self.urls.append(url)
        except:
                print "Something went wrong scraping Google."
                print "Scraping has stopped"
                pass

        return True
Example #11
0
def strip(q):
    """
    Args:
    q: query string
    
    Returns:
    A list of 10 pages, each stripped separately
    """
    r = google.search(q,num=15,start=0,stop=15)

    l = []
    for result in r:
        l.append(result)

    text = []
    for url in l:
        try:
            req = urllib2.Request(url)
            u = urllib2.urlopen(req)
            page = u.read()
            soup = bs4.BeautifulSoup(page,'html')
            raw = soup.get_text()
            
            reg = re.sub("[\t\n ]"," ",raw)
            text.append(reg)
        except:
            pass
    return text
    def Websarch(self,requet,numberResu):
        """
        this method we use http://breakingcode.wordpress.com/2010/06/29/google-search-python/
        """

        urls=search(requet,stop=numberResu)
        return  urls
Example #13
0
def main():

    # get a list with first NUM_URLS
    urls = []

    # now have some fun with the results...

    for url in search(SEARCH_CRITERIA, stop=NUM_URLS):
        urls.append(str(url))

    # filter those that belongs to the same domain
    urls = not_duplicate(urls)


    # go to each web page and gather data
    data = gather_data(urls)


    # create a CSV file with data just gathered
    send_to_csv(data)

    #sys.exit(0)

    # DONE!
    print "DONE"
Example #14
0
def getQuery(query):
    pages = google.search(query,start=0,stop=10)
    q = findQuery(query.lower())
    if q == None:
        return "Your query needs to contain either who, when, or where"
    names = {}
    for p in pages:
        try:
            url = urllib2.urlopen(p)
        except:
            continue
        try:
            page = url.read().decode('ascii', 'ignore')
        except httplib.IncompleteRead as e:
            page = e.partial
        soup = BeautifulSoup(page, 'html.parser')
        pnames = []
        if(q == 1):
            list(map(lambda n: pnames.extend(list(findNames(n))), soup.find_all(string = re.compile('[A-Z][a-z]+ [A-Z][a-z]+'))))
        elif(q == 2):
            list(map(lambda n: pnames.extend(list(findDates(n))), soup.find_all(string = re.compile('((?:(?:January|February|March|April|May|June|July|August|September|October|November|December) (?:[1-3]?[1-9], )?)?[1-9]{4})'))))
        elif(q == 3):
            list(map(lambda n: pnames.extend(list(findPlaces(n))), soup.find_all(string = re.compile('(?:[0-9]+ [A-Z][a-z]+)|(?:[A-Z][a-z]+)|(?:[0-9]+ [A-Z][a-z]+ [A-Z][a-z]+)|(?:[A-Z][a-z]+ [A-Z][a-z]+)|(?:[A-Z][a-z]+, [A-Z][a-z]+)|(?:[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+)'))))
        for n in pnames:
            if(n in names):
                names[n] += 1
            else:
                names[n] = 1
    sortedNames = sorted(names.items(), key = lambda x: x[1], reverse = True)
    #print names[sortedNames[0][0]]
    try:
        return sortedNames[0][0]
    except:
        return "Invalid Query"
def main():
  results=[]
  wiki="site:en.wikipedia.org "
  searchquery=[]
         
  toopen='C:/users/iceman/Desktop/'+'list1.txt'     ####Input file of search queries
  filewriter=open(toopen,"r") #### 
  for line in filewriter.readlines():
   line=re.sub(r'\s', '', line)
   print line
   searchquery.append(line)
  
  i=0
  filename=path+'search_results'+ ".txt"
  FILE=open(filename,"w")
  for term in searchquery:
    time.sleep(1)
    
    for url in search(wiki+term,stop=13):
	  flag=0
	  print url
	  if url.find(".jpg")>0 or (url.find(".png") or url.find("jpeg")) or (url.find("%")) > 0: ######## ignores .png files, .jpeg files
	    flag=1
	  if url.find("Category:")==-1 and flag ==0 :              ### ignores category files
	    str=cleanstring(url);
	    print str
	    FILE.writelines(str);         ########## Writes result to the file.
Example #16
0
def get_hot_from_subreddit(message, raw):
	if raw:
		return

	print "Getting from subreddit..."

	try:
		return ["Hot from /r/{}! (Found in Reddit) || ".format("".join(message["arguments"][2:])) + " | ".join([str(x) for x in r.get_subreddit("".join(message["arguments"][2:]).lower()).get_hot(limit=min([int(message["arguments"][1]), 10]))])]

	except IndexError:
		return "Syntax: webabout <num_hots> <topic>"

	try:
		gs = google.search(" ".join(message["arguments"][2:]), num=min([int(message["arguments"][1]), 5]), start=0, stop=min([int(message["arguments"][1]), 5]), pause=2)

	except IndexError:
		return "Syntax: webabout <num_hots> <topic>"

	print "Got Google search!"

	results = []

	for website in gs:
		request = requests.get(str(website))

		print "Parsing website: {}".format(str(website))

		try:
			results.append("{} -> {}".format(str(website.encode("utf-8")), BeautifulSoup.BeautifulSoup(request.text.encode("utf-8")).title.string.encode("utf-8")))

		except AttributeError:
			results.append(website.encode("utf-8") + " (No URL!)")

	return "Hot about {}! (Found in Google) || ".format(" ".join(message["arguments"][2:])) + " | ".join(results)
Example #17
0
def get_google_result(search_keywords):
    if search_keywords == 'help':
        help_message = "To use this bot start message with @google \
                        followed by what you want to search for. If \
                        found, Zulip will return the first search result \
                        on Google. An example message that could be sent is:\
                        '@google zulip' or '@google how to create a chatbot'."
        return help_message
    else:
        try:
            urls = search(search_keywords, stop=20)
            urlopen('http://216.58.192.142', timeout=1)
        except http.client.RemoteDisconnected as er:
            logging.exception(er)
            return 'Error: No internet connection. {}.'.format(er)
        except Exception as e:
            logging.exception(e)
            return 'Error: Search failed. {}.'.format(e)

        if not urls:
            return 'No URLs returned by google.'

        url = next(urls)

        return 'Success: {}'.format(url)
Example #18
0
def whoAnswer(question,n): #returns top n occurrences of names from question
    numPages = 10;#uses top 10 results
    links = google.search("Who " + question,num=numPages,start=0,stop=0,pause=2.0)
    txt = ""
    for i in range(numPages):
        txt += BeautifulSoup(google.get_page(links.next())).getText()
    return getNames(txt,n)
Example #19
0
def find(q):
    for url in google.search(q, num = 5, stop = 10):
        
        u = urlopen(url)
        txt = BeautifulSoup(u.read())
        
        #reads text in body tags
        txt = txt.body
        if (txt != None):
            txt = txt.prettify().encode('UTF-8')

            if "who" in q.lower():
                names = regex.findNames(txt)
                addVals(names)
            elif "when" in q.lower():
                dates = regex.findDates(txt)
                print dates
                addVals(dates)

    strresults = ""
    for i in narrow(results):
        print i
        #Why breaks no work
        strresults += i + " OR "
    if (len(strresults) > 4):
        strresults = strresults[0:len(strresults)-4]
    return strresults
    results.clear()
def download_documents(query):
    """Download .doc files for given search query from Google

    Args:
        query (string): Search query

    """

    # Modified Google search query for .doc files
    query += " filetype:doc"

    doc_count = 0
    for url in search(query, stop=50):

        # Cleaning the encoded URL for filename
        filename = urllib.unquote(url).split('/')[-1]

        print "\nDownloading: \nFilename: %s\nURL: %s" % (filename, url)
        urllib.urlretrieve(url, filename)
        # print urllib.unquote(url)

        doc_count += 1
        # Hacky check to get desired number of docs
        global NUM_DOCS_DOWNLOAD
        if doc_count == NUM_DOCS_DOWNLOAD:
            break
def search():
    """
    Performs a search if given a query via a GET parameter (query) and returns a
    rendered Jinja template

    Params:
        none

    Returns:
        A rendered Jinja template (index.html) with the following parameters:
            QUERY = query
            RESULT = result
    """

    if "query" not in request.args:
        return render_template("index.html")

    # Get the query from the form
    query = request.args["query"]

    # Run a Google search for the query
    gsearch_res = list(google.search(query, num=10, start=0, stop=10))

    # Get the web page and strip the tags. Note that 'wgotten' is a pun from
    # the wget command for retrieving webpages from the command line.
    wgotten = urllib2.urlopen(gsearch_res[0])
    webpage = wgotten.read()
    beautified_soup = bs4.BeautifulSoup(webpage,'html')

    # Run RegEx parsing
    result = regex.parse(query,beautified_soup)

    # Return the rendered Jinja template
    return render_template("index.html", QUERY=query, RESULT=result)
Example #22
0
	def __init__(self, query='tea', topVectRaw='topvect.csv'):
		# self.mode = mode
		self.start_urls = list(search(query, stop=20))
        
		# if mode =='tfidf':
			# with open('..\iphigeni\webidf.csv') as infile:
				# vectRead = reader(infile)
				# next(vectRead)
				# for line in vectRead: self.idf[line[0]] = float(line[1])
		# if mode == 'svec':
			# with open(self.svecFilename) as infile:
				# k = 1
				# for line in infile:
					# self.svecdict[line.split()[0]] = k
					# k += 1
			# self.svecfile = linereader.dopen(self.svecFilename)
			
		# Use the veblen spider to generate topVectRaw
		with open(topVectRaw) as infile: 
			vectRead = reader(infile)
			next(vectRead)
			# if mode in ['tf', 'tfidf']:
			for line in vectRead:
				try: self.topic[line[1]] += int(line[0])
				except KeyError: self.topic[line[1]] = int(line[0])
Example #23
0
	def createDiskList(self):
		print self.name
		# use below method to search with google
		# https://breakingcode.wordpress.com/2010/06/29/google-search-python/
		for url in search(keyword + ' site:' + self.url, num=RESULTS, start=0, stop=RESULTS):
			if url.find('.xml') == -1:
				print '  ',url
				soup = BeautifulSoup(urllib.urlopen(url))
				tdatetime = dt.now()
				tstr = tdatetime.strftime('%Y/%m/%d-%H:%M')
				f = open(self.name+'_test.dat', 'w')
				f.write(soup.prettify())
				print 'write ',self.name+'_'+tstr+'.html'
				f.close()
				#print soup.prettify()
				try:
					title = soup.find('title').text
				except AttributeError:
					print 'AttributeError:'
				except:
					print 'Unexpected error', sys.exc_info()[0]
				else:
					self.disk.append(Disk(title,url))
					try:
						print '     > ',title
					except UnicodeEncodeError:
						print "UnicodeEncodeError", "TODO:need to replace illigal words to display terminal"
					else:
						pass
			else:
				pass
Example #24
0
	def print_link(self):
		with IPTV(stdout=devnull, stderr=devnull):
			for url in google.search(self.query, num=60, stop=1):
				parsed = urlparse(url)
				self.parsedUrls.append(parsed.scheme + '://' + parsed.netloc +"\n")
		time.sleep(1)
		print '\n'.join(self.parsedUrls)
def get_wiki_url(film_info, year):
    film_query = "%s %s wikipedia" % (str(film_info['film']), year)
    print film_query
    search_results = []
    for url in search(film_query, stop=20):
            search_results.append(url)
    return search_results[0]
Example #26
0
    def get_company_domain(self, searchKey):
        '''looks for the company website from the top five
           url return by google search. If company website
           is found then it parses the url to get domain name
        '''
        search_result = search(searchKey, stop=5)

        for url in search_result:
            keywords = searchKey.split(" ")

            print keywords
            if keywords[0] in url.lower():
                # if links is wikipedia link then parse the webpage to get
                # company homepage
                if "en.wikipedia.org" in url:
                    chomepage = get_company_website(url)
                    if chomepage is not None:
                        return extract_domainname(chomepage)
                return extract_domainname(url)

            try:
                htmlpage = get_page(url)
                soup = BeautifulSoup(htmlpage)

                title = soup.title.text.lower()

                if keywords[0] in title:
                    return extract_domainname(url)
            except:
                print searchKey.ljust(52) + ": Can't parse web page at " + colored(url.ljust(100), 'blue')
Example #27
0
def submit_err(error_msg,no_solution):
	print "\n"
	print "Please wait ................ "
	print "Pyoverflow is checking for the top solutions for your code problems" 
	print ":)"
	print "\n" 

	try:
		search_word = "python"+" "+str(error_msg)
		for url in search(search_word, stop=2):
			search_result.append(url)
	except Exception as e:
		print e
		sys.exit(0)
	
	try:	
		if(int(no_solution) > 0):
			for i in range(0,int(no_solution)):
				#print search_result[i]
				print "Opening"+"\t"+str(i)+" solution in browser"
				webbrowser.open_new_tab(search_result[i])
		else:
			print "Number of solutions should be > 0"

	except Exception as e:
		print e
		sys.exit(0)
Example #28
0
def get_prospect(s, pool, company, name, title, domain):
	logging.debug('Waiting to join the pool')
	with s:
		thread_name = threading.currentThread().getName()
		pool.makeActive(thread_name)
		if not len(domain):
			try:
				domainsearch = google.search(company)
				domain = domainsearch.next()
				time.sleep(0.1)
				while 'wiki' in domain.lower() and not 'wiki' in company:
					domain = domainsearch.next()
				w = domain.find('www')
				if w >= 0:
					domain = domain[w+4:]
				w = domain.find('http')
				if w >= 0:
					domain = domain[w+7:]
				if domain[-1] == '/':
					domain = domain[:-1]		

			except:
				e = sys.exc_info()[0]
				print "ERROR: ", str(e)
				
		outrow = [name, company, domain, title]
		print outrow
		pool.addPerson(outrow)	

		pool.makeInactive(thread_name)
def showsome(searchfor):
  out = []

  for url in search(searchfor, stop=5):
    out.append(url)

  return out
def search_who(question):
    g = search(question,num=5,stop=5)
    info = []
    htmls = [x for x in g]
    for url in htmls:
        u = urlopen(url)
        item = BeautifulSoup(u.read())
        item = item.get_text().replace("\n"," ")
        info.append(item)
    #print info

    people = {}
    pages = []
    for a in info:
        pages.append(re.findall("[A-Z][a-z]+\s[A-Z][a-z]+",a))
        #pages.append(re.findall("[A-Z][a-z]+\s([A-Z][a-z]+\s)?[A-Z][a-z]+",a))
    #print pages
    
    for p in pages:#list of lists
        for peep in p:
            #print peep
            firstLast= peep.split()#"Angela Lin" -> ["Angela","Lin"]
            if firstLast[0] in firstDic.keys() and firstLast[1] in surnameDic.keys():
                if peep not in people.keys():
                    people[peep] = 1
                else:
                    people[peep]+= 1
                    
    #print people.keys();
    #for p in people.keys():
    #    print p
    #    print people.get(p,None)

    return max(people, key=people.get)
Example #31
0
def Tell_Me_Alfred(query="The Himalayas are", answer_type="Description"):

    global ALL_RESULTS
    global ALL_ANSWERS_SORTED
    global ALL_ANSWERS
    ALL_RESULTS = []
    ALL_ANSWERS = dict()

    for url in search(query, stop=20):
        try:
            #print(url)
            ALL_RESULTS.append(url)
        except:
            print "URL Error"

    #ALL_RESULTS=['http://www.victoriamemorial-cal.org/', 'http://www.tripadvisor.in/Attraction_Review-g304558-d311680-Reviews-Victoria_Memorial_Hall-Kolkata_Calcutta_West_Bengal.html', 'http://kolkata.cityseekr.com/venue/403224-victoria-memorial', 'http://www.thecityguide.in/Kolkata/Art-Entertainment/SGGG/Victoria-Memorial-Elgin', 'http://www.justdial.com/Kolkata/Victoria-Memorial-Hall/033P6853927_BZDET', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#History', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Finance', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Design', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Construction', 'http://ww.itimes.com/poll/best-image-of-victoria-memorial-kolkata-54ad5bd294fa2', 'http://www.trekearth.com/gallery/Asia/India/East/West_Bengal/Kolkata_(Calcutta)/photo1412050.htm', 'http://www.culturalindia.net/monuments/victoria-memorial.html', 'http://knowindia.gov.in/knowindia/culture_heritage.php?id=68', 'http://www.youtube.com/watch?v=C_0IvslcRqU', 'http://www.ixigo.com/victoria-memorial-kolkata-india-ne-1019165', 'http://www.lonelyplanet.com/india/kolkata-calcutta/sights/architecture/victoria-memorial', 'http://www.indianholiday.com/tourist-attraction/kolkata/victoria-memorial.html', 'http://www.mapsofindia.com/kolkata/places-of-interest/famous-monuments/victoria-memorial.html', 'https://www.facebook.com/pages/Victoria-Memorial-Hall-Kolkata/113100222172879', 'http://www.iloveindia.com/indian-monuments/victoria-memorial.html', 'http://www.kolkata.org.uk/tourist-attractions/victoria-memorial.html', 'http://www.vmsb.org/contact_us.html', 'http://mocomi.com/victoria-memorial-facts/', 'http://www.journeymart.com/de/india/west-bengal/kolkata/victoria-memorial.aspx', 'http://www.theincredibleindiatravel.com/victoria-memorial-hall-india/victoria-memorial.html', 'http://goindia.about.com/od/cities/ig/Kolkata-Photo-Gallery/Victoria-Memorial.htm', 'http://zeenews.india.com/news/sci-tech/victoria-memorial-museum-blackout-in-kolkata-for-earth-hour_1569445.html']
    #ALL_RESULTS=['http://en.wikipedia.org/wiki/Himalayas', 'http://en.wikipedia.org/wiki/Paro_Taktsang', 'http://en.wikipedia.org/wiki/List_of_Himalayan_peaks_and_passes', 'http://en.wikipedia.org/wiki/Indian_Himalayan_Region', 'http://en.wikipedia.org/wiki/Indian_Plate', 'http://simple.wikipedia.org/wiki/Himalayas', 'http://www.thehindu.com/sci-tech/energy-and-environment/emissions-from-biomass-burning-cross-the-himalayas/article7105899.ece', 'http://www.npr.org/blogs/goatsandsoda/2015/04/15/399579066/in-search-of-the-missing-trekkers-in-nepal-s-muddy-morass', 'http://www.nzherald.co.nz/bay-of-plenty-times/news/article.cfm?c_id=1503343&objectid=11434737', 'http://www.youtube.com/watch?v=HuSHOQ6gv5Y', 'http://www.britannica.com/EBchecked/topic/266037/Himalayas', 'http://www.english-online.at/geography/himalayas/himalaya-mountain-range.html', 'http://www.himalayanfootsteps.com/destinations/where-are-the-himalayas/', 'http://www.mountainprofessor.com/the-himalaya.html', 'http://www.himalaya2000.com/himalayan-facts/location-of-himalayas.html', 'http://www.unmuseum.org/yeti.htm', 'http://www.hitt-initiative.org/mla/?page_id=390', 'http://www.robinsonlibrary.com/geography/physical/mountains/himalaya.htm', 'http://geography.howstuffworks.com/asia/the-himalayas.htm', 'http://www.kidsdiscover.com/spotlight/himalayas-kids/', 'http://pubs.usgs.gov/gip/dynamic/himalaya.html', 'http://www.todayifoundout.com/index.php/2013/12/himalayas-formed/', 'http://www.pbs.org/wgbh/nova/everest/earth/birth.html', 'http://www.pbs.org/wnet/nature/the-himalayas-himalayas-facts/6341/', 'http://www.pbs.org/wnet/nature/the-himalayas-introduction/6338/', 'http://www.oddizzi.com/teachers/explore-the-world/physical-features/mountains/mountain-case-study/himalayas/', 'https://vimeo.com/121045965', 'http://www.worldwildlife.org/places/eastern-himalayas', 'http://www.answers.com/Q/What_are_the_Himalayas']

    print 'YOUR TOP ANSWERS ARE:'
    c = 0.0
    for res in ALL_RESULTS:
        Exact_Match_Found_flag = 0
        try:
            timeout = 0
            #print 'Checking Source:',res
            response = urllib2.urlopen(res)
            page_data = response.read()
            page_data = BeautifulSoup(page_data)
            page_data = page_data.get_text()
            page_data = page_data.split('.')

            # Read from Individual Web Pages
            if answer_type == "Description":
                Start_T = time.time()
                for line in page_data:
                    Curr_T = time.time()
                    if (Curr_T - Start_T) > 15.0:
                        break
                    if re.findall(query.lower(), line.lower()) != []:
                        c += 1.0
                        line_low = line.lower()
                        line = line_low.split(query.lower())
                        print '==============================================================================='
                        print 'Answer ', c, ':'
                        line = query + line[1] + '.'
                        print line
                        print '\n\nSource: ', res
                        print '==============================================================================='
                        Exact_Match_Found_flag = 1
                        break

            elif answer_type == "Location":
                query_parts = query.split(' ')
                Start_T = time.time()
                for line in page_data:
                    Curr_T = time.time()
                    if (Curr_T - Start_T) > 30.0:
                        break
                    check_next = 0
                    for each_qp in query_parts:
                        if re.findall(each_qp.lower(), line.lower()) == []:
                            check_next = 1
                            break
                    if check_next == 1:
                        continue
                    else:
                        line_parts = line.split(' ')
                        for each_lp in line_parts:
                            if (each_lp in query_parts) or (
                                    each_lp
                                    in IGNORE_LIST):  #Skip the Query Words
                                continue
                            if check_WordNet(
                                    word=each_lp,
                                    def_word='city') or check_WordNet(
                                        word=each_lp,
                                        def_word='country') or check_WordNet(
                                            word=each_lp, def_word='continent'
                                        ) or check_WordNet(word=each_lp,
                                                           def_word='state'):
                                c += 1.0
                                print each_lp
                                if not ALL_ANSWERS.has_key(each_lp):
                                    ALL_ANSWERS[each_lp] = 1
                                else:
                                    ALL_ANSWERS[each_lp] += 1
                                Exact_Match_Found_flag = 1
                                break
                        if Exact_Match_Found_flag:
                            break

            #print 'Finished Checking Source:',res
        except:
            print

    #Give a Probability for One Word Answers
    if answer_type == "Location":

        ALL_ANSWERS_SORTED = []
        all_ans = ALL_ANSWERS.keys()
        for each_ans in all_ans:
            ALL_ANSWERS_SORTED.append([ALL_ANSWERS[each_ans], each_ans])

        ALL_ANSWERS_SORTED.sort()
        print '==============================================================================='
        print 'SUMMARY:'
        print '---------------------------------------------------------------------------'
        for each_sa in range(0, len(ALL_ANSWERS_SORTED)):
            idx = len(ALL_ANSWERS_SORTED) - 1 - each_sa
            print ALL_ANSWERS_SORTED[idx][1]
            print 'Confidence Measure= ', (ALL_ANSWERS_SORTED[idx][0] / c *
                                           100.0), '%'
            print '---------------------------------------------------------------------------'
        print '==============================================================================='
Example #32
0
def wechat(request):
    if request.method == 'GET':
        # 检验合法性
        # 从 request 中提取基本信息 (signature, timestamp, nonce, xml)
        signature = request.GET.get('signature')
        timestamp = request.GET.get('timestamp')
        nonce = request.GET.get('nonce')

        if not wechat_instance.check_signature(
                signature=signature, timestamp=timestamp, nonce=nonce):
            return HttpResponseBadRequest('Verify Failed')

        return HttpResponse(
            request.GET.get('echostr', ''), content_type="text/plain")


    # 解析本次请求的 XML 数据
    try:
        wechat_instance.parse_data(data=request.body)
    except ParseError:
        return HttpResponseBadRequest('Invalid XML Data')

    # 获取解析好的微信请求信息
    message = wechat_instance.get_message()

    if isinstance(message, TextMessage):
        # 当前会话内容
        content = message.content.strip()
        if content == '博客' or content == 'blog' or content == '最新':
            return HttpResponse(wechat_instance.response_news(get_new_blogposts(request)), content_type="application/xml")
        if content == '功能':
            reply_text = (
                '目前支持的功能:最新博客、博客搜索\n' +
                '回复【最新】或者【博客】,将返回最新博客' +
                '回复【search:ionic】,将搜索ionic相关内容 \n' +
                '正在实现: Google、Wiki'
            )
            response = wechat_instance.response_text(content=reply_text)
            return HttpResponse(response, content_type="application/xml")

        if 'wiki' in content or '维基' in content:
            import wikipedia
            wiki_content = content.replace("wiki:", "")
            print wiki_content
            wiki = wikipedia.page(wiki_content)

            print wiki.title,wiki.summary,wiki.url
            message = [{
                'title': wiki.title,
                'picurl': '',
                'description': wiki.summary,
                'url': wiki.url
            }]

            return HttpResponse(wechat_instance.response_news(message), content_type="application/xml")

        if 'google' in content:
            from google import search
            for url in search(content.replace('google:', ''), stop=1):
                print(url)

            return HttpResponse(wechat_instance.response_news(get_new_blogposts(request)), content_type="application/xml")
        if 'search' in content:
            keyword = content.replace('search:','')
            blogpost = BlogPost.objects.search(keyword)[:5]
            messages = blogpost_to_array(blogpost)
            messages.append({
                'title': '在https://www.phodal.com/上查看所有结果',
                'picurl': 'https://www.phodal.com/static/media/uploads/search.jpg',
                'url': 'http://www.phodal.com/search/?q=' + 'keyword'
            })
            return HttpResponse(wechat_instance.response_news(messages), content_type="application/xml")
        else:
            response = get_new_blogposts(request)
            message = {
                'title': '稍等:Phodal君正在实现功能中。正在为你返回最新文章。',
                'picurl': 'https://www.phodal.com/static/phodal/images/bg.jpg',
                'description': '稍等:Phodal君正在实现功能中。现在为你返回最新文章。',
                'url': 'https://www.phodal.com/',
            }
            response.insert(0, message)
            return HttpResponse(wechat_instance.response_news(response), content_type="application/xml")

    elif isinstance(message, VoiceMessage):
        reply_text = '语音信息我听不懂/:P-(/:P-(/:P-('
    elif isinstance(message, ImageMessage):
        reply_text = '图片信息我也看不懂/:P-(/:P-(/:P-('
    elif isinstance(message, VideoMessage):
        reply_text = '视频我不会看/:P-('
    elif isinstance(message, LinkMessage):
        reply_text = '链接信息'
    elif isinstance(message, LocationMessage):
        reply_text = '地理位置信息'
    elif isinstance(message, EventMessage):
        if message.type == 'subscribe':
            reply_text = '感谢您的到来!回复【功能】返回使用指南'
            if message.key and message.ticket:
                reply_text += '\n来源:二维码扫描'
            else:
                reply_text += '\n来源:搜索公众号名称'
        elif message.type == 'unsubscribe':
            reply_text = '取消关注事件'
        elif message.type == 'scan':
            reply_text = '已关注用户扫描二维码!'
        elif message.type == 'location':
            reply_text = '上报地理位置'
        elif message.type == 'click':
            reply_text = '自定义菜单点击'
        elif message.type == 'view':
            reply_text = '自定义菜单跳转链接'
        elif message.type == 'templatesendjobfinish':
            reply_text = '模板消息'
    else:
        reply_text = '稍等:Phodal君正在实现功能中。'

    response = wechat_instance.response_text(content=reply_text)
    return HttpResponse(response, content_type="application/xml")
Example #33
0
#!/usr/bin/env python
# Get the first 20 hits for: "Breaking Code" WordPress blog
from google import search
for url in search('site:http://s2.lmcdn.fr/multimedia/', stop=5130):
    print(url)
    myfile = open("urls.txt", "a")
    myfile.write(str(url) + "\n")
    myfile.close()
    b = b[4:6]
    if (b[1] != '0'):
        b = b[0]
    b = int(b)
    #print(b)
    if (b > 2):
        print(1)
    else:
        print(-1)
except:
    print(-1)

#google search
c = 0
query = "info" + url
for j in search(query, tld="co.in", num=10, stop=1, pause=0):
    tsd, td, tsu = extract(j)

    hurl = td + '.' + tsu
    if (hurl == host):
        c += 1
if (c != 0):
    print(1)
else:
    print(1)
#no of links pointing to page

print('NA')

#phistank database
import requests
from bs4 import BeautifulSoup
from google import search
import urllib

query = raw_input()
quer = query + " tutorials point"
add = []
for j in search(quer, tld="com", num=1, stop=1, pause=2):
    add.append(j)

if "https://www.tutorialspoint.com" in add[0]:
    url = "https://www.tutorialspoint.com/" + str(query) + "/"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    data = soup.find("button",
                     class_="btn btn-default btn-sm btn-buy-tutorial")
    data = data.find('a')['href']

    url = "https://www.tutorialspoint.com" + data
    print url
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    data = soup.find_all("h1")
    rcf = []
    for i in data:
        rcf.append(i.find("a"))
    url = "https://tutorialspoint.com" + rcf[2]['href']
    urllib.urlretrieve(url, query + ".pdf")
else:
    print "Sorry no results found"
Example #36
0
import urllib.request
from bs4 import BeautifulSoup
from google import search

lst = []
all_links = ['http://www.gemschicago.org Tuition',
        'https://www.latinschool.org Tuition',
        'http://www.nordangliaeducation.com Tuition',
        'http://www.fwparker.org Tuition']
for link in all_links:
    for url in search(link,lang='en',stop=3):
        lst.append(url)
    search_link = lst[0]
    request = urllib.request.Request(search_link)
    response = urllib.request.urlopen(request)
    soup = BeautifulSoup(response, "html.parser")
    print('---------------------------------------------------------------')
    print(search_link)
    for table_tag in soup.find_all('td'):
        print(table_tag.text)
    lst = []
Example #37
0
def googleSearch():
    lol = raw_input(color.UNDERLINE + "" + T + "Query>" + color.END)
    for url in search(lol, tld='com', lang='es', stop=50):
        print("" + G + "Site: " + W + "" + url)
def search(what):
   for url in google.search(what, lang='ru',stop=5):
       return (url)
from google import search
import pprint

KEYWORDS = 'duyetdev'

data = []
for d in search(KEYWORDS, tld='com.vn', lang='vi', stop=10):
    data.append(d)

print pprint.pprint(data)
Example #40
0
# the script recieves an input argument from the user and displays the important information regarding the hacks to the user
# can be found usefull in some cases

import re  # regex --regular expression operation handling library in python....USED HERE FOR THE PATTERN MATCHING HIGHLY NEEDED HERE!!!

import sys
import google
import urllib2

if len(sys.argv) < 2:  # argument count validation supplied by the user

    print sys.argv[0] + ": <dict>"
    sys.exit(2)

fo = open(sys.argv[1])

for word in fo.readlines():
    print "\n searching the " + word.strip()
    result = google.search(word.strip())

    try:
        for implink in results:
            if re.search("youtube", implink) == None: print implink

    except urllib2.HTTPError, e:

        print "Search not found : " + str(e)

# program ends
# happy hacking :)
Example #41
0
##part 1
# Get the google search results.
##return the URLs
from google import search

search_term = "islamic state"
Newspapers = [
    "champress.net", "timesofindia.indiatimes.com", "aawsat.net",
    "english.people.com.cn", "bbc.co.uk", "tehrantimes.com", "dw.de"
]
for news in Newspapers:
    gs = search_term + " site:" + news
    f = open("/Users/Aseel/FinalProject/" + search_term + "/" + news + ".txt",
             "w")
    for url in search(gs, num=100, stop=1000):
        f.write(url)
        f.write("\n")

    f.close()
Example #42
0
def main():
    try:

        print "\n\\\\\\\ Welcome to GoooFu //////// \n"
        print "WARNING: Executing this script multiple times may cause Google to block your IP address"
        # FILE CREATION --------------------------------------------------------------------------------
        fName = str(raw_input("Enter filename to save output to: "))
        fileName = fName + '.html'
        f = open(fileName, "w+")
        print "Search results for 'site' & 'string' directive: \n"
        # READING VALUES -------------------------------------------------------------------------------
        searchSite = str(raw_input("Enter the URL: "))
        searchString = str(raw_input("Enter your search string: "))
        searchPage = str(raw_input("\nEnter the page you want to check for: "))
        fileType = str(raw_input("\nEnter filetype to search: "))
        # SEARCH SCRIPT FOLLOWS ------------------------------------------------------------------------
        print "\nPlease wait. Fetching  .. .."
        f.write('<h1>Site: ' + searchSite + ' & String: ' + searchString +
                ' search output:</h1>')
        query = 'site:' + searchSite + ' ' + searchString
        #br = mechanize.Browser()
        #Session = br.open("https://www.google.com/")
        try:
            for j in search(query, tld="co.uk", num=5, stop=1, pause=5):
                print(j)
                f.write('<p> <a href="' + j + '">' + j + '</a> </p>' + '\n')
                #sleep(randint(0.5,1.5))

        except:
            print "HTTP Error"

        print "\nSearch results with 'inurl' directive: "
        print "\nPlease wait. Fetching  .. .."
        f.write('<h1>Site: ' + searchSite + ' & Page: ' + searchPage +
                ' search output:</h1>')
        query2 = 'inurl:' + searchPage + ' ' + 'site:' + searchSite
        try:
            for i in search(query2, tld="co.au", num=5, stop=1, pause=5):
                print(i)
                f.write('<p> <a href="' + i + '">' + i + '</a> </p>' + '\n')
                #sleep(randint(1.0,1.5))

        except:
            print "HTTP Error"

        print "\nFiletype search on supplied URL: "
        print "\nPlease wait. Fetching  .. .."
        f.write('<h1>Site: ' + searchSite + ' & File type: ' + fileType +
                ' search output:</h1>')
        query4 = 'site:' + searchSite + 'filetype:' + fileType
        try:
            for l in search(query4, tld="co.za", num=5, stop=1, pause=5):
                print(l)
                f.write('<p> <a href="' + l + '">' + l + '</a> </p>' + '\n')
                #sleep(randint(1.0,1.5))

        except:
            print "HTTP Error"

        print "\nExecuting search string 'allintitle' search:"
        print "\nPlease wait. Fetching  .. .."
        f.write('<h1>Allintitle search output:</h1>')
        query5 = 'allintitle:' + searchString
        try:
            for m in search(query5, tld="co.in", num=5, stop=1, pause=4):
                print(m)
                f.write('<p> <a href="' + m + '">' + m + '</a> </p>' + '\n')

        except:
            print "HTTP Error"

        print "\nExecuting search string 'intitle' search:"
        print "\nPlease wait. Fetching  .. .."
        f.write('<h1>Intitle search output:</h1>')
        query6 = 'intitle:' + searchString
        try:
            for n in search(query6, tld="co.vi", num=5, stop=1, pause=5):
                print(n)
                f.write('<p> <a href="' + n + '">' + n + '</a> </p>' + '\n')
                #sleep(randint(0.5,1.9))

        except:
            print "HTTP Error"
        f.close()
        print "\nOutput has been saved to %s.html" % fName
        print "\nThank you for using GoooFu. To contribute https://github.com/west-wind/GoooFu"
        print "\nExiting..."
    except KeyboardInterrupt:
        sys.exit(0)
def googleSearch(text):
    text = re.sub("/google ", "", text, count=1)
    answer = ""
    for url in google.search(text, lang='se',num=1, stop=2):
        answer += url + "\n"
    return answer
Example #44
0
 def google_it(dork):
     clear_cookie()
     for title in search(dork, stop=30):
         print(B + ' [!] Profile Found :> ' + C + title)
         time.sleep(0.5)
name = your_list[0][1][2]
# [['This is the first line', 'Line1'],
#  ['This is the second line', 'Line2'],
#  ['This is the third line', 'Line3']]

f = csv.writer(open("fraudSearchResults.csv", "a"))
f.writerow(["Search Terms", "Top 10 URL"])

#don't overwrite old data open("learner.csv", "w") to open("learner.csv", "a")
#fraudArray = [fraud, jail, prison, ponzi scheme, steal, money laundering, pyramid scheme, stole, investigation, investigated, indicted, corrupt, bribery]

#fullSearch = loop through array and combine with name

fullSearch = "fraud" + "AND" + name

for url in search(fullSearch, tld='com', lang='en', stop=10):
    print(url)

    f.writerow([fullSearch, url])

#f.writerow([fullSearch, url])
'''error handling:

from xgoogle.search import GoogleSearch, SearchError
try:
  gs = GoogleSearch("quantum mechanics")
  gs.results_per_page = 100
  results = []
  while True:
    tmp = gs.get_results()
    if not tmp: # no more results were found
    thepage = urllib.request.urlopen(url)
    soup = BeautifulSoup(thepage, "html.parser")
    return soup.title.text


with open('designers.json') as data_file:
    data = json.load(data_file)

client = MongoClient()
db = client.FashionData
Designer = db['Designer']
print(db)

for i in data:
    try:
        designer_name = i['designer']
        website = list(search(designer_name, stop=1))[0]
        i['website'] = website
        db.Designer.insert({'name': designer_name, 'website': website})
        print(designer_name, website)
    except:
        print(designer_name)

    #break

#with open('designers.json', "w") as jsonFile:
#  json.dump(data, jsonFile)

cursor = Designer.find()
for document in cursor:
    print(document)
Example #47
0
def search_multiple_pages(query, link_amount, verbose=False, **kwargs):
    def __config_proxy(proxy_string):
        proxy_type_schema = {
            "http": httplib2.socks.PROXY_TYPE_HTTP,
            "socks4": httplib2.socks.PROXY_TYPE_SOCKS4,
            "socks5": httplib2.socks.PROXY_TYPE_SOCKS5
        }
        proxy_type = get_proxy_type(proxy_string)[0]
        proxy_dict = proxy_string_to_dict(proxy_string)
        proxy_config = httplib2.ProxyInfo(
            proxy_type=proxy_type_schema[proxy_type],
            proxy_host="".join(proxy_dict.keys()),
            proxy_port="".join(proxy_dict.values()))
        return proxy_config

    proxy, agent = kwargs.get("proxy", None), kwargs.get("agent", None)

    if proxy is not None:
        if verbose:
            logger.debug(
                set_color("configuring to use proxy '{}'...".format(proxy),
                          level=10))
        __config_proxy(proxy)

    if agent is not None:
        if verbose:
            logger.debug(
                set_color("settings user-agent to '{}'...".format(agent),
                          level=10))

    logger.warning(
        set_color(
            "multiple pages will be searched using Google's API client, searches may be blocked after a certain "
            "amount of time...",
            level=30))
    results, limit, found, index = set(), link_amount, 0, google_api.search(
        query, user_agent=agent, safe="on")
    try:
        while limit > 0:
            results.add(next(index))
            limit -= 1
            found += 1
    except Exception as e:
        if "Error 503" in str(e):
            logger.fatal(
                set_color(
                    "Google is blocking the current IP address, dumping already found URL's...",
                    level=50))
            results = results
            pass

    retval = set()
    for url in results:
        if URL_REGEX.match(url) and URL_QUERY_REGEX.match(url):
            if verbose:
                logger.debug(set_color("found '{}'...".format(url), level=10))
            retval.add(url)

    if len(retval) != 0:
        logger.info(
            set_color(
                "a total of {} links found out of requested {}...".format(
                    len(retval), link_amount)))
        write_to_log_file(list(retval), URL_LOG_PATH, "url-log-{}.log")
    else:
        logger.error(
            set_color("unable to extract URL's from results...", level=40))
Example #48
0
# -*- coding: utf-8 -*-

from google import search
from bs4 import BeautifulSoup
import urllib

for url in search("クック パッド 内紛 site:newspicks.com", stop=10, lang="en"):
    soup = BeautifulSoup(urllib.urlopen(url))
    print url
    print soup.find("title").text
Example #49
0
def url_cek():
    for url in google.search(dork, num=site_sayi, stop=1):
        dosya = open("urller.txt", "a+")
        dosya.write(url + "\n")
Example #50
0
if DEBUG3: print "    "+"\n    ".join(list(hostnames))


###################
## search hostnames
###################
if DEBUG2: print "Search Hostnames"

key_cnt = 0
for keyword in remain_keywords:
  key_cnt = key_cnt + 1
  if DEBUG3: print '  '+str(key_cnt)+': '+keyword

  cnt = 0
  try:
    for url in search(keyword, stop=5000, pause=60.0):
      cnt = cnt + 1
      hostname = get_hostname(url)

      ## don't save IP address -- we need hostname
      m = re.match("(\d+\.\d+\.\d+\.\d+)", hostname)
      if m is not None:
        continue
      
      if DEBUG3: print('  %d.%d: %s\n    %s' % (key_cnt, cnt, url, hostname))
      hostnames.add(hostname)

      if cnt % 100 == 0:
        store_output_files()
  except:
    print "  search exception! exit!"
Example #51
0
def pastebin_search(args, lookup, reportDir, apiKeyDir):

    userAgent = {'User-agent': 'Mozilla/5.0'}

    #return values
    pasteScrapeUrl = []
    pasteScrapeContent = []
    pasteScrapeResult = []

    # check for empty args
    if args.pastebinsearch is not None:

        for a in args.pastebinsearch:
            #init lists
            scrapeURL = []
            scrapeContent = []

            #iterate the lookup list
            for i, l in enumerate(lookup):

                #init textfiles
                scrapedFile = open(
                    reportDir + l + '/' + l + '_pastebin_content.txt', 'w')
                pasteUrlFile = open(
                    reportDir + l + '/' + l + '_pastebin_urls.txt', 'w')

                #show user whiat is being searched
                print '[+] Searching Pastebin for public pastes containing %s' % (
                    l)
                print '[i] May require a Pastebin Pro account for IP whitelisting'

                #run google query code
                try:
                    #iterate url results from search of dork arg and supplied lookup value against pastebin. return top 20 hits
                    for url in search(str(a) + ' ' + str(l) +
                                      ' site:pastebin.com',
                                      stop=20):
                        #delay 1 second to be polite
                        time.sleep(1)
                        #append results together
                        scrapeURL.append(url)
                        if args.verbose is True:
                            print '[+] Paste containing "%s" and "%s" found at: %s' (
                                a, l, url)
                except Exception:
                    print '[-] Error dorking pastebin URLs, skipping...'
                    pasteScrapeResult.append('Error scraping Pastebin')
                    continue

                for u in scrapeURL:
                    #http://docs.python-guide.org/en/latest/scenarios/scrape/
                    try:
                        page = requests.get(u, headers=userAgent)
                        pasteUrlFile.writelines(u)
                    except:
                        print '[-] Error opening ' + u + ':'
                        pasteScrapeResult.append('Error opening %s' % u)
                        continue

                    #build html tree
                    tree = html.fromstring(page.content)

                    #if verbose spit out url, search term and domain searched
                    if args.verbose is True:
                        print '[+] Looking for instances of %s and %s in %s \n' % (
                            a, l, url)
                    #grab raw paste data from the textarea
                    rawPasteData = tree.xpath(
                        '//textarea[@class="paste_code"]/text()')

                    #search lines for lookup and keyword
                    for line in rawPasteData:
                        #regex for the lookup value (domain) in that line
                        #if re.search((str(l)), line):
                        if str(l) in line:
                            #if the argument search term is in the line
                            if a in line:
                                scrapedFile.writelines(a)

                return pasteScrapeResult
Example #52
0
from google import search
import urllib
from bs4 import BeautifulSoup
import webbrowser
import pyperclip


def google_scrape(url):
    thepage = urllib.urlopen(url)
    soup = BeautifulSoup(thepage, "html.parser")
    return soup.title.text


i = 1
query = pyperclip.paste()

for url in search(query, stop=10):
    a = google_scrape(url)
    print str(i) + ". " + a
    print url
    webbrowser.open(url)
    i += 1
# Remove extra whitepsace and newlines
textData = textData.replace('\\n', '')
textData = ' '.join(textData.split())
# Split data, as google only accepts querys of 32 words or less
textData = split(textData, 250)

# Loop through each query
for line in textData:
    queryNum += 1
    print("")

    deltaTime = time.time()
    print("Searching for query number " + str(queryNum))
    urls = []
    # This is where we search google for the urls
    for url in search('"' + line + '"', stop=3, num=3):
        urls.append(url)
        time.sleep(1)  # Avoid google detecting our bot
    if len(urls) == 0:
        print("No results found")
        continue
    print("Search completed in " + str(round(time.time() - deltaTime, 2)) +
          " seconds, beginning comparison")

    deltaTime = time.time()
    # Now we process each url
    for url in urls:
        # Optimze popular websites for text
        elementsToSearch = {}
        if "stackoverflow" in url:
            elementsToSearch = {"code"}
Example #54
0
    def google_it (site,dork):
	clear_cookie()
	for title in search(dork, stop=30):
	    print(GR+' [!] Site Found :> '+B+title)
	    time.sleep(0.5)
Example #55
0
]

with codecs.open("homepages.csv", "a", "utf8") as outfile:
    facultydictkeys = list(facultydict.keys())
    random.shuffle(facultydictkeys)
    for name in facultydictkeys:
        # Skip any homepages we have already in the database.
        if name in homepages:
            # ...unless it's a Google search page, then we will try again to fix it.
            match = re.search("www.google.com", homepages[name])
            if match == None:
                continue
        str = name + " " + facultydict[name]
        name = name.decode("utf8")
        # Grab first result from Google search.
        results = google.search(str, stop=1)
        actualURL = "FIXME"
        for url in results:
            actualURL = url
            matched = 0
            for t in trim:
                match = re.search(t, url)
                if match != None:
                    matched = matched + 1
            if matched == 0:
                break

        # Output the name and this resolved URL.
        match = re.search("www.google.com", actualURL)
        print(name)
        try:
Example #56
0
import newspaper
import google
import re
from bs4 import BeautifulSoup


therapists = ['Physiotherapist', 'respiratory therapist', 'Hematologist']
for p in therapists:
    search_results = google.search(p+" salary department of labor", stop=1, lang="en")
    print("*"*30)
    print(p.upper())
    print("_"*15)
    for link in search_results:
        #print(link)
        data = newspaper.Article(url=link)
        data.download()
        text=data.html
        soup = BeautifulSoup(text, 'html.parser')
        rows = soup.find_all('p')
        for tr in rows:
            if 'The median annual wage' in str(tr):
                print(str(tr).split(" ")[-4])
                break
        break
Example #57
0
	domain_name = domain_name[:-4]
if domain_name.endswith('.in'):
	domain_name = domain_name[:-3]'''

#print domain_name

if flag_no_title:
    query = domain_name + ' ' + title
else:
    extra = lolmax.hitString(7, web_url)
    query = domain_name + ' ' + extra

print "Query is : " + (query)
# Performs a search on google based on title + domain name
print("URLs are : ")
for url in search(query, stop=7):
    print(url)
    #print type(str(url))
    domain_found = get_tld(str(url))
    if (domain_name == domain_found):
        print(
            "========================================Match found====================================="
        )
        flag_found_in_list = True
    #Top URL's fetched on google search of title + web_url

if flag_found_in_list:
    print("Not a phishing site ! You may proceed !")
else:
    print("No match found. Phishing Site ! Do not proceed !")
Example #58
0
# Get IMDB, RT, Meta and Audience scores for a movie by name.
# Run this module from the terminal to enter a movie name and see its score.

from ratings.get_score import get_score
from google import search
import requests
from bs4 import BeautifulSoup

movie = input('Enter movie name : ')
imdb_url = rotten_url = None

# print('imdb:')
count = 0
for url in search(movie + ' imdb', stop=1):
    imdb_url = url
    break

# print('IMDb URL : ' + imdb_url)

# print('\nRT:')
for url in search(movie + ' rottentomatoes', stop=1):
    rotten_url = url
    break
#
print('IMDb URL : ' + str(imdb_url))
print('RT URL : ' + str(rotten_url))
# exit()

movie_score = get_score(imdb_url, rotten_url)
print('Film Score : ' + str(movie_score))
Example #59
0
result = []

file = codecs.open("example.txt", "ab+", "utf-8")
final = []

for w in mylist:
    try:
        if codew:
            article = wikipedia.summary(w,
                                        sentences=5,
                                        redirect=True,
                                        auto_suggest=True)
            file.write("<h2>" + w + "</h2><br><p>" + article + "</p>")
            file.write("<br>\n")
        if codeg:
            for url in search(w + ' wikipedia', stop=1):
                if "wikipedia" in url:
                    result.append(url)

            final.append(result[0])
            result[:] = []

            # if any(x in article for x in original):
            #     file.write("<h2>"+w+"</h2><br><p>"+article+"</p>")
            #     file.write("<br>\n")

    except:
        pass

if codeg:
    final = list(set(final))
Example #60
0
#Get's Google Search Results

from google import search
for url in search('josh voss linkedin', stop=20):
    print url