def dangerous_retrieve_urls(topic, num = 10, saftey = 5.0): ''' Add a exception check that if this method returns a list of size zero, google has ip blocked us and we can use our backup crawler, (via google's official api) ''' top_news, niche_news = [], [] query = "" if "news" != topic[-4:]: # If the last four chars don't equal news query = str(topic) + " news" try: for top in g.search(query, tld='com', lang='en', num=num, start=0, stop=num, pause=saftey): if is_news_link(top): # Note that we are appending tuples of the date # and the news link, not just the link top_news.append(NewsWebsite(raw_link=top)) for niche in g.search(query, tld='com', lang='en', num=num, start=300, stop=300+num, pause=saftey): if is_news_link(niche): niche_news.append(NewsWebsite(raw_link=niche)) except Exception: print "Google has blocked us" return None return top_news + niche_news
def google_search(self): # Retrieve pages based on domain search query #print "[*] Searching for email addresses in " + str(self.searchMax) + " sites and waiting " + str(self.delay) + " seconds between searches" # Search for emails based on the search string "@<DOMAIN>" print "[*] (PASSIVE) Searching for emails in Google search results: @\"" + self.domain + "\"" googleResults = googlesearch.SearchGoogle(self.domain, self.searchMax, self.delay) emails = googleResults.process() if emails: for e in emails: self.allEmails.append(e) # Search for emails not within the domain's site (-site:<domain>) query = self.domain + " -site:" + self.domain print "[*] (PASSIVE) Searching for emails NOT within the domain's site: " + query for url in google.search(query, start=0, stop=self.searchMax, num=self.numMax, pause=self.delay, extra_params={'filter': '0'}): self.queue.put(url) # Search for emails within the domain's site (site:<domain>) if self.active: query = "site:" + self.domain print "[*] (ACTIVE) Searching for emails within the domain's sites: " + self.domain for url in google.search(query, start=0, stop=self.searchMax, num=self.numMax, pause=self.delay, extra_params={'filter': '0'}): self.queue.put(url) else: print "[*] Active seach (-a) not specified, skipping searching for emails within the domain's sites (*." + self.domain + ")" th.queue.join()
def get_suggestions(self, keywords, keyword_confidence): stackoverflow_query = keywords + " error stackoverflow" askubuntu_query = keywords + " error askubuntu" suggestions = [] question_ids = [] for url in search(stackoverflow_query, tld='es', lang='en', stop=5): hostname = urlparse.urlparse(url).hostname if(hostname == "stackoverflow.com"): path = urlparse.urlsplit(url).path pathx = str(path).split('/') question_ids.append(pathx[2]) if len(question_ids)!=0: print "#DRAK : Fetched Stackoverflow Questions\n#DRAK : Fetching answers" suggestions.extend(self.so.get_suggestions(question_ids)) print "#DRAK : Answers fetched successfully" question_ids = [] for url in search(askubuntu_query, tld='es', lang='en', stop=5): hostname = urlparse.urlparse(url).hostname if(hostname == "askubuntu.com"): path = urlparse.urlsplit(url).path pathx = str(path).split('/') question_ids.append(pathx[2]) if len(question_ids)!=0: print "#DRAK : Fetched AskUbuntu Questions\n#DRAK : Fetching answers" suggestions.extend(self.au.get_suggestions(question_ids)) print "#DRAK : Answers fetched successfully" for suggestion in suggestions: suggestion.keyword_confidence = keyword_confidence return suggestions
def showresults(): if(re.match('^Who', request.form['search'])): ggen = google.search(request.form['search'], stop=1) results = [google.get_page(link) for link in ggen] results = [soupify(x) for x in results] results = [findName(x) for x in results] return render_template("index.html", results = results) elif(re.match('^When', request.form['search'])): ggen = google.search(request.form['search'], stop=1) results = [google.get_page(link) for link in ggen] results = [soupify(x) for x in results] results = [findDate(x) for x in results] return render_template("index.html", results = results) return render_template("index.html", results = "Please enter 'Who' or 'When'")
def get_tokens(query, src="google"): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] if src == "wikipedia": print "Searching Wikipedia for " + query infile = opener.open('http://en.wikipedia.org/wiki/' + query) page = infile.read() else: print "Searching Google for " + query page = "" results = google.search(query, "com", "en", 1, 0, 1, 2.0) for result in results: print "on " + result page = google.get_page(result) #print page raw = nltk.clean_html(page) #parses into tokens and saves as lwoercase tokens = map(lambda x:x.lower(),word_tokenize(raw)) #removes punctuation and empty strings tokens = [s.translate(None, string.punctuation) for s in tokens] tokens = [s for s in tokens if s] return tokens
def get_pubmed_id(pdb_id): try: url = 'http://www.rcsb.org/pdb/explore.do?structureId='+pdb_id.lower() source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source, 'html5lib') string = str(soup.find('meta', {'name':'description'})) pattern = re.compile('<meta content="[0-9A-Za-z]{4}:\s+(.+)" name') article = pattern.match(string).group(1) except BaseException: print(pdb_id.lower()+': '+"Something's wrong") return np.nan for counter in range(1): try: if counter > 0: time.sleep(np.random.random_integers(10, 30)) links = [url for url in search(article, stop=40)] pmids = [split[-1] if split[-2] == 'pubmed' else '' for split in [link.split('/') for link in links]] global pm pm = pmids[np.where(np.array(pmids) != '')[0][0]] except BaseException as e: if str(e).find('503'): raise Exception('Seems google blocked you') continue else: break print(pdb_id.lower()+': '+pm) return pm
def get_wiki_url(term): try: urls = search(str(term)+' wikipedia') #results = wikipedia.search(term) #except wikipedia.exceptions.DisambiguationError as e: #print e.options while True: try: url = urls.next() except: print "Cannot find wiki page for this term, putting as Unknown ", term return '','Unknown' #print url, url.find('wikipedia') if (url.find('wikipedia') !=-1): #print "Found Wikipedia page ", url subterms = re.split("/", url) #print subterms wiki_name = subterms[len(subterms)-1] #try: # wikipedia.summary(wiki_name) #except wikipedia.exceptions.DisambiguationError as e: # wiki_name = handle_disambiguation(wiki_name, e) #print "Url and term: ", url, term, wiki_name break except ValueError: print "Error in google query for term: ", term url = '' wiki_name = "" #exit(0) return url,wiki_name
def whenQuery(s): """ TL;DR: Searches input on google for "when" questions. Arguments: s (string) - A query string; what you are searching for. -Ex. "when did World War I start" Returns: A list of names (strings) from the html of the top 5 google search results from the query. """ results = google.search(s, num = 5, start = 0, stop = 5) r = [] for pg in results: r.append(pg) dates = [] for n in range (0,9): url = urllib2.urlopen(r[n]) page = url.read() soup = bs4.BeautifulSoup(page, "html.parser") raw = soup.get_text() text = re.findall("\d{2}(-|\/)?\d{2}(-|\/)?\d+", raw) for dict in text: for i in dict: if i != unicode('') and i != unicode(' ') and i != unicode('\n'): dates.append(i) return dates
def on_message(ws, message): message = json.loads(message) for i in message: unicodedata.normalize('NFKD', i).encode('ascii', 'ignore') if message['cmd'] == 'chat': if message['text'].lower() == '|ebear' or message['text'].lower() == '|eb': _thread.start_new_thread(run, ()) if message['text'].lower() == '|ebear -s' or message['text'].lower() == '|eb -s': _thread.start_new_thread(run, ((1,))) elif message['text'].lower() == '|source': ws.send(json.dumps({"cmd": "chat", "text": ("%s") % comm['|source']})) elif message['text'].lower() == '|help' or message['text'].lower() == '|h': ws.send(json.dumps({"cmd": "chat", "text": ("%s") % comm['|help']})) elif message['text'].lower()[:3] == '|g ': if len(message['text']) > 3: ws.send(json.dumps({"cmd": "chat", "text": google.search(message['text'][3:])})) else: ws.send(json.dumps({"cmd": "chat", "text": "Usage is |g \"string\""})) elif message['text'].lower() == '|g': ws.send(json.dumps({"cmd": "chat", "text": "Usage is |g \"string\""})) elif message['text'].lower()[:4] == '|ed ': if len(message['text']) > 4: ws.send(json.dumps({"cmd": "chat", "text": ED.search(message['text'][4:])})) else: ws.send(json.dumps({"cmd": "chat", "text": "Usage is |ed \"string\""})) elif message['text'].lower() == '|ed': ws.send(json.dumps({"cmd": "chat", "text": "Usage is |ed \"string\""})) afk(message)
def run(self, keywords=[]): if not keywords: # Check if file exists if not os.path.isfile(self.default_keyword_file): return False else: keywords = [] fp = open(self.default_keyword_file,"r") for line in fp.readlines(): keywords.append(line.strip()) fp.close() self.keywords = keywords print "Using Keywords:{0}".format(self.keywords) try: # Get the hits for the given keywords for keyword in self.keywords: print "KEYWORD:{0}".format(keyword) for url in search(keyword, stop=self.maxResuts): print "Found URL:{0}".format(url) self.urls.append(url) except: print "Something went wrong scraping Google." print "Scraping has stopped" pass return True
def strip(q): """ Args: q: query string Returns: A list of 10 pages, each stripped separately """ r = google.search(q,num=15,start=0,stop=15) l = [] for result in r: l.append(result) text = [] for url in l: try: req = urllib2.Request(url) u = urllib2.urlopen(req) page = u.read() soup = bs4.BeautifulSoup(page,'html') raw = soup.get_text() reg = re.sub("[\t\n ]"," ",raw) text.append(reg) except: pass return text
def Websarch(self,requet,numberResu): """ this method we use http://breakingcode.wordpress.com/2010/06/29/google-search-python/ """ urls=search(requet,stop=numberResu) return urls
def main(): # get a list with first NUM_URLS urls = [] # now have some fun with the results... for url in search(SEARCH_CRITERIA, stop=NUM_URLS): urls.append(str(url)) # filter those that belongs to the same domain urls = not_duplicate(urls) # go to each web page and gather data data = gather_data(urls) # create a CSV file with data just gathered send_to_csv(data) #sys.exit(0) # DONE! print "DONE"
def getQuery(query): pages = google.search(query,start=0,stop=10) q = findQuery(query.lower()) if q == None: return "Your query needs to contain either who, when, or where" names = {} for p in pages: try: url = urllib2.urlopen(p) except: continue try: page = url.read().decode('ascii', 'ignore') except httplib.IncompleteRead as e: page = e.partial soup = BeautifulSoup(page, 'html.parser') pnames = [] if(q == 1): list(map(lambda n: pnames.extend(list(findNames(n))), soup.find_all(string = re.compile('[A-Z][a-z]+ [A-Z][a-z]+')))) elif(q == 2): list(map(lambda n: pnames.extend(list(findDates(n))), soup.find_all(string = re.compile('((?:(?:January|February|March|April|May|June|July|August|September|October|November|December) (?:[1-3]?[1-9], )?)?[1-9]{4})')))) elif(q == 3): list(map(lambda n: pnames.extend(list(findPlaces(n))), soup.find_all(string = re.compile('(?:[0-9]+ [A-Z][a-z]+)|(?:[A-Z][a-z]+)|(?:[0-9]+ [A-Z][a-z]+ [A-Z][a-z]+)|(?:[A-Z][a-z]+ [A-Z][a-z]+)|(?:[A-Z][a-z]+, [A-Z][a-z]+)|(?:[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+)')))) for n in pnames: if(n in names): names[n] += 1 else: names[n] = 1 sortedNames = sorted(names.items(), key = lambda x: x[1], reverse = True) #print names[sortedNames[0][0]] try: return sortedNames[0][0] except: return "Invalid Query"
def main(): results=[] wiki="site:en.wikipedia.org " searchquery=[] toopen='C:/users/iceman/Desktop/'+'list1.txt' ####Input file of search queries filewriter=open(toopen,"r") #### for line in filewriter.readlines(): line=re.sub(r'\s', '', line) print line searchquery.append(line) i=0 filename=path+'search_results'+ ".txt" FILE=open(filename,"w") for term in searchquery: time.sleep(1) for url in search(wiki+term,stop=13): flag=0 print url if url.find(".jpg")>0 or (url.find(".png") or url.find("jpeg")) or (url.find("%")) > 0: ######## ignores .png files, .jpeg files flag=1 if url.find("Category:")==-1 and flag ==0 : ### ignores category files str=cleanstring(url); print str FILE.writelines(str); ########## Writes result to the file.
def get_hot_from_subreddit(message, raw): if raw: return print "Getting from subreddit..." try: return ["Hot from /r/{}! (Found in Reddit) || ".format("".join(message["arguments"][2:])) + " | ".join([str(x) for x in r.get_subreddit("".join(message["arguments"][2:]).lower()).get_hot(limit=min([int(message["arguments"][1]), 10]))])] except IndexError: return "Syntax: webabout <num_hots> <topic>" try: gs = google.search(" ".join(message["arguments"][2:]), num=min([int(message["arguments"][1]), 5]), start=0, stop=min([int(message["arguments"][1]), 5]), pause=2) except IndexError: return "Syntax: webabout <num_hots> <topic>" print "Got Google search!" results = [] for website in gs: request = requests.get(str(website)) print "Parsing website: {}".format(str(website)) try: results.append("{} -> {}".format(str(website.encode("utf-8")), BeautifulSoup.BeautifulSoup(request.text.encode("utf-8")).title.string.encode("utf-8"))) except AttributeError: results.append(website.encode("utf-8") + " (No URL!)") return "Hot about {}! (Found in Google) || ".format(" ".join(message["arguments"][2:])) + " | ".join(results)
def get_google_result(search_keywords): if search_keywords == 'help': help_message = "To use this bot start message with @google \ followed by what you want to search for. If \ found, Zulip will return the first search result \ on Google. An example message that could be sent is:\ '@google zulip' or '@google how to create a chatbot'." return help_message else: try: urls = search(search_keywords, stop=20) urlopen('http://216.58.192.142', timeout=1) except http.client.RemoteDisconnected as er: logging.exception(er) return 'Error: No internet connection. {}.'.format(er) except Exception as e: logging.exception(e) return 'Error: Search failed. {}.'.format(e) if not urls: return 'No URLs returned by google.' url = next(urls) return 'Success: {}'.format(url)
def whoAnswer(question,n): #returns top n occurrences of names from question numPages = 10;#uses top 10 results links = google.search("Who " + question,num=numPages,start=0,stop=0,pause=2.0) txt = "" for i in range(numPages): txt += BeautifulSoup(google.get_page(links.next())).getText() return getNames(txt,n)
def find(q): for url in google.search(q, num = 5, stop = 10): u = urlopen(url) txt = BeautifulSoup(u.read()) #reads text in body tags txt = txt.body if (txt != None): txt = txt.prettify().encode('UTF-8') if "who" in q.lower(): names = regex.findNames(txt) addVals(names) elif "when" in q.lower(): dates = regex.findDates(txt) print dates addVals(dates) strresults = "" for i in narrow(results): print i #Why breaks no work strresults += i + " OR " if (len(strresults) > 4): strresults = strresults[0:len(strresults)-4] return strresults results.clear()
def download_documents(query): """Download .doc files for given search query from Google Args: query (string): Search query """ # Modified Google search query for .doc files query += " filetype:doc" doc_count = 0 for url in search(query, stop=50): # Cleaning the encoded URL for filename filename = urllib.unquote(url).split('/')[-1] print "\nDownloading: \nFilename: %s\nURL: %s" % (filename, url) urllib.urlretrieve(url, filename) # print urllib.unquote(url) doc_count += 1 # Hacky check to get desired number of docs global NUM_DOCS_DOWNLOAD if doc_count == NUM_DOCS_DOWNLOAD: break
def search(): """ Performs a search if given a query via a GET parameter (query) and returns a rendered Jinja template Params: none Returns: A rendered Jinja template (index.html) with the following parameters: QUERY = query RESULT = result """ if "query" not in request.args: return render_template("index.html") # Get the query from the form query = request.args["query"] # Run a Google search for the query gsearch_res = list(google.search(query, num=10, start=0, stop=10)) # Get the web page and strip the tags. Note that 'wgotten' is a pun from # the wget command for retrieving webpages from the command line. wgotten = urllib2.urlopen(gsearch_res[0]) webpage = wgotten.read() beautified_soup = bs4.BeautifulSoup(webpage,'html') # Run RegEx parsing result = regex.parse(query,beautified_soup) # Return the rendered Jinja template return render_template("index.html", QUERY=query, RESULT=result)
def __init__(self, query='tea', topVectRaw='topvect.csv'): # self.mode = mode self.start_urls = list(search(query, stop=20)) # if mode =='tfidf': # with open('..\iphigeni\webidf.csv') as infile: # vectRead = reader(infile) # next(vectRead) # for line in vectRead: self.idf[line[0]] = float(line[1]) # if mode == 'svec': # with open(self.svecFilename) as infile: # k = 1 # for line in infile: # self.svecdict[line.split()[0]] = k # k += 1 # self.svecfile = linereader.dopen(self.svecFilename) # Use the veblen spider to generate topVectRaw with open(topVectRaw) as infile: vectRead = reader(infile) next(vectRead) # if mode in ['tf', 'tfidf']: for line in vectRead: try: self.topic[line[1]] += int(line[0]) except KeyError: self.topic[line[1]] = int(line[0])
def createDiskList(self): print self.name # use below method to search with google # https://breakingcode.wordpress.com/2010/06/29/google-search-python/ for url in search(keyword + ' site:' + self.url, num=RESULTS, start=0, stop=RESULTS): if url.find('.xml') == -1: print ' ',url soup = BeautifulSoup(urllib.urlopen(url)) tdatetime = dt.now() tstr = tdatetime.strftime('%Y/%m/%d-%H:%M') f = open(self.name+'_test.dat', 'w') f.write(soup.prettify()) print 'write ',self.name+'_'+tstr+'.html' f.close() #print soup.prettify() try: title = soup.find('title').text except AttributeError: print 'AttributeError:' except: print 'Unexpected error', sys.exc_info()[0] else: self.disk.append(Disk(title,url)) try: print ' > ',title except UnicodeEncodeError: print "UnicodeEncodeError", "TODO:need to replace illigal words to display terminal" else: pass else: pass
def print_link(self): with IPTV(stdout=devnull, stderr=devnull): for url in google.search(self.query, num=60, stop=1): parsed = urlparse(url) self.parsedUrls.append(parsed.scheme + '://' + parsed.netloc +"\n") time.sleep(1) print '\n'.join(self.parsedUrls)
def get_wiki_url(film_info, year): film_query = "%s %s wikipedia" % (str(film_info['film']), year) print film_query search_results = [] for url in search(film_query, stop=20): search_results.append(url) return search_results[0]
def get_company_domain(self, searchKey): '''looks for the company website from the top five url return by google search. If company website is found then it parses the url to get domain name ''' search_result = search(searchKey, stop=5) for url in search_result: keywords = searchKey.split(" ") print keywords if keywords[0] in url.lower(): # if links is wikipedia link then parse the webpage to get # company homepage if "en.wikipedia.org" in url: chomepage = get_company_website(url) if chomepage is not None: return extract_domainname(chomepage) return extract_domainname(url) try: htmlpage = get_page(url) soup = BeautifulSoup(htmlpage) title = soup.title.text.lower() if keywords[0] in title: return extract_domainname(url) except: print searchKey.ljust(52) + ": Can't parse web page at " + colored(url.ljust(100), 'blue')
def submit_err(error_msg,no_solution): print "\n" print "Please wait ................ " print "Pyoverflow is checking for the top solutions for your code problems" print ":)" print "\n" try: search_word = "python"+" "+str(error_msg) for url in search(search_word, stop=2): search_result.append(url) except Exception as e: print e sys.exit(0) try: if(int(no_solution) > 0): for i in range(0,int(no_solution)): #print search_result[i] print "Opening"+"\t"+str(i)+" solution in browser" webbrowser.open_new_tab(search_result[i]) else: print "Number of solutions should be > 0" except Exception as e: print e sys.exit(0)
def get_prospect(s, pool, company, name, title, domain): logging.debug('Waiting to join the pool') with s: thread_name = threading.currentThread().getName() pool.makeActive(thread_name) if not len(domain): try: domainsearch = google.search(company) domain = domainsearch.next() time.sleep(0.1) while 'wiki' in domain.lower() and not 'wiki' in company: domain = domainsearch.next() w = domain.find('www') if w >= 0: domain = domain[w+4:] w = domain.find('http') if w >= 0: domain = domain[w+7:] if domain[-1] == '/': domain = domain[:-1] except: e = sys.exc_info()[0] print "ERROR: ", str(e) outrow = [name, company, domain, title] print outrow pool.addPerson(outrow) pool.makeInactive(thread_name)
def showsome(searchfor): out = [] for url in search(searchfor, stop=5): out.append(url) return out
def search_who(question): g = search(question,num=5,stop=5) info = [] htmls = [x for x in g] for url in htmls: u = urlopen(url) item = BeautifulSoup(u.read()) item = item.get_text().replace("\n"," ") info.append(item) #print info people = {} pages = [] for a in info: pages.append(re.findall("[A-Z][a-z]+\s[A-Z][a-z]+",a)) #pages.append(re.findall("[A-Z][a-z]+\s([A-Z][a-z]+\s)?[A-Z][a-z]+",a)) #print pages for p in pages:#list of lists for peep in p: #print peep firstLast= peep.split()#"Angela Lin" -> ["Angela","Lin"] if firstLast[0] in firstDic.keys() and firstLast[1] in surnameDic.keys(): if peep not in people.keys(): people[peep] = 1 else: people[peep]+= 1 #print people.keys(); #for p in people.keys(): # print p # print people.get(p,None) return max(people, key=people.get)
def Tell_Me_Alfred(query="The Himalayas are", answer_type="Description"): global ALL_RESULTS global ALL_ANSWERS_SORTED global ALL_ANSWERS ALL_RESULTS = [] ALL_ANSWERS = dict() for url in search(query, stop=20): try: #print(url) ALL_RESULTS.append(url) except: print "URL Error" #ALL_RESULTS=['http://www.victoriamemorial-cal.org/', 'http://www.tripadvisor.in/Attraction_Review-g304558-d311680-Reviews-Victoria_Memorial_Hall-Kolkata_Calcutta_West_Bengal.html', 'http://kolkata.cityseekr.com/venue/403224-victoria-memorial', 'http://www.thecityguide.in/Kolkata/Art-Entertainment/SGGG/Victoria-Memorial-Elgin', 'http://www.justdial.com/Kolkata/Victoria-Memorial-Hall/033P6853927_BZDET', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#History', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Finance', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Design', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Construction', 'http://ww.itimes.com/poll/best-image-of-victoria-memorial-kolkata-54ad5bd294fa2', 'http://www.trekearth.com/gallery/Asia/India/East/West_Bengal/Kolkata_(Calcutta)/photo1412050.htm', 'http://www.culturalindia.net/monuments/victoria-memorial.html', 'http://knowindia.gov.in/knowindia/culture_heritage.php?id=68', 'http://www.youtube.com/watch?v=C_0IvslcRqU', 'http://www.ixigo.com/victoria-memorial-kolkata-india-ne-1019165', 'http://www.lonelyplanet.com/india/kolkata-calcutta/sights/architecture/victoria-memorial', 'http://www.indianholiday.com/tourist-attraction/kolkata/victoria-memorial.html', 'http://www.mapsofindia.com/kolkata/places-of-interest/famous-monuments/victoria-memorial.html', 'https://www.facebook.com/pages/Victoria-Memorial-Hall-Kolkata/113100222172879', 'http://www.iloveindia.com/indian-monuments/victoria-memorial.html', 'http://www.kolkata.org.uk/tourist-attractions/victoria-memorial.html', 'http://www.vmsb.org/contact_us.html', 'http://mocomi.com/victoria-memorial-facts/', 'http://www.journeymart.com/de/india/west-bengal/kolkata/victoria-memorial.aspx', 'http://www.theincredibleindiatravel.com/victoria-memorial-hall-india/victoria-memorial.html', 'http://goindia.about.com/od/cities/ig/Kolkata-Photo-Gallery/Victoria-Memorial.htm', 'http://zeenews.india.com/news/sci-tech/victoria-memorial-museum-blackout-in-kolkata-for-earth-hour_1569445.html'] #ALL_RESULTS=['http://en.wikipedia.org/wiki/Himalayas', 'http://en.wikipedia.org/wiki/Paro_Taktsang', 'http://en.wikipedia.org/wiki/List_of_Himalayan_peaks_and_passes', 'http://en.wikipedia.org/wiki/Indian_Himalayan_Region', 'http://en.wikipedia.org/wiki/Indian_Plate', 'http://simple.wikipedia.org/wiki/Himalayas', 'http://www.thehindu.com/sci-tech/energy-and-environment/emissions-from-biomass-burning-cross-the-himalayas/article7105899.ece', 'http://www.npr.org/blogs/goatsandsoda/2015/04/15/399579066/in-search-of-the-missing-trekkers-in-nepal-s-muddy-morass', 'http://www.nzherald.co.nz/bay-of-plenty-times/news/article.cfm?c_id=1503343&objectid=11434737', 'http://www.youtube.com/watch?v=HuSHOQ6gv5Y', 'http://www.britannica.com/EBchecked/topic/266037/Himalayas', 'http://www.english-online.at/geography/himalayas/himalaya-mountain-range.html', 'http://www.himalayanfootsteps.com/destinations/where-are-the-himalayas/', 'http://www.mountainprofessor.com/the-himalaya.html', 'http://www.himalaya2000.com/himalayan-facts/location-of-himalayas.html', 'http://www.unmuseum.org/yeti.htm', 'http://www.hitt-initiative.org/mla/?page_id=390', 'http://www.robinsonlibrary.com/geography/physical/mountains/himalaya.htm', 'http://geography.howstuffworks.com/asia/the-himalayas.htm', 'http://www.kidsdiscover.com/spotlight/himalayas-kids/', 'http://pubs.usgs.gov/gip/dynamic/himalaya.html', 'http://www.todayifoundout.com/index.php/2013/12/himalayas-formed/', 'http://www.pbs.org/wgbh/nova/everest/earth/birth.html', 'http://www.pbs.org/wnet/nature/the-himalayas-himalayas-facts/6341/', 'http://www.pbs.org/wnet/nature/the-himalayas-introduction/6338/', 'http://www.oddizzi.com/teachers/explore-the-world/physical-features/mountains/mountain-case-study/himalayas/', 'https://vimeo.com/121045965', 'http://www.worldwildlife.org/places/eastern-himalayas', 'http://www.answers.com/Q/What_are_the_Himalayas'] print 'YOUR TOP ANSWERS ARE:' c = 0.0 for res in ALL_RESULTS: Exact_Match_Found_flag = 0 try: timeout = 0 #print 'Checking Source:',res response = urllib2.urlopen(res) page_data = response.read() page_data = BeautifulSoup(page_data) page_data = page_data.get_text() page_data = page_data.split('.') # Read from Individual Web Pages if answer_type == "Description": Start_T = time.time() for line in page_data: Curr_T = time.time() if (Curr_T - Start_T) > 15.0: break if re.findall(query.lower(), line.lower()) != []: c += 1.0 line_low = line.lower() line = line_low.split(query.lower()) print '===============================================================================' print 'Answer ', c, ':' line = query + line[1] + '.' print line print '\n\nSource: ', res print '===============================================================================' Exact_Match_Found_flag = 1 break elif answer_type == "Location": query_parts = query.split(' ') Start_T = time.time() for line in page_data: Curr_T = time.time() if (Curr_T - Start_T) > 30.0: break check_next = 0 for each_qp in query_parts: if re.findall(each_qp.lower(), line.lower()) == []: check_next = 1 break if check_next == 1: continue else: line_parts = line.split(' ') for each_lp in line_parts: if (each_lp in query_parts) or ( each_lp in IGNORE_LIST): #Skip the Query Words continue if check_WordNet( word=each_lp, def_word='city') or check_WordNet( word=each_lp, def_word='country') or check_WordNet( word=each_lp, def_word='continent' ) or check_WordNet(word=each_lp, def_word='state'): c += 1.0 print each_lp if not ALL_ANSWERS.has_key(each_lp): ALL_ANSWERS[each_lp] = 1 else: ALL_ANSWERS[each_lp] += 1 Exact_Match_Found_flag = 1 break if Exact_Match_Found_flag: break #print 'Finished Checking Source:',res except: print #Give a Probability for One Word Answers if answer_type == "Location": ALL_ANSWERS_SORTED = [] all_ans = ALL_ANSWERS.keys() for each_ans in all_ans: ALL_ANSWERS_SORTED.append([ALL_ANSWERS[each_ans], each_ans]) ALL_ANSWERS_SORTED.sort() print '===============================================================================' print 'SUMMARY:' print '---------------------------------------------------------------------------' for each_sa in range(0, len(ALL_ANSWERS_SORTED)): idx = len(ALL_ANSWERS_SORTED) - 1 - each_sa print ALL_ANSWERS_SORTED[idx][1] print 'Confidence Measure= ', (ALL_ANSWERS_SORTED[idx][0] / c * 100.0), '%' print '---------------------------------------------------------------------------' print '==============================================================================='
def wechat(request): if request.method == 'GET': # 检验合法性 # 从 request 中提取基本信息 (signature, timestamp, nonce, xml) signature = request.GET.get('signature') timestamp = request.GET.get('timestamp') nonce = request.GET.get('nonce') if not wechat_instance.check_signature( signature=signature, timestamp=timestamp, nonce=nonce): return HttpResponseBadRequest('Verify Failed') return HttpResponse( request.GET.get('echostr', ''), content_type="text/plain") # 解析本次请求的 XML 数据 try: wechat_instance.parse_data(data=request.body) except ParseError: return HttpResponseBadRequest('Invalid XML Data') # 获取解析好的微信请求信息 message = wechat_instance.get_message() if isinstance(message, TextMessage): # 当前会话内容 content = message.content.strip() if content == '博客' or content == 'blog' or content == '最新': return HttpResponse(wechat_instance.response_news(get_new_blogposts(request)), content_type="application/xml") if content == '功能': reply_text = ( '目前支持的功能:最新博客、博客搜索\n' + '回复【最新】或者【博客】,将返回最新博客' + '回复【search:ionic】,将搜索ionic相关内容 \n' + '正在实现: Google、Wiki' ) response = wechat_instance.response_text(content=reply_text) return HttpResponse(response, content_type="application/xml") if 'wiki' in content or '维基' in content: import wikipedia wiki_content = content.replace("wiki:", "") print wiki_content wiki = wikipedia.page(wiki_content) print wiki.title,wiki.summary,wiki.url message = [{ 'title': wiki.title, 'picurl': '', 'description': wiki.summary, 'url': wiki.url }] return HttpResponse(wechat_instance.response_news(message), content_type="application/xml") if 'google' in content: from google import search for url in search(content.replace('google:', ''), stop=1): print(url) return HttpResponse(wechat_instance.response_news(get_new_blogposts(request)), content_type="application/xml") if 'search' in content: keyword = content.replace('search:','') blogpost = BlogPost.objects.search(keyword)[:5] messages = blogpost_to_array(blogpost) messages.append({ 'title': '在https://www.phodal.com/上查看所有结果', 'picurl': 'https://www.phodal.com/static/media/uploads/search.jpg', 'url': 'http://www.phodal.com/search/?q=' + 'keyword' }) return HttpResponse(wechat_instance.response_news(messages), content_type="application/xml") else: response = get_new_blogposts(request) message = { 'title': '稍等:Phodal君正在实现功能中。正在为你返回最新文章。', 'picurl': 'https://www.phodal.com/static/phodal/images/bg.jpg', 'description': '稍等:Phodal君正在实现功能中。现在为你返回最新文章。', 'url': 'https://www.phodal.com/', } response.insert(0, message) return HttpResponse(wechat_instance.response_news(response), content_type="application/xml") elif isinstance(message, VoiceMessage): reply_text = '语音信息我听不懂/:P-(/:P-(/:P-(' elif isinstance(message, ImageMessage): reply_text = '图片信息我也看不懂/:P-(/:P-(/:P-(' elif isinstance(message, VideoMessage): reply_text = '视频我不会看/:P-(' elif isinstance(message, LinkMessage): reply_text = '链接信息' elif isinstance(message, LocationMessage): reply_text = '地理位置信息' elif isinstance(message, EventMessage): if message.type == 'subscribe': reply_text = '感谢您的到来!回复【功能】返回使用指南' if message.key and message.ticket: reply_text += '\n来源:二维码扫描' else: reply_text += '\n来源:搜索公众号名称' elif message.type == 'unsubscribe': reply_text = '取消关注事件' elif message.type == 'scan': reply_text = '已关注用户扫描二维码!' elif message.type == 'location': reply_text = '上报地理位置' elif message.type == 'click': reply_text = '自定义菜单点击' elif message.type == 'view': reply_text = '自定义菜单跳转链接' elif message.type == 'templatesendjobfinish': reply_text = '模板消息' else: reply_text = '稍等:Phodal君正在实现功能中。' response = wechat_instance.response_text(content=reply_text) return HttpResponse(response, content_type="application/xml")
#!/usr/bin/env python # Get the first 20 hits for: "Breaking Code" WordPress blog from google import search for url in search('site:http://s2.lmcdn.fr/multimedia/', stop=5130): print(url) myfile = open("urls.txt", "a") myfile.write(str(url) + "\n") myfile.close()
b = b[4:6] if (b[1] != '0'): b = b[0] b = int(b) #print(b) if (b > 2): print(1) else: print(-1) except: print(-1) #google search c = 0 query = "info" + url for j in search(query, tld="co.in", num=10, stop=1, pause=0): tsd, td, tsu = extract(j) hurl = td + '.' + tsu if (hurl == host): c += 1 if (c != 0): print(1) else: print(1) #no of links pointing to page print('NA') #phistank database
import requests from bs4 import BeautifulSoup from google import search import urllib query = raw_input() quer = query + " tutorials point" add = [] for j in search(quer, tld="com", num=1, stop=1, pause=2): add.append(j) if "https://www.tutorialspoint.com" in add[0]: url = "https://www.tutorialspoint.com/" + str(query) + "/" page = requests.get(url) soup = BeautifulSoup(page.content, "html.parser") data = soup.find("button", class_="btn btn-default btn-sm btn-buy-tutorial") data = data.find('a')['href'] url = "https://www.tutorialspoint.com" + data print url page = requests.get(url) soup = BeautifulSoup(page.content, "html.parser") data = soup.find_all("h1") rcf = [] for i in data: rcf.append(i.find("a")) url = "https://tutorialspoint.com" + rcf[2]['href'] urllib.urlretrieve(url, query + ".pdf") else: print "Sorry no results found"
import urllib.request from bs4 import BeautifulSoup from google import search lst = [] all_links = ['http://www.gemschicago.org Tuition', 'https://www.latinschool.org Tuition', 'http://www.nordangliaeducation.com Tuition', 'http://www.fwparker.org Tuition'] for link in all_links: for url in search(link,lang='en',stop=3): lst.append(url) search_link = lst[0] request = urllib.request.Request(search_link) response = urllib.request.urlopen(request) soup = BeautifulSoup(response, "html.parser") print('---------------------------------------------------------------') print(search_link) for table_tag in soup.find_all('td'): print(table_tag.text) lst = []
def googleSearch(): lol = raw_input(color.UNDERLINE + "" + T + "Query>" + color.END) for url in search(lol, tld='com', lang='es', stop=50): print("" + G + "Site: " + W + "" + url)
def search(what): for url in google.search(what, lang='ru',stop=5): return (url)
from google import search import pprint KEYWORDS = 'duyetdev' data = [] for d in search(KEYWORDS, tld='com.vn', lang='vi', stop=10): data.append(d) print pprint.pprint(data)
# the script recieves an input argument from the user and displays the important information regarding the hacks to the user # can be found usefull in some cases import re # regex --regular expression operation handling library in python....USED HERE FOR THE PATTERN MATCHING HIGHLY NEEDED HERE!!! import sys import google import urllib2 if len(sys.argv) < 2: # argument count validation supplied by the user print sys.argv[0] + ": <dict>" sys.exit(2) fo = open(sys.argv[1]) for word in fo.readlines(): print "\n searching the " + word.strip() result = google.search(word.strip()) try: for implink in results: if re.search("youtube", implink) == None: print implink except urllib2.HTTPError, e: print "Search not found : " + str(e) # program ends # happy hacking :)
##part 1 # Get the google search results. ##return the URLs from google import search search_term = "islamic state" Newspapers = [ "champress.net", "timesofindia.indiatimes.com", "aawsat.net", "english.people.com.cn", "bbc.co.uk", "tehrantimes.com", "dw.de" ] for news in Newspapers: gs = search_term + " site:" + news f = open("/Users/Aseel/FinalProject/" + search_term + "/" + news + ".txt", "w") for url in search(gs, num=100, stop=1000): f.write(url) f.write("\n") f.close()
def main(): try: print "\n\\\\\\\ Welcome to GoooFu //////// \n" print "WARNING: Executing this script multiple times may cause Google to block your IP address" # FILE CREATION -------------------------------------------------------------------------------- fName = str(raw_input("Enter filename to save output to: ")) fileName = fName + '.html' f = open(fileName, "w+") print "Search results for 'site' & 'string' directive: \n" # READING VALUES ------------------------------------------------------------------------------- searchSite = str(raw_input("Enter the URL: ")) searchString = str(raw_input("Enter your search string: ")) searchPage = str(raw_input("\nEnter the page you want to check for: ")) fileType = str(raw_input("\nEnter filetype to search: ")) # SEARCH SCRIPT FOLLOWS ------------------------------------------------------------------------ print "\nPlease wait. Fetching .. .." f.write('<h1>Site: ' + searchSite + ' & String: ' + searchString + ' search output:</h1>') query = 'site:' + searchSite + ' ' + searchString #br = mechanize.Browser() #Session = br.open("https://www.google.com/") try: for j in search(query, tld="co.uk", num=5, stop=1, pause=5): print(j) f.write('<p> <a href="' + j + '">' + j + '</a> </p>' + '\n') #sleep(randint(0.5,1.5)) except: print "HTTP Error" print "\nSearch results with 'inurl' directive: " print "\nPlease wait. Fetching .. .." f.write('<h1>Site: ' + searchSite + ' & Page: ' + searchPage + ' search output:</h1>') query2 = 'inurl:' + searchPage + ' ' + 'site:' + searchSite try: for i in search(query2, tld="co.au", num=5, stop=1, pause=5): print(i) f.write('<p> <a href="' + i + '">' + i + '</a> </p>' + '\n') #sleep(randint(1.0,1.5)) except: print "HTTP Error" print "\nFiletype search on supplied URL: " print "\nPlease wait. Fetching .. .." f.write('<h1>Site: ' + searchSite + ' & File type: ' + fileType + ' search output:</h1>') query4 = 'site:' + searchSite + 'filetype:' + fileType try: for l in search(query4, tld="co.za", num=5, stop=1, pause=5): print(l) f.write('<p> <a href="' + l + '">' + l + '</a> </p>' + '\n') #sleep(randint(1.0,1.5)) except: print "HTTP Error" print "\nExecuting search string 'allintitle' search:" print "\nPlease wait. Fetching .. .." f.write('<h1>Allintitle search output:</h1>') query5 = 'allintitle:' + searchString try: for m in search(query5, tld="co.in", num=5, stop=1, pause=4): print(m) f.write('<p> <a href="' + m + '">' + m + '</a> </p>' + '\n') except: print "HTTP Error" print "\nExecuting search string 'intitle' search:" print "\nPlease wait. Fetching .. .." f.write('<h1>Intitle search output:</h1>') query6 = 'intitle:' + searchString try: for n in search(query6, tld="co.vi", num=5, stop=1, pause=5): print(n) f.write('<p> <a href="' + n + '">' + n + '</a> </p>' + '\n') #sleep(randint(0.5,1.9)) except: print "HTTP Error" f.close() print "\nOutput has been saved to %s.html" % fName print "\nThank you for using GoooFu. To contribute https://github.com/west-wind/GoooFu" print "\nExiting..." except KeyboardInterrupt: sys.exit(0)
def googleSearch(text): text = re.sub("/google ", "", text, count=1) answer = "" for url in google.search(text, lang='se',num=1, stop=2): answer += url + "\n" return answer
def google_it(dork): clear_cookie() for title in search(dork, stop=30): print(B + ' [!] Profile Found :> ' + C + title) time.sleep(0.5)
name = your_list[0][1][2] # [['This is the first line', 'Line1'], # ['This is the second line', 'Line2'], # ['This is the third line', 'Line3']] f = csv.writer(open("fraudSearchResults.csv", "a")) f.writerow(["Search Terms", "Top 10 URL"]) #don't overwrite old data open("learner.csv", "w") to open("learner.csv", "a") #fraudArray = [fraud, jail, prison, ponzi scheme, steal, money laundering, pyramid scheme, stole, investigation, investigated, indicted, corrupt, bribery] #fullSearch = loop through array and combine with name fullSearch = "fraud" + "AND" + name for url in search(fullSearch, tld='com', lang='en', stop=10): print(url) f.writerow([fullSearch, url]) #f.writerow([fullSearch, url]) '''error handling: from xgoogle.search import GoogleSearch, SearchError try: gs = GoogleSearch("quantum mechanics") gs.results_per_page = 100 results = [] while True: tmp = gs.get_results() if not tmp: # no more results were found
thepage = urllib.request.urlopen(url) soup = BeautifulSoup(thepage, "html.parser") return soup.title.text with open('designers.json') as data_file: data = json.load(data_file) client = MongoClient() db = client.FashionData Designer = db['Designer'] print(db) for i in data: try: designer_name = i['designer'] website = list(search(designer_name, stop=1))[0] i['website'] = website db.Designer.insert({'name': designer_name, 'website': website}) print(designer_name, website) except: print(designer_name) #break #with open('designers.json', "w") as jsonFile: # json.dump(data, jsonFile) cursor = Designer.find() for document in cursor: print(document)
def search_multiple_pages(query, link_amount, verbose=False, **kwargs): def __config_proxy(proxy_string): proxy_type_schema = { "http": httplib2.socks.PROXY_TYPE_HTTP, "socks4": httplib2.socks.PROXY_TYPE_SOCKS4, "socks5": httplib2.socks.PROXY_TYPE_SOCKS5 } proxy_type = get_proxy_type(proxy_string)[0] proxy_dict = proxy_string_to_dict(proxy_string) proxy_config = httplib2.ProxyInfo( proxy_type=proxy_type_schema[proxy_type], proxy_host="".join(proxy_dict.keys()), proxy_port="".join(proxy_dict.values())) return proxy_config proxy, agent = kwargs.get("proxy", None), kwargs.get("agent", None) if proxy is not None: if verbose: logger.debug( set_color("configuring to use proxy '{}'...".format(proxy), level=10)) __config_proxy(proxy) if agent is not None: if verbose: logger.debug( set_color("settings user-agent to '{}'...".format(agent), level=10)) logger.warning( set_color( "multiple pages will be searched using Google's API client, searches may be blocked after a certain " "amount of time...", level=30)) results, limit, found, index = set(), link_amount, 0, google_api.search( query, user_agent=agent, safe="on") try: while limit > 0: results.add(next(index)) limit -= 1 found += 1 except Exception as e: if "Error 503" in str(e): logger.fatal( set_color( "Google is blocking the current IP address, dumping already found URL's...", level=50)) results = results pass retval = set() for url in results: if URL_REGEX.match(url) and URL_QUERY_REGEX.match(url): if verbose: logger.debug(set_color("found '{}'...".format(url), level=10)) retval.add(url) if len(retval) != 0: logger.info( set_color( "a total of {} links found out of requested {}...".format( len(retval), link_amount))) write_to_log_file(list(retval), URL_LOG_PATH, "url-log-{}.log") else: logger.error( set_color("unable to extract URL's from results...", level=40))
# -*- coding: utf-8 -*- from google import search from bs4 import BeautifulSoup import urllib for url in search("クック パッド 内紛 site:newspicks.com", stop=10, lang="en"): soup = BeautifulSoup(urllib.urlopen(url)) print url print soup.find("title").text
def url_cek(): for url in google.search(dork, num=site_sayi, stop=1): dosya = open("urller.txt", "a+") dosya.write(url + "\n")
if DEBUG3: print " "+"\n ".join(list(hostnames)) ################### ## search hostnames ################### if DEBUG2: print "Search Hostnames" key_cnt = 0 for keyword in remain_keywords: key_cnt = key_cnt + 1 if DEBUG3: print ' '+str(key_cnt)+': '+keyword cnt = 0 try: for url in search(keyword, stop=5000, pause=60.0): cnt = cnt + 1 hostname = get_hostname(url) ## don't save IP address -- we need hostname m = re.match("(\d+\.\d+\.\d+\.\d+)", hostname) if m is not None: continue if DEBUG3: print(' %d.%d: %s\n %s' % (key_cnt, cnt, url, hostname)) hostnames.add(hostname) if cnt % 100 == 0: store_output_files() except: print " search exception! exit!"
def pastebin_search(args, lookup, reportDir, apiKeyDir): userAgent = {'User-agent': 'Mozilla/5.0'} #return values pasteScrapeUrl = [] pasteScrapeContent = [] pasteScrapeResult = [] # check for empty args if args.pastebinsearch is not None: for a in args.pastebinsearch: #init lists scrapeURL = [] scrapeContent = [] #iterate the lookup list for i, l in enumerate(lookup): #init textfiles scrapedFile = open( reportDir + l + '/' + l + '_pastebin_content.txt', 'w') pasteUrlFile = open( reportDir + l + '/' + l + '_pastebin_urls.txt', 'w') #show user whiat is being searched print '[+] Searching Pastebin for public pastes containing %s' % ( l) print '[i] May require a Pastebin Pro account for IP whitelisting' #run google query code try: #iterate url results from search of dork arg and supplied lookup value against pastebin. return top 20 hits for url in search(str(a) + ' ' + str(l) + ' site:pastebin.com', stop=20): #delay 1 second to be polite time.sleep(1) #append results together scrapeURL.append(url) if args.verbose is True: print '[+] Paste containing "%s" and "%s" found at: %s' ( a, l, url) except Exception: print '[-] Error dorking pastebin URLs, skipping...' pasteScrapeResult.append('Error scraping Pastebin') continue for u in scrapeURL: #http://docs.python-guide.org/en/latest/scenarios/scrape/ try: page = requests.get(u, headers=userAgent) pasteUrlFile.writelines(u) except: print '[-] Error opening ' + u + ':' pasteScrapeResult.append('Error opening %s' % u) continue #build html tree tree = html.fromstring(page.content) #if verbose spit out url, search term and domain searched if args.verbose is True: print '[+] Looking for instances of %s and %s in %s \n' % ( a, l, url) #grab raw paste data from the textarea rawPasteData = tree.xpath( '//textarea[@class="paste_code"]/text()') #search lines for lookup and keyword for line in rawPasteData: #regex for the lookup value (domain) in that line #if re.search((str(l)), line): if str(l) in line: #if the argument search term is in the line if a in line: scrapedFile.writelines(a) return pasteScrapeResult
from google import search import urllib from bs4 import BeautifulSoup import webbrowser import pyperclip def google_scrape(url): thepage = urllib.urlopen(url) soup = BeautifulSoup(thepage, "html.parser") return soup.title.text i = 1 query = pyperclip.paste() for url in search(query, stop=10): a = google_scrape(url) print str(i) + ". " + a print url webbrowser.open(url) i += 1
# Remove extra whitepsace and newlines textData = textData.replace('\\n', '') textData = ' '.join(textData.split()) # Split data, as google only accepts querys of 32 words or less textData = split(textData, 250) # Loop through each query for line in textData: queryNum += 1 print("") deltaTime = time.time() print("Searching for query number " + str(queryNum)) urls = [] # This is where we search google for the urls for url in search('"' + line + '"', stop=3, num=3): urls.append(url) time.sleep(1) # Avoid google detecting our bot if len(urls) == 0: print("No results found") continue print("Search completed in " + str(round(time.time() - deltaTime, 2)) + " seconds, beginning comparison") deltaTime = time.time() # Now we process each url for url in urls: # Optimze popular websites for text elementsToSearch = {} if "stackoverflow" in url: elementsToSearch = {"code"}
def google_it (site,dork): clear_cookie() for title in search(dork, stop=30): print(GR+' [!] Site Found :> '+B+title) time.sleep(0.5)
] with codecs.open("homepages.csv", "a", "utf8") as outfile: facultydictkeys = list(facultydict.keys()) random.shuffle(facultydictkeys) for name in facultydictkeys: # Skip any homepages we have already in the database. if name in homepages: # ...unless it's a Google search page, then we will try again to fix it. match = re.search("www.google.com", homepages[name]) if match == None: continue str = name + " " + facultydict[name] name = name.decode("utf8") # Grab first result from Google search. results = google.search(str, stop=1) actualURL = "FIXME" for url in results: actualURL = url matched = 0 for t in trim: match = re.search(t, url) if match != None: matched = matched + 1 if matched == 0: break # Output the name and this resolved URL. match = re.search("www.google.com", actualURL) print(name) try:
import newspaper import google import re from bs4 import BeautifulSoup therapists = ['Physiotherapist', 'respiratory therapist', 'Hematologist'] for p in therapists: search_results = google.search(p+" salary department of labor", stop=1, lang="en") print("*"*30) print(p.upper()) print("_"*15) for link in search_results: #print(link) data = newspaper.Article(url=link) data.download() text=data.html soup = BeautifulSoup(text, 'html.parser') rows = soup.find_all('p') for tr in rows: if 'The median annual wage' in str(tr): print(str(tr).split(" ")[-4]) break break
domain_name = domain_name[:-4] if domain_name.endswith('.in'): domain_name = domain_name[:-3]''' #print domain_name if flag_no_title: query = domain_name + ' ' + title else: extra = lolmax.hitString(7, web_url) query = domain_name + ' ' + extra print "Query is : " + (query) # Performs a search on google based on title + domain name print("URLs are : ") for url in search(query, stop=7): print(url) #print type(str(url)) domain_found = get_tld(str(url)) if (domain_name == domain_found): print( "========================================Match found=====================================" ) flag_found_in_list = True #Top URL's fetched on google search of title + web_url if flag_found_in_list: print("Not a phishing site ! You may proceed !") else: print("No match found. Phishing Site ! Do not proceed !")
# Get IMDB, RT, Meta and Audience scores for a movie by name. # Run this module from the terminal to enter a movie name and see its score. from ratings.get_score import get_score from google import search import requests from bs4 import BeautifulSoup movie = input('Enter movie name : ') imdb_url = rotten_url = None # print('imdb:') count = 0 for url in search(movie + ' imdb', stop=1): imdb_url = url break # print('IMDb URL : ' + imdb_url) # print('\nRT:') for url in search(movie + ' rottentomatoes', stop=1): rotten_url = url break # print('IMDb URL : ' + str(imdb_url)) print('RT URL : ' + str(rotten_url)) # exit() movie_score = get_score(imdb_url, rotten_url) print('Film Score : ' + str(movie_score))
result = [] file = codecs.open("example.txt", "ab+", "utf-8") final = [] for w in mylist: try: if codew: article = wikipedia.summary(w, sentences=5, redirect=True, auto_suggest=True) file.write("<h2>" + w + "</h2><br><p>" + article + "</p>") file.write("<br>\n") if codeg: for url in search(w + ' wikipedia', stop=1): if "wikipedia" in url: result.append(url) final.append(result[0]) result[:] = [] # if any(x in article for x in original): # file.write("<h2>"+w+"</h2><br><p>"+article+"</p>") # file.write("<br>\n") except: pass if codeg: final = list(set(final))
#Get's Google Search Results from google import search for url in search('josh voss linkedin', stop=20): print url