def showresults(): if(re.match('^Who', request.form['search'])): ggen = google.search(request.form['search'], stop=1) results = [google.get_page(link) for link in ggen] results = [soupify(x) for x in results] results = [findName(x) for x in results] return render_template("index.html", results = results) elif(re.match('^When', request.form['search'])): ggen = google.search(request.form['search'], stop=1) results = [google.get_page(link) for link in ggen] results = [soupify(x) for x in results] results = [findDate(x) for x in results] return render_template("index.html", results = results) return render_template("index.html", results = "Please enter 'Who' or 'When'")
def showresults(): if (re.match('^Who', request.form['search'])): ggen = google.search(request.form['search'], stop=1) results = [google.get_page(link) for link in ggen] results = [soupify(x) for x in results] results = [findName(x) for x in results] return render_template("index.html", results=results) elif (re.match('^When', request.form['search'])): ggen = google.search(request.form['search'], stop=1) results = [google.get_page(link) for link in ggen] results = [soupify(x) for x in results] results = [findDate(x) for x in results] return render_template("index.html", results=results) return render_template("index.html", results="Please enter 'Who' or 'When'")
def wiki_zh_to_en(self, s): """ 利用維基百科的頁面跳轉功能找到英文 """ #""" 利用 google translate 中翻英 """ #url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-TW&tl=en&dt=t&q=%s" %\ # (urllib.parse.quote(s)) #response = urllib.request.urlopen(url) #html = response.read() #print(html) url = "https://zh.wikipedia.org/wiki/%s" % (urllib.parse.quote(s)) try: page = get_page(url) soup = BeautifulSoup(page, 'lxml') en_link = soup.findAll('li', class_='interwiki-en') if len(en_link) == 0: return None p = en_link[0].a if p is None: return None en = p.get('href', None) if en is None: return None tr = en.split('/')[-1].lower().replace("_", " ") print("找到翻譯了... [%s] => %s" % (s, tr)) return tr except Exception as e: print("Error: %s" % str(e)) return None
def get_tokens(query, src="google"): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] if src == "wikipedia": print "Searching Wikipedia for " + query infile = opener.open('http://en.wikipedia.org/wiki/' + query) page = infile.read() else: print "Searching Google for " + query page = "" results = google.search(query, "com", "en", 1, 0, 1, 2.0) for result in results: print "on " + result page = google.get_page(result) #print page raw = nltk.clean_html(page) #parses into tokens and saves as lwoercase tokens = map(lambda x:x.lower(),word_tokenize(raw)) #removes punctuation and empty strings tokens = [s.translate(None, string.punctuation) for s in tokens] tokens = [s for s in tokens if s] return tokens
def whoAnswer(question,n): #returns top n occurrences of names from question numPages = 10;#uses top 10 results links = google.search("Who " + question,num=numPages,start=0,stop=0,pause=2.0) txt = "" for i in range(numPages): txt += BeautifulSoup(google.get_page(links.next())).getText() return getNames(txt,n)
def get_company_domain(self, searchKey): '''looks for the company website from the top five url return by google search. If company website is found then it parses the url to get domain name ''' search_result = search(searchKey, stop=5) for url in search_result: keywords = searchKey.split(" ") print keywords if keywords[0] in url.lower(): # if links is wikipedia link then parse the webpage to get # company homepage if "en.wikipedia.org" in url: chomepage = get_company_website(url) if chomepage is not None: return extract_domainname(chomepage) return extract_domainname(url) try: htmlpage = get_page(url) soup = BeautifulSoup(htmlpage) title = soup.title.text.lower() if keywords[0] in title: return extract_domainname(url) except: print searchKey.ljust(52) + ": Can't parse web page at " + colored(url.ljust(100), 'blue')
def who(s): g = google.search(s, num=1, start=0, stop=8) #l = BeautifulSoup(google.get_page(g.next())) #x = givetext(l.prettify()) f = [] #soup=[(BeautifulSoup(google.get_page(x)).find_all('p') for x in urls] HOW TO SOUP urls = [x for x in g] f.append("Step 1 - Collecting URLS:") for x in urls: f.append(x) f.append("") f.append("") soup = [(google.get_page(x)) for x in urls] alphabetsoup = [nameapp.givetext(x) for x in soup] f.append("Step 2 - Collecting all the Names:") f.append(str(alphabetsoup)) f.append("") f.append("") splitted = [] for x in alphabetsoup: splitted += x """for i in urls: html.append(BeautifulSoup(google.get_page(i))) allnames = [] for i in html: allnames.append(nameapp.givetext(i)) splitted = [] for i in allnames: splitted += i.split(",")""" namestats = {} ##all names for i in splitted: if i in namestats: namestats[i] += 1 else: namestats[i] = 1 f.append("Step 3 - Making a Dictionary:") f.append(str(namestats)) f.append("") f.append("") final = {} for i in namestats: if namestats[i] >= 25: final[i] = namestats[i] f.append("Step 4 - Narrowing Possibilites:") f.append(str(final)) f.append("") f.append("") maxname = final.keys()[0] if (maxname == s): maxname = final.keys()[1] maxvalue = final[maxname] for x in final.keys(): if final[x] > maxvalue and x != s: maxname = x maxvalue = final[x] f.append("Step 5 - The Answer:") f.append(maxname) return f
def who(s): g = google.search(s, num= 1,start = 0, stop = 8) #l = BeautifulSoup(google.get_page(g.next())) #x = givetext(l.prettify()) #soup=[(BeautifulSoup(google.get_page(x)).find_all('p') for x in urls] HOW TO SOUP soup=[(google.get_page(x)) for x in g] alphabetsoup=[nameapp.givetext(x) for x in soup] splitted=[] for x in alphabetsoup: splitted+=x """for i in urls: html.append(BeautifulSoup(google.get_page(i))) allnames = [] for i in html: allnames.append(nameapp.givetext(i)) splitted = [] for i in allnames: splitted += i.split(",")""" namestats = {} ##all names for i in splitted: if i in namestats: namestats[i] += 1 else: namestats[i] = 1 final = {} for i in namestats: if namestats[i] >= 25: final[i] = namestats[i] return final
def get_tokens(query, src="google"): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] if src == "wikipedia": print "Searching Wikipedia for " + query infile = opener.open('http://en.wikipedia.org/wiki/' + query) page = infile.read() else: print "Searching Google for " + query page = "" results = google.search(query, "com", "en", 1, 0, 1, 2.0) for result in results: print "on " + result page = google.get_page(result) #print page raw = nltk.clean_html(page) #parses into tokens and saves as lwoercase tokens = map(lambda x: x.lower(), word_tokenize(raw)) #removes punctuation and empty strings tokens = [s.translate(None, string.punctuation) for s in tokens] tokens = [s for s in tokens if s] return tokens
def get_google(url, seen): from bs4 import BeautifulSoup from google import search, get_page soup = BeautifulSoup(get_page(url), 'lxml') content = { url : [str(s) for paragraph in soup.find_all('p') for s in paragraph.strings] } remaining = { link.get('href') for link in soup.find_all('a') if link.get('href') not in seen } return content, remaining
def ask_google_en_to_zh(s): q = s + " wikipedia 中文" result = search(q, lang="zh", pause=1.0) cnt = 0 for x in result: cnt += 1 if cnt >= 10: break if 'wikipedia' in x: x = urllib.parse.unquote(x) x = x.split('/')[-1] if mafan.text.is_traditional(x): return x if mafan.text.is_simplified(x): url = "http://zh.wikipedia.org/zh-tw/" + urllib.parse.quote_plus( x) print(url) try: w = get_page(url) except: continue soup = BeautifulSoup(w, 'lxml') x = soup.title.string x = x.strip().split(' ')[0] if '維基百科' not in x: return x return None
def who(s): g = google.search(s, num= 1,start = 0, stop = 8) #l = BeautifulSoup(google.get_page(g.next())) #x = givetext(l.prettify()) f=[] #soup=[(BeautifulSoup(google.get_page(x)).find_all('p') for x in urls] HOW TO SOUP urls = [x for x in g] f.append("Step 1 - Collecting URLS:") for x in urls: f.append(x) f.append("") f.append("") soup=[(google.get_page(x)) for x in urls] alphabetsoup=[nameapp.givetext(x) for x in soup] f.append("Step 2 - Collecting all the Names:") f.append(str(alphabetsoup)) f.append("") f.append("") splitted=[] for x in alphabetsoup: splitted+=x """for i in urls: html.append(BeautifulSoup(google.get_page(i))) allnames = [] for i in html: allnames.append(nameapp.givetext(i)) splitted = [] for i in allnames: splitted += i.split(",")""" namestats = {} ##all names for i in splitted: if i in namestats: namestats[i] += 1 else: namestats[i] = 1 f.append("Step 3 - Making a Dictionary:") f.append(str(namestats)) f.append("") f.append("") final = {}; for i in namestats: if namestats[i] >= 25: final[i] = namestats[i] f.append("Step 4 - Narrowing Possibilites:") f.append(str(final)) f.append("") f.append("") maxname=final.keys()[0] if (maxname == s): maxname=final.keys()[1] maxvalue=final[maxname] for x in final.keys(): if final[x] > maxvalue and x != s: maxname=x maxvalue=final[x] f.append("Step 5 - The Answer:") f.append(maxname) return f
def get_stuff(query): s = google.search (query, tld = 'com', lang = 'en', start=0, stop=10, pause=2.0) l = [] for url in s: l.append(url) data = [] for x in l: data.append(google.get_page(x)) return data
def get_stuff(query): s = google.search(query, tld='com', lang='en', start=0, stop=10, pause=2.0) l = [] for url in s: l.append(url) data = [] for x in l: data.append(google.get_page(x)) return data
def parse_urls(urlsList): #takes list of urls, returns only text #beautiful soup html = "" for u in urlsList: content = google.get_page(u) soup = BeautifulSoup(content) html += soup.get_text() print "Parsed HTML code!" return html
def retRandomResult(search): result = {} res = google.search(search) for c,i in enumerate(res): if c > 15: break if c % 6 == random.randint(0,5): g = Goose() a = g.extract(raw_html=google.get_page(i)) result["resNum"] = c result["title"] = a.title result["url"] = i result["blob"] = getBlob(search,a.cleaned_text) else: continue if len(result) == 0: for i in res: g = Goose() a = g.extract(raw_html=google.get_page(i)) result["title"] = a.title result["url"] = i result["blob"] = getBlob(search,a.cleaned_text) return result return result
def search(question): name = False date = False searchtype = question.split(" ")[0] if searchtype.lower() == "who": name = True elif searchtype.lower() == "when": date = True else: return [None,0] g = google.search(question, num = 10, start = 0, stop = 10, pause=3.0) utils = [w for w in g] d = {} count = 0 for x in utils: #run beautiful soup to find names try: html_doc = google.get_page(x) soup = BeautifulSoup(html_doc) y = soup.get_text() if name: names = check_names(get_potential_names(y)) #names = findname(y) for k in names.keys(): count = count + names[k] if k not in d.keys(): d[k] = names[k] else: d[k] = d[k] + names[k] #find highest number of names # dhigh=findhigh(d); #return addition of that elif date: dates = find_dates( y ) for k in dates.keys(): count = count + dates[k] if k not in d.keys(): d[k] = dates[k] else: d[k] = d[k] + dates[k] else: return except Exception, error: pass
def get_google_result(q, result_map): # Form search url google_url = get_google_search_url(q) # Parse text of result page print(google_url) google_search_results = google.get_page(google_url) google_search_soup = BeautifulSoup(google_search_results, "html.parser") google_text_search_results = google_search_soup.get_text().encode( "utf-8").lower() result_map["content"] = google_text_search_results result_map["words"] = google_text_search_results.split(" ") # Return text return google_text_search_results
def parsePages(urls, wordsInQuery): # For example, {"Zeus":500, "Jupiter": 366} namesByFrequency = {} # Loop through urls, remembering the index for weighting for index, url in enumerate(urls): try: html = google.get_page(url) except: continue if wordsInQuery[0] == "WHEN": namesInThisPage = searchHelper.extractDates(html) else: namesInThisPage = searchHelper.extractNames(html) namesInThisPage = searchHelper.weightNames(namesInThisPage, index, NUM_GOOGLE_RESULTS) namesByFrequency = searchHelper.addDicts(namesByFrequency, namesInThisPage) namesByPercent = searchHelper.compareNames(namesByFrequency) return namesByPercent
def when(s): f = [] f.append("Step 1 - Collecting URLS:") g = google.search(s, num = 1, start = 0, stop = 8) urls = [x for x in g] for annoyingvariable in urls: f.append(annoyingvariable) f.append("") f.append("") soup = [(google.get_page(x)) for x in urls] f.append("Step 2 - Collecting all the Years:") yearsoup = [re.findall('\s[1-2][0-9]{3}',x) for x in soup] y = yearsoup f.append(str(y)) f.append("") f.append("") yeardict = {} for page in yearsoup: if len(page) > 0: for year in page: if year in yeardict: yeardict[year] += 1 else: yeardict[year] = 1 # return yeardict # yeardict.update(monthdict) f.append("Step 3 - Collecting all the Months:") monthsoup = [findmonths(x) for x in soup] m = monthsoup f.append(str(m)) f.append("") f.append("") monthdict = {} for page in monthsoup: for month in page: if len(month) > 0: if month[0] in monthdict: monthdict[month[0]] += len(month) else: monthdict[month[0]] = len(month) f.append("Step 4 - Making the Dictionaries:") yd = yeardict f.append(str(yd)) f.append("") f.append("") md = monthdict f.append(str(md)) f.append("") f.append("") topyear = yeardict.keys()[0] for year in yeardict: if yeardict[year] > yeardict[topyear]: topyear = year topmonth = monthdict.keys()[0] for month in monthdict: if monthdict[month] > monthdict[topmonth]: topmonth = month f.append("Step 5 - The Answer:") f.append(topmonth + ", " + topyear) return f
keyword = 'apple' res_gen = google.search(keyword, stop=10) res = [] for site in res_gen: if site not in res: res.append(site) print "a" fo = open("res.html", "wb") site_html = google.get_page(res[5]) p = Parser() p.init_parser() p.feed(site_html) print p.data_list fo.write("<meta charset='UTF-8'><html><body>") for sentence in p.data_list: braille_sentence = t.convert(sentence) print braille_sentence fo.write("<p>") fo.write("".join(braille_sentence)) fo.write("</p>") #print site_html fo.write("</body></html>") fo.close()
def when(s): f = [] f.append("Step 1 - Collecting URLS:") g = google.search(s, num=1, start=0, stop=8) urls = [x for x in g] for annoyingvariable in urls: f.append(annoyingvariable) f.append("") f.append("") soup = [(google.get_page(x)) for x in urls] f.append("Step 2 - Collecting all the Years:") yearsoup = [re.findall('\s[1-2][0-9]{3}', x) for x in soup] y = yearsoup f.append(str(y)) f.append("") f.append("") yeardict = {} for page in yearsoup: if len(page) > 0: for year in page: if year in yeardict: yeardict[year] += 1 else: yeardict[year] = 1 # return yeardict # yeardict.update(monthdict) f.append("Step 3 - Collecting all the Months:") monthsoup = [findmonths(x) for x in soup] m = monthsoup f.append(str(m)) f.append("") f.append("") monthdict = {} for page in monthsoup: for month in page: if len(month) > 0: if month[0] in monthdict: monthdict[month[0]] += len(month) else: monthdict[month[0]] = len(month) f.append("Step 4 - Making the Dictionaries:") yd = yeardict f.append(str(yd)) f.append("") f.append("") md = monthdict f.append(str(md)) f.append("") f.append("") topyear = yeardict.keys()[0] for year in yeardict: if yeardict[year] > yeardict[topyear]: topyear = year topmonth = monthdict.keys()[0] for month in monthdict: if monthdict[month] > monthdict[topmonth]: topmonth = month f.append("Step 5 - The Answer:") f.append(topmonth + ", " + topyear) return f
def fetch_pages(question): return [google.get_page(url) for url in google.search(question, stop=10)]
def findContactPage(url): html = google.get_page(url) soup = BeautifulSoup(html) contactStr = soup.find_all('a', href=re.compile(".*?contact", re.IGNORECASE)) return contactStr
import google import re from bs4 import BeautifulSoup def findContactPage(url): html = google.get_page(url) soup = BeautifulSoup(html) contactStr = soup.find_all('a', href=re.compile(".*?contact", re.IGNORECASE)) return contactStr if __name__ == "__main__": url = "http://www.wrangler.com/" contactStr = findContactPage(url) if(len(contactStr) > 0): contactPage = google.get_page(contactStr[0].get("href")) print contactStr[0].get("href")#.find_parents("a") soup = BeautifulSoup(contactPage) emailStr = soup.find_all(text=re.compile("[\w\.-]+@[\w\.-]+")) if(len(emailStr) > 0) : print addressStr else: print "could not find email" else: print "could not find contacts page"
def myNewsSearch(key): data = {'q' : key} url = 'https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&tbs=qdr:h72&'+urllib.urlencode(data) page = google.get_page(url) return page
def myNewsSearch(key): data = {'q': key} url = 'https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&tbs=qdr:h72&' + urllib.urlencode( data) page = google.get_page(url) return page