Beispiel #1
0
def showresults():
    if(re.match('^Who', request.form['search'])):
        ggen = google.search(request.form['search'], stop=1)
        results = [google.get_page(link) for link in ggen]
        results = [soupify(x) for x in results]
        results = [findName(x) for x in results]
        return render_template("index.html", results = results)
    elif(re.match('^When', request.form['search'])):
        ggen = google.search(request.form['search'], stop=1)
        results = [google.get_page(link) for link in ggen]
        results = [soupify(x) for x in results]
        results = [findDate(x) for x in results]
        return render_template("index.html", results = results)
    return render_template("index.html", results = "Please enter 'Who' or 'When'")
Beispiel #2
0
def showresults():
    if (re.match('^Who', request.form['search'])):
        ggen = google.search(request.form['search'], stop=1)
        results = [google.get_page(link) for link in ggen]
        results = [soupify(x) for x in results]
        results = [findName(x) for x in results]
        return render_template("index.html", results=results)
    elif (re.match('^When', request.form['search'])):
        ggen = google.search(request.form['search'], stop=1)
        results = [google.get_page(link) for link in ggen]
        results = [soupify(x) for x in results]
        results = [findDate(x) for x in results]
        return render_template("index.html", results=results)
    return render_template("index.html",
                           results="Please enter 'Who' or 'When'")
    def wiki_zh_to_en(self, s):
        """ 利用維基百科的頁面跳轉功能找到英文 """
        #""" 利用 google translate 中翻英 """
        #url = "https://translate.googleapis.com/translate_a/single?client=gtx&sl=zh-TW&tl=en&dt=t&q=%s" %\
        #    (urllib.parse.quote(s))
        #response = urllib.request.urlopen(url)
        #html = response.read()
        #print(html)
        url = "https://zh.wikipedia.org/wiki/%s" % (urllib.parse.quote(s))
        try:
            page = get_page(url)

            soup = BeautifulSoup(page, 'lxml')
            en_link = soup.findAll('li', class_='interwiki-en')
            if len(en_link) == 0:
                return None
            p = en_link[0].a
            if p is None:
                return None
            en = p.get('href', None)
            if en is None:
                return None
            tr = en.split('/')[-1].lower().replace("_", " ")
            print("找到翻譯了... [%s] => %s" % (s, tr))
            return tr
        except Exception as e:
            print("Error: %s" % str(e))
            return None
Beispiel #4
0
def get_tokens(query, src="google"):
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]

	if src == "wikipedia":
		print "Searching Wikipedia for " + query
		infile = opener.open('http://en.wikipedia.org/wiki/' + query)
		page = infile.read()
	else:
		print "Searching Google for " + query
		page = ""
		results = google.search(query, "com", "en", 1, 0, 1, 2.0)
		for result in results:
			print "on " + result
			page = google.get_page(result)
	
	#print page

	raw = nltk.clean_html(page) 

	#parses into tokens and saves as lwoercase
	tokens = map(lambda x:x.lower(),word_tokenize(raw))

	#removes punctuation and empty strings
	tokens = [s.translate(None, string.punctuation) for s in tokens]
	tokens = [s for s in tokens if s]

	return tokens
Beispiel #5
0
def whoAnswer(question,n): #returns top n occurrences of names from question
    numPages = 10;#uses top 10 results
    links = google.search("Who " + question,num=numPages,start=0,stop=0,pause=2.0)
    txt = ""
    for i in range(numPages):
        txt += BeautifulSoup(google.get_page(links.next())).getText()
    return getNames(txt,n)
    def get_company_domain(self, searchKey):
        '''looks for the company website from the top five
           url return by google search. If company website
           is found then it parses the url to get domain name
        '''
        search_result = search(searchKey, stop=5)

        for url in search_result:
            keywords = searchKey.split(" ")

            print keywords
            if keywords[0] in url.lower():
                # if links is wikipedia link then parse the webpage to get
                # company homepage
                if "en.wikipedia.org" in url:
                    chomepage = get_company_website(url)
                    if chomepage is not None:
                        return extract_domainname(chomepage)
                return extract_domainname(url)

            try:
                htmlpage = get_page(url)
                soup = BeautifulSoup(htmlpage)

                title = soup.title.text.lower()

                if keywords[0] in title:
                    return extract_domainname(url)
            except:
                print searchKey.ljust(52) + ": Can't parse web page at " + colored(url.ljust(100), 'blue')
Beispiel #7
0
def who(s):
    g = google.search(s, num=1, start=0, stop=8)
    #l = BeautifulSoup(google.get_page(g.next()))
    #x = givetext(l.prettify())
    f = []
    #soup=[(BeautifulSoup(google.get_page(x)).find_all('p') for x in urls] HOW TO SOUP
    urls = [x for x in g]
    f.append("Step 1 - Collecting URLS:")
    for x in urls:
        f.append(x)
    f.append("")
    f.append("")
    soup = [(google.get_page(x)) for x in urls]
    alphabetsoup = [nameapp.givetext(x) for x in soup]
    f.append("Step 2 - Collecting all the Names:")
    f.append(str(alphabetsoup))
    f.append("")
    f.append("")
    splitted = []
    for x in alphabetsoup:
        splitted += x
    """for i in urls:
        html.append(BeautifulSoup(google.get_page(i)))
    allnames = []
    for i in html:
        allnames.append(nameapp.givetext(i))
    splitted = []
    for i in allnames:
        splitted += i.split(",")"""
    namestats = {}
    ##all names
    for i in splitted:
        if i in namestats:
            namestats[i] += 1
        else:
            namestats[i] = 1
    f.append("Step 3 - Making a Dictionary:")
    f.append(str(namestats))
    f.append("")
    f.append("")
    final = {}
    for i in namestats:
        if namestats[i] >= 25:
            final[i] = namestats[i]

    f.append("Step 4 - Narrowing Possibilites:")
    f.append(str(final))
    f.append("")
    f.append("")
    maxname = final.keys()[0]
    if (maxname == s):
        maxname = final.keys()[1]
    maxvalue = final[maxname]
    for x in final.keys():
        if final[x] > maxvalue and x != s:
            maxname = x
            maxvalue = final[x]
    f.append("Step 5 - The Answer:")
    f.append(maxname)
    return f
def who(s):
    g = google.search(s, num= 1,start = 0, stop = 8)
    #l = BeautifulSoup(google.get_page(g.next()))
    #x = givetext(l.prettify()) 
    
    #soup=[(BeautifulSoup(google.get_page(x)).find_all('p') for x in urls] HOW TO SOUP
    soup=[(google.get_page(x)) for x in g]
    alphabetsoup=[nameapp.givetext(x) for x in soup]
    splitted=[]
    for x in alphabetsoup:
        splitted+=x
    """for i in urls:
        html.append(BeautifulSoup(google.get_page(i)))
    allnames = []
    for i in html:
        allnames.append(nameapp.givetext(i))
    splitted = []
    for i in allnames:
        splitted += i.split(",")"""
    namestats = {}
    ##all names
    for i in splitted:
        if i in namestats:
            namestats[i] += 1
        else:
            namestats[i] = 1

    final = {}
    for i in namestats:
        if namestats[i] >= 25:
            final[i] = namestats[i]
    return final
Beispiel #9
0
def get_tokens(query, src="google"):
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]

    if src == "wikipedia":
        print "Searching Wikipedia for " + query
        infile = opener.open('http://en.wikipedia.org/wiki/' + query)
        page = infile.read()
    else:
        print "Searching Google for " + query
        page = ""
        results = google.search(query, "com", "en", 1, 0, 1, 2.0)
        for result in results:
            print "on " + result
            page = google.get_page(result)

    #print page

    raw = nltk.clean_html(page)

    #parses into tokens and saves as lwoercase
    tokens = map(lambda x: x.lower(), word_tokenize(raw))

    #removes punctuation and empty strings
    tokens = [s.translate(None, string.punctuation) for s in tokens]
    tokens = [s for s in tokens if s]

    return tokens
Beispiel #10
0
def get_google(url, seen):
    from bs4    import BeautifulSoup
    from google import search, get_page
    soup      = BeautifulSoup(get_page(url), 'lxml')
    content   = { url : [str(s) for paragraph in soup.find_all('p') for s in paragraph.strings] }
    remaining = { link.get('href') for link in soup.find_all('a') if link.get('href') not in seen }
    return content, remaining
Beispiel #11
0
def ask_google_en_to_zh(s):
    q = s + " wikipedia 中文"
    result = search(q, lang="zh", pause=1.0)
    cnt = 0
    for x in result:
        cnt += 1
        if cnt >= 10:
            break
        if 'wikipedia' in x:
            x = urllib.parse.unquote(x)
            x = x.split('/')[-1]
            if mafan.text.is_traditional(x):
                return x
            if mafan.text.is_simplified(x):
                url = "http://zh.wikipedia.org/zh-tw/" + urllib.parse.quote_plus(
                    x)
                print(url)
                try:
                    w = get_page(url)
                except:
                    continue
                soup = BeautifulSoup(w, 'lxml')
                x = soup.title.string
                x = x.strip().split(' ')[0]
                if '維基百科' not in x:
                    return x
    return None
Beispiel #12
0
def who(s):
    g = google.search(s, num= 1,start = 0, stop = 8)
    #l = BeautifulSoup(google.get_page(g.next()))
    #x = givetext(l.prettify()) 
    f=[]
    #soup=[(BeautifulSoup(google.get_page(x)).find_all('p') for x in urls] HOW TO SOUP
    urls = [x for x in g]
    f.append("Step 1 - Collecting URLS:")
    for x in urls:
        f.append(x)
    f.append("")
    f.append("")
    soup=[(google.get_page(x)) for x in urls]
    alphabetsoup=[nameapp.givetext(x) for x in soup]
    f.append("Step 2 - Collecting all the Names:")
    f.append(str(alphabetsoup))
    f.append("")
    f.append("")
    splitted=[]
    for x in alphabetsoup:
        splitted+=x
    """for i in urls:
        html.append(BeautifulSoup(google.get_page(i)))
    allnames = []
    for i in html:
        allnames.append(nameapp.givetext(i))
    splitted = []
    for i in allnames:
        splitted += i.split(",")"""
    namestats = {}
    ##all names
    for i in splitted:
        if i in namestats:
            namestats[i] += 1
        else:
            namestats[i] = 1
    f.append("Step 3 - Making a Dictionary:")
    f.append(str(namestats))
    f.append("")
    f.append("")
    final = {};
    for i in namestats:
        if namestats[i] >= 25:
            final[i] = namestats[i]

    f.append("Step 4 - Narrowing Possibilites:")
    f.append(str(final))
    f.append("")
    f.append("")
    maxname=final.keys()[0]
    if (maxname == s):
        maxname=final.keys()[1]
    maxvalue=final[maxname]
    for x in final.keys():
        if final[x] > maxvalue and x != s:
            maxname=x
            maxvalue=final[x]
    f.append("Step 5 - The Answer:")
    f.append(maxname)
    return f
Beispiel #13
0
def get_stuff(query): 
    s = google.search (query, tld = 'com', lang = 'en', start=0, stop=10, pause=2.0)
    l = []
    for url in s:
        l.append(url)
    data = []
    for x in l:
        data.append(google.get_page(x))
    return data
Beispiel #14
0
def get_stuff(query):
    s = google.search(query, tld='com', lang='en', start=0, stop=10, pause=2.0)
    l = []
    for url in s:
        l.append(url)
    data = []
    for x in l:
        data.append(google.get_page(x))
    return data
def parse_urls(urlsList):
    #takes list of urls, returns only text
    #beautiful soup
    html = ""
    for u in urlsList:
        content = google.get_page(u)
        soup = BeautifulSoup(content)
        html += soup.get_text()
    print "Parsed HTML code!"
    return html
def parse_urls(urlsList):
    #takes list of urls, returns only text
    #beautiful soup
    html = ""
    for u in urlsList:
        content = google.get_page(u)
        soup = BeautifulSoup(content)
        html += soup.get_text()
    print "Parsed HTML code!"
    return html
Beispiel #17
0
def retRandomResult(search):
	result = {}
	res = google.search(search)
	for c,i in enumerate(res):
		if c > 15: break
		if c % 6 == random.randint(0,5): 
	                g = Goose()
	                a = g.extract(raw_html=google.get_page(i))
			result["resNum"] = c
	                result["title"] = a.title
	                result["url"] = i
	                result["blob"] = getBlob(search,a.cleaned_text)
		else: 
			continue
	if len(result) == 0:
		for i in res:
	                g = Goose()
	                a = g.extract(raw_html=google.get_page(i))
	                result["title"] = a.title
	                result["url"] = i
	                result["blob"] = getBlob(search,a.cleaned_text)
	                return result
        return result
Beispiel #18
0
def search(question):
    name = False
    date = False

    searchtype = question.split(" ")[0]
    if searchtype.lower() == "who":
        name = True
    elif searchtype.lower() == "when":
        date = True
    else:
        return [None,0]

    g =  google.search(question, num = 10, start = 0, stop = 10, pause=3.0)
    utils = [w for w in g]
    
    d = {}
    count = 0
    for x in utils:
    #run beautiful soup to find names
        try:
            html_doc = google.get_page(x)
            soup = BeautifulSoup(html_doc)
            y = soup.get_text()

            if name:
                names = check_names(get_potential_names(y))
                #names = findname(y)
                for k in names.keys():
                    count = count + names[k]
                    if k not in d.keys():
                        d[k] = names[k]
                    else:
                        d[k] = d[k] + names[k]
                    #find highest number of names
                   # dhigh=findhigh(d);
                    #return addition of that 
            elif date:
                dates = find_dates( y )
                for k in dates.keys():
                    count = count + dates[k]
                    if k not in d.keys():
                        d[k] = dates[k]
                    else:
                        d[k] = d[k] + dates[k]
            else: 
                return


        except Exception, error:
            pass
Beispiel #19
0
def get_google_result(q, result_map):
    # Form search url
    google_url = get_google_search_url(q)

    # Parse text of result page
    print(google_url)
    google_search_results = google.get_page(google_url)
    google_search_soup = BeautifulSoup(google_search_results, "html.parser")
    google_text_search_results = google_search_soup.get_text().encode(
        "utf-8").lower()

    result_map["content"] = google_text_search_results
    result_map["words"] = google_text_search_results.split(" ")

    # Return text
    return google_text_search_results
def parsePages(urls, wordsInQuery):
    # For example, {"Zeus":500, "Jupiter": 366}
    namesByFrequency = {}

    # Loop through urls, remembering the index for weighting
    for index, url in enumerate(urls):
        try:
            html = google.get_page(url)
        except:
            continue

        if wordsInQuery[0] == "WHEN":
            namesInThisPage = searchHelper.extractDates(html)
        else:
            namesInThisPage = searchHelper.extractNames(html)

        namesInThisPage = searchHelper.weightNames(namesInThisPage, index, NUM_GOOGLE_RESULTS)
        namesByFrequency = searchHelper.addDicts(namesByFrequency, namesInThisPage)

    namesByPercent = searchHelper.compareNames(namesByFrequency)

    return namesByPercent
Beispiel #21
0
def when(s):
    f = []
    f.append("Step 1 - Collecting URLS:")
    g = google.search(s, num = 1, start = 0, stop = 8)
    urls = [x for x in g]
    for annoyingvariable in urls:
        f.append(annoyingvariable)
    f.append("")
    f.append("")
    soup = [(google.get_page(x)) for x in urls]
    f.append("Step 2 - Collecting all the Years:")
    yearsoup = [re.findall('\s[1-2][0-9]{3}',x) for x in soup]
    y = yearsoup
    f.append(str(y))
    f.append("")
    f.append("")
    yeardict = {}
    for page in yearsoup:
        if len(page) > 0:
            for year in page:
                if year in yeardict:
                    yeardict[year] += 1
                else:
                    yeardict[year] = 1
 #   return yeardict
 #   yeardict.update(monthdict)
    
    f.append("Step 3 - Collecting all the Months:")
    monthsoup = [findmonths(x) for x in soup]
    m = monthsoup
    f.append(str(m))
    f.append("")
    f.append("")
    monthdict = {}
    for page in monthsoup:
        for month in page:
            if len(month) > 0:
                if month[0] in monthdict:
                    monthdict[month[0]] += len(month)
                else:
                    monthdict[month[0]] = len(month)

    f.append("Step 4 - Making the Dictionaries:")
    yd = yeardict
    f.append(str(yd))
    f.append("")
    f.append("")
    md = monthdict
    f.append(str(md))
    f.append("")
    f.append("")
    topyear = yeardict.keys()[0]

    for year in yeardict:
        if yeardict[year] > yeardict[topyear]:
            topyear = year

    topmonth = monthdict.keys()[0]

    for month in monthdict:
        if monthdict[month] > monthdict[topmonth]:
            topmonth = month

    

    
    f.append("Step 5 - The Answer:")
    f.append(topmonth + ", " + topyear)
    return f
Beispiel #22
0

keyword = 'apple'

res_gen = google.search(keyword, stop=10)

res = []

for site in res_gen:
    if site not in res:
        res.append(site)

print "a"

fo = open("res.html", "wb")
site_html = google.get_page(res[5])
p = Parser()
p.init_parser()
p.feed(site_html)

print p.data_list
fo.write("<meta charset='UTF-8'><html><body>")
for sentence in p.data_list:
    braille_sentence = t.convert(sentence)
    print braille_sentence
    fo.write("<p>")
    fo.write("".join(braille_sentence))
    fo.write("</p>")
#print site_html
fo.write("</body></html>")
fo.close()
Beispiel #23
0
def when(s):
    f = []
    f.append("Step 1 - Collecting URLS:")
    g = google.search(s, num=1, start=0, stop=8)
    urls = [x for x in g]
    for annoyingvariable in urls:
        f.append(annoyingvariable)
    f.append("")
    f.append("")
    soup = [(google.get_page(x)) for x in urls]
    f.append("Step 2 - Collecting all the Years:")
    yearsoup = [re.findall('\s[1-2][0-9]{3}', x) for x in soup]
    y = yearsoup
    f.append(str(y))
    f.append("")
    f.append("")
    yeardict = {}
    for page in yearsoup:
        if len(page) > 0:
            for year in page:
                if year in yeardict:
                    yeardict[year] += 1
                else:
                    yeardict[year] = 1

#   return yeardict
#   yeardict.update(monthdict)

    f.append("Step 3 - Collecting all the Months:")
    monthsoup = [findmonths(x) for x in soup]
    m = monthsoup
    f.append(str(m))
    f.append("")
    f.append("")
    monthdict = {}
    for page in monthsoup:
        for month in page:
            if len(month) > 0:
                if month[0] in monthdict:
                    monthdict[month[0]] += len(month)
                else:
                    monthdict[month[0]] = len(month)

    f.append("Step 4 - Making the Dictionaries:")
    yd = yeardict
    f.append(str(yd))
    f.append("")
    f.append("")
    md = monthdict
    f.append(str(md))
    f.append("")
    f.append("")
    topyear = yeardict.keys()[0]

    for year in yeardict:
        if yeardict[year] > yeardict[topyear]:
            topyear = year

    topmonth = monthdict.keys()[0]

    for month in monthdict:
        if monthdict[month] > monthdict[topmonth]:
            topmonth = month

    f.append("Step 5 - The Answer:")
    f.append(topmonth + ", " + topyear)
    return f
Beispiel #24
0
def fetch_pages(question):
    return [google.get_page(url) for url in google.search(question, stop=10)]
def findContactPage(url):
    html = google.get_page(url)
    soup = BeautifulSoup(html)
    contactStr = soup.find_all('a', href=re.compile(".*?contact", re.IGNORECASE))
    return contactStr
import google
import re
from bs4 import BeautifulSoup


def findContactPage(url):
    html = google.get_page(url)
    soup = BeautifulSoup(html)
    contactStr = soup.find_all('a', href=re.compile(".*?contact", re.IGNORECASE))
    return contactStr


if __name__ == "__main__":
    url = "http://www.wrangler.com/"
    contactStr = findContactPage(url)
    if(len(contactStr) > 0):
        contactPage = google.get_page(contactStr[0].get("href"))
        print contactStr[0].get("href")#.find_parents("a")
        soup = BeautifulSoup(contactPage)
        emailStr = soup.find_all(text=re.compile("[\w\.-]+@[\w\.-]+"))
        if(len(emailStr) > 0) :
            print addressStr
        else:
            print "could not find email"
    else:
        print "could not find contacts page"
def myNewsSearch(key):
    data = {'q' : key}
    url = 'https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&tbs=qdr:h72&'+urllib.urlencode(data)
    page = google.get_page(url)
    return page
Beispiel #28
0
def myNewsSearch(key):
    data = {'q': key}
    url = 'https://www.google.com/search?hl=en&gl=us&tbm=nws&authuser=0&tbs=qdr:h72&' + urllib.urlencode(
        data)
    page = google.get_page(url)
    return page