Ejemplos de Google.search en Python, ejemplos de pattern.web.Google.search en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: words2map.py Proyecto: overlap-ai/words2map

def research_keywords(something_unknown, model, websites_to_scan=10, keyword_count=25, attempts=0, google_api_key=GOOGLE_API_KEY):
	# searches for something unknown on Google to find related websites and returns a ranked list of keywords from across all sites
	maximum_number_of_google_search_attempts = 3
	if attempts < maximum_number_of_google_search_attempts:
		all_keywords = Manager().dict()
		engine = Google(license=google_api_key, throttle=1.0, language="en")
		try:
			processes = []
			for page in range(int(websites_to_scan/10)):
				for website in engine.search(something_unknown, start=page+1, count=10, type=SEARCH, cached=False):
					web_mining_process = Process(target=extract_keywords, args=(website.url, model, all_keywords))
					processes.append(web_mining_process)
					web_mining_process.start()
				[process.join() for process in processes]
		except HTTP403Forbidden:
			print "\nToday's maximum number of free searches from Google shared by this API across all words2map users has expired.\nPlease get your own key at https://code.google.com/apis/console\n\nFrom that site, simply:\n1. In the API Manager Overview, find \"Custom Search API\" and enable it\n2. Copy your new API key from \"Credentials\"\n3. Paste it in words2map.py in the global variable \"GOOGLE_API_KEY\"\n"
			sys.exit(1)
		except (URLError, URLTimeout, HTTPError, SSLError):
			print "\nUnable to reach Google Search for {}, trying one more time".format(something_unknown)
			return research_keywords(something_unknown, model, attempts=attempts+1)
		all_keywords = sorted(all_keywords.items(), key=itemgetter(1), reverse=True)
		print "\nKeywords about {} to combine vectors for:".format(something_unknown)
		top_keywords = []
		for i in range(keyword_count):
			try:
				keyword, score = all_keywords[i]
				top_keywords.append(all_keywords[i])
				print "{} {}".format(round(score, 3), unidecode(keyword).replace("_", " "))
			except IndexError:
				break
		return top_keywords
	else:
		print "After a few tries, it seems that Google is not returning results for us. If you haven't done so already, please try adding your own API key at https://code.google.com/apis/console\n\nFrom that site, simply:\n1. In the API Manager Overview, find \"Custom Search API\" and enable it\n2. Copy your new API key from \"Credentials\"\n3. Paste it in words2map.py in the global variable \"GOOGLE_API_KEY\"\n"
		sys.exit(1)

Ejemplo n.º 2

0

Mostrar archivo

def research_keywords(something_unknown, model, websites_to_scan=10, keyword_count=25, attempts=0, google_api_key=GOOGLE_API_KEY):
	# searches for something unknown on Google to find related websites and returns a ranked list of keywords from across all sites
	maximum_number_of_google_search_attempts = 3
	if attempts < maximum_number_of_google_search_attempts:
		all_keywords = Manager().dict()
		engine = Google(license=google_api_key, throttle=1.0, language="en")
		try:
			processes = []
			for page in range(int(websites_to_scan/10)):
				for website in engine.search(something_unknown, start=page+1, count=10, type=SEARCH, cached=False):
					web_mining_process = Process(target=extract_keywords, args=(website.url, model, all_keywords))
					processes.append(web_mining_process)
					web_mining_process.start()
				[process.join() for process in processes]
		except HTTP403Forbidden:
			print "\nToday's maximum number of free searches from Google shared by this API across all words2map users has expired.\nPlease get your own key at https://code.google.com/apis/console\n\nFrom that site, simply:\n1. In the API Manager Overview, find \"Custom Search API\" and enable it\n2. Copy your new API key from \"Credentials\"\n3. Paste it in words2map.py in the global variable \"GOOGLE_API_KEY\"\n"
			sys.exit(1)
		except (URLError, URLTimeout, HTTPError, SSLError):
			print "\nUnable to reach Google Search for {}, trying one more time".format(something_unknown)
			return research_keywords(something_unknown, model, attempts=attempts+1)
		all_keywords = sorted(all_keywords.items(), key=itemgetter(1), reverse=True)
		print "\nKeywords about {} to combine vectors for:".format(something_unknown)
		top_keywords = []
		for i in range(keyword_count):
			try:
				keyword, score = all_keywords[i]
				top_keywords.append(all_keywords[i])
				print "{} {}".format(round(score, 3), unidecode(keyword).replace("_", " "))
			except IndexError:
				break
		return top_keywords
	else:
		print "After a few tries, it seems that Google is not returning results for us. If you haven't done so already, please try adding your own API key at https://code.google.com/apis/console\n\nFrom that site, simply:\n1. In the API Manager Overview, find \"Custom Search API\" and enable it\n2. Copy your new API key from \"Credentials\"\n3. Paste it in words2map.py in the global variable \"GOOGLE_API_KEY\"\n"
		sys.exit(1)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: seed-preperation.py Proyecto: alifars/portfolio

def google_search(targetword, itemlist,targetpath):
    resultnum=0
    engine = Google(license=None)
    file = codecs.open(targetpath,'a','utf-8')
    outtext= ''
    patt = ur'\W+'
    for item in itemlist:
        for i in range(1,5):
            for result in engine.search(item, type=SEARCH, start=i):

                  url = URL(result.url)
                  text = url.download(unicode=True)

                  text = plaintext(text)
                  text = correctPersianString(text)
                  text = text.replace('\n',' ')
                  lines = text.split('.')
                  for line in lines:
                      if targetword in line:

                              match = re.findall(patt,line)
                              output =  ' '.join(match)

                              for item in punclist:
                                  if item in line:
                                      line = line.replace(item,' ')

                              print output
                              file.write(output)
                              file.write('\n')
    print str(resultnum)+" found in web"
    file.close()

Ejemplo n.º 4

0

Mostrar archivo

Archivo: webtrainmaker.py Proyecto: alifars/iWordSense

def google_search(match,targetfile):
    engine = Google(license=None)
    for i in range(1,10):
        for result in engine.search(match, type=SEARCH, start=i):
              print plaintext(result.description)
              targetfile.write(plaintext(result.description))
              targetfile.write('\n')

Ejemplo n.º 5

0

Mostrar archivo

Archivo: webtrainmaker.py Proyecto: alifars/portfolio

def google_search(match, targetfile):
    engine = Google(license=None)
    for i in range(1, 10):
        for result in engine.search(match, type=SEARCH, start=i):
            print plaintext(result.description)
            targetfile.write(plaintext(result.description))
            targetfile.write('\n')

Ejemplo n.º 6

0

Mostrar archivo

Archivo: ScrapeMusicInfo.py Proyecto: nataliehan23/Insight-oldio

def youtubeLinkGoogle(author, album):
	if author == 'unknown':
		author = ''
	name = author + ' ' + album
	key='AIzaSyC0go1dbuPHJhGYnONXvBc8z9Q8GkBSosw'
	engine = Google(license=key, throttle=0)
	results = engine.search(name + " youtube")
	return results[0].url

Ejemplo n.º 7

0

Mostrar archivo

Archivo: engines.py Proyecto: luislezcair/gisiaws

 def run(self,q):# q is the query
     engine = Google(license=None, language="en")
     # Google is very fast but you can only get up to 100 (10x10) results per query.
     urlsGoogle=[]
     for i in range(1,11):
         for result in engine.search(q, start=i, count=10, type=SEARCH, cached=True):
             urlsGoogle.append(result.url)
     return urlsGoogle

Ejemplo n.º 8

0

Mostrar archivo

Archivo: googleClient.py Proyecto: fede379/cinaptic_analizador_semantico

    def get_urls(self, q = "", n = 1, limit = 1):
        url = []
        reload(sys)
        sys.setdefaultencoding(GOOGLE_API_ENCODING)
        engine_google = Google(license=GOOGLE_API_KEY, language=GOOGLE_API_LANG)
        for i in range(1, (n + 1)):
            for result in engine_google.search(q, start=i, count=10, type=SEARCH, cached=False):
                url.append(result.url)

        return url[:limit]

Ejemplo n.º 9

0

Mostrar archivo

Archivo: web_page_parser.py Proyecto: ssrinivasan/data_science_practice

 def __init__(self, query=''):
     engine = Google(license=None, throttle=0.5, language=None)
     c = 0
     for i in range(1, 5):
         results = engine.search(query, start=i)
         for result in results:
             c += 1
             print c
             print result.url + ':' + result.title
             print repr(plaintext(result.text))
             print ""

Ejemplo n.º 10

0

Mostrar archivo

Archivo: web_page_parser.py Proyecto: alex-nexus/data_science_practice

 def __init__(self, query=''):
     engine = Google(license=None, throttle=0.5, language=None)
     c=0        
     for i in range(1,5):
         results = engine.search(query, start=i)
         for result in results:
             c+=1
             print c
             print result.url+':'+result.title
             print repr(plaintext(result.text))
             print ""

Ejemplo n.º 11

0

Mostrar archivo

Archivo: metrics.py Proyecto: gtank/storylinenews

def gnews_polarity(topic):
    engine = Google()
    results = engine.search(topic, type=NEWS)
    score = 0
    #only 8 results without using paging/cursor
    for result in results:
        content = heuristic_scrape(urllib.unquote(result.url))
        if content:
            polarity = sentiment(content)
            score = score + polarity
        else:
            results.remove(result)
    return score / float(len(results))  #avg sentiment

Ejemplo n.º 12

0

Mostrar archivo

Archivo: metrics.py Proyecto: gtank/storylinenews

def gnews_polarity(topic):
    engine = Google()
    results = engine.search(topic, type=NEWS)
    score = 0
    #only 8 results without using paging/cursor
    for result in results:
        content = heuristic_scrape(urllib.unquote(result.url))
        if content:
            polarity = sentiment(content)
            score = score + polarity
        else:
            results.remove(result)
    return score / float(len(results)) #avg sentiment

Ejemplo n.º 13

0

Mostrar archivo

Archivo: google.py Proyecto: luislezcair/gisiaws

def generar_consulta_google(q):
    url = []
    reload(sys)
    sys.setdefaultencoding('utf8')

    # engine_google = Google(license="AIzaSyCvvHb8SYgHvS5gEIQabxuJ0Kl0sYdHl9U", language="en")
    engine_google = Google(license="AIzaSyCKlCEJ41mE_6gqTN2AI9J4iSB-2L55zR0", language="en")
    for consulta in q:
        for i in range(1, 2):
            for result in engine_google.search(consulta, start=i, count=10, type=SEARCH, cached=False):

                titulo = strip_accents(result.title)
                url.append(result.url)
    return url

Ejemplo n.º 14

0

Mostrar archivo

Archivo: perception.py Proyecto: OAlm/the_stromberg_stories

def learn(concept):
    """ Returns a list of properties for the given concept,
        collected from a "I think X is Y".
    """
    q = 'I think %s is *' % concept
    p = []
    g = Google(language='en', license=None)
    for i in range(10):
        for result in g.search(q, start=i, cached=True):
            m = plaintext(result.description)
            m = search(q, m) # Use * as a wildcard.
            if m:
                p.append(m[0][-1].string)
    return [w for w in p if w in PROPERTIES] # only handles known properties...

Ejemplo n.º 15

0

Mostrar archivo

Archivo: mine.py Proyecto: nirabhratapaswi/nlp

def get_info(search_query):
	if isinstance(search_query, str):
		search_query = str(search_query)
	else:
		return { "Error": "Pass a string, from mine.py [7]", "Result": [None] }

	result = []
	engineGoogle = Google(license=None, throttle=0.5, language=None)
	engineBing = Bing(license=None, throttle=0.5, language=None)
	engineTwitter = Twitter(license=None, throttle=0.5, language=None)
	engineFacebook = Facebook(license=None, throttle=1.0, language='en')
	engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None)
	engineFlickr = Flickr(license=None, throttle=5.0, language=None)
	engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr]
	engineArray = [engineGoogle, engineTwitter]

	'''
	for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)])
		[result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray]
			# print repr(plaintext(para.text))
			# print repr(plaintext(para.url)) + '\n\n'
			# result.append(repr(plaintext(para.text)))
	'''

	# Google
	for i in range(1, 5):
		result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)])
		
	for i in range(1, 5):
		result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	'''
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)])
	for i in range(1,2):
		result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)])
	'''

	return { "Error": None, "Result": result }

	# return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }

Ejemplo n.º 16

0

Mostrar archivo

Archivo: Python_Data_Science_Perceptions.py Proyecto: VakinduPhilliam/Python_Data_Science

def learn(concept):

    q = 'I think %s is *' % concept
    p = []
    g = Google(language='en')

    for i in range(10):

        for result in g.search(q, start=i, cached=True):

            m = plaintext(result.description)
            m = search(q, m)  # use * as wildcard

            if m:
                p.append(m[0][-1].string)

    return [w for w in p if w in PROPERTIES]

Ejemplo n.º 17

0

Mostrar archivo

Archivo: PatternSearch.py Proyecto: Darth-Neo/nl_lib

 def patternSearch(self, n=12, m=50):
     logger.info(u"patternSearch")
  
     proxyList = list()
     proxyList.append(u"3128")
     proxyList.append(u"206.217.138.154")
     
     logger.info(u"proxyList - %s" % proxyList)
  
     engine = Google(license=None, throttle=0.5, language=None)
     # engine = Bing(license=None, throttle=0.5, language=None)
 
     for i in range(n):                
         logger.info(u"Search %d" % i)
         results = engine.search(self.searchTerm, start=i+1, count=m, cached=False, proxy=proxyList)
         
         for r in results:
             logger.debug(u"Result=%s" % r.text)
             url = r.url
             logger.debug(u"URL=%s" % url)
             
             # if url[-4:] == ".com":
             #    continue
                     
             s = r.text.lower()
             s = plaintext(s)
             s = parsetree(s)
 
             # self.logSentences(s)
 
             # Execute a Regular Expression Search
             # p = r'(NN)+ (VB)+'
             p = r'(NN)+'
             q = search(p, s)
 
             # logPOS(q)
 
             # Iterate over all the words in the POS
             logger.debug(u"  q.Length=%d" % len(q))
             logger.debug(u"  q[]=%s" % q)
 
             self.g, self.urlConcepts, self.wordConcepts = \
                 self.addNodes(self.g, q, url, self.urlConcepts, self.wordConcepts)
     
     return self.urlConcepts, self.wordConcepts

Ejemplo n.º 18

0

Mostrar archivo

Archivo: data_mine.py Proyecto: nirabhratapaswi/Debato

def get_info(search_query):
    if isinstance(search_query, str):
        search_query = str(search_query)
    else:
        return {"Error": "Pass a string, from mine.py [7]"}

    google = [{'text': '', 'url': '', 'title': ''}]
    twitter = [{'text': '', 'url': '', 'title': ''}]
    engineGoogle = Google(license=None, throttle=0.5, language=None)
    # engineBing = Bing(license=None, throttle=0.5, language=None)
    engineTwitter = Twitter(license=None, throttle=0.5, language=None)
    # engineFacebook = Facebook(license=None, throttle=1.0, language='en')
    # engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None)
    # engineFlickr = Flickr(license=None, throttle=5.0, language=None)
    # engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr]
    engineArray = [engineGoogle, engineTwitter]

    # Google
    for i in range(1, 2):
        for para in engineGoogle.search(search_query,
                                        type=SEARCH,
                                        start=i,
                                        count=5):
            google.append({
                'text': para.text,
                'url': para.url,
                'title': para.title
            })
        #resultGoogle = resultGoogle + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)])
    # Twitter
    for i in range(1, 2):
        for para in engineTwitter.search(search_query,
                                         type=SEARCH,
                                         start=i,
                                         count=5):
            twitter.append({
                'text': para.text,
                'url': para.url,
                'title': para.title
            })
        #resultTwitter = resultTwitter + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])

    # print 'From data_mine.py --> google: ', google, ', twitter: ', twitter

    return {"Error": None, "Google": google, "Twitter": twitter}

Ejemplo n.º 19

0

Mostrar archivo

Archivo: web.py Proyecto: antiface/pynlpl

def googlecorpsearch(word,concfilter = '', extraquery='',license=None, start=1, count=8):
    """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples.
       First queries Google, and then retrieves the pages of the top search results.
       Uses 'pattern' (CLiPS, Antwerpen University)
       """
    if not concfilter:
        query = 'allintext: ' + word 
    else:
        query = 'allintext: "' + word + ' * ' + concfilter + '" OR "' + concfilter + ' * ' + word + '"'
    if extraquery:
        query += ' ' + extraquery
        

    engine = Google(license=license)
        
    processed = {}
    
    for result in engine.search(query, start=start,count=count):
        if not result.url in processed:
            processed[result.url] = True
            try:
                content = plaintext(result.download())
            except:
                continue
                
            begin = 0
            wordindex = None
            wordlength = 0
            concindex = None            
            for i in range(1,len(content)):
                if content[i] == '.' or content[i] == '?' or content[i] == '!' or content[i] == '\n':
                    if wordindex >= begin and ((concfilter and concindex >= begin) or (not concfilter)):
                        if len(content[begin:wordindex].strip()) > 5 or len(content[wordindex+wordlength:i+1].strip()) > 5:
                            yield (content[begin:wordindex].strip(), content[wordindex:wordindex+wordlength].strip(), content[wordindex+wordlength:i+1], result.url)
                    wordindex = concindex = None
                    begin = i + 1
                if len(word)+i <= len(content) and content[i:i+len(word)].lower() == word.lower():
                    wordindex = i
                    wordlength = len(word)
                    for j in range(len(word),len(content)):                        
                        if i+j < len(content) and (content[i+j] == ' ' or  content[i+j] == '?' or content[i+j] == '!' or content[i+j] == '\n'):
                            wordlength = j
                            break                                                                
                if concfilter and content[i:len(concfilter)].lower() == concfilter.lower():
                    concindex = i

Ejemplo n.º 20

0

Mostrar archivo

def generar_consulta_google(q):
    url = []
    reload(sys)
    sys.setdefaultencoding('utf8')

    # engine_google = Google(license="AIzaSyCvvHb8SYgHvS5gEIQabxuJ0Kl0sYdHl9U", language="en")
    engine_google = Google(license="AIzaSyCKlCEJ41mE_6gqTN2AI9J4iSB-2L55zR0",
                           language="en")
    for consulta in q:
        for i in range(1, 2):
            for result in engine_google.search(consulta,
                                               start=i,
                                               count=10,
                                               type=SEARCH,
                                               cached=False):

                titulo = strip_accents(result.title)
                url.append(result.url)
    return url

Ejemplo n.º 21

0

Mostrar archivo

Archivo: app.py Proyecto: Flo-tyna/Text-Summarization-App

def main():
    activites = ["Summary", "Named Entity Recognition", "Search", "Keywords"]
    choice = st.sidebar.selectbox("Select Activity", activites)
    if choice == "Summary":
        html_temp = """
	<div style="background-color:#16A085;"><p style="color:white;font-size:60px;">Text Summarizer</p></div>
	"""
        components.html(html_temp)
        text = st.text_area("Input Text For Summary", height=300)
        if st.button("summarize"):
            st.success(summary(text))
        text_range = st.sidebar.slider("Summarize words Range", 25, 500)

    # Named Entity Recognition
    elif choice == "Named Entity Recognition":
        html_temp1 = """
	<div style="background-color:#16A085;"><p style="color:white;font-size:60px;">Text Tokenizer</p></div>
	"""
        components.html(html_temp1)
        row_data = st.text_area("write Text For Tokenizer")
        docx = nlp(row_data)
        if st.button("Tokenizer"):
            spacy_streamlit.visualize_tokens(
                docx, attrs=['text', 'pos_', 'dep_', 'ent_type_'])
        if st.button("NER"):
            spacy_streamlit.visualize_ner(docx,
                                          labels=nlp.get_pipe('ner').labels)
        if st.button("Text Relationship"):
            spacy_streamlit.visualize_parser(docx)

    #Search Bar
    elif choice == "Search":
        html_temp4 = """
	<div style="background-color:#16A085;"><p style="color:white;font-size:60px;,text-align:center;">Search Bar</p></div>
	"""
        components.html(html_temp4)
        row_text = st.text_input("Search Anything")
        google = Google(license=None)
        if st.button("search"):
            for search_result in google.search(row_text):
                st.write(search_result.text)
                st.warning(search_result.url)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: 01-google.py Proyecto: cloudappsetup/pattern

# The web module has a SearchEngine class with a search() method 
# that yields a list of Result objects.
# Each Result has url, title, description, language, author and date and properties.
# Subclasses of SearchEngine include: 
# Google, Yahoo, Bing, Twitter, Wikipedia, Flickr.

# This example retrieves results from Google based on a given query.
# The Google search engine can handle SEARCH, IMAGE, NEWS and BLOG type searches.

# Google's "Custom Search API" is a paid service.
# The web module uses a test account with a 100 free queries per day, shared with all users.
# If the limit is exceeded, SearchEngineLimitError is raised.
# You can obtain your own license key at: https://code.google.com/apis/console/
# Activate "Custom Search API" under "Services" and get the key under "API Access".
# Then use Google(license=[YOUR_KEY]).search().
# This will give you 100 personal free queries, or 5$ per 1000 queries.
engine = Google(license=None, language="en")

# Veale & Hao's method for finding simile using Google's wildcard (*) support.
# http://afflatus.ucd.ie/Papers/LearningFigurative_CogSci07.pdf)
# This will match results such as "as light as a feather", "as cute as a cupcake", etc.
q = "as * as a *"

# Google is very fast but you can only get up to 100 (10x10) results per query.
for i in range(1,2):
    for result in engine.search(q, start=i, count=10, type=SEARCH):
        print plaintext(result.description) # plaintext() removes HTML formatting.
        print result.url
        print result.date
        print

Ejemplo n.º 23

0

Mostrar archivo

def gnewshits(topic):
    engine = Google()
    results = engine.search(topic, type=NEWS)
    return results.total

Ejemplo n.º 24

0

Mostrar archivo

def calc_main():
    st.title("Nimbus Words")
    st.sidebar.header("Input Options")
    expander_bar = st.beta_expander("How To Use This App")
    expander_bar.markdown("""

    **Use the Dropdown Box located within the sidebar on the left to choose 1 of the 6 AI text editing features offered by Nimbus Words.** 

    1) **Summarizer:** Paste in text that will be summarized by our AI model. The first text box will do an automated summary of our program's recommended word count, and the second box beneath that will provide a summary of the exact word count you choose using the slider located within the sidebar.  

    2) **Tokenizer:** Paste in text that will be analyzed by our AI model. The **Tokenizer** button will provide a breakdown on each word within the phrase, for example 'Google' is an organization, or 'Jeff Bezos' is a proper noun. The **NER** button will display all named entities, for example 'Steve Jobs' is a person. The **Text Relationship** button will display a visual graph of the dependency each word has within a sentence or phrase. 

    3) **Synonyms:** Paste in text that will be analyzed by our AI model. The **Synonyms** button will provide you with synonyms to the inputted attribute. The **Definition** checkbox will provide definitions for the attribute. The **Example** checkbox will provide examples of the given attribute in a sentence.

    4) **Translator:** Paste in text that will be translated by our AI model. The **Translate** button will translate the inputted text into one of the many languages that we have provided, and we will automatically detect which language the inputted text is written in.

    5) **Search:** Paste in text that will be preprcoessed by our AI model. The **Search** button will do a filtered search for your input.

    6) **Spell Correction:** Paste in text that will be spell-checked by our AI model. The **Correct** button will offer a correct spelling for any grammatical error that are detected. The **Pluralize**, **Singularize**, **Comparative** and **Superlative** checkboxes do exactly as they say, and ouput those options for the input you provided. 

    """)

    activites = [
        "Summary", "Tokenizer", "Synonyms", "Translator", "Search",
        "Spell Correction"
    ]
    choice = st.sidebar.selectbox("Select Activity", activites)
    if choice == "Summary":
        st.title('AI Text Summarizer')
        text = st.text_area("Input Text For Summary", height=300)
        if st.button("Summarize"):
            st.success(summary(text))
        text_range = st.sidebar.slider("Summarize words Range", 25, 500)
        text = st.text_area("Input Text For Summary", height=250)
        if st.button("Summarize with Custom Word Count"):
            st.warning(summarize(text, word_count=text_range))
    # Tokenizer
    elif choice == "Tokenizer":
        st.title('Text Tokenizer')
        row_data = st.text_area("write Text For Tokenizer")
        docx = nlp(row_data)
        if st.button("Tokenizer"):
            spacy_streamlit.visualize_tokens(
                docx, attrs=['text', 'pos_', 'dep_', 'ent_type_'])
        if st.button("NER"):
            spacy_streamlit.visualize_ner(docx,
                                          labels=nlp.get_pipe('ner').labels)
        if st.button("Text Relationship"):
            spacy_streamlit.visualize_parser(docx)
    # synonyms
    elif choice == "Synonyms":
        st.title('Synonym Generator')
        text = st.text_area("Enter Text")
        if st.button("Synonyms"):
            for syn in wordnet.synsets(text):
                for i in syn.lemmas():
                    st.success(i.name())
        if st.checkbox("Definition"):
            for syn in wordnet.synsets(text):
                st.warning(syn.definition())
        if st.checkbox("Example"):
            for syn in wordnet.synsets(text):
                st.success(syn.examples())
    # Translator
    elif choice == "Translator":
        st.title('Speech Tranlation')
        row_text = st.text_area("Enter Your Text For Translation", height=300)
        translation_text = TextBlob(row_text)
        list1 = ["en", "ta", "pa", "gu", "hi", "ur", "kn", "bn", "te"]
        a = st.selectbox("select", list1)
        if st.button("search"):
            #input1 = TextBlob("Simple is better than complex")
            st.success(translation_text.translate(to=a))
    #Search Bar
    elif choice == "Search":
        st.title('Web Search')
        row_text = st.text_input("Search Anything")
        google = Google(license=None)
        if st.button("search"):
            for search_result in google.search(row_text):
                st.write(search_result.text)
                st.warning(search_result.url)
    elif choice == "Spell Correction":
        st.title('AI Spell Correction')
        text_data = st.text_area("Enter Text Here")
        a = TextBlob(text_data)
        if st.button("Correct"):
            st.success(a.correct())
        st.title('Pluralize & Singularize')
        text_data1 = st.text_input("Enter a word For pluralize / singularize")
        if st.checkbox("Pluralize"):
            st.warning(pluralize(text_data1))
        if st.checkbox("Singularize"):
            st.warning(singularize(text_data1))

        st.title('Compartitive & Superlative')
        text2 = st.text_input("Enter Text For comparative & superlative")
        if st.checkbox("Comparative"):
            st.success(comparative(text2))
        if st.checkbox("Superlative"):
            st.success(superlative(text2))

Ejemplo n.º 25

0

Mostrar archivo

    cnt = Counter()
    for i in resultString.split():
        try:
            content = i
            content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
        except UnicodeDecodeError as e:
            print e, i
        if content in schlagList.elements():
            schlagList[content] += 1
            cnt[content] += 1
    return (resultDate, resultUrl, cnt) # why not just write to file?


engine = Google(license=None, language='de') #'0001401664590345725200:dx8-iwqnvyw')

for result in engine.search('Energiewende'):
    articleDict = [buildDict(unicode(result.text), result.url, result.date)]



with codecs.open("google.csv", "a", "utf-8") as f:
    for l,m in sorted(schlagList.most_common(20)):
        f.write(l)
        f.write(",")
        f.write(str(m).encode("utf-8"))
        f.write("\n")
    print schlagList.most_common(20)

'''
with codecs.open("google2.csv", "a", "utf-8") as f:
    for x,y,z in articleDict:

Ejemplo n.º 26

0

Mostrar archivo

Archivo: crawl.py Proyecto: staii/datascience-onsite

from pattern.web import Google, plaintext
import re

words = ["jordan", "germany"]
filename = "../data/corpus-addition"

f = open(filename, "w+")
engine = Google(license=None, throttle=0.5, language=None) # Using Google since I had some issues with Bing.
for word in words:
  print "Working on '%s' ..." % word
  for result in engine.search(word):
    lines = plaintext(result.download()).split("\n")
    lines = [re.sub("[^\\w]", " ", line) for line in lines]
    lines = [re.sub("\\s+", " ", line).strip() for line in lines]
    lines = [line if line.count(" ") >= 4 else "" for line in lines]
    text = re.sub("\\n+", "\n", "\n".join(lines))
    f.write(text + "\n")

f.close()

Ejemplo n.º 27

0

Mostrar archivo

class SourceChecker(object):
    def __init__(self, text, language, max_queries=5, span=5, threshold=.8):
        self.max_queries = max_queries
        self.span = span
        self.threshold = threshold
        self.text = text
        self.language = language
        self.cat_dict = defaultdict(list)
        key = ''
        self.engine = Google(license=key, throttle=0.5, language=None)
        self.cat_weightage_dict = {
            'imposter site': 0.2,
            'fake news': 0.0,
            'parody site': 0.0,
            'some fake stories': 0.0,
            'conspiracy': 0.4,
            'fake': 0.0,
            'rumor': 0.2,
            'unreliable': 0.3,
            'reliable': 0.9,
            'bias': 0.7,
            'clickbait': 0.3,
            'satire': 0.0,
            'junksci': 0.3,
            'political': 0.8,
            'hate': 0.3,
            'blog': 0.5,
            'satirical': 0.1,
            'unrealiable': 0.3,
            'questionable': 0.4,
            'least_biased': 1.0,
            'pseudoscience': 0.5,
            'right_center': 0.8,
            'pro_science': 0.8,
            'left_center': 0.8,
            'right': 0.8,
            'left': 0.8,
            'biased': 0.8,
            'state': 0.5
        }

    def get_queries(self):
        """Function to extract search queries from the text:
        breaks text into ngrams, filters ngrams that consist mostly of stopwords or named entities,
        selects an evenly spaced sample of the remaining ngrams"""

        text = self.text
        beg_quotes = re.findall(r'\"\S', text)
        for each in beg_quotes:
            text = text.replace(each, 'BEGQ' + each[-1])

        end_quotes = re.findall(r'\S\"', text)
        for each in end_quotes:
            text = text.replace(each, each[0] + 'ENDQ')

        text = re.sub('(ENDQ)+', 'ENDQ', text)
        text = re.sub('(BEGQ)+', 'BEGQ', text)
        text = text.replace('--', 'DOUBLEDASH')

        all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)
        if self.language in stopwords.fileids():
            stop_words = stopwords.words(self.language)
        else:
            stop_words = []
        queries = []
        queries.append(self.text)
        for ngram in all_ngrams:
            num_stop = len([w for w in ngram if w in stop_words])
            stop_score = float(num_stop) / len(ngram)
            if self.language == 'english':
                chunked = ne_chunk(pos_tag(ngram))
                named_entities = [[w for w, t in elt] for elt in chunked
                                  if isinstance(elt, nltk.Tree)]
                num_ent = sum([len(ent_list) for ent_list in named_entities])
                ent_score = float(num_ent) / len(ngram)
            else:
                ent_score = 0

            if stop_score < self.threshold and ent_score < self.threshold:
                r_string = self.reconstruct_ngram(ngram)
                if r_string in self.text:
                    queries.append(r_string)

        reduction = len(queries) / self.max_queries
        if reduction == 0:
            reduction = 1
        return queries[0::reduction]

    def reconstruct_ngram(self, ngram):
        """Function to reconstruct original substrings from the ngrams"""

        punc_b = ['!', '?', '.', ',', ';', ':', '\'', ')', ']', '}']
        punc_a = ['(', '[', '}', '$']
        ngram = ' '.join(ngram)
        for p in punc_b:
            ngram = ngram.replace(' ' + p, p)
        for p in punc_a:
            ngram = ngram.replace(p + ' ', p)
        ngram = re.sub('(^| )BEGQ', ' "', ngram)
        ngram = re.sub('ENDQ($| )', '" ', ngram)
        ngram = ngram.replace('DOUBLEDASH', '--')
        return ngram

    def load_domains(self):
        """loads domain information"""
        sources_path = pd('data', 'source_data.csv')
        domain_file = Datasheet.load(sources_path, headers=True)
        for row in domain_file:
            url = row[1]
            if str(row[-1]).find("\""):
                cats = row[2:-1]
            else:
                cats = row[2:]
            self.cat_dict[url] = cats

    def pairwise(self, t):
        it = iter(t)
        return izip(it, it)

    def get_urls(self, queries):
        """runs search query through search API and collects returned domain information"""
        domains = defaultdict(list)
        for q in queries:
            q = "\"" + q + "\""
            results = self.engine.search(q)

            for result in results:
                url = result.url
                domain = self.get_domain(url)
                domains[domain].append(q)
        return domains

    def get_domain(self, full_url):
        """function to extract the domain name from the URL"""
        clean_reg = re.compile(r'^((?:https?:\/\/)?(?:www\.)?).*?(\/.*)?$')
        match = re.search(clean_reg, full_url)
        beg, end = match.group(1), match.group(2)
        domain = string.replace(full_url, beg, '')
        domain = string.replace(domain, end, '')
        return domain

    def render_output(self, domains):
        """renders text output"""
        output = defaultdict(list)
        for d, v in domains.items():
            d_cats = [
                c for c in self.cat_dict[d]
                if len(c) > 0 and len(c.split(' ')) < 3
            ]
            overlap = float(len(v)) / self.max_queries
            if overlap <= 0.2:
                output['MINIMAL'].append((d, d_cats))
            elif 0.2 < overlap < 0.6:
                output['SOME'].append((d, d_cats))
            elif overlap >= 0.6:
                output['HIGH'].append((d, d_cats))
        degrees = ['HIGH', 'SOME', 'MINIMAL']
        score1 = {}
        for deg in degrees:
            score2 = 0
            len2 = 0
            if output[deg]:
                for d, cats in sorted(output[deg]):
                    if cats:
                        score3 = 0.0
                        len2 += 1
                        len3 = len(cats)
                        for cat in cats:
                            score3 += self.cat_weightage_dict[cat]
                        score3 /= len3
                        score2 += score3
                    else:
                        continue
            if len2 != 0:
                score2 /= len2
            score1[deg] = score2
        print 'score is'
        cred = (0.5 * score1['HIGH'] + 0.3 * score1['SOME'] +
                0.2 * score1['MINIMAL'])
        print 'credibility score is '
        print cred
        print '\n'
        for deg in degrees:
            if output[deg]:
                print '%s OVERLAP: ' % deg
                for d, cats in sorted(output[deg]):
                    if cats:
                        print d + ': ' + ','.join(cats)
                    else:
                        print d
                print '\n'
        if cred == None:
            cred = 0

        if cred <= 0.5:
            output['RESULT'].append(('false', [int(cred * 100)]))
        else:
            output['RESULT'].append(('true', [int(cred * 100)]))

        return output

    def cleanup_text(self, inputData):
        inputString = re.sub(r'[^\w\s]', "", inputData).strip().lower()
        if re.match(r'^(hello|hi|hey)$', inputString):
            return [False, "Hello. Please enter something useful!"]
        elif re.match(
                r'^(how\s+are\s+you(\s+doing)?|hows\s+it\s+going|hows\s+everything|how\s+are\s+things|hows\s+life)$',
                inputString):
            return [False, "Good. Please enter something useful!"]
        elif re.match(
                r'^(whats\s+up|whats\s+new|whats\s+going\s+on|s+up|whaz+up)$',
                inputString):
            return [False, "Nothing. Please enter something useful!"]
        elif re.match(r'^good\s+(morning|afternoon|evening|night)$',
                      inputString):
            return [
                False,
                re.findall(r'^(good\s+(morning|afternoon|evening|night))$',
                           inputString)[0][0].upper() +
                "! Please enter something useful!"
            ]
        elif len(inputString.split()) < 8:
            return [
                False, "Please make sure the text contains at least 8 words"
            ]
        else:
            return [True, inputData]

    def getSensationalData(self, text):
        model_path = '../SensationalismClassifier/trained_model.pkl'
        clf = SensationalismClassifier(train_data=None,
                                       model=model_path,
                                       dump=False,
                                       debug=False)
        firstFive = ""
        token = text.split()[:5]
        for words in token:
            firstFive = firstFive + words + " "
        myList = [[firstFive, text]]
        res = clf.classify(myList)
        list = res[0]
        # returns 1 if the input is categorized as sensattokenionalist, 0 if not.
        return list[2]

Ejemplo n.º 28

0

Mostrar archivo

Archivo: web.py Proyecto: zzmjohn/pynlpl

def googlecorpsearch(word,
                     concfilter='',
                     extraquery='',
                     license=None,
                     start=1,
                     count=8):
    """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples.
       First queries Google, and then retrieves the pages of the top search results.
       Uses 'pattern' (CLiPS, Antwerpen University)
       """
    if not concfilter:
        query = 'allintext: ' + word
    else:
        query = 'allintext: "' + word + ' * ' + concfilter + '" OR "' + concfilter + ' * ' + word + '"'
    if extraquery:
        query += ' ' + extraquery

    engine = Google(license=license)

    processed = {}

    for result in engine.search(query, start=start, count=count):
        if not result.url in processed:
            processed[result.url] = True
            try:
                content = plaintext(result.download())
            except:
                continue

            begin = 0
            wordindex = None
            wordlength = 0
            concindex = None
            for i in range(1, len(content)):
                if content[i] == '.' or content[i] == '?' or content[
                        i] == '!' or content[i] == '\n':
                    if wordindex >= begin and (
                        (concfilter and concindex >= begin) or
                        (not concfilter)):
                        if len(content[begin:wordindex].strip()) > 5 or len(
                                content[wordindex + wordlength:i +
                                        1].strip()) > 5:
                            yield (content[begin:wordindex].strip(),
                                   content[wordindex:wordindex +
                                           wordlength].strip(),
                                   content[wordindex + wordlength:i + 1],
                                   result.url)
                    wordindex = concindex = None
                    begin = i + 1
                if len(word) + i <= len(content) and content[i:i +
                                                             len(word)].lower(
                                                             ) == word.lower():
                    wordindex = i
                    wordlength = len(word)
                    for j in range(len(word), len(content)):
                        if i + j < len(content) and (
                                content[i + j] == ' ' or content[i + j] == '?'
                                or content[i + j] == '!'
                                or content[i + j] == '\n'):
                            wordlength = j
                            break
                if concfilter and content[i:len(concfilter)].lower(
                ) == concfilter.lower():
                    concindex = i

Ejemplo n.º 29

0

Mostrar archivo

def google_search():
    g = Google()
    for r in g.search(query="Barcamp RD 2019", star=1, count=10):
        print(r.title, r.text, r.url)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: 01-google.py Proyecto: Grepsy/cityment

# The web module has a SearchEngine class with a search() method
# that yields a list of Result objects.
# Each Result has url, title, date, description, date and author properties.
# Subclasses of SearchEngine include:
# Google, Yahoo, Bing, Twitter, Wikipedia, Flickr.

# This example retrieves results from Google based on a given query.
# The Google search engine can handle SEARCH, IMAGE, NEWS and BLOG type searches.

# You should obtain your own license key at:
# http://code.google.com/apis/ajaxsearch/signup.html
# Otherwise you will be sharing the default key with all users of this module.
engine = Google(license=None)

q = "as * as a *"
# Veale & Hao's method for finding simile using Google's wildcard (*) support.
# http://afflatus.ucd.ie/Papers/LearningFigurative_CogSci07.pdf)
# This will match results such as
# - "as light as a feather",
# - "as cute as a cupcake",
# - "as mad as a hatter",
# etc.

# Google is very fast but you can only get up to 64 (8x8) results per query.
for i in range(1, 8):
    for result in engine.search(q, start=i, type=SEARCH):
        print plaintext(
            result.description)  # plaintext() removes HTML formatting.
        print result.url
        print

Ejemplo n.º 31

0

Mostrar archivo

Archivo: tm_pattern.py Proyecto: Godcomplex11/DU

    'Scale', 'Compass'
])
a
print(a)

#%%% Working with Numbers
#The Pattern library contains functions that can be used to convert numbers in the form of text strings into their numeric counterparts and vice versa. To convert from text to numeric representation the number function is used. Similarly to convert back from numbers to their corresponding text representation the numerals function is used. Look at the following script:

from pattern.en import number, numerals
print(number("one hundred and twenty two"))
print(numerals(256.390, round=2))

#%%%Data Mining
from pattern.web import Google
google = Google()
for results in google.search('Analytics India Magazine'):
    print(results.url)
    print(results.text)

for results in google.search('Gamification'):
    print(results.url)

#twitter
from pattern.web import Twitter

twitter = Twitter()

for results in twitter.search('Analytics India Magazine'):
    print(results.url)
    print(results.text)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: fof.py Proyecto: kishi001/FakeOrFact

class FoF(object):
    def __init__(self, text, language, max_queries=10, span=20, threshold=.8):
        self.max_queries = max_queries
        self.span = span
        self.threshold = threshold
        self.text = text
        self.language = language
        self.cat_dict = defaultdict(list)
        key = 'AIzaSyDZzslNRAsgyiiBKx36S8rblRKungcypEA'
        self.engine = Google(license=key, throttle=0.5, language=None)
        self.urls = []

    def reconstruct_ngram(self, ngram):
        punc_b = ['!', '?', '.', ',', ';', ':', '\'', ')', ']', '}']
        punc_a = ['(', '[', '}', '$']
        ngram = ' '.join(ngram)
        for p in punc_b:
            ngram = ngram.replace(' ' + p, p)
        for p in punc_a:
            ngram = ngram.replace(p + ' ', p)
        ngram = re.sub('(^| )BEGQ', ' "', ngram)
        ngram = re.sub('ENDQ($| )', '" ', ngram)
        ngram = ngram.replace('DOUBLEDASH', '--')
        return ngram

    def get_queries(self):
        text = self.text
        beg_quotes = re.findall(r'\"\S', text)
        for each in beg_quotes:
            text = text.replace(each, 'BEGQ' + each[-1])

        end_quotes = re.findall(r'\S\"', text)
        for each in end_quotes:
            text = text.replace(each, each[0] + 'ENDQ')

        text = re.sub('(ENDQ)+', 'ENDQ', text)
        text = re.sub('(BEGQ)+', 'BEGQ', text)
        text = text.replace('--', 'DOUBLEDASH')

        all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)
        if self.language in stopwords.fileids():
            stop_words = stopwords.words(self.language)
        else:
            stop_words = []
        queries = []
        for ngram in all_ngrams:
            num_stop = len([w for w in ngram if w in stop_words])
            stop_score = float(num_stop) / len(ngram)
            if self.language == 'english':
                chunked = ne_chunk(pos_tag(ngram))
                named_entities = [[w for w, t in elt] for elt in chunked
                                  if isinstance(elt, nltk.Tree)]
                num_ent = sum([len(ent_list) for ent_list in named_entities])
                ent_score = float(num_ent) / len(ngram)
            else:
                ent_score = 0

            if stop_score < self.threshold and ent_score < self.threshold:
                r_string = self.reconstruct_ngram(ngram)
                if r_string in self.text:
                    queries.append(r_string)
        reduction = len(queries) / self.max_queries
        return queries[0::reduction]

    def get_domain(self, full_url):
        clean_reg = re.compile(r'^((?:https?:\/\/)?(?:www\.)?).*?(\/.*)?$')
        match = re.search(clean_reg, full_url)
        self.urls.append(full_url)
        beg, end = match.group(1), match.group(2)
        domain = string.replace(full_url, beg, '')
        domain = string.replace(domain, end, '')
        return domain

    def get_urls(self, queries):
        domains = defaultdict(list)
        for q in queries:
            q = "\"" + q + "\""
            results = self.engine.search(q)

            for result in results:
                url = result.url
                domain = self.get_domain(url)
                domains[domain].append(q)
        return domains

    def load_domains(self):
        sources_path = pd('data', 'data.csv')
        domain_file = Datasheet.load(sources_path, headers=True)
        for row in domain_file:
            url = row[1]
            cats = row[2:]
            self.cat_dict[url] = cats

    def render_output(self, domains):
        data = {'SOME': [], 'HIGH': []}
        for d, v in domains.items():
            d_cats = [
                c for c in self.cat_dict[d]
                if len(c) > 0 and len(c.split(' ')) < 3
            ]
            overlap = float(len(v)) / self.max_queries
            if 0.4 < overlap < 0.6:
                data['SOME'].append(d)
            elif overlap >= 0.6:
                data['HIGH'].append(d)
        data['href'] = self.urls
        json_data = json.dumps(data)
        print json_data
        sys.stdout.flush()

Ejemplo n.º 33

0

Mostrar archivo

Archivo: pattern_news.py Proyecto: abhijithneilabraham/Clips-pattern-examples

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr  2 02:17:22 2019

@author: abhijithneilabraham
"""

from pattern.web import Bing, SEARCH, plaintext, Google
from ulmfit import ULMFiT
engine = Google(license=key)
searched = []
search_key = 'സഞ്ജു സാംസൺ'

for result in engine.search(search_key, type=SEARCH, start=1):
    print(repr(plaintext(result.text)))
    searched.append(repr(plaintext(result.text)))
print(len(searched))

model = ULMFiT("news/")
for i in searched:
    x = model.predict(i)

    print(x['intent'])

Ejemplo n.º 34

0

Mostrar archivo

Archivo: MP1_1_finalw.py Proyecto: yuntae1000/SoftwareDesignFall15

rawurl_wsj=[]
rawurl_foxnews=[]
journal_names=['nytimes', 'cbsnews', 'wsj', 'foxnews']
rawurls=[rawurl_nytimes, rawurl_cbsnews, rawurl_wsj, rawurl_foxnews]

g=Google()


#get the New York Times url
for journal, raw_url_title in zip(journal_names, rawurls):

	#in order to get 30 urls with the keyword, used for-loop
	for i in range(1,4):

		# search google results correspoding to the following keyword
		for result in g.search('Donald Trump opinion site:'+journal+'.com', start=i):

		    print result.url # Better not to print all of these -- they're not informative to the person using your program
		    # append the urls to the rawurl_ array
		    # ^ Not a useful comment - you're just stating what the next line does.
		    # Better to say _why_ the next line does what it does
		    raw_url_title.append(result.url)

	print raw_url_title # Again, better not to print all of these -- I'm not going to keep writing this, but it applies throughout your code
	print len(raw_url_title)

	# saves the keyword to the local file in order to reduce query
	# we will use this file for analyzing later on

	f=open('url_'+journal+'.txt', "w")
	print >>f, raw_url_title

Ejemplo n.º 35

0

Mostrar archivo

Archivo: engine.py Proyecto: Murlocks/Ngram-Tiling-QA

class Engine(object):

    def __init__(self, provider, key=None):
        if provider.lower() == "bing":
            key = key or 'd6Mz4slIdgIxcKR4609FO+QKOFTEFFRB3i7j8VioPiE'
            self._engine = Bing(license=key)
        elif provider.lower() == "google":
            key = key or 'AIzaSyCAADAKnnkmDwIlLk_Q1p6foqI_ZMrgzcg'
            self._engine = Google(license=key)
        else:
            raise ValueError('Not a recognized provider.')

    def fuzzySearch(self, result, query):
        best, best_i = 0, None
        for i in range(len(result) - len(query) + 1):
            score = similarity(result[i:i+len(query)], query)
            if best < score:
                best = score
                best_i = i
        return result[best_i+len(query):] if best_i != None else ''

    def getPatterns(self, query):
        cleaned = query.strip('?')
        p = [(cleaned, 3)]
        t = parsetree(query)[0]
        for chunk in t.chunks:
            if chunk.pos == 'NP':
                p.append((chunk.string, 2))
        for w in cleaned.split():
            p.append((w, 1))
        return p

    def getGrams(self, results):
        grams = {}
        for text, weight in results:
            uni = set(ngrams(text, n=1))
            bi = set(ngrams(text, n=2))
            tri = set(ngrams(text, n=3))
            for gram in uni:
                grams[gram] = grams.get(gram, 0) + weight
            for gram in bi:
                grams[gram] = grams.get(gram, 0) + weight
            for gram in tri:
                grams[gram] = grams.get(gram, 0) + weight
        return grams

    def removeStopWords(self, grams, queries):
        for gram in grams.keys():
            inter = set([g.lower() for g in gram]) & set(wordlist.STOPWORDS)
            if len(inter) > 1 or inter and len(gram) == 1:
                grams.pop(gram)
        return grams

    def searchQueries(self, queries):
        results = []
        for q, w1, d in queries:
            for r in self._engine.search(q, count=100):
                results.append((re.sub(r'[!,.?]', '', plaintext(r.txt)), w1))
        return results

    def searchQueriesWithPatterns(self, queries):
        # Faster, but still need to refine extraction patterns
        results = []
        for q, w1, d in queries:
            print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            print q
            print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            for r in self._engine.search(q, count=50):
                # Each result is given a preliminary score based on the weight of the query
                # that retrieved it and the pattern that was matched to it
                for p, w2 in self.getPatterns(q):
                    if d == 'L':
                        m = re.search('(.*?)' + p + '.*\.', plaintext(r.txt), re.IGNORECASE)
                    else:
                        m = re.search(p + '(.*)', plaintext(r.txt), re.IGNORECASE)
                    if m:
                        print plaintext(r.txt)
                        print "-------------------------------------------------"
                        print p, "generated", m.group(1)
                        print "================================================="
                        results.append((m.group(1), w1 + w2))
                        break
        return results

    def searchAndGram(self, queries):
        results = self.searchQueries(queries)
        grams = self.getGrams(results)
        grams = self.removeStopWords(grams, queries)
        return grams

Ejemplo n.º 36

0

Mostrar archivo

def calc_main():
    st.write("Nimbus Words")   
    st.sidebar.header("Input Options") 

    activites = ["Summary", "Tokenizer","Synonyms","Translator","Search","Spell Correction"]
    choice = st.sidebar.selectbox("Select Activity",activites)
    if choice == "Summary":
        st.title('AI Text Summarizer')
        text = st.text_area("Input Text For Summary",height=300)
        if st.button("summarize"):
            st.success(summary(text))
        text_range= st.sidebar.slider("Summarize words Range",25,500)
        text = st.text_area("Input Text For Summary",height=250)
        if st.button("custom summarization"):
           st.warning(summarize(text,word_count=text_range))
    # Tokenizer
    elif choice == "Tokenizer":
        st.title('Text Tokenizer')
        row_data = st.text_area("write Text For Tokenizer")
        docx= nlp(row_data)
        if st.button("Tokenizer"):
            spacy_streamlit.visualize_tokens(docx,attrs=['text','pos_','dep_','ent_type_'])
        if st.button("NER"):
            spacy_streamlit.visualize_ner(docx,labels=nlp.get_pipe('ner').labels)
        if st.button("Text Relationship"):
            spacy_streamlit.visualize_parser(docx)
       # synonyms      
    elif choice == "Synonyms":
        st.title('Synonym Generator')
        text = st.text_area("Enter Text")
        if st.button("Find"):
            for syn in wordnet.synsets(text):
                for i in syn.lemmas():
                    st.success(i.name())
        if st.checkbox("Defination"):
            for syn in wordnet.synsets(text):
                st.warning(syn.definition()) 
        if st.checkbox("Example"):
            for syn in wordnet.synsets(text):
                st.success(syn.examples())
      # Translator          
    elif choice == "Translator":
        st.title('Speech Tranlation')
        row_text = st.text_area("Enter Your Text For Translation",height=300)
        translation_text = TextBlob(row_text)
        list1 = ["en","ta","pa","gu","hi","ur","kn","bn","te"]
        a= st.selectbox("select",list1)
        if st.button("search"):
            #input1 = TextBlob("Simple is better than complex")
            st.success(translation_text.translate(to=a))
    #Search Bar
    elif choice == "Search":
        st.title('Web Search')
        row_text= st.text_input("Search Anything")
        google = Google(license=None)
        if st.button("search"):
            for search_result in google.search(row_text):
                st.write(search_result.text)
                st.warning(search_result.url)
    elif choice == "Spell Correction":
        st.title('AI Spell Correction')
        text_data = st.text_area("Enter Text Here")
        a = TextBlob(text_data)
        if st.button("Correct"):
            st.success(a.correct())
        st.title('Pluralize & Singularize')
        text_data1 = st.text_input("Enter a word For pluralize / singularize")
        if st.checkbox("pluralize"):
            st.warning(pluralize(text_data1))
        if st.checkbox("singularize"):
            st.warning(singularize(text_data1))
        
        st.title('Compartitive & Superlative')
        text2 = st.text_input("Enter Text For comparative & superlative")
        if st.checkbox("comparative"):
            st.success(comparative(text2))
        if st.checkbox("superlative"):
            st.success(superlative(text2))

Ejemplo n.º 37

0

Mostrar archivo

Archivo: healthyme.py Proyecto: nhungkia/Healthy-me

def search():
	inputs = request.form
	age = inputs['age']
	sex = inputs['sex']
	race = inputs['race']
	if age == '1-4':
		Age = "Age='2a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"

	if age == '5-14':
		Age = "Age='3a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"
	if age == '15-24':
		Age = "Age='4a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"
	if age == '25-34':
		Age = "Age='5a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"
	if age == '35-44':
		Age = "Age='6a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"
	if age == '45-54':
		Age = "Age='7a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"
	if age == '55-64':
		Age = "Age='8a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"

	if age == '65-74':
		Age = "Age='9a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"
	if age == '75-84':
		Age = "Age='10a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"

	if age == '85+':
		Age = "Age='11a'"
		if sex == 'Female' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_f_asian !=1 ORDER BY p_f_asian DESC;"

		if sex == 'Female' and race == 'Black/African American':
			sql_query = "SELECT name,p_f_black FROM probabilities WHERE "+Age+" and p_f_black !=1 ORDER BY p_f_black DESC;"

		if sex == 'Female' and race == 'Hispanic':
			sql_query = "SELECT name,p_f_latino FROM probabilities WHERE "+Age+" and p_f_latino !=1 ORDER BY p_f_latino DESC;"

		if sex == 'Female' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_f_native FROM probabilities WHERE "+Age+" and p_f_native !=1 ORDER BY p_f_native DESC;"

		if sex == 'Female' and race == 'White':
			sql_query = "SELECT name,p_f_white FROM probabilities WHERE "+Age+" and p_f_white !=1 ORDER BY p_f_white DESC;"

		if sex == 'Male' and race == 'Asian/Pacific Islander':
			sql_query = "SELECT name,p_f_asian FROM probabilities WHERE "+Age+" and p_m_asian !=1 ORDER BY p_m_asian DESC;"

		if sex == 'Male' and race == 'Black/African American':
			sql_query = "SELECT name,p_m_black FROM probabilities WHERE "+Age+" and p_m_black !=1 ORDER BY p_m_black DESC;"

		if sex == 'Male' and race == 'Hispanic':
			sql_query = "SELECT name,p_m_latino FROM probabilities WHERE "+Age+" and p_m_latino !=1 ORDER BY p_m_latino DESC;"

		if sex == 'Male' and race == 'Native American/Alaskan':
			sql_query = "SELECT name,p_m_native FROM probabilities WHERE "+Age+" and p_m_native !=1 ORDER BY p_m_native DESC;"

		if sex == 'Male' and race == 'White':
			sql_query = "SELECT name,p_m_white FROM probabilities WHERE "+Age+" and p_m_white !=1 ORDER BY p_m_white DESC;"

	cursor.execute(sql_query)
	rows = cursor.fetchall()
	new_list=[''.join(list(rows[0][0])),''.join(list(rows[1][0])),''.join(list(rows[2][0])),''.join(list(rows[3][0])),''.join(list(rows[4][0]))]
	pvals=[rows[0][1],rows[1][1],rows[2][1],rows[3][1],rows[4][1]]
	padded_list=[]

	#need to pad the strings so that they are equal length
	max_len=len(max(new_list,key=len))
	for i in range(len(new_list)):
		cur_len=len(new_list[i])
		empty_string=" "*cur_len
		padded_string=new_list[i]+empty_string
		padded_list.append(padded_string)

	#using pattern look up google for top diseases

	engine=Google(license=key,throttle=0)

	empty_link_disease=[]
	empty_text_disease=[]

	empty_link_treatment=[]
	empty_text_treatment=[]


	for index in new_list:
		#search wikipedia first for snippet about the disease, if it doesn't exist change to google
		about=Wikipedia().search(index)
		if not about:
			about=engine.search(index)
			empty_link_disease.append(about[0].url)
			empty_text_disease.append(plaintext(about[0].text))
		elif about:
			about_url=engine.search(index+' Wikipedia')
			empty_link_disease.append(about_url[0].url)
			empty_text_disease.append(plaintext(about.sections[0].content[0:1000])+'...')

		treatment=engine.search(index+' treatment site:www.healthtap.com')
		empty_link_treatment.append(treatment[0].url)
		empty_text_treatment.append(plaintext(treatment[0].text))
		
	disease_links=[''.join(list(empty_link_disease[0])),''.join(list(empty_link_disease[1])),''.join(list(empty_link_disease[2])),''.join(list(empty_link_disease[3])),''.join(list(empty_link_disease[4]))]
	treatment_links=[''.join(list(empty_link_treatment[0])),''.join(list(empty_link_treatment[1])),''.join(list(empty_link_treatment[2])),''.join(list(empty_link_treatment[3])),''.join(list(empty_link_treatment[4]))]
	#string_links=[str(list(empty_link[0])),str(list(empty_link[1])),str(list(empty_link[2])),str(list(empty_link[3])),str(list(empty_link[4]))]
	#print string_links
	return render_template('results_feb1.html',disease_name=padded_list,pvalues=pvals, d_links=disease_links, disease_snippets=empty_text_disease, t_links=treatment_links, treatment_snippets=empty_text_treatment)

Ejemplo n.º 38

0

Mostrar archivo

Archivo: simple.py Proyecto: mellowizz/metastudy

#!/usr/bin/python2
# -*- coding: utf-8 -*-
import codecs, os, sys
from collections import Counter
from optparse import OptionParser
from pattern.web import Google

engine = Google(license=None, language='de') #'001401664590345725200:dx8-iwqnvyw', language='de')

for result in engine.search('Energiewende', cached=False):
    print repr(result.text)

Ejemplo n.º 39

0

Mostrar archivo

Archivo: source_checker.py Proyecto: juhi10071998/fake-news-recognition-tool

class SourceChecker(object):
    def __init__(self, text, max_queries=10, span=20, threshold=.8):
        self.max_queries = max_queries
        self.span = span
        self.threshold = threshold
        self.text = text
        self.cat_dict = defaultdict(list)
        self.engine = Google(license='AIzaSyCFgnXgb9rcwJspcSeXHo7QHvucgM2nLrI',
                             throttle=0.5,
                             language=None)

    def get_queries(self):
        text = self.text
        beg_quotes = re.findall(r'\"\S', text)
        for each in beg_quotes:
            text = text.replace(each, 'BEGQ' + each[-1])

        end_quotes = re.findall(r'\S\"', text)
        for each in end_quotes:
            text = text.replace(each, each[0] + 'ENDQ')

        text = re.sub('(ENDQ)+', 'ENDQ', text)
        text = re.sub('(BEGQ)+', 'BEGQ', text)
        text = text.replace('--', 'DOUBLEDASH')

        all_ngrams = ngrams(text, n=self.span, punctuation="", continuous=True)
        stop_words = stopwords.words('english')
        queries = []

        for ngram in all_ngrams:
            num_stop = len([w for w in ngram if w in stop_words])
            stop_score = float(num_stop) / len(ngram)

            chunked = ne_chunk(pos_tag(ngram))
            named_entities = [[w for w, t in elt] for elt in chunked
                              if isinstance(elt, nltk.Tree)]
            num_ent = sum([len(ent_list) for ent_list in named_entities])
            ent_score = float(num_ent) / len(ngram)

            if stop_score < self.threshold and ent_score < self.threshold:
                r_string = self.reconstruct_ngram(ngram)
                if r_string in self.text:
                    queries.append(r_string)

        reduction = len(queries) / self.max_queries
        return queries[0::reduction]

    def reconstruct_ngram(self, ngram):
        punc_b = ['!', '?', '.', ',', ';', ':', '\'', ')', ']', '}']
        punc_a = ['(', '[', '}', '$']
        ngram = ' '.join(ngram)
        for p in punc_b:
            ngram = ngram.replace(' ' + p, p)
        for p in punc_a:
            ngram = ngram.replace(p + ' ', p)
        ngram = re.sub('(^| )BEGQ', ' "', ngram)
        ngram = re.sub('ENDQ($| )', '" ', ngram)
        ngram = ngram.replace('DOUBLEDASH', '--')
        return ngram

    def load_domains(self):
        sources_path = pd('data', 'source_data.csv')
        domain_file = Datasheet.load(sources_path, headers=True)
        for row in domain_file:
            url = row[1]
            cats = row[2:]
            self.cat_dict[url] = cats

    def pairwise(self, t):
        it = iter(t)
        return izip(it, it)

    def get_urls(self, queries):
        domains = defaultdict(list)
        for q in queries:
            q = "\"" + q + "\""
            results = self.engine.search(q)

            for result in results:
                url = result.url
                domain = self.get_domain(url)
                domains[domain].append(q)
        return domains

    def get_domain(self, full_url):
        clean_reg = re.compile(r'^((?:https?:\/\/)?(?:www\.)?).*?(\/.*)?$')
        match = re.search(clean_reg, full_url)
        beg, end = match.group(1), match.group(2)
        domain = string.replace(full_url, beg, '')
        domain = string.replace(domain, end, '')
        return domain

    def render_output(self, domains):
        output = defaultdict(list)
        for d, v in domains.items():
            d_cats = [
                c for c in self.cat_dict[d]
                if len(c) > 0 and len(c.split(' ')) < 3
            ]
            overlap = float(len(v)) / self.max_queries
            if overlap <= 0.2:
                output['MINIMAL'].append((d, d_cats))
            elif 0.2 < overlap < 0.6:
                output['SOME'].append((d, d_cats))
            elif overlap >= 0.6:
                output['HIGH'].append((d, d_cats))
        degrees = ['HIGH', 'SOME', 'MINIMAL']
        print '\n'
        for deg in degrees:
            if output[deg]:
                print '%s OVERLAP: ' % deg
                for d, cats in sorted(output[deg]):
                    if cats:
                        print d + ': ' + ','.join(cats)
                    else:
                        print d
                print '\n'

    def render_graph(self, domains):
        g = Graph()
        for domain in domains.keys():
            if domain in self.cat_dict:
                categories = self.cat_dict[domain]
                stroke = (0, 0, 0, 0.5)
                if 'right' in categories:
                    stroke = (255, 0, 0, 1)
                elif 'right_center' in categories:
                    stroke = (255, 0, 0, .5)
                if 'left' in categories:
                    stroke = (0, 0, 255, 1)
                elif 'left_center' in categories:
                    stroke = (0, 0, 255, .5)
                if 'least_biased' in categories:
                    stroke = (0, 255, 0, 1)

            fill = (128, 128, 0, 0.1)
            dub_cats = [
                'fake', 'questionable', 'clickbait', 'unreliable', 'conspiracy'
            ]
            score = len([c for c in categories if c in dub_cats])
            if score:
                fill = (0, 0, 0, float(score) / 5)
            g.add_node(domain,
                       radius=len(domains[domain]) * 6,
                       stroke=stroke,
                       strokewidth=6,
                       fill=fill,
                       font_size=30)

        pairs = self.pairwise(domains.keys())
        for x, y in pairs:
            x_queries = set(domains[x])
            y_queries = set(domains[y])
            intersection = len(x_queries.intersection(y_queries))
            if intersection > 0:
                max_rad = max(len(domains[x]), len(domains[y])) + 1000
                g.add_edge(x, y, length=max_rad, strokewidth=intersection)

        path = 'graph'
        g.export(path,
                 encoding='utf-8',
                 distance=6,
                 directed=False,
                 width=1400,
                 height=900)

Ejemplo n.º 40

0

Mostrar archivo

Archivo: 01-google.py Proyecto: DataBranner/pattern

# Other search engines may also handle IMAGE, NEWS, ...

# Google's "Custom Search API" is a paid service.
# The pattern.web module uses a test account by default,
# with a 100 free queries per day shared by all Pattern users.
# If this limit is exceeded, SearchEngineLimitError is raised.
# You should obtain your own license key at:
# https://code.google.com/apis/console/
# Activate "Custom Search API" under "Services" and get the key under "API Access".
# Then use Google(license=[YOUR_KEY]).search().
# This will give you 100 personal free queries, or 5$ per 1000 queries.
engine = Google(license=None, language="en")

# Veale & Hao's method for finding similes using wildcards (*):
# http://afflatus.ucd.ie/Papers/LearningFigurative_CogSci07.pdf
# This will match results such as:
# - "as light as a feather",
# - "as cute as a cupcake",
# - "as drunk as a lord",
# - "as snug as a bug", etc.
q = "as * as a *"

# Google is very fast but you can only get up to 100 (10x10) results per query.
for i in range(1, 2):
    for result in engine.search(q, start=i, count=10, type=SEARCH, cached=True):
        # plaintext() removes all HTML formatting.
        print(plaintext(result.text).encode("utf-8"))
        print(result.url)
        print(result.date)
        print()

Ejemplo n.º 41

0

Mostrar archivo

Archivo: newsgrapher.py Proyecto: ageek/sentiment

def gnewshits(topic):
    engine = Google()
    results = engine.search(topic, type=NEWS)
    return results.total

Ejemplo n.º 42

0

Mostrar archivo

Archivo: day7_pattern.py Proyecto: enliktjioe/28daysofnlp

while not asyn_req.done:
    time.sleep(0.1)
    print('searching...')

print(asyn_req.value)

print(find_urls(asyn_req.value, unique=True))

# ### Getting Search Engine Results with APIs

# #### Google

from pattern.web import Google

google = Google(license=None)
for search_result in google.search('artificial intelligence'):
    print(search_result.url)
    print(search_result.text)

# #### Twitter

from pattern.web import Twitter

twitter = Twitter()
index = None
for j in range(3):
    for tweet in twitter.search('artificial intelligence',
                                start=index,
                                count=3):
        print(tweet.text)
        index = tweet.id

Ejemplo n.º 43

0

Mostrar archivo

Archivo: engine.py Proyecto: Murlocks/Ngram-Tiling-QA

class Engine(object):
    def __init__(self, provider, key=None):
        if provider.lower() == "bing":
            key = key or 'd6Mz4slIdgIxcKR4609FO+QKOFTEFFRB3i7j8VioPiE'
            self._engine = Bing(license=key)
        elif provider.lower() == "google":
            key = key or 'AIzaSyCAADAKnnkmDwIlLk_Q1p6foqI_ZMrgzcg'
            self._engine = Google(license=key)
        else:
            raise ValueError('Not a recognized provider.')

    def fuzzySearch(self, result, query):
        best, best_i = 0, None
        for i in range(len(result) - len(query) + 1):
            score = similarity(result[i:i + len(query)], query)
            if best < score:
                best = score
                best_i = i
        return result[best_i + len(query):] if best_i != None else ''

    def getPatterns(self, query):
        cleaned = query.strip('?')
        p = [(cleaned, 3)]
        t = parsetree(query)[0]
        for chunk in t.chunks:
            if chunk.pos == 'NP':
                p.append((chunk.string, 2))
        for w in cleaned.split():
            p.append((w, 1))
        return p

    def getGrams(self, results):
        grams = {}
        for text, weight in results:
            uni = set(ngrams(text, n=1))
            bi = set(ngrams(text, n=2))
            tri = set(ngrams(text, n=3))
            for gram in uni:
                grams[gram] = grams.get(gram, 0) + weight
            for gram in bi:
                grams[gram] = grams.get(gram, 0) + weight
            for gram in tri:
                grams[gram] = grams.get(gram, 0) + weight
        return grams

    def removeStopWords(self, grams, queries):
        for gram in grams.keys():
            inter = set([g.lower() for g in gram]) & set(wordlist.STOPWORDS)
            if len(inter) > 1 or inter and len(gram) == 1:
                grams.pop(gram)
        return grams

    def searchQueries(self, queries):
        results = []
        for q, w1, d in queries:
            for r in self._engine.search(q, count=100):
                results.append((re.sub(r'[!,.?]', '', plaintext(r.txt)), w1))
        return results

    def searchQueriesWithPatterns(self, queries):
        # Faster, but still need to refine extraction patterns
        results = []
        for q, w1, d in queries:
            print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            print q
            print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            for r in self._engine.search(q, count=50):
                # Each result is given a preliminary score based on the weight of the query
                # that retrieved it and the pattern that was matched to it
                for p, w2 in self.getPatterns(q):
                    if d == 'L':
                        m = re.search('(.*?)' + p + '.*\.', plaintext(r.txt),
                                      re.IGNORECASE)
                    else:
                        m = re.search(p + '(.*)', plaintext(r.txt),
                                      re.IGNORECASE)
                    if m:
                        print plaintext(r.txt)
                        print "-------------------------------------------------"
                        print p, "generated", m.group(1)
                        print "================================================="
                        results.append((m.group(1), w1 + w2))
                        break
        return results

    def searchAndGram(self, queries):
        results = self.searchQueries(queries)
        grams = self.getGrams(results)
        grams = self.removeStopWords(grams, queries)
        return grams

Ejemplo n.º 44

0

Mostrar archivo

# This example retrieves results from Google based on a given query.
# The Google search engine can handle SEARCH type searches.
# Other search engines may also handle IMAGE, NEWS, ...

# Google's "Custom Search API" is a paid service.
# The pattern.web module uses a test account by default,
# with a 100 free queries per day shared by all Pattern users.
# If this limit is exceeded, SearchEngineLimitError is raised.
# You should obtain your own license key at: 
# https://code.google.com/apis/console/
# Activate "Custom Search API" under "Services" and get the key under "API Access".
# Then use Google(license=[YOUR_KEY]).search().
# This will give you 100 personal free queries, or 5$ per 1000 queries.
engine = Google(license=None, language="en")

# Veale & Hao's method for finding similes using wildcards (*):
# http://afflatus.ucd.ie/Papers/LearningFigurative_CogSci07.pdf
# This will match results such as:
# - "as light as a feather",
# - "as cute as a cupcake",
# - "as drunk as a lord",
# - "as snug as a bug", etc.
q = "as * as a *"

# Google is very fast but you can only get up to 100 (10x10) results per query.
for i in range(1, 2):
    for result in engine.search(q, start=i, count=10, type=SEARCH, cached=True):
        print plaintext(result.text) # plaintext() removes all HTML formatting.
        print result.url
        print result.date
        print

Ejemplo n.º 45

0

Mostrar archivo

#!/usr/bin/python2
# -*- coding: utf-8 -*-
import codecs, os, sys
from collections import Counter
from optparse import OptionParser
from pattern.web import Google

engine = Google(
    license=None,
    language='de')  #'001401664590345725200:dx8-iwqnvyw', language='de')

for result in engine.search('Energiewende', cached=False):
    print repr(result.text)