Example #1
0
 def run(self,q):# q is the query
     urlsBing=[]
     engine = Bing(license=None) # Enter your license key.
     for i in range(1,11):
         for result in engine.search(q, type=SEARCH, start=i):
             urlsBing.append(result.url)
     return urlsBing
    def get_urls(self, q = "", n = 1, limit = 1):
        url = []
        reload(sys)
        sys.setdefaultencoding(GOOGLE_API_ENCODING)
        engine_google = Bing(license=BING_API_KEY, language=BING_API_LANG)
        for i in range(1, (n + 1)):
            for result in engine_google.search(q, start=i, count=10, type=SEARCH, cached=False):
                url.append(result.url)

        return url[:limit]
 def get_bing_entries(self, search, nb):
     bing = Bing(language=self.dico_lang[self.language])
     entries = []
     for result in bing.search(search, start=1, count=nb, cached=False):
         entry_input = Input(result.text)
         annotations = {
             'source' : 'Bing',
             'title': result.title,
             'url': result.url,
             'search' : search,
         }
         entry_input.segments[0].annotations.update(annotations)
         entries.append(entry_input)
     return entries
Example #4
0
def generar_consulta_bing(q):
    reload(sys)
    sys.setdefaultencoding('utf8')

    engine_bing = Bing(license="TNMHm68dvf440pSPdnU+2LqxeQi7J2xszPZLBiPYsmI",
                       throttle=0.5,
                       language=None)
    bing = []
    for consulta in q:
        request = asynchronous(engine_bing.search,
                               consulta,
                               start=1,
                               count=20,
                               type=SEARCH,
                               timeout=10)
        while not request.done:
            time.sleep(0.01)
        #
        # # An error occured in engine.search(), raise it.
        if request.error:
            raise request.error
        #
        # # Retrieve the list of search results.
        for result in request.value:
            bing.append(result.url)

    return bing
Example #5
0
 def get_bing_entries(self, search, nb):
     bing = Bing(language=self.dico_lang[self.language])
     entries = list()
     for result in bing.search(search, start=1, count=nb, cached=False):
         entry_input = Input(result.text)
         annotations = {
             'source': 'Bing',
             'title': result.title,
             'url': result.url,
             'search': search,
         }
         segment = entry_input[0]
         segment.annotations.update(annotations)
         entry_input[0] = segment
         entries.append(entry_input)
     return entries
Example #6
0
 def __init__(self, provider, key=None):
     if provider.lower() == "bing":
         key = key or 'd6Mz4slIdgIxcKR4609FO+QKOFTEFFRB3i7j8VioPiE'
         self._engine = Bing(license=key)
     elif provider.lower() == "google":
         key = key or 'AIzaSyCAADAKnnkmDwIlLk_Q1p6foqI_ZMrgzcg'
         self._engine = Google(license=key)
     else:
         raise ValueError('Not a recognized provider.')
Example #7
0
def bingcorpsearch(word,concfilter = '', extraquery='',license=None, start=1, count=50):
    """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples.
       First queries Google, and then retrieves the pages of the top search results.
       Uses 'pattern' (CLiPS, Antwerpen University)
       """
    if not concfilter:
        query = word
    else:
        query = word + ' ' + concfilter
    if extraquery:
       query += ' ' + extraquery

    engine = Bing(license=license)
        
    processed = {}
    
    for result in engine.search(query, start=start,count=count):
        if not result.url in processed:
            processed[result.url] = True
            try:
                content = plaintext(result.download())
            except:
                continue
                
            begin = 0
            wordindex = None
            wordlength = 0
            concindex = None            
            for i in range(1,len(content)):
                if content[i] == '.' or content[i] == '?' or content[i] == '!' or content[i] == '\n':
                    if wordindex >= begin and ((concfilter and concindex >= begin) or (not concfilter)):
                        if len(content[begin:wordindex].strip()) > 5 or len(content[wordindex+wordlength:i+1].strip()) > 5:
                            yield (content[begin:wordindex].strip(), content[wordindex:wordindex+wordlength].strip(), content[wordindex+wordlength:i+1], result.url)
                    wordindex = concindex = None
                    begin = i + 1
                if len(word)+i <= len(content) and content[i:i+len(word)].lower() == word.lower():
                    wordindex = i
                    wordlength = len(word)
                    for j in range(len(word),len(content)):                        
                        if i+j < len(content) and (content[i+j] == ' ' or  content[i+j] == '?' or content[i+j] == '!' or content[i+j] == '\n'):
                            wordlength = j
                            break                                                                
                if concfilter and content[i:len(concfilter)].lower() == concfilter.lower():
                    concindex = i
Example #8
0
def novelty(word):
    
    """ Returns the novelty of the given word as a value 0.0-1.0 (1.0 = 100% novel).
    """
    
    engine = Bing() # Google(license="...")
    
    # Get the number of search results that mention the given word.
    # http://www.clips.ua.ac.be/pages/pattern-web#services
    count = engine.search(word, cached=True).total
    
    # Note: we should cached=False to get the most up-to-date count.
    
    # It would be nice if this number was relative (0.0-1.0),
    # then we could represent novelty as a percentage,
    # based on the number of existing web pages that mention the word.
    # Here are some raw numbers:
    
    # - "and"                      : 1730000000
    # - "new york"                 : 94700000
    # - "tree"                     : 78200000
    # - "justin bieber"            : 7680000
    # - "computational creativity" : 5330000
    # - "zombification"            : 126000
    # - "zombification machine"    : 37000
    # - "zombology"                : 11100
    # - "zombeliever"              : 11
    # - "zombriefing"              : 0
    # - "zombifractor"             : 0
    
    # So, it looks like common words are mentioned thousands of times,
    # while invented words are mentioned dozens of times.
    
    # We'll cut off the result count above 100
    # (= anything mentioned 100x times on the net is not novel).
    count = min(count, 100)
    
    # And then relativize the value:
    count = 1.0 - count * 0.01
    
    return count
Example #9
0
def novelty(word):
    """ Returns the novelty of the given word as a value 0.0-1.0 (1.0 = 100% novel).
    """

    engine = Bing()  # Google(license="...")

    # Get the number of search results that mention the given word.
    # http://www.clips.ua.ac.be/pages/pattern-web#services
    count = engine.search(word, cached=True).total

    # Note: we should cached=False to get the most up-to-date count.

    # It would be nice if this number was relative (0.0-1.0),
    # then we could represent novelty as a percentage,
    # based on the number of existing web pages that mention the word.
    # Here are some raw numbers:

    # - "and"                      : 1730000000
    # - "new york"                 : 94700000
    # - "tree"                     : 78200000
    # - "justin bieber"            : 7680000
    # - "computational creativity" : 5330000
    # - "zombification"            : 126000
    # - "zombification machine"    : 37000
    # - "zombology"                : 11100
    # - "zombeliever"              : 11
    # - "zombriefing"              : 0
    # - "zombifractor"             : 0

    # So, it looks like common words are mentioned thousands of times,
    # while invented words are mentioned dozens of times.

    # We'll cut off the result count above 100
    # (= anything mentioned 100x times on the net is not novel).
    count = min(count, 100)

    # And then relativize the value:
    count = 1.0 - count * 0.01

    return count
Example #10
0
def get_info(search_query):
	if isinstance(search_query, str):
		search_query = str(search_query)
	else:
		return { "Error": "Pass a string, from mine.py [7]", "Result": [None] }

	result = []
	engineGoogle = Google(license=None, throttle=0.5, language=None)
	engineBing = Bing(license=None, throttle=0.5, language=None)
	engineTwitter = Twitter(license=None, throttle=0.5, language=None)
	engineFacebook = Facebook(license=None, throttle=1.0, language='en')
	engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None)
	engineFlickr = Flickr(license=None, throttle=5.0, language=None)
	engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr]
	engineArray = [engineGoogle, engineTwitter]

	'''
	for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)])
		[result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray]
			# print repr(plaintext(para.text))
			# print repr(plaintext(para.url)) + '\n\n'
			# result.append(repr(plaintext(para.text)))
	'''

	# Google
	for i in range(1, 5):
		result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)])
		
	for i in range(1, 5):
		result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	'''
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)])
	for i in range(1,2):
		result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)])
	'''

	return { "Error": None, "Result": result }

	# return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }
Example #11
0
from pattern.web import Bing, asynchronous, plaintext
from pattern.web import SEARCH, IMAGE, NEWS

import time

# This example retrieves results from Bing based on a given query.
# The Bing search engine can retrieve up to a 1000 results (10x100) for a query.

# Bing's "Custom Search API" is a paid service.
# The pattern.web module uses a test account by default,
# with 5000 free queries per month shared by all Pattern users.
# If this limit is exceeded, SearchEngineLimitError is raised.
# You should obtain your own license key at: 
# https://datamarket.azure.com/account/
engine = Bing(license=None, language="en")

# Quote a query to match it exactly:
q = "\"is more important than\""

# When you execute a query,
# the script will halt until all results are downloaded.
# In apps with an infinite main loop (e.g., GUI, game),
# it is often more useful if the app keeps on running 
# while the search is executed in the background.
# This can be achieved with the asynchronous() function.
# It takes any function and that function's arguments and keyword arguments:
request = asynchronous(engine.search, q, start=1, count=100, type=SEARCH, timeout=10)

# This while-loop simulates an infinite application loop.
# In real-life you would have an app.update() or similar
Example #12
0
from pattern.web import Bing, IMAGE
import urllib

engine = Bing()

for counter, result in enumerate(engine.search('meme', type=IMAGE)):
    try:
        urllib.urlretrieve(result.url, "images/%s.jpg" % counter)
    except:
        pass
Example #13
0
import os, sys
sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import Bing, asynchronous, plaintext
from pattern.web import SEARCH, IMAGE, NEWS

import time

# This example retrieves results from Bing based on a given query.
# Yahoo can retrieve up to a 1000 results (10x100) for a query.

# You should obtain your own license key at:
# https://developer.apps.yahoo.com/wsregapp/
# Otherwise you will be sharing the default license with all users of this module.
engine = Bing(license=None)

# Quote a query to match it exactly:
q = "\"is more important than\""

# When you execute a query, the script will halt until all results are downloaded.
# In applications with an event loop (e.g. a GUI or an interactive animation)
# it is more useful if the app keeps on running while the search is executed in the background.
# This can be achieved with the asynchronous() command.
# It takes any function and the function's arguments and keyword arguments:
request = asynchronous(engine.search,
                       q,
                       start=1,
                       count=100,
                       type=SEARCH,
                       timeout=10)
Example #14
0
from pattern.db import date, time, NOW
from pattern.web import Bing, NEWS

# It is often useful to keep a date stamp for each row in the table.
# The pattern.db module's date() function can be used for this.
# It is a simple wrapper around Python's datetime.datetime class,
# with extra functionality to make it easy to parse or print it as a string.

print date(NOW)
print date()
print date("2010-11-01 16:30", "%Y-%m-%d %H:%M")
print date("Nov 1, 2010", "%b %d, %Y")
print date("Nov 1, 2010", "%b %d, %Y", format="%d/%m/%Y")
print

# All possible formatting options:
# http://docs.python.org/library/time.html#time.strftime

for r in Bing(license=None, language="en").search("today", type=NEWS):
    print r.title
    print repr(
        r.date
    )  # Result.date is a string (e.g. we can't > <= += with the date).
    print date(r.date)  # date() can parse any Result.date in the web module.
    print

d = date("4 november 2011")
d += time(days=2, hours=5)
print d
Example #15
0
# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Bing and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'          # Bing search query
p = "NP (VP) more important than NP" # Search pattern.
p = Pattern.fromstring(p)
d = Datasheet()

engine = Bing(license=None)
for i in range(1): # max=10
    for result in engine.search(q, start=i+1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1] # Left NP.
            b = m.constituents(constraint=5)[ 0] # Right NP.
            d.append((
                a.string.lower(), 
                b.string.lower()))

pprint(d)

print
Example #16
0
def google_search(match,targetfile):
    engine = Google(license=None)
    for i in range(1,10):
        for result in engine.search(match, type=SEARCH, start=i):
              print plaintext(result.description)
              targetfile.write(plaintext(result.description))
              targetfile.write('\n')

def bing_search(match):
    engine = Bing
    for i in range(1,10):
        for result in engine.search(match, type=SEARCH, start=i):
              print plaintext(result.description)

engine = Bing() # Enter your license key.
for i in range(1,15):
    for result in engine.search('holy', type=SEARCH, start=i):
        print plaintext(result.description)
        print
#google_search(u'شیر مادر', milkfile)
#google_search(u'شیر وحشی', lionfile)
#google_search(u'شیر آب', tapfile)
##article =  engine.search(match)
#print article.title
#for link in  article.links:
#    print link
#    #subarticle = engine.search(link)
#    url = URL(link)
#    result = Result(url)
#    print result.download()
# Example of pattern: http://www.clips.ua.ac.be/pages/pattern

from pattern.web import Bing, plaintext
from pattern.en import Sentence, Chunk, parse
from pattern.search import Pattern
from pattern.graph import Graph, Node, Edge, export

g = Graph()
for i in range(1):
    print "--------------", i
    for r in Bing().search('"more important than"', start=i + 1, count=50):
        s = plaintext(r.description.lower())
        print s
        s = Sentence(parse(s))
        print s
        p = Pattern.fromstring('NP (VP) more important than NP')
        for m in p.search(s):
            a = m.constituents(p[+0])[-1]  # Left NP.
            b = m.constituents(p[-1])[+0]  # Right NP.
            a = (isinstance(a, Chunk) and a.head or a).string
            b = (isinstance(b, Chunk) and b.head or b).string
            if a and b:
                if a not in g:
                    g.add_node(a, radius=5, stroke=(0, 0, 0, 0.8))
                if b not in g:
                    g.add_node(b, radius=5, stroke=(0, 0, 0, 0.8))
                g.add_edge(g[b], g[a], stroke=(0, 0, 0, 0.6))

g = g.split()[0]  # Largest subgraph.

for n in g.sorted()[:40]:  # Sorted by Node.weight.
Example #18
0
def bingcorpsearch(word,
                   concfilter='',
                   extraquery='',
                   license=None,
                   start=1,
                   count=50):
    """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples.
       First queries Google, and then retrieves the pages of the top search results.
       Uses 'pattern' (CLiPS, Antwerpen University)
       """
    if not concfilter:
        query = word
    else:
        query = word + ' ' + concfilter
    if extraquery:
        query += ' ' + extraquery

    engine = Bing(license=license)

    processed = {}

    for result in engine.search(query, start=start, count=count):
        if not result.url in processed:
            processed[result.url] = True
            try:
                content = plaintext(result.download())
            except:
                continue

            begin = 0
            wordindex = None
            wordlength = 0
            concindex = None
            for i in range(1, len(content)):
                if content[i] == '.' or content[i] == '?' or content[
                        i] == '!' or content[i] == '\n':
                    if wordindex >= begin and (
                        (concfilter and concindex >= begin) or
                        (not concfilter)):
                        if len(content[begin:wordindex].strip()) > 5 or len(
                                content[wordindex + wordlength:i +
                                        1].strip()) > 5:
                            yield (content[begin:wordindex].strip(),
                                   content[wordindex:wordindex +
                                           wordlength].strip(),
                                   content[wordindex + wordlength:i + 1],
                                   result.url)
                    wordindex = concindex = None
                    begin = i + 1
                if len(word) + i <= len(content) and content[i:i +
                                                             len(word)].lower(
                                                             ) == word.lower():
                    wordindex = i
                    wordlength = len(word)
                    for j in range(len(word), len(content)):
                        if i + j < len(content) and (
                                content[i + j] == ' ' or content[i + j] == '?'
                                or content[i + j] == '!'
                                or content[i + j] == '\n'):
                            wordlength = j
                            break
                if concfilter and content[i:len(concfilter)].lower(
                ) == concfilter.lower():
                    concindex = i
Example #19
0
    engine = Google(license=None)
    for i in range(1, 10):
        for result in engine.search(match, type=SEARCH, start=i):
            print plaintext(result.description)
            targetfile.write(plaintext(result.description))
            targetfile.write('\n')


def bing_search(match):
    engine = Bing
    for i in range(1, 10):
        for result in engine.search(match, type=SEARCH, start=i):
            print plaintext(result.description)


engine = Bing()  # Enter your license key.
for i in range(1, 15):
    for result in engine.search('holy', type=SEARCH, start=i):
        print plaintext(result.description)
        print
#google_search(u'شیر مادر', milkfile)
#google_search(u'شیر وحشی', lionfile)
#google_search(u'شیر آب', tapfile)
##article =  engine.search(match)
#print article.title
#for link in  article.links:
#    print link
#    #subarticle = engine.search(link)
#    url = URL(link)
#    result = Result(url)
#    print result.download()
Example #20
0
import sys, random
import machine
from pattern.en import tag
from pattern.web import Bing, SEARCH, IMAGE, URL, extension


def save_image(url, figure):
    url = URL(url)
    f = open('illustrations/' + figure + extension(url.page), 'wb')
    f.write(url.download())
    f.close()


text = sys.stdin.read()
invention = machine.Invention(text)
engine = Bing(license=None)

#the following searches for patent illustrations on bing, using a generated noun from each description of the illustration
#search_base = "patent illustration "
#for i, illustration in enumerate(invention.unformatted_illustrations):
#nouns = [word for word, pos in tag(illustration) if pos == 'NN']
#if len(nouns) > 0:
#search_string = search_base + random.choice(nouns)#' '.join(nouns)
#print "searching for: " + search_string
#for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)):
#print "saving: " + result.url
#try:
#save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1))
#except:
#next
import sys, random
import machine
from pattern.en import tag
from pattern.web import Bing, SEARCH, IMAGE, URL, extension

def save_image(url, figure):
    url = URL(url)
    f = open('illustrations/' + figure + extension(url.page), 'wb')
    f.write(url.download())
    f.close()


text = sys.stdin.read()
invention = machine.Invention(text)
engine = Bing(license=None)

#the following searches for patent illustrations on bing, using a generated noun from each description of the illustration
#search_base = "patent illustration "
#for i, illustration in enumerate(invention.unformatted_illustrations):
    #nouns = [word for word, pos in tag(illustration) if pos == 'NN']
    #if len(nouns) > 0:
        #search_string = search_base + random.choice(nouns)#' '.join(nouns) 
        #print "searching for: " + search_string
        #for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)):
            #print "saving: " + result.url
            #try:
                #save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1))
            #except:
                #next

# the following searches for "fig N patent illustration"