def run(self,q):# q is the query urlsBing=[] engine = Bing(license=None) # Enter your license key. for i in range(1,11): for result in engine.search(q, type=SEARCH, start=i): urlsBing.append(result.url) return urlsBing
def get_urls(self, q = "", n = 1, limit = 1): url = [] reload(sys) sys.setdefaultencoding(GOOGLE_API_ENCODING) engine_google = Bing(license=BING_API_KEY, language=BING_API_LANG) for i in range(1, (n + 1)): for result in engine_google.search(q, start=i, count=10, type=SEARCH, cached=False): url.append(result.url) return url[:limit]
def get_bing_entries(self, search, nb): bing = Bing(language=self.dico_lang[self.language]) entries = [] for result in bing.search(search, start=1, count=nb, cached=False): entry_input = Input(result.text) annotations = { 'source' : 'Bing', 'title': result.title, 'url': result.url, 'search' : search, } entry_input.segments[0].annotations.update(annotations) entries.append(entry_input) return entries
def generar_consulta_bing(q): reload(sys) sys.setdefaultencoding('utf8') engine_bing = Bing(license="TNMHm68dvf440pSPdnU+2LqxeQi7J2xszPZLBiPYsmI", throttle=0.5, language=None) bing = [] for consulta in q: request = asynchronous(engine_bing.search, consulta, start=1, count=20, type=SEARCH, timeout=10) while not request.done: time.sleep(0.01) # # # An error occured in engine.search(), raise it. if request.error: raise request.error # # # Retrieve the list of search results. for result in request.value: bing.append(result.url) return bing
def get_bing_entries(self, search, nb): bing = Bing(language=self.dico_lang[self.language]) entries = list() for result in bing.search(search, start=1, count=nb, cached=False): entry_input = Input(result.text) annotations = { 'source': 'Bing', 'title': result.title, 'url': result.url, 'search': search, } segment = entry_input[0] segment.annotations.update(annotations) entry_input[0] = segment entries.append(entry_input) return entries
def __init__(self, provider, key=None): if provider.lower() == "bing": key = key or 'd6Mz4slIdgIxcKR4609FO+QKOFTEFFRB3i7j8VioPiE' self._engine = Bing(license=key) elif provider.lower() == "google": key = key or 'AIzaSyCAADAKnnkmDwIlLk_Q1p6foqI_ZMrgzcg' self._engine = Google(license=key) else: raise ValueError('Not a recognized provider.')
def bingcorpsearch(word,concfilter = '', extraquery='',license=None, start=1, count=50): """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples. First queries Google, and then retrieves the pages of the top search results. Uses 'pattern' (CLiPS, Antwerpen University) """ if not concfilter: query = word else: query = word + ' ' + concfilter if extraquery: query += ' ' + extraquery engine = Bing(license=license) processed = {} for result in engine.search(query, start=start,count=count): if not result.url in processed: processed[result.url] = True try: content = plaintext(result.download()) except: continue begin = 0 wordindex = None wordlength = 0 concindex = None for i in range(1,len(content)): if content[i] == '.' or content[i] == '?' or content[i] == '!' or content[i] == '\n': if wordindex >= begin and ((concfilter and concindex >= begin) or (not concfilter)): if len(content[begin:wordindex].strip()) > 5 or len(content[wordindex+wordlength:i+1].strip()) > 5: yield (content[begin:wordindex].strip(), content[wordindex:wordindex+wordlength].strip(), content[wordindex+wordlength:i+1], result.url) wordindex = concindex = None begin = i + 1 if len(word)+i <= len(content) and content[i:i+len(word)].lower() == word.lower(): wordindex = i wordlength = len(word) for j in range(len(word),len(content)): if i+j < len(content) and (content[i+j] == ' ' or content[i+j] == '?' or content[i+j] == '!' or content[i+j] == '\n'): wordlength = j break if concfilter and content[i:len(concfilter)].lower() == concfilter.lower(): concindex = i
def novelty(word): """ Returns the novelty of the given word as a value 0.0-1.0 (1.0 = 100% novel). """ engine = Bing() # Google(license="...") # Get the number of search results that mention the given word. # http://www.clips.ua.ac.be/pages/pattern-web#services count = engine.search(word, cached=True).total # Note: we should cached=False to get the most up-to-date count. # It would be nice if this number was relative (0.0-1.0), # then we could represent novelty as a percentage, # based on the number of existing web pages that mention the word. # Here are some raw numbers: # - "and" : 1730000000 # - "new york" : 94700000 # - "tree" : 78200000 # - "justin bieber" : 7680000 # - "computational creativity" : 5330000 # - "zombification" : 126000 # - "zombification machine" : 37000 # - "zombology" : 11100 # - "zombeliever" : 11 # - "zombriefing" : 0 # - "zombifractor" : 0 # So, it looks like common words are mentioned thousands of times, # while invented words are mentioned dozens of times. # We'll cut off the result count above 100 # (= anything mentioned 100x times on the net is not novel). count = min(count, 100) # And then relativize the value: count = 1.0 - count * 0.01 return count
def get_info(search_query): if isinstance(search_query, str): search_query = str(search_query) else: return { "Error": "Pass a string, from mine.py [7]", "Result": [None] } result = [] engineGoogle = Google(license=None, throttle=0.5, language=None) engineBing = Bing(license=None, throttle=0.5, language=None) engineTwitter = Twitter(license=None, throttle=0.5, language=None) engineFacebook = Facebook(license=None, throttle=1.0, language='en') engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None) engineFlickr = Flickr(license=None, throttle=5.0, language=None) engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr] engineArray = [engineGoogle, engineTwitter] ''' for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)]) [result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray] # print repr(plaintext(para.text)) # print repr(plaintext(para.url)) + '\n\n' # result.append(repr(plaintext(para.text))) ''' # Google for i in range(1, 5): result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)]) for i in range(1, 5): result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)]) ''' # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)]) for i in range(1,2): result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)]) ''' return { "Error": None, "Result": result } # return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }
from pattern.web import Bing, asynchronous, plaintext from pattern.web import SEARCH, IMAGE, NEWS import time # This example retrieves results from Bing based on a given query. # The Bing search engine can retrieve up to a 1000 results (10x100) for a query. # Bing's "Custom Search API" is a paid service. # The pattern.web module uses a test account by default, # with 5000 free queries per month shared by all Pattern users. # If this limit is exceeded, SearchEngineLimitError is raised. # You should obtain your own license key at: # https://datamarket.azure.com/account/ engine = Bing(license=None, language="en") # Quote a query to match it exactly: q = "\"is more important than\"" # When you execute a query, # the script will halt until all results are downloaded. # In apps with an infinite main loop (e.g., GUI, game), # it is often more useful if the app keeps on running # while the search is executed in the background. # This can be achieved with the asynchronous() function. # It takes any function and that function's arguments and keyword arguments: request = asynchronous(engine.search, q, start=1, count=100, type=SEARCH, timeout=10) # This while-loop simulates an infinite application loop. # In real-life you would have an app.update() or similar
from pattern.web import Bing, IMAGE import urllib engine = Bing() for counter, result in enumerate(engine.search('meme', type=IMAGE)): try: urllib.urlretrieve(result.url, "images/%s.jpg" % counter) except: pass
import os, sys sys.path.insert(0, os.path.join("..", "..")) from pattern.web import Bing, asynchronous, plaintext from pattern.web import SEARCH, IMAGE, NEWS import time # This example retrieves results from Bing based on a given query. # Yahoo can retrieve up to a 1000 results (10x100) for a query. # You should obtain your own license key at: # https://developer.apps.yahoo.com/wsregapp/ # Otherwise you will be sharing the default license with all users of this module. engine = Bing(license=None) # Quote a query to match it exactly: q = "\"is more important than\"" # When you execute a query, the script will halt until all results are downloaded. # In applications with an event loop (e.g. a GUI or an interactive animation) # it is more useful if the app keeps on running while the search is executed in the background. # This can be achieved with the asynchronous() command. # It takes any function and the function's arguments and keyword arguments: request = asynchronous(engine.search, q, start=1, count=100, type=SEARCH, timeout=10)
from pattern.db import date, time, NOW from pattern.web import Bing, NEWS # It is often useful to keep a date stamp for each row in the table. # The pattern.db module's date() function can be used for this. # It is a simple wrapper around Python's datetime.datetime class, # with extra functionality to make it easy to parse or print it as a string. print date(NOW) print date() print date("2010-11-01 16:30", "%Y-%m-%d %H:%M") print date("Nov 1, 2010", "%b %d, %Y") print date("Nov 1, 2010", "%b %d, %Y", format="%d/%m/%Y") print # All possible formatting options: # http://docs.python.org/library/time.html#time.strftime for r in Bing(license=None, language="en").search("today", type=NEWS): print r.title print repr( r.date ) # Result.date is a string (e.g. we can't > <= += with the date). print date(r.date) # date() can parse any Result.date in the web module. print d = date("4 november 2011") d += time(days=2, hours=5) print d
# "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Bing and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Bing search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) d = Datasheet() engine = Bing(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i+1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[ 0] # Right NP. d.append(( a.string.lower(), b.string.lower())) pprint(d) print
def google_search(match,targetfile): engine = Google(license=None) for i in range(1,10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) targetfile.write(plaintext(result.description)) targetfile.write('\n') def bing_search(match): engine = Bing for i in range(1,10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) engine = Bing() # Enter your license key. for i in range(1,15): for result in engine.search('holy', type=SEARCH, start=i): print plaintext(result.description) print #google_search(u'شیر مادر', milkfile) #google_search(u'شیر وحشی', lionfile) #google_search(u'شیر آب', tapfile) ##article = engine.search(match) #print article.title #for link in article.links: # print link # #subarticle = engine.search(link) # url = URL(link) # result = Result(url) # print result.download()
# Example of pattern: http://www.clips.ua.ac.be/pages/pattern from pattern.web import Bing, plaintext from pattern.en import Sentence, Chunk, parse from pattern.search import Pattern from pattern.graph import Graph, Node, Edge, export g = Graph() for i in range(1): print "--------------", i for r in Bing().search('"more important than"', start=i + 1, count=50): s = plaintext(r.description.lower()) print s s = Sentence(parse(s)) print s p = Pattern.fromstring('NP (VP) more important than NP') for m in p.search(s): a = m.constituents(p[+0])[-1] # Left NP. b = m.constituents(p[-1])[+0] # Right NP. a = (isinstance(a, Chunk) and a.head or a).string b = (isinstance(b, Chunk) and b.head or b).string if a and b: if a not in g: g.add_node(a, radius=5, stroke=(0, 0, 0, 0.8)) if b not in g: g.add_node(b, radius=5, stroke=(0, 0, 0, 0.8)) g.add_edge(g[b], g[a], stroke=(0, 0, 0, 0.6)) g = g.split()[0] # Largest subgraph. for n in g.sorted()[:40]: # Sorted by Node.weight.
def bingcorpsearch(word, concfilter='', extraquery='', license=None, start=1, count=50): """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples. First queries Google, and then retrieves the pages of the top search results. Uses 'pattern' (CLiPS, Antwerpen University) """ if not concfilter: query = word else: query = word + ' ' + concfilter if extraquery: query += ' ' + extraquery engine = Bing(license=license) processed = {} for result in engine.search(query, start=start, count=count): if not result.url in processed: processed[result.url] = True try: content = plaintext(result.download()) except: continue begin = 0 wordindex = None wordlength = 0 concindex = None for i in range(1, len(content)): if content[i] == '.' or content[i] == '?' or content[ i] == '!' or content[i] == '\n': if wordindex >= begin and ( (concfilter and concindex >= begin) or (not concfilter)): if len(content[begin:wordindex].strip()) > 5 or len( content[wordindex + wordlength:i + 1].strip()) > 5: yield (content[begin:wordindex].strip(), content[wordindex:wordindex + wordlength].strip(), content[wordindex + wordlength:i + 1], result.url) wordindex = concindex = None begin = i + 1 if len(word) + i <= len(content) and content[i:i + len(word)].lower( ) == word.lower(): wordindex = i wordlength = len(word) for j in range(len(word), len(content)): if i + j < len(content) and ( content[i + j] == ' ' or content[i + j] == '?' or content[i + j] == '!' or content[i + j] == '\n'): wordlength = j break if concfilter and content[i:len(concfilter)].lower( ) == concfilter.lower(): concindex = i
engine = Google(license=None) for i in range(1, 10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) targetfile.write(plaintext(result.description)) targetfile.write('\n') def bing_search(match): engine = Bing for i in range(1, 10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) engine = Bing() # Enter your license key. for i in range(1, 15): for result in engine.search('holy', type=SEARCH, start=i): print plaintext(result.description) print #google_search(u'شیر مادر', milkfile) #google_search(u'شیر وحشی', lionfile) #google_search(u'شیر آب', tapfile) ##article = engine.search(match) #print article.title #for link in article.links: # print link # #subarticle = engine.search(link) # url = URL(link) # result = Result(url) # print result.download()
import sys, random import machine from pattern.en import tag from pattern.web import Bing, SEARCH, IMAGE, URL, extension def save_image(url, figure): url = URL(url) f = open('illustrations/' + figure + extension(url.page), 'wb') f.write(url.download()) f.close() text = sys.stdin.read() invention = machine.Invention(text) engine = Bing(license=None) #the following searches for patent illustrations on bing, using a generated noun from each description of the illustration #search_base = "patent illustration " #for i, illustration in enumerate(invention.unformatted_illustrations): #nouns = [word for word, pos in tag(illustration) if pos == 'NN'] #if len(nouns) > 0: #search_string = search_base + random.choice(nouns)#' '.join(nouns) #print "searching for: " + search_string #for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)): #print "saving: " + result.url #try: #save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1)) #except: #next
import sys, random import machine from pattern.en import tag from pattern.web import Bing, SEARCH, IMAGE, URL, extension def save_image(url, figure): url = URL(url) f = open('illustrations/' + figure + extension(url.page), 'wb') f.write(url.download()) f.close() text = sys.stdin.read() invention = machine.Invention(text) engine = Bing(license=None) #the following searches for patent illustrations on bing, using a generated noun from each description of the illustration #search_base = "patent illustration " #for i, illustration in enumerate(invention.unformatted_illustrations): #nouns = [word for word, pos in tag(illustration) if pos == 'NN'] #if len(nouns) > 0: #search_string = search_base + random.choice(nouns)#' '.join(nouns) #print "searching for: " + search_string #for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)): #print "saving: " + result.url #try: #save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1)) #except: #next # the following searches for "fig N patent illustration"