def run(self, keywords=[]): if not keywords: # Check if file exists if not os.path.isfile(self.default_keyword_file): return False else: keywords = [] fp = open(self.default_keyword_file,"r") for line in fp.readlines(): keywords.append(line.strip()) fp.close() self.keywords = keywords print "Using Keywords:{0}".format(self.keywords) try: # Get the hits for the given keywords bing = PyBingSearch(BING_API_KEY) for keyword in self.keywords: print "KEYWORD:{0}".format(keyword) result_list, next_uri = bing.search(keyword, limit=self.maxResuts, format='json') for result in result_list: url = result.url print "Found URL:{0}".format(url) self.urls.append(url) except: print "Something went wrong querying Bing." pass return True
def run(self, keywords=[]): if not keywords: # Check if file exists if not os.path.isfile(self.default_keyword_file): return False else: keywords = [] fp = open(self.default_keyword_file, "r") for line in fp.readlines(): keywords.append(line.strip()) fp.close() self.keywords = keywords print "Using Keywords:{0}".format(self.keywords) try: # Get the hits for the given keywords bing = PyBingSearch(BING_API_KEY) for keyword in self.keywords: print "KEYWORD:{0}".format(keyword) result_list, next_uri = bing.search(keyword, limit=self.maxResuts, format='json') for result in result_list: url = result.url print "Found URL:{0}".format(url) self.urls.append(url) except: print "Something went wrong querying Bing." pass return True
def bing_search(query): bing = PyBingSearch('rLSasvRW9cvlU5fG9hoSGjJG2M1eIjR+Ld27nFC9Pj8=') buildquery = query.replace(',', ' ') result_list = bing.search_all(query, limit=10, format='json') bingurls = [] for result in result_list: bingurls.append(result.url) return bingurls
def bing_search(query): bing = PyBingSearch('rLSasvRW9cvlU5fG9hoSGjJG2M1eIjR+Ld27nFC9Pj8=') buildquery=query.replace(',',' ') result_list = bing.search_all(query, limit=10, format='json') bingurls=[] for result in result_list: bingurls.append(result.url) return bingurls
def GetLinksForQueryBing(query): bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY') try: result_list = bing.search_all(query, limit=20, format='json') except PyBingException: return [] results = [result.url for result in result_list] results = results[:min(20, len(results))] return [r for r in results if r.find("youtube") == -1]
def bingSearch(self, numresult=10): bing = PyBingSearch(self.bing_api_key) results, next_uri = bing.search(self.query, limit=numresult, format='json') res = [] for i in range(numresult): res += [results[i].url] return res
def bingSearch(link, limit=4): #Sanitize input try: linkfile = link.replace("^", "|") bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q') try: result_list, next_uri = bing.search(linkfile, limit, format='json') except: result_list, next_uri = bing.search(linkfile.replace(" news", ""), limit, format='json') returning=[] for i in xrange(limit): try: returning.append(result_list[i].url.encode('utf8')) except: break return returning except: return [link.replace(" news", "")]
def get_improved_term(query): bing = PyBingSearch('') # Add your bing-api key here result_list, next_url = bing.search("%s wikipedia" % query, limit=3, format='json') for result in result_list: wiki_url = result.url wiki_desc = result.description if "en.wikipedia" in wiki_url: if ("may refer to" not in wiki_desc) or ("may also refer to" not in wiki_desc): wiki_corr_term = (wiki_url).split("/")[-1] try: wiki_corr_term_dec = str(urllib.unquote(wiki_corr_term).decode('utf-8')) return wiki_corr_term_dec except: pass return query
def getTopTen(): global query global pagesToBeCrawled global fb bing = PyBingSearch('mMlCxUd5qmU5uDJ1w1VLbDkobVK905A9cZZhYkfqGHg=') query = raw_input("Enter a search query ") pagesToBeCrawled = input("Enter the number of pages you would like to be crawled? ") fp.write('****************************The query searched for is:' + query + ", pages to be crawled: " + str(pagesToBeCrawled) + '\n') urlList, next_uri = bing.search(query, limit=10, format='json') # get the results for result in urlList: #initialUrls.append(result); # Add the initial lists to the list if (pages > pagesToBeCrawled): print 'We have successfully crawled',pagesToBeCrawled,'pages' break checkUrl(result.url)
def bingSearch(linkfile): print "\nCalling bingSearch with arguments linkfile: {}:".format(str(linkfile)) #Sanitize input linkfile = linkfile.replace("^", "|") bing=PyBingSearch('XXXXX') #Get from bing: result_list, next_uri = bing.search(linkfile, limit=5, format='json') #result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json') result_list[0].description #first bing result file = open( 'bingResults.txt', 'w') for res in result_list: file.write('"' + res.url + '" ') break print "\nbingSearch complete" return str(result_list[0].url)
def GetLinksForQueryBing(query): #service = build("customsearch", "v1", # developerKey="AIzaSyDBh9qkTpuXSWbsjCfnCTQJFuFGKOYCElM") #res = service.cse().list( # q=query, # cx='000504779742960611072:dpmv5fihhu8', # ).execute() #return [item['link'] for item in res['items']][:20] try: bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY') result_list = bing.search_all(query, limit=20, format='json') results = [result.url for result in result_list] except: return None return results[:min(20, len(results))]
def get_improved_term(query): bing = PyBingSearch('') # Add your bing-api key here result_list, next_url = bing.search("%s wikipedia" % query, limit=3, format='json') for result in result_list: wiki_url = result.url wiki_desc = result.description if "en.wikipedia" in wiki_url: if ("may refer to" not in wiki_desc) or ("may also refer to" not in wiki_desc): wiki_corr_term = (wiki_url).split("/")[-1] try: wiki_corr_term_dec = str( urllib.unquote(wiki_corr_term).decode('utf-8')) return wiki_corr_term_dec except: pass return query
def __init__(self): self.APIKEY = open("apikey.txt").read() self.bing = PyBingSearch(self.APIKEY) self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} self.summarizer = Summarizer()
def main(argv): query = argv[1] if(EXACT): query = '"' + query + '"' print("Query:", query) bing = PyBingSearch() bing.search_all(query, 1000) print("-----------------------") print("hits:", len(bing.list)) print("writing results to:", output_file) output = open(output_file, 'w') for url in bing.list: output.write(url + "\n") print("writing finished") sys.exit()
def _hits(self,my_query): if self.search_engine == "google": query = urllib.urlencode({'q' : my_query}) time.sleep(randint(0,4)) r = requests.get('https://www.google.com/search?' + query) searchres_param = "id=\"resultStats\">((About |)[0-9,]+) result(|s)</div>" print my_query try: count = re.search(searchres_param,r.text).group(1) if "About " in count: count = count.strip("About ") print "Result found" return (int(str(re.sub(',','',count))) + 0.01) except: print "No results" return 0.01 elif self.search_engine == "bing": bing = PyBingSearch('xAFcyVsidXgkpQxwHYkPcPPPRGpjU2qlNtjBD6ZqGNU') result_list,next_url = bing.search(my_query) if len(result_list) > 0: return len(result_list) + 0.01 else: return 0.01
#pip install py-bing-search #Blog Yazisi : http://bit.ly/1iEZHZt from py_bing_search import PyBingSearch file = open("siteurl.txt", "wb") bing = PyBingSearch('API-KEY') result_list, next_uri = bing.search("Sorgu Cümleciği", limit=50, format='json') for result in result_list: file.write(result.url+"\n"); file.close()
# -*- coding: utf-8 -*- __author__ = 'lufo' from py_bing_search import PyBingSearch bing = PyBingSearch('QkcWAM6VJ/S0LJI9wvVGN4UNQUwikMb4zY/kUVe/hAw') result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json') for result in result_list: print result.url
from py_bing_search import PyBingSearch import os import fileinput import sys import time import re reload(sys) sys.setdefaultencoding('utf8') filename = sys.argv[1] bing = PyBingSearch('UvG/iELD97We0KffqjrVFHwUrEHbe0ZCbeVfraImZRg') outputfile = filename.replace('.txt', "") input_text = [] output_text = [] text = open(filename, 'r') for line in text.readlines(): input_text.append(line.rstrip()); all_text_length = len(input_text) badwords = outputfile + "-badwords.txt" total = outputfile + "-result.txt" badwords_output = open(badwords, 'w') total_output = open(total, 'w') output = open(outputfile + ".json", 'w') output.write('{"result":[')
class WebMd: def __init__(self): self.APIKEY = open("apikey.txt").read() self.bing = PyBingSearch(self.APIKEY) self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} self.summarizer = Summarizer() def extractUrlStructuredText(self, url): """Extracts data from webmd url and provides a list of objects containing the heading and body """ html = self.getUrl(url) Soup = BeautifulSoup(html) soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist if soup == None: soup = Soup.find('div', {'id':'textArea'}) # generally always exists body = "" blocks = [] # list of objects containing heading and body heading = "" body = "" startNew = False skip = False for child in soup.recursiveChildGenerator(): name = getattr(child, "name", None) if skip: skip = False continue if startNew: heading = child body = "" startNew = False continue if name in ['script', 'style']: skip = True continue if name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b']: blocks.append({'heading':heading, 'body':body}) startNew = True if name is not None: pass elif not child.isspace(): # leaf node, don't print spaces body = body + " " + child if len(blocks)>1: return blocks[1::] return [] def extractUrlText(self, url): """Extracts content text from webmd url """ html = self.getUrl(url) Soup = BeautifulSoup(html) soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist if soup == None: soup = Soup.find('div', {'id':'textArea'}) # generally always exists skipNext = False body = "" for child in soup.recursiveChildGenerator(): if skipNext: skipNext = False continue name = getattr(child, "name", None) if name in ["script", "style"]: skipNext = True if name is not None: pass elif not child.isspace(): # leaf node, don't print spaces body = body + child return body def getUrl(self, url): """Attempts to summarize webpage contents (assuming webmd url) """ hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} req = urllib2.Request(url, headers=hdr) response = urllib2.urlopen(req).read() #response = requests.get(test_url) #response = urllib2.urlopen(test_url).read() return response def isFirstAidPage(self, url): if url.find('/first-aid/') == -1: return False else: return True def search(self, s, limit=3): """Searches top limit number of bing searches. Returns the summarized/unsummarized data and the format code (0=no format, 1=formatted) """ result_list, next_uri = self.bing.search(s + " treatment webmd", limit=limit, format='json') ########### Xiuyan's processing. First Aid type instruction format ########## for result in result_list: print(result.url) if self.isFirstAidPage(result.url): try: page = requests.get(result.url) print("piece of shit") return (extract_instructions(page), 1) except: print("entered Xiuyan's except") ########## Rahman's processing. Returns structured data representing all of first link ############# try: blocks = self.extractUrlStructuredText(result_list[0].url) return (blocks, 1) except: print("Able to structure into headers and body") ########### Rahman's processing for 'other' pages. Attempts to summarize all first three links ########### content = "" for result in result_list: try: content = content + self.extractUrlText(result.url) except Exception, e: print(e) pass if content == "": print("Other WebMd Page") return (self.summarizer.summarizeText(content), 0) ########### Worst case: summarize first url ################ print("Summarizing first") return (self.summarizer.summarizeUrl(result_list[0].url), 0)
def get_results(search): '-> _List_ of dictionaries of results' bing = PyBingSearch(BING_SEARCH_KEY) results, next_uri = bing.search(search, limit =NUM_SEARCH_RESULTS, format ='json') return results
from py_bing_search import PyBingSearch bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY') result_list = bing.search_all( "(yawn) AND (other OR early) AND (people) AND (contagious OR catching) AND (room)", limit=50, format='json') print[result.url for result in result_list][:10]
from py_bing_search import PyBingSearch import sys import os linkfile = sys.argv[-1] linkfile = linkfile.replace("^", "|"); bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q') result_list, next_uri = bing.search(linkfile, limit=5, format='json') #result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json') result_list[0].description file = open( 'bingResults.txt', 'w') for res in result_list: file.write('"' + res.url + '" ') break
def bingWikiSearch(self): query = self.query.split(" ")[0] + " :wiki" bing = PyBingSearch(self.bing_api_key) results, next_uri = bing.search(query, limit=1, format='json') return results[0].url
def GetLinksForQuery(query): bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY') result_list = bing.search_all(query, limit=20, format='json') return [result.url for result in result_list][:20]
from py_bing_search import PyBingSearch import urllib import urllib2 import json import os import socket socket.setdefaulttimeout(5) key = '4axpjG94pE8x9yUZqveY2LObcgNLVfX5oTW6+s5JbR0' bing = PyBingSearch('4axpjG94pE8x9yUZqveY2LObcgNLVfX5oTW6+s5JbR0') credentialBing = 'Basic ' + (':%s' % key).encode( 'base64')[:-1] # the "-1" is to remove the trailing "\n" which encode adds photo_directory = 'bingBad' if not os.path.exists(photo_directory): os.makedirs(photo_directory) for offset in range(0, 50000, 50): bing_search_url = "https://api.datamarket.azure.com/Bing/Search/v1/Image?Query=%27bad%20photography%27&$format=json&$top=200&$skip=" + str( offset) request = urllib2.Request(bing_search_url) request.add_header('Authorization', credentialBing) requestOpener = urllib2.build_opener() response = requestOpener.open(request) results = json.load(response) for i, image in enumerate(results['d']['results']):
from py_bing_search import PyBingSearch bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY') result_list = bing.search_all("(yawn) AND (other OR early) AND (people) AND (contagious OR catching) AND (room)", limit=50, format='json') print [result.url for result in result_list][:10]
import json import os import requests import time from collections import defaultdict from functools import reduce from py_bing_search import PyBingSearch import sys apikey = 'wtprucmwrgk6bd92rq7tun97' edmund_url = 'http://api.edmunds.com/api/vehicle/v2/' end_url = '?fmt=json&view=full&api_key=' + apikey bing = PyBingSearch('Np5rmrL6fIPP3jpDqVi+Li/rJ1Joih4Q6wP69HrjQro=') model_id = 1 make_id = 1 engine_id = 1 models_list = [] makes_list = [] engines_list = [] make_ids = {} makes_models_dict = defaultdict(list) makes_json = requests.get(edmund_url + 'makes' + end_url).json() def add_engines(engine, model_id): global engine_id global engines_list global models_list
soup = BeautifulSoup(r) body = soup.find('body').text body = unicodedata.normalize('NFKD', body).encode('ascii', 'ignore') body = body.splitlines() body = [i for i in body if i != ''] body = [x for x in body if len(x) > 70] body = map(cut, body) if len(body) < 5: indexes = range(0, len(body)) else: indexes = [randint(0, len(body) - 1) for i in range(0, 5)] return ['"' + body[i] + '"' for i in indexes] #Now the request bing = PyBingSearch('1lQ7z/Ye5Qo/vuWoEuznwGUDQX841pfEkLC77SBTNCs') #Function def request_urls(url): statements = rand_statements(url) list_duplicates = [] for statement in statements: result_list, next_uri = bing.search(statement, limit=50, format='json') results = [ unicodedata.normalize('NFKD', result_list[i].url).encode( 'ascii', 'ignore') for i in range(0, len(result_list)) ] list_duplicates = list_duplicates + results #Get the frequencies of each url we get return Counter(list_duplicates).most_common()
def GetLinksForQueryBing(query): bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY') result_list = bing.search_all(query, limit=20, format='json') return [result.url for result in result_list][:20]