def bingSearch(link, limit=4):
    #Sanitize input
    try:
	    linkfile = link.replace("^", "|")
	    bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q')
	    try: result_list, next_uri = bing.search(linkfile, limit, format='json')
	    except: result_list, next_uri = bing.search(linkfile.replace(" news", ""), limit, format='json')
	    returning=[]
	    for i in xrange(limit):
		 try: returning.append(result_list[i].url.encode('utf8'))
		 except: break
	    return returning
    except: return [link.replace(" news", "")]
Exemple #2
0
    def run(self, keywords=[]):

        if not keywords:
            # Check if file exists
            if not os.path.isfile(self.default_keyword_file):
                return False
            else:
                keywords = []
                fp = open(self.default_keyword_file, "r")
                for line in fp.readlines():
                    keywords.append(line.strip())
                fp.close()

        self.keywords = keywords
        print "Using Keywords:{0}".format(self.keywords)

        try:
            # Get the hits for the given keywords
            bing = PyBingSearch(BING_API_KEY)
            for keyword in self.keywords:
                print "KEYWORD:{0}".format(keyword)
                result_list, next_uri = bing.search(keyword,
                                                    limit=self.maxResuts,
                                                    format='json')
                for result in result_list:
                    url = result.url
                    print "Found URL:{0}".format(url)
                    self.urls.append(url)
        except:
            print "Something went wrong querying Bing."
            pass

        return True
Exemple #3
0
    def run(self, keywords=[]):

        if not keywords:
            # Check if file exists
            if not os.path.isfile(self.default_keyword_file):
                return False
            else:
                keywords = []
                fp = open(self.default_keyword_file,"r")
                for line in fp.readlines():
                    keywords.append(line.strip())
                fp.close()

        self.keywords = keywords
        print "Using Keywords:{0}".format(self.keywords)

        try:
            # Get the hits for the given keywords
            bing = PyBingSearch(BING_API_KEY)
            for keyword in self.keywords:
                print "KEYWORD:{0}".format(keyword)
                result_list, next_uri = bing.search(keyword, limit=self.maxResuts, format='json')
                for result in result_list:
                    url = result.url
                    print "Found URL:{0}".format(url)
                    self.urls.append(url)
        except:
                print "Something went wrong querying Bing."
                pass

        return True
 def bingSearch(self, numresult=10):
     bing = PyBingSearch(self.bing_api_key)
     results, next_uri = bing.search(self.query,
                                     limit=numresult,
                                     format='json')
     res = []
     for i in range(numresult):
         res += [results[i].url]
     return res
Exemple #5
0
def get_improved_term(query):
    bing = PyBingSearch('') # Add your bing-api key here
    result_list, next_url = bing.search("%s wikipedia" % query, limit=3, format='json')
    for result in result_list:
        wiki_url = result.url
        wiki_desc = result.description
        if "en.wikipedia" in wiki_url:
            if ("may refer to" not in wiki_desc) or ("may also refer to" not in wiki_desc):
                wiki_corr_term = (wiki_url).split("/")[-1]
                try:
                    wiki_corr_term_dec = str(urllib.unquote(wiki_corr_term).decode('utf-8'))
                    return wiki_corr_term_dec
                except:
                    pass
    return query
Exemple #6
0
def getTopTen():
	global query
	global pagesToBeCrawled
	global fb
	bing = PyBingSearch('mMlCxUd5qmU5uDJ1w1VLbDkobVK905A9cZZhYkfqGHg=')
	query = raw_input("Enter a search query ")
	pagesToBeCrawled = input("Enter the number of pages you would like to be crawled? ")
	fp.write('****************************The query searched for is:' + query + ", pages to be crawled: " + str(pagesToBeCrawled) + '\n')
	urlList, next_uri = bing.search(query, limit=10, format='json') # get the results
	for result in urlList:
		#initialUrls.append(result); # Add the initial lists to the list
		if (pages > pagesToBeCrawled):
				print 'We have successfully crawled',pagesToBeCrawled,'pages'
				break
		checkUrl(result.url)
Exemple #7
0
def bingSearch(linkfile):
    print "\nCalling bingSearch with arguments linkfile: {}:".format(str(linkfile))
    #Sanitize input
    linkfile = linkfile.replace("^", "|")

    bing=PyBingSearch('XXXXX')
    #Get from bing:
    result_list, next_uri = bing.search(linkfile, limit=5, format='json')
    #result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json')
    result_list[0].description #first bing result
    file = open( 'bingResults.txt', 'w')
    for res in result_list:
        file.write('"' + res.url + '" ')
        break
    print "\nbingSearch complete"
    return str(result_list[0].url)
def get_improved_term(query):
    bing = PyBingSearch('')  # Add your bing-api key here
    result_list, next_url = bing.search("%s wikipedia" % query,
                                        limit=3,
                                        format='json')
    for result in result_list:
        wiki_url = result.url
        wiki_desc = result.description
        if "en.wikipedia" in wiki_url:
            if ("may refer to" not in wiki_desc) or ("may also refer to"
                                                     not in wiki_desc):
                wiki_corr_term = (wiki_url).split("/")[-1]
                try:
                    wiki_corr_term_dec = str(
                        urllib.unquote(wiki_corr_term).decode('utf-8'))
                    return wiki_corr_term_dec
                except:
                    pass
    return query
Exemple #9
0
 def _hits(self,my_query):
     if self.search_engine == "google":
         query = urllib.urlencode({'q' : my_query})
         time.sleep(randint(0,4))
         r = requests.get('https://www.google.com/search?' + query)
         searchres_param = "id=\"resultStats\">((About |)[0-9,]+) result(|s)</div>"
         print my_query
         try:
             count = re.search(searchres_param,r.text).group(1)
             if "About " in count:
                 count = count.strip("About ")
             print "Result found"
             return (int(str(re.sub(',','',count))) + 0.01)
         except:
             print "No results"
             return 0.01
     elif self.search_engine == "bing":
         bing = PyBingSearch('xAFcyVsidXgkpQxwHYkPcPPPRGpjU2qlNtjBD6ZqGNU')
         result_list,next_url = bing.search(my_query)
         if len(result_list) > 0:
             return len(result_list) + 0.01
         else:
             return 0.01
Exemple #10
0
#pip install py-bing-search
#Blog Yazisi : http://bit.ly/1iEZHZt

from py_bing_search import PyBingSearch

file = open("siteurl.txt", "wb")

bing = PyBingSearch('API-KEY')
result_list, next_uri = bing.search("Sorgu Cümleciği", limit=50, format='json')

for result in result_list:
    file.write(result.url+"\n");

file.close()
Exemple #11
0
# -*- coding: utf-8 -*-
__author__ = 'lufo'

from py_bing_search import PyBingSearch

bing = PyBingSearch('QkcWAM6VJ/S0LJI9wvVGN4UNQUwikMb4zY/kUVe/hAw')
result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json')

for result in result_list:
    print result.url
for query_full in input_text:
	index = index + 1
	passphrase = re.sub(r"\'", '',query_full)
	query = query_full.split()


	for i in range(0, len(query)):
		query[i] = query[i].lower()
		print "qeury[" + str(i) + "] = " + query[i]

	new_list = []

	print passphrase
	#Handling exception in case of bad query
	try:
		result_list = bing.search(passphrase, limit=10, format='json')
	except:
		#Gathering bad query
		badwords_output.write(query_full + '\n')

	# Initializing dictionary
	#matched_list = {"passphrase": passphrase, "cloestMatching": '', "maxMatchNumber" : 0, "Percentage":0}

	matched_list = {}

	matched_list["passphrase"] = []
	matched_list["title"] = []
	matched_list["matchedWords"] = []
	matched_list["maxMatchNumber"] = []
	matched_list["percentage"] = []
	matched_list["uniqueMatchedWords"] = []
Exemple #13
0
class WebMd:
    
    def __init__(self):
        self.APIKEY = open("apikey.txt").read()
        self.bing = PyBingSearch(self.APIKEY)
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
        self.summarizer = Summarizer()

    def extractUrlStructuredText(self, url):
        """Extracts data from webmd url and provides a list of objects containing the heading and body
        """
        html = self.getUrl(url)    
        Soup = BeautifulSoup(html)
        soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist
        if soup == None:
            soup = Soup.find('div', {'id':'textArea'}) # generally always exists
        body = ""
        blocks = [] # list of objects containing heading and body
        heading = ""
        body = ""
        startNew = False
        skip = False
        for child in soup.recursiveChildGenerator():
            name = getattr(child, "name", None)
            if skip:
                skip = False
                continue
            if startNew:
                heading = child
                body = ""
                startNew = False
                continue
            if name in ['script', 'style']:
                skip = True
                continue
            if name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b']:
                blocks.append({'heading':heading, 'body':body})
                startNew = True
            if name is not None:
                pass
            elif not child.isspace(): # leaf node, don't print spaces
                body = body + " " + child
        if len(blocks)>1:
            return blocks[1::]
        return []
        
    def extractUrlText(self, url):
        """Extracts content text from webmd url
        """
        html = self.getUrl(url)    
        Soup = BeautifulSoup(html)
        soup = Soup.find('div', {'class':'hwDefinition_fmt'}) # better condition but doesn't always exist
        if soup == None:
            soup = Soup.find('div', {'id':'textArea'}) # generally always exists
        skipNext = False
        body = ""
        for child in soup.recursiveChildGenerator():
            if skipNext:
                skipNext = False
                continue
            name = getattr(child, "name", None)
            if name in ["script", "style"]:
                skipNext = True
            if name is not None:
                pass
            elif not child.isspace(): # leaf node, don't print spaces
                body = body + child
        return body
                   
    def getUrl(self, url):
        """Attempts to summarize webpage contents (assuming webmd url) 
        """
        hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
               'Accept-Encoding': 'none',
               'Accept-Language': 'en-US,en;q=0.8',
               'Connection': 'keep-alive'}
        req = urllib2.Request(url, headers=hdr)
        response = urllib2.urlopen(req).read()
        #response = requests.get(test_url)
        #response = urllib2.urlopen(test_url).read()        
        return response

    def isFirstAidPage(self, url):
        if url.find('/first-aid/') == -1:
            return False
        else:
            return True
        
    def search(self, s, limit=3):
        """Searches top limit number of bing searches.
           Returns the summarized/unsummarized data and the format code (0=no format, 1=formatted)
        """
        result_list, next_uri = self.bing.search(s + " treatment webmd", limit=limit, format='json')
        
        ########### Xiuyan's processing. First Aid type instruction format ##########
        for result in result_list:
            print(result.url)
            if self.isFirstAidPage(result.url):
                
                try:
                    page = requests.get(result.url)
                    print("piece of shit")
                    return (extract_instructions(page), 1)
                except:
                    print("entered Xiuyan's except")
                
        ########## Rahman's processing. Returns structured data representing all of first link #############
        try:
            blocks = self.extractUrlStructuredText(result_list[0].url)
            return (blocks, 1)
        except:
            print("Able to structure into headers and body")

        ########### Rahman's processing for 'other' pages. Attempts to summarize all first three links ###########  
        content = ""      
        for result in result_list:
            try:
                content = content + self.extractUrlText(result.url)
            except Exception, e:
                print(e)
                pass
        if content == "":
            print("Other WebMd Page")
            return (self.summarizer.summarizeText(content), 0)
        
        ########### Worst case: summarize first url ################
        print("Summarizing first")
        return (self.summarizer.summarizeUrl(result_list[0].url), 0)
def get_results(search):
    '-> _List_ of dictionaries of results'
    bing = PyBingSearch(BING_SEARCH_KEY)
    results, next_uri = bing.search(search, limit =NUM_SEARCH_RESULTS, format ='json')
    return results
Exemple #15
0
from py_bing_search import PyBingSearch
import sys
import os

linkfile = sys.argv[-1]
linkfile = linkfile.replace("^", "|");
bing=PyBingSearch('MsYC/eW39AiaY9EYFIC8mlX8C7HPRRooagMKRwVZx7Q')
result_list, next_uri = bing.search(linkfile, limit=5, format='json')
#result_list, next_uri = bing.search("Python Software Foundation", limit=50, format='json')
result_list[0].description

file = open( 'bingResults.txt', 'w')
for res in result_list:
	file.write('"' + res.url + '" ')
	break
 def bingWikiSearch(self):
     query = self.query.split(" ")[0] + " :wiki"
     bing = PyBingSearch(self.bing_api_key)
     results, next_uri = bing.search(query, limit=1, format='json')
     return results[0].url
Exemple #17
0
# -*- coding: utf-8 -*-
__author__ = 'lufo'

from py_bing_search import PyBingSearch

bing = PyBingSearch('QkcWAM6VJ/S0LJI9wvVGN4UNQUwikMb4zY/kUVe/hAw')
result_list, next_uri = bing.search("Python Software Foundation",
                                    limit=50,
                                    format='json')

for result in result_list:
    print result.url