def bing(request): """ Search bing using a paid API. --- type: translated: type: string parameters: - name: query description: search query required: true type: string paramType: form - name: tx description: Transaction Id (proof of payment) type: string paramType: query """ if 'query' not in request.data: return Response({"error": "Must provide a 'query' parameter."}, status=status.HTTP_400_BAD_REQUEST) api = BingSearchAPI(settings.AZURE_MARKETPLACE_KEY) result = api.search_web( request.data['query'], payload={'$format': 'json'} ) if result.ok: return Response({"results": result.text}) else: return Response({"error": result.text}, status=status.HTTP_400_BAD_REQUEST)
def bing(request): """ Search bing using a paid API. --- type: translated: type: string parameters: - name: query description: search query required: true type: string paramType: form - name: tx description: Transaction Id (proof of payment) type: string paramType: query """ if 'query' not in request.data: return Response({"error": "Must provide a 'query' parameter."}, status=status.HTTP_400_BAD_REQUEST) api = BingSearchAPI(settings.AZURE_MARKETPLACE_KEY) result = api.search_web(request.data['query'], payload={'$format': 'json'}) if result.ok: return Response({"results": result.text}) else: return Response({"error": result.text}, status=status.HTTP_400_BAD_REQUEST)
def download_raw_results(infile, outfile): skip = 0 params = {"ImageFilters": '"Face:Face"', "$format": "json", "$top": 50, "$skip": skip} lines = tuple(open(infile, "r")) search_results = open(outfile, "w+") i = 1 for line in lines: bing = BingSearchAPI(bing_keys[i]) i = i + 1 if i == 7: i = 0 print line r = bing.search("web", line, params) if r.status_code == 200: # print r raw_search_results.append(r.json) # print("Appended an Entry!!!!!!!!") # res.append(bing.search("web",line,params)) for result in raw_search_results: for elem in result["d"]["results"][0]["Web"]: websites_list.append(elem["DisplayUrl"]) # Extract the links, and write to file so we don't have for link in websites_list: search_results.write("%s\n" % link.encode("utf-8"))
def query(query_string): bing = BingSearchAPI(my_key) params = {'ImageFilters':'"Face:Face"', '$format': 'json', '$top': 10, '$skip': 0} results = bing.search('web',query_string,params).json() # requests 1.0+ return [result['Url'] for result in results['d']['results'][0]['Web']]
def get_actor_url(actor_name): bing = BingSearchAPI(BING_KEY) params = { 'ImageFilters': '"Face:Face"', '$format': 'json', '$top': 1, '$skip': 0 } actor_name = actor_name.encode('utf-8') data = bing.search('image', actor_name, params).json() return data['d']['results'][0]['Image'][0]['Thumbnail']['MediaUrl']
def query(query_string): bing = BingSearchAPI(my_key) params = { 'ImageFilters': '"Face:Face"', '$format': 'json', '$top': 10, '$skip': 0 } results = bing.search('web', query_string, params).json() # requests 1.0+ return [result['Url'] for result in results['d']['results'][0]['Web']]
def search_bing(query, per_page=10, offset=0): try: my_key = "" bing = BingSearchAPI(my_key) params = {'$format': 'json', '$top': per_page, '$skip': offset} results = bing.search('image+web', query, params) results = results['d']['results'][0]['Web'] return results except(e): print e return []
def crawl_from_bing(search_query): my_key = read_bing_key() # search_query = "nba jumpshot" bing = BingSearchAPI(my_key) for i in range(20): params = { '$format': 'json', '$top': 50, '$skip': i * 50} result_list = bing.search('image',search_query,params).json() print(len(result_list['d']['results'][0]['Image'])) for result in result_list['d']['results'][0]['Image']: image_url = (result['MediaUrl']) title_name = result['Title'].encode('gbk', 'ignore').decode(encoding="utf-8", errors="ignore") title_name = title_name.replace('... ','') download_single_image(image_url, search_query, title_name)
def getRelevantURLForWord(wd, api_key): from bing_search_api import BingSearchAPI bing = BingSearchAPI(api_key) params = {'$format': 'json', '$skip': '10'} result = bing.search_web(wd, payload=params) if result.status_code == 200: entries = result.json()['d']['results'] if entries: rank = random.randint(0, len(entries)-1) url = entries[rank]['Url'] return url, rank+10 else: return None else: raise ApiError("Web search api error: {}".format(result.status_code))
class BingImageFetcher: NUM_IMGS = 10 def __init__(self, keypath): keyfile = open(keypath, 'r') key = keyfile.readline().strip() self.bing = BingSearchAPI(key) self.params = { #'ImageFilters':'"Face:Face"', '$format': 'json', '$top': self.NUM_IMGS, '$skip': 0} TIMEOUT = 10.0 IMG_FILES = 'img' def create_request(self, word): # note, throws ConnectionError if failed to fetch resp = self.bing.search('image', word, self.params).json() image_results = resp['d']['results'][0]['Image'] if len(image_results) == 0: raise Exception('Failed to find any images for query ' + word) image_url = image_results[random.randint(0, self.NUM_IMGS-1)]['MediaUrl'] up = urlparse.urlparse(image_url) destfile = os.path.basename(up.path) destpath = os.path.join(BingImageFetcher.IMG_FILES, destfile) if not os.path.isdir(BingImageFetcher.IMG_FILES): os.mkdir(BingImageFetcher.IMG_FILES) is_cached = False if os.path.isfile(destpath): # if we already have that image then just use the cached version is_cached = True return is_cached, image_url, destpath
def __init__(self, keypath): keyfile = open(keypath, 'r') key = keyfile.readline().strip() self.bing = BingSearchAPI(key) self.params = { #'ImageFilters':'"Face:Face"', '$format': 'json', '$top': self.NUM_IMGS, '$skip': 0}
def get_text(self): if self.conf_lyrics_done: print 'Lyrics are already done' return self.conf_lyrics_done bing = BingSearchAPI() tags = self.conf_tags search = '%s lyrics %s' % (tags['title'], tags['performer']) print 'Searching for lyrics. Search string: %s' % search lyrics_search = bing.search('web', search.encode('utf-8'), {'$format': 'json'}) #print 'Lyrics search result: %s' % pformat(lyrics_search) for result in lyrics_search.get('d', {}).get('results', [{}])[0].get('Web', []): url = result['Url'] print 'lyrics in %s?' % url for match, (good_attr, bad_part) in lyrics_matches.items(): if match in url: # Good! We have a known site with lyrics - let's extract them. print 'yes, lyrics are probably here' browser = Browser() browser.set_handle_robots(None) browser.open(url) text = browser.response().read() soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES) lyrics_el = soup.find(attrs=good_attr) if not lyrics_el: #print 'Not found lyrics in %s' % text continue #print 'full text: %s' % text #print 'Found something like this: %s' % lyrics_el parts = list(self.extract_text_parts(lyrics_el.contents, bad_part)) lyrics = '\n'.join(parts) #print 'Found lyrics: \n%s' % lyrics print 'Found lyrics: %s...' % lyrics[:150] self.conf_lyrics = lyrics self.conf_lyrics_done = True return self.conf_lyrics_done print 'Unsupported lyrics source: %s' % url if not self.conf_lyrics_done: print 'ERROR: lyrics not found! %s' % self.conf_tags['title'] return self.conf_lyrics_done
def bing_search_total(_verbose, _search_phrase, _bing_api_key): _search_phrase_parsed = "%22" + _search_phrase.replace(' ', '+').strip( ' ') + "%22" # %22 acts as quotes, facilitating a phrase search _bing_search = BingSearchAPI(_bing_api_key) _bing_parameters = {'$format': 'json', '$top': 2} try: res = _bing_search.search('web', _search_phrase_parsed, _bing_parameters).json() total_search_results = res["d"]["results"][0]["WebTotal"] total = int(total_search_results) if (isinstance(total, int)): if _verbose: print('\t' + _search_phrase_parsed.replace('+', ' ').replace( '%22', '') + total) pass return total except Exception as e: if _verbose: print('\tERROR: in bing.search() - search total\n\t' + str(e)) print('\tERROR: in bing.search() - search total\n\t' + str(e)) print("[Errno {0}] {1}".format(e.errno, e.strerror)) return 0
result = bing.search('image', query_string, params).json() image_url = result['d']['results'][0]['Image'][random_index]['MediaUrl'] return image_url def getAmazonURL(item): """ A utility for retrieving the Amazon search results URL for some item. """ return "http://www.amazon.com/s?field-keywords={0}".format(urllib.quote_plus(item)) # # SETUP bing_key = 'api_key_here' angelListAPI = AngelListAPI() genderGuesserAPI = GenderGuesserAPI() googleBooksAPI = GoogleBooksAPI() googleResultsAPI = GoogleResultsAPI() thesaurusAPI = ThesaurusAPI() tweetSentimentAPI = TweetSentimentAPI() bing = BingSearchAPI(bing_key) APIS = [angelListAPI, genderGuesserAPI, googleBooksAPI, googleResultsAPI, thesaurusAPI, tweetSentimentAPI] CALCULATOR = Calculator(APIS)
def get_pics(self): if self.conf_pics_done: print 'Pics are already done' return self.conf_pics_done imgdir = self.imgdir if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT: self.conf_pics_done = True return self.conf_pics_done bing = BingSearchAPI() tags = self.conf_tags search = '%s %s' % (tags['title'], tags['performer']) print 'Searching for images. Search string: %s' % search img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'}) print 'Images: %s' % pformat(img_search) registry = processed_image_urls.setdefault(imgdir, set()) if not os.path.exists(imgdir): os.makedirs(imgdir) for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []): if result['MediaUrl'] not in registry: browser = Browser() browser.set_handle_robots(None) registry.add(result['MediaUrl']) log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir)) try: #log.debug('Opening %s' % result['SourceUrl']) browser.open(result['SourceUrl']) #log.debug('Opening %s' % result['MediaUrl']) img = Image.open(browser.open(result['MediaUrl'])) if img.size[0] >= DEFAULT_VIDEO_RESOLUTION and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]: print 'Found image: %s' % result['MediaUrl'] img.save(os.path.join(imgdir, ('image%03d.png' % (len(glob.glob1(imgdir, "*.png"))) + 1))) self.conf_pics_done = True if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT: self.conf_pics_done = True break except: print_exc() if len(glob.glob1(imgdir, "*.png")) < REQUIRED_IMAGE_COUNT: search = tags['performer'] print 'Searching for images. Search string: %s' % search img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'}) for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []): if result['MediaUrl'] not in registry: browser = Browser() browser.set_handle_robots(None) registry.add(result['MediaUrl']) log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir)) try: #log.debug('Opening %s' % result['SourceUrl']) browser.open(result['SourceUrl']) #log.debug('Opening %s' % result['MediaUrl']) img = Image.open(browser.open(result['MediaUrl'])) if img.size[0] >= DEFAULT_VIDEO_RESOLUTION[0] and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]: print 'Found image: %s' % result['MediaUrl'] img.save(os.path.join(imgdir, ('image%03d.png' % (len(glob.glob1(imgdir, "*.png"))) + 1))) if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT: self.conf_pics_done = True break except: print_exc() return self.conf_pics_done
from bing_search_api import BingSearchAPI import json my_key = "8jhH8TwVCHdDiWxXYgC5KqyEmChYTKW0kkFngbVYnH8" query_string = "Sony" bing = BingSearchAPI(my_key) params = {'$format': 'json', '$top': 10, '$skip': 0} news = bing.search('news', query_string, params).json() for i in range(10): print(news['d']['results'][0]['News'][i]) #news = json.loads(bing.search('news', query_string, params).json())
def request_image(window, keyword, num_of_try=0, translate=True): """ Queries Bing for images and retries up to 5 times if the randomly selected image could not be accessed :param keyword: string which specifies the image content :param num_of_try: internal parameter that increases if the selected image could not be retrieved (e.g. Forbidden Error) :param translate: Should the keyword be translated to english before the search? (may increase result size) :return: The image data in bytes """ if keyword is None: return None if translate: ms_key = open('../ms.key').read() trans = Translator('__RealTimeStoryIllustrator__', ms_key) translatedkw = trans.translate(keyword, lang_from='de', lang_to='en') print("IMAGE SERVICE: Getting image for " + str(keyword) + ". Searched for the english translation '" + str(translatedkw) + "'.") else: translatedkw = keyword print("IMAGE SERVICE: Getting image for " + str(keyword) + ".") if num_of_try > 5: # no images were found logger.error( "IMAGE SERVICE: Could not find an image after 5 tries for " + str(translatedkw) + ".") return None # OLD CODE FOR SEARCHING BEGIN # term = urllib.parse.quote_plus(translatedkw) # sites = [line.rstrip() for line in # open(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'sites.txt'), # encoding="utf-8")] # excludedsites = "" # for site in sites: # excludedsites = excludedsites + "-site:" + urllib.parse.quote_plus(site) + '%20' # img_type = '%7Eillustration+AND+clipart' # opener = urllib.request.build_opener() # opener.addheaders = [('User-agent', 'Mozilla/5.0')] # url = ('http://ajax.googleapis.com/ajax/services/search/images?' + # 'v=1.0&q=' + term + '%20' + img_type + '%20' + excludedsites + '%20&userip=91.141.0.105' + # '&rsz=8&imgsz=medium&safe=active' + '&tbs=ic:color') # OLD CODE FOR SEARCHING END try: params = { '$format': 'json', '$top': 10, 'ImageFilters': '\'Size:Small\'' } bing_key = open('../bing.key').read() api = BingSearchAPI(bing_key) result = api.search_image( str(translatedkw + '+AND+(illustration+OR+clipart)'), params) amount = len(result.json()['d']['results']) # print(json.dumps(result.json(), sort_keys=True, indent=2)) # print(result.json()) # print(result.json()['d']['results'][0]['MediaUrl']) img_num = random.randint(0, amount - 1) data = urllib.request.urlopen( result.json()['d']['results'][img_num]['MediaUrl'], timeout=2).read() return data except Exception as e: # have to catch everything since socket exceptions seem to be broken print("ERROR in IMAGE SERVICE: Trying again, request was denied " + str(e)) return request_image(window, keyword, num_of_try + 1, translate=translate)
import json from bing_search_api import BingSearchAPI #read the top keywords(usually 1-3) and generate the search keyword parameter k = [] with open("topkeywords.txt") as f: for line in f: k.append(line.strip()) s=' '.join(k) n=1000 #search result limit my_key = "uAZ6dYNEodLuQxx1W3UKkLegY+Uj8y7e1E3AxPwqtmM" #API key query_string = s #the query string. currently only has keyword parameter. bing = BingSearchAPI(my_key) #initialize search request params = {'$format': 'json'} #response format as json #output file f = open("bingresults.txt","w") #get first 50 results from Bing for obj in bing.search('web',query_string,params).json()['d']['results']: for lnk in obj['Web']: f.write(lnk['Url']) f.write('\n') i=50 #get the rest results while i<n: params = {'$format': 'json','$skip': i} #skip first i results
import json from bing_search_api import BingSearchAPI #read the top keywords(usually 1-3) and generate the search keyword parameter k = [] with open("topkeywords.txt") as f: for line in f: k.append(line.strip()) s = ' '.join(k) n = 1000 #search result limit my_key = "uAZ6dYNEodLuQxx1W3UKkLegY+Uj8y7e1E3AxPwqtmM" #API key query_string = s #the query string. currently only has keyword parameter. bing = BingSearchAPI(my_key) #initialize search request params = {'$format': 'json'} #response format as json #output file f = open("bingresults.txt", "w") #get first 50 results from Bing for obj in bing.search('web', query_string, params).json()['d']['results']: for lnk in obj['Web']: f.write(lnk['Url']) f.write('\n') i = 50 #get the rest results while i < n: params = {'$format': 'json', '$skip': i} #skip first i results
from secret_config import secret_config from bing_search_api import BingSearchAPI search_api = BingSearchAPI(secret_config['BING_API_KEY']) def query(q, sources='web'): params = {"$format": "json", "$top": 20} response = search_api.search(sources, q.encode('utf-8'), params) return response.json()
def request_image(window, keyword, num_of_try=0, translate=True): """ Queries Bing for images and retries up to 5 times if the randomly selected image could not be accessed :param keyword: string which specifies the image content :param num_of_try: internal parameter that increases if the selected image could not be retrieved (e.g. Forbidden Error) :param translate: Should the keyword be translated to english before the search? (may increase result size) :return: The image data in bytes """ if keyword is None: return None if translate: ms_key = open('../ms.key').read() trans = Translator('__RealTimeStoryIllustrator__', ms_key) translatedkw = trans.translate(keyword, lang_from='de', lang_to='en') print("IMAGE SERVICE: Getting image for " + str(keyword) + ". Searched for the english translation '" + str(translatedkw) + "'.") else: translatedkw = keyword print("IMAGE SERVICE: Getting image for " + str(keyword) + ".") if num_of_try > 5: # no images were found logger.error("IMAGE SERVICE: Could not find an image after 5 tries for " + str(translatedkw) + ".") return None # OLD CODE FOR SEARCHING BEGIN # term = urllib.parse.quote_plus(translatedkw) # sites = [line.rstrip() for line in # open(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'sites.txt'), # encoding="utf-8")] # excludedsites = "" # for site in sites: # excludedsites = excludedsites + "-site:" + urllib.parse.quote_plus(site) + '%20' # img_type = '%7Eillustration+AND+clipart' # opener = urllib.request.build_opener() # opener.addheaders = [('User-agent', 'Mozilla/5.0')] # url = ('http://ajax.googleapis.com/ajax/services/search/images?' + # 'v=1.0&q=' + term + '%20' + img_type + '%20' + excludedsites + '%20&userip=91.141.0.105' + # '&rsz=8&imgsz=medium&safe=active' + '&tbs=ic:color') # OLD CODE FOR SEARCHING END try: params = {'$format': 'json', '$top': 10, 'ImageFilters': '\'Size:Small\''} bing_key = open('../bing.key').read() api = BingSearchAPI(bing_key) result = api.search_image(str(translatedkw+'+AND+(illustration+OR+clipart)'), params) amount = len(result.json()['d']['results']) # print(json.dumps(result.json(), sort_keys=True, indent=2)) # print(result.json()) # print(result.json()['d']['results'][0]['MediaUrl']) img_num = random.randint(0, amount-1) data = urllib.request.urlopen(result.json()['d']['results'][img_num]['MediaUrl'], timeout=2).read() return data except Exception as e: # have to catch everything since socket exceptions seem to be broken print("ERROR in IMAGE SERVICE: Trying again, request was denied "+str(e)) return request_image(window, keyword, num_of_try + 1, translate=translate)
import logging from collections import Counter from bing_search_api import BingSearchAPI my_key = "urTuvjb7b6dFiCmC3Jj6ZAxuX8DqyXwQRDccSEQJVbc" bing = BingSearchAPI(my_key) def main(): print(search("Hello", 3)) # category='Web' or category='News' def search(query_string, num_results, category='Web'): query_string = query_string.replace("#", "%23") params = { 'ImageFilters': '"Face:Face"', '$format': 'json', '$top': num_results, '$skip': 0 } results = bing.search(category, query_string, params).json() # requests 1.0+ return [result['Url'] for result in results['d']['results'][0][category]] def group_search(query_list, num_results, category='Web', on_wiki=False, on_ubd=False,
import sys import random import urllib #import urllib to download images from their URLs import time #to implement delays in making requests to Toronto Deep Learning from textblob import TextBlob, Word #import the class TextBlob from textblob, Word to get definitions from bing_search_api import BingSearchAPI #import the class BingSearchAPI from https://github.com/xthepoet/pyBingSearchAPI from cvserver import response_for_image, captions, nearest_neighbour #import the function response_for_image to fetch image descriptions, caption to extract them from HTML using beautiful soup #MY FUNCTONS def fix_punctuation(sentence): #pass in a string to fix the punctuation return sentence.replace(' .', '').replace(' , ', ', ') #INFO FOR BING API my_key = "insert_API_key" #replace with Bing API Key query_string = sys.argv[1] #get query string as input from command line using sys.argv, for multiple words use query between " " bing = BingSearchAPI(my_key) #parameters for image searching -- more documentation on params and image filters here http://goo.gl/xG0v0O params = {'ImageFilters':'"Style:Photo"', '$format': 'json', #specifies format of data response '$top': 400, #specifies number of results to return, default is 50 '$skip': 0} #specifies starting point offset for the results #bing.search()requires sources first (images, web, video, etc.), then query string, then rest of params (above) #full schema documentation for bing API is here http://goo.gl/xG0v0O results = bing.search('image',query_string,params).json() #requests 1.0+ image_list = results['d']['results'][0]['Image'] #this gets us to the list of all the images #create a new list of all the image source URLs using a list comprehension image_urls = [image['MediaUrl'] for image in image_list if len(image['MediaUrl']) > 0]
from bing_search_api import BingSearchAPI import json my_key = "dWls875YJyXwh7dmX3LdIaETO9IDjfkdG4g8533M9zs" query_string = raw_input("What is your query? ") bing = BingSearchAPI(my_key) params = { '$format': 'json', '$top': 10, '$skip': 0} searchJSON = bing.search('news',query_string,params).json() print searchJSON[1]
def Collocations_Method_2(_bing_api_key, _n_grams_from_input_text_file, _input_file_path, _apply_POS_restrictions, _verbose): if _verbose: # A file to save the verbose output of the program _output_file_verbose = str(_input_file_path).replace(_input_file_path.split('/')[-1], 'verbose.txt') _output_file_verbose = open(_output_file_verbose, 'a') print("\n--------------------------------------------------------------------------", file=_output_file_verbose) print("\tMethod-2: Title-Url - Extracting collocations:", file=_output_file_verbose) print("--------------------------------------------------------------------------\n\n", file=_output_file_verbose) print("\tMethod-2: Title-Url - Extracting collocations ...") # A list to store n-gram phrases that are collocations title_url_collocations = [] # A list to store n-gram phrases that are not collocations n_grams_not_collocations = [] # Snowball stemmer is used to stem words stemmer = snowballstemmer.stemmer('english') # Call to Bing search API _bing_search = BingSearchAPI(_bing_api_key) _bing_search_parameters = {'$format': 'json', '$top': 10} # Top 10 search results # Python list with words synonymous to 'Wikipedia', 'dictionary', 'definition' _list_of_synonymous_words = ['dictionary', 'lexicon', 'definition', 'meaning', 'unabridged', 'gazetteer' \ 'spellchecker', 'spellingchecker', 'thesaurus', 'synonymfinder', 'wordfinder', 'wikipedia', 'investorwords' \ 'investopedia', 'wiktionary'] for _n_gram in _n_grams_from_input_text_file: if _verbose: print("\n%s:" % (_n_gram), file=_output_file_verbose) if _n_gram in title_url_collocations or _n_gram in n_grams_not_collocations: # If a particular n-gram phrase is checked if it is a collocation before, # it will be present in one of the lists, wordnet_collocations OR n_grams_not_collocations # Hence, we move on to the next n-gram / phrase continue else: # Before checking if the n-gram is a collocation we check if atlease one # POS tag is from the valid POS tag list: {Noun, Verb, Adverb, Adjective} if # _apply_POS_restrictions is set to True if _apply_POS_restrictions: valid_POS_tags = ['NN', 'VB', 'RB', 'JJ'] _valid_POS_tag_counter = 0 # A counter to count the number of valid POS tags in n-gram for _pos_tag in valid_POS_tags: if _pos_tag in _n_gram: _valid_POS_tag_counter += 1 if _valid_POS_tag_counter == 0: # If no valid POS tag is present in the n-gram, it is not a collocation # when POS restrictions are applied n_grams_not_collocations.append(_n_gram) if _verbose: print("\t'%s' does not have valid POS tags\n\tMoving on to the next phrase ..." % (_n_gram), file=_output_file_verbose) continue # We move on to the next phrase # If POS restrictions are not to be applied on the n-gram _n_gram_lower = _n_gram.lower() + ' ' # Lower case _n_gram_lower = re.sub(r'_.*? ', ' ', _n_gram_lower).rstrip(' ') _n_gram_lower_search_phrase = 'define "%s"' % (_n_gram_lower) # Bing - Phrase search try: _search_results = _bing_search.search('web', _n_gram_lower_search_phrase, _bing_search_parameters).json() _search_result_count = len(_search_results["d"]["results"][0]["Web"]) except Exception as e: if _verbose: print("\tERROR: Method-2 - Bing search - Title-Url\n%s" % (str(e)), file=_output_file_verbose) print("\tERROR: Method-2 - Bing search - Title-Url\n%s" % (str(e))) _search_result_count = 0 continue # List to save top 10 search Titles _search_titles = [] # List to store top 10 search Urls _search_urls = [] # We iterate through each of the search result and append search titles and Urls to their respective lists for x in xrange(0, _search_result_count): _url = _search_results["d"]["results"][0]["Web"][x]["Url"] _title = _search_results["d"]["results"][0]["Web"][x]["Title"] _title = unicodedata.normalize('NFKD', _title).encode('ascii', 'ignore') _url = unicodedata.normalize('NFKD', _url).encode('ascii', 'ignore') _search_titles.append(_title) _search_urls.append(_url) # removing punctuation, special characters and spaces from the keyword _n_gram_lower_no_spaces = ''.join(_char for _char in _n_gram_lower if _char.isalnum()) _n_gram_lower_no_spaces = _n_gram_lower_no_spaces.replace(' ', '') _number_of_search_results_returned = len(_search_urls) # No. of search urls = titles # Variable to store the count of titles and urls that have valid keywords and match with the search phrase _number_of_valid_titles = 0 _number_of_valid_urls = 0 for x in xrange(0, _number_of_search_results_returned): _search_title = "" _search_title = _search_titles[x] _search_title_lower_case = _search_title.lower() _search_title_lower_case_no_spaces = "".join( _char for _char in _search_title_lower_case if _char.isalnum()) _search_url = "" _search_url = _search_urls[x] _search_url_lower_case = _search_url.lower() _search_url_lower_case_no_spaces = "".join(_char for _char in _search_url_lower_case if _char.isalnum()) if _verbose: print("\t%d:\n\tSearch title: %s\n\tSearch Url: %s" % (x + 1, _search_title, _search_url), file=_output_file_verbose) for _synonym in _list_of_synonymous_words: _synonym_match = False # Check if _synonym is present in the tile _title_match = re.search(_synonym, _search_title_lower_case_no_spaces) # check if _synonym is present in the url _url_match = re.search(_synonym, _search_url_lower_case_no_spaces) # If a match is found either in title or the url, open the link and check if the # <title> </title> tag from the html has a match with the keyword if _title_match: _synonym_match = True elif _url_match: _synonym_match = True else: continue if _synonym_match: # Reading HTML from url try: # replace: _url_response = urllib2.urlopen(_search_url) # _url_response = urllib2.urlopen(_search_url) http = httplib2.Http(".cache") resp, _url_response = http.request(_search_url, "GET") _html = _url_response # print(_html) _beautiful_html = BeautifulSoup(_html, "lxml") except Exception as e: if _verbose: print("\tException - Method-2 - Reading HTML\n%s" % (str(e)), file=_output_file_verbose) print("\tException - Method-2 - Reading HTML\n%s" % (str(e))) # print(e.fp.read()) print("-----------------\n" + _search_url + "\n---------------\n") # Extracting text in between <h1> tag try: # Comments are to excluded, this part is to coded # _text_from_title = _beautiful_html.find('h1').text # print(_beautiful_html.find('h1').text + "\n") # print("sss" + _beautiful_html.title.string + '\n') _text_from_title = _beautiful_html.title.string # Remove any non-ascii characters from the text extracted _text_from_title_ascii_only = "".join( _char for _char in _text_from_title if ord(_char) < 128) _text_from_title_ascii_only = _text_from_title_ascii_only.lower() except: # If failed to extract text from <h1> _text_from_title_ascii_only = "" """ # ------- FOR Stemmed match ------------ # Stem the title text extracted and the n-gram phrase # If the stemmed n-gram phrase is present in the stemmed title, # that n-gram phrase is a collocation _n_gram_lower_stemmed = "" for _word in _n_gram_lower.split(' '): _n_gram_lower_stemmed = " " + stemmer.stemWord(_word) _text_from_title_ascii_only_stemmed = "" for _word in _text_from_title_ascii_only.split(' '): _text_from_title_ascii_only_stemmed = " " + stemmer.stemWord(_word) if _verbose: print "\t\tStemmed search title: %s\n\t\tStemmed phrase: %s" %(_text_from_title_ascii_only_stemmed, _n_gram_lower_stemmed) if _n_gram_lower_stemmed in _text_from_title_ascii_only_stemmed: _number_of_valid_titles += 1 if _verbose: print "\t\t\tMatch" else: if _verbose: print "\t\t\tNot a match" # --------------------------------------- """ # ------------ FOR Exact title match ------------- if _verbose: print("\t\tSearch TITLE processed: %s\n\t\tPhrase processed: %s" % ( _text_from_title_ascii_only, _n_gram_lower), file=_output_file_verbose) if _n_gram_lower in _text_from_title_ascii_only: _number_of_valid_titles += 1 if _verbose: print("\t\t\tMatch", file=_output_file_verbose) else: if _verbose: print("\t\t\tNot a match", file=_output_file_verbose) # ------------------------------------------------ # Remove punctuation and numbers from Url and see if the n-gram / phrase is present in it # If yes, then that n-gram is a collocation _search_url_lower_case_no_spaces_no_punctuation = "".join( [_char for _char in _search_url_lower_case_no_spaces if not _char.isdigit()]) if _verbose: print("\t\tSearch URL processed: %s\n\t\tPhrase processed: %s" % ( _search_url_lower_case_no_spaces_no_punctuation, _n_gram_lower_no_spaces), file=_output_file_verbose) if _n_gram_lower_no_spaces in _search_url_lower_case_no_spaces_no_punctuation: _number_of_valid_urls += 1 if _verbose: print("\t\t\tMatch", file=_output_file_verbose) else: if _verbose: print("\t\t\tNot a match", file=_output_file_verbose) break else: continue if _number_of_valid_titles > 0 or _number_of_valid_urls > 0: title_url_collocations.append(_n_gram) if _verbose: print("\n\tTotal number of valid titles: %d\n\tTotal number of valid urls: %d\n\t- Collocation -\n" \ % (_number_of_valid_titles, _number_of_valid_urls), file=_output_file_verbose) else: n_grams_not_collocations.append(_n_gram) if _verbose: print("\t- Not a collocation -\n", file=_output_file_verbose) # Output text file to save collocations _output_file_path_title_url_collocations = str(_input_file_path).replace(_input_file_path.split('/')[-1], 'collocations_title_url.txt') _output_file_title_url_collocations = open(_output_file_path_title_url_collocations, 'w') for _collocation in title_url_collocations: _output_file_title_url_collocations.write(_collocation + '\n') _output_file_title_url_collocations.close() if _verbose: print("\nMethod-2: Title-Url - Collocations are written to the file:\n%s" % ( _output_file_path_title_url_collocations), file=_output_file_verbose) # Output text file to save n-grams that are not collocations _output_file_path_title_url_not_collocations = str(_input_file_path).replace(_input_file_path.split('/')[-1], 'not_collocations_title_url.txt') _output_file_title_url_not_collocations = open(_output_file_path_title_url_not_collocations, 'w') for _n_gram in n_grams_not_collocations: _output_file_title_url_not_collocations.write(_n_gram + '\n') _output_file_title_url_not_collocations.close() if _verbose: print("Method-2: Title-Url - N-grams that are not collocations are written to the file:\n%s" % ( _output_file_path_title_url_not_collocations), file=_output_file_verbose) if _verbose: print("\n--------------------------------------------------------------------------", file=_output_file_verbose) print("\tMethod-2: Title-Url - Extracting collocations - Complete", file=_output_file_verbose) print("--------------------------------------------------------------------------\n\n", file=_output_file_verbose) # Returning n-grams that are collocations and n-grams that are not if _verbose: print("\t\tMethod-2: Collocation extraction successful") return title_url_collocations, n_grams_not_collocations