Esempio n. 1
0
def bing(request):
    """
    Search bing using a paid API.
    ---
    type:
      translated:
        type: string
    parameters:
        - name: query
          description: search query
          required: true
          type: string
          paramType: form
        - name: tx
          description: Transaction Id  (proof of payment)
          type: string
          paramType: query
    """
    if 'query' not in request.data:
        return Response({"error": "Must provide a 'query' parameter."},
                        status=status.HTTP_400_BAD_REQUEST)
    api = BingSearchAPI(settings.AZURE_MARKETPLACE_KEY)
    result = api.search_web(
        request.data['query'],
        payload={'$format': 'json'}
    )
    if result.ok:
        return Response({"results": result.text})
    else:
        return Response({"error": result.text},
                        status=status.HTTP_400_BAD_REQUEST)
Esempio n. 2
0
def bing(request):
    """
    Search bing using a paid API.
    ---
    type:
      translated:
        type: string
    parameters:
        - name: query
          description: search query
          required: true
          type: string
          paramType: form
        - name: tx
          description: Transaction Id  (proof of payment)
          type: string
          paramType: query
    """
    if 'query' not in request.data:
        return Response({"error": "Must provide a 'query' parameter."},
                        status=status.HTTP_400_BAD_REQUEST)
    api = BingSearchAPI(settings.AZURE_MARKETPLACE_KEY)
    result = api.search_web(request.data['query'], payload={'$format': 'json'})
    if result.ok:
        return Response({"results": result.text})
    else:
        return Response({"error": result.text},
                        status=status.HTTP_400_BAD_REQUEST)
Esempio n. 3
0
def download_raw_results(infile, outfile):
    skip = 0
    params = {"ImageFilters": '"Face:Face"', "$format": "json", "$top": 50, "$skip": skip}
    lines = tuple(open(infile, "r"))
    search_results = open(outfile, "w+")
    i = 1
    for line in lines:
        bing = BingSearchAPI(bing_keys[i])
        i = i + 1
        if i == 7:
            i = 0
        print line
        r = bing.search("web", line, params)
        if r.status_code == 200:
            # print r
            raw_search_results.append(r.json)
        # print("Appended an Entry!!!!!!!!")
        # res.append(bing.search("web",line,params))

    for result in raw_search_results:
        for elem in result["d"]["results"][0]["Web"]:
            websites_list.append(elem["DisplayUrl"])
    # Extract the links, and write to file so we don't have
    for link in websites_list:
        search_results.write("%s\n" % link.encode("utf-8"))
def query(query_string):
    bing = BingSearchAPI(my_key)
    params = {'ImageFilters':'"Face:Face"',
              '$format': 'json',
              '$top': 10,
              '$skip': 0}
    results = bing.search('web',query_string,params).json() # requests 1.0+ 

    return [result['Url'] for result in results['d']['results'][0]['Web']]
Esempio n. 5
0
def get_actor_url(actor_name):
    bing = BingSearchAPI(BING_KEY)
    params = {
        'ImageFilters': '"Face:Face"',
        '$format': 'json',
        '$top': 1,
        '$skip': 0
    }
    actor_name = actor_name.encode('utf-8')
    data = bing.search('image', actor_name, params).json()
    return data['d']['results'][0]['Image'][0]['Thumbnail']['MediaUrl']
Esempio n. 6
0
def query(query_string):
    bing = BingSearchAPI(my_key)
    params = {
        'ImageFilters': '"Face:Face"',
        '$format': 'json',
        '$top': 10,
        '$skip': 0
    }
    results = bing.search('web', query_string, params).json()  # requests 1.0+

    return [result['Url'] for result in results['d']['results'][0]['Web']]
Esempio n. 7
0
def search_bing(query, per_page=10, offset=0):
    try:
        my_key = ""
        bing = BingSearchAPI(my_key)
        params = {'$format': 'json',
                  '$top': per_page,
                  '$skip': offset}
        results = bing.search('image+web', query, params)
        results = results['d']['results'][0]['Web']
        return results
    except(e):
        print e
        return []
Esempio n. 8
0
def crawl_from_bing(search_query):
    my_key = read_bing_key()
    # search_query = "nba jumpshot"
    bing = BingSearchAPI(my_key)
    for i in range(20):
        params = {
              '$format': 'json',
              '$top': 50,
              '$skip': i * 50}
        result_list = bing.search('image',search_query,params).json()
        print(len(result_list['d']['results'][0]['Image']))
        for result in result_list['d']['results'][0]['Image']:
            image_url = (result['MediaUrl'])
            title_name = result['Title'].encode('gbk', 'ignore').decode(encoding="utf-8", errors="ignore")
            title_name = title_name.replace('... ','')
            download_single_image(image_url, search_query, title_name)
Esempio n. 9
0
def getRelevantURLForWord(wd, api_key):
    from bing_search_api import BingSearchAPI

    bing = BingSearchAPI(api_key)
    params = {'$format': 'json', '$skip': '10'}
    result = bing.search_web(wd, payload=params)
    if result.status_code == 200:
        entries = result.json()['d']['results']
        if entries:
            rank = random.randint(0, len(entries)-1)
            url = entries[rank]['Url']
            return url, rank+10
        else:
            return None
    else:
        raise ApiError("Web search api error: {}".format(result.status_code))
Esempio n. 10
0
class BingImageFetcher:

    NUM_IMGS = 10

    def __init__(self, keypath):
        keyfile = open(keypath, 'r')
        key = keyfile.readline().strip()
        self.bing = BingSearchAPI(key)
        self.params = {
                        #'ImageFilters':'"Face:Face"',
                        '$format': 'json',
                        '$top': self.NUM_IMGS,
                        '$skip': 0}

    TIMEOUT = 10.0
    IMG_FILES = 'img'

    def create_request(self, word):
        # note, throws ConnectionError if failed to fetch
        resp = self.bing.search('image', word, self.params).json()
        image_results = resp['d']['results'][0]['Image']
        if len(image_results) == 0:
            raise Exception('Failed to find any images for query ' + word)
        image_url = image_results[random.randint(0, self.NUM_IMGS-1)]['MediaUrl']
        up = urlparse.urlparse(image_url)
        destfile = os.path.basename(up.path)
        destpath = os.path.join(BingImageFetcher.IMG_FILES, destfile)
        if not os.path.isdir(BingImageFetcher.IMG_FILES):
            os.mkdir(BingImageFetcher.IMG_FILES)
        is_cached = False
        if os.path.isfile(destpath):
            # if we already have that image then just use the cached version
            is_cached = True
        return is_cached, image_url, destpath
Esempio n. 11
0
 def __init__(self, keypath):
     keyfile = open(keypath, 'r')
     key = keyfile.readline().strip()
     self.bing = BingSearchAPI(key)
     self.params = {
                     #'ImageFilters':'"Face:Face"',
                     '$format': 'json',
                     '$top': self.NUM_IMGS,
                     '$skip': 0}
Esempio n. 12
0
 def get_text(self):
     if self.conf_lyrics_done:
         print 'Lyrics are already done'
         return self.conf_lyrics_done
     bing = BingSearchAPI()
     tags = self.conf_tags
     search = '%s lyrics %s' % (tags['title'], tags['performer'])
     print 'Searching for lyrics. Search string: %s' % search
     lyrics_search = bing.search('web', search.encode('utf-8'), {'$format': 'json'})
     #print 'Lyrics search result: %s' % pformat(lyrics_search)
     for result in lyrics_search.get('d', {}).get('results', [{}])[0].get('Web', []):
         url = result['Url']
         print 'lyrics in %s?' % url
         for match, (good_attr, bad_part) in lyrics_matches.items():
             if match in url:
                 # Good! We have a known site with lyrics - let's extract them.
                 print 'yes, lyrics are probably here'
                 browser = Browser()
                 browser.set_handle_robots(None)
                 browser.open(url)
                 text = browser.response().read()
                 soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES)
                 lyrics_el = soup.find(attrs=good_attr)
                 if not lyrics_el:
                     #print 'Not found lyrics in %s' % text
                     continue
                 #print 'full text: %s' % text
                 #print 'Found something like this: %s' % lyrics_el
                 parts = list(self.extract_text_parts(lyrics_el.contents, bad_part))
                 lyrics = '\n'.join(parts)
                 #print 'Found lyrics: \n%s' % lyrics
                 print 'Found lyrics: %s...' % lyrics[:150]
                 self.conf_lyrics = lyrics
                 self.conf_lyrics_done = True
                 return self.conf_lyrics_done
         print 'Unsupported lyrics source: %s' % url
     if not self.conf_lyrics_done:
         print 'ERROR: lyrics not found! %s' % self.conf_tags['title']
     return self.conf_lyrics_done
Esempio n. 13
0
def bing_search_total(_verbose, _search_phrase, _bing_api_key):

    _search_phrase_parsed = "%22" + _search_phrase.replace(' ', '+').strip(
        ' ') + "%22"  # %22 acts as quotes, facilitating a phrase search
    _bing_search = BingSearchAPI(_bing_api_key)
    _bing_parameters = {'$format': 'json', '$top': 2}

    try:
        res = _bing_search.search('web', _search_phrase_parsed,
                                  _bing_parameters).json()
        total_search_results = res["d"]["results"][0]["WebTotal"]
        total = int(total_search_results)
        if (isinstance(total, int)):
            if _verbose:
                print('\t' + _search_phrase_parsed.replace('+', ' ').replace(
                    '%22', '') + total)
                pass
            return total
    except Exception as e:
        if _verbose:
            print('\tERROR: in bing.search() - search total\n\t' + str(e))
        print('\tERROR: in bing.search() - search total\n\t' + str(e))
        print("[Errno {0}] {1}".format(e.errno, e.strerror))
        return 0
Esempio n. 14
0
    result = bing.search('image', query_string, params).json()

    image_url = result['d']['results'][0]['Image'][random_index]['MediaUrl']

    return image_url


def getAmazonURL(item):
    """
    A utility for retrieving the Amazon search results URL for some item.
    """

    return "http://www.amazon.com/s?field-keywords={0}".format(urllib.quote_plus(item))


#
# SETUP
bing_key = 'api_key_here'

angelListAPI = AngelListAPI()
genderGuesserAPI = GenderGuesserAPI()
googleBooksAPI = GoogleBooksAPI()
googleResultsAPI = GoogleResultsAPI()
thesaurusAPI = ThesaurusAPI()
tweetSentimentAPI = TweetSentimentAPI()
bing = BingSearchAPI(bing_key)

APIS = [angelListAPI, genderGuesserAPI, googleBooksAPI, googleResultsAPI, thesaurusAPI, tweetSentimentAPI]
CALCULATOR = Calculator(APIS)
Esempio n. 15
0
 def get_pics(self):
     if self.conf_pics_done:
         print 'Pics are already done'
         return self.conf_pics_done
     imgdir = self.imgdir
     if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT:
         self.conf_pics_done = True
         return self.conf_pics_done
     bing = BingSearchAPI()
     tags = self.conf_tags
     search = '%s %s' % (tags['title'], tags['performer'])
     print 'Searching for images. Search string: %s' % search
     img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'})
     print 'Images: %s' % pformat(img_search)
     registry = processed_image_urls.setdefault(imgdir, set())
     if not os.path.exists(imgdir):
         os.makedirs(imgdir)
     for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []):
         if result['MediaUrl'] not in registry:
             browser = Browser()
             browser.set_handle_robots(None)
             registry.add(result['MediaUrl'])
             log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir))
             try:
                 #log.debug('Opening %s' % result['SourceUrl'])
                 browser.open(result['SourceUrl'])
                 #log.debug('Opening %s' % result['MediaUrl'])
                 img = Image.open(browser.open(result['MediaUrl']))
                 if img.size[0] >= DEFAULT_VIDEO_RESOLUTION and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]:
                     print 'Found image: %s' % result['MediaUrl']
                     img.save(os.path.join(imgdir, ('image%03d.png'
                         % (len(glob.glob1(imgdir, "*.png"))) + 1)))
                     self.conf_pics_done = True
                     if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT:
                         self.conf_pics_done = True
                         break
             except:
                 print_exc()
     if len(glob.glob1(imgdir, "*.png")) < REQUIRED_IMAGE_COUNT:
         search = tags['performer']
         print 'Searching for images. Search string: %s' % search
         img_search = bing.search('image', search.encode('utf-8'), {'$format': 'json'})
         for result in img_search.get('d', {}).get('results', [{}])[0].get('Image', []):
             if result['MediaUrl'] not in registry:
                 browser = Browser()
                 browser.set_handle_robots(None)
                 registry.add(result['MediaUrl'])
                 log.debug('%s images in %s' % (len(glob.glob1(imgdir, "*.png")), imgdir))
                 try:
                     #log.debug('Opening %s' % result['SourceUrl'])
                     browser.open(result['SourceUrl'])
                     #log.debug('Opening %s' % result['MediaUrl'])
                     img = Image.open(browser.open(result['MediaUrl']))
                     if img.size[0] >= DEFAULT_VIDEO_RESOLUTION[0] and img.size[1] >= DEFAULT_VIDEO_RESOLUTION[1]:
                         print 'Found image: %s' % result['MediaUrl']
                         img.save(os.path.join(imgdir, ('image%03d.png'
                             % (len(glob.glob1(imgdir, "*.png"))) + 1)))
                         if len(glob.glob1(imgdir, "*.png")) > REQUIRED_IMAGE_COUNT:
                             self.conf_pics_done = True
                             break
                 except:
                     print_exc()
     return self.conf_pics_done
Esempio n. 16
0
from bing_search_api import BingSearchAPI
import json

my_key = "8jhH8TwVCHdDiWxXYgC5KqyEmChYTKW0kkFngbVYnH8"
query_string = "Sony"
bing = BingSearchAPI(my_key)
params = {'$format': 'json', '$top': 10, '$skip': 0}
news = bing.search('news', query_string, params).json()
for i in range(10):
    print(news['d']['results'][0]['News'][i])
#news = json.loads(bing.search('news', query_string, params).json())
def request_image(window, keyword, num_of_try=0, translate=True):
    """
    Queries Bing for images and retries up to 5 times if the randomly selected image could not be accessed
    :param keyword:
        string which specifies the image content
    :param num_of_try:
        internal parameter that increases if the selected image could not be retrieved (e.g. Forbidden Error)
    :param translate:
        Should the keyword be translated to english before the search? (may increase result size)
    :return:
        The image data in bytes
    """

    if keyword is None:
        return None
    if translate:
        ms_key = open('../ms.key').read()
        trans = Translator('__RealTimeStoryIllustrator__', ms_key)
        translatedkw = trans.translate(keyword, lang_from='de', lang_to='en')
        print("IMAGE SERVICE: Getting image for " + str(keyword) +
              ". Searched for the english translation '" + str(translatedkw) +
              "'.")
    else:
        translatedkw = keyword
        print("IMAGE SERVICE: Getting image for " + str(keyword) + ".")

    if num_of_try > 5:  # no images were found
        logger.error(
            "IMAGE SERVICE: Could not find an image after 5 tries for " +
            str(translatedkw) + ".")
        return None

    # OLD CODE FOR SEARCHING BEGIN

    # term = urllib.parse.quote_plus(translatedkw)

    # sites = [line.rstrip() for line in
    #         open(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'sites.txt'),
    #              encoding="utf-8")]
    # excludedsites = ""
    # for site in sites:
    #    excludedsites = excludedsites + "-site:" + urllib.parse.quote_plus(site) + '%20'

    # img_type = '%7Eillustration+AND+clipart'
    # opener = urllib.request.build_opener()
    # opener.addheaders = [('User-agent', 'Mozilla/5.0')]

    # url = ('http://ajax.googleapis.com/ajax/services/search/images?' +
    #      'v=1.0&q=' + term + '%20' + img_type + '%20' + excludedsites + '%20&userip=91.141.0.105' +
    #       '&rsz=8&imgsz=medium&safe=active' + '&tbs=ic:color')

    # OLD CODE FOR SEARCHING END

    try:
        params = {
            '$format': 'json',
            '$top': 10,
            'ImageFilters': '\'Size:Small\''
        }
        bing_key = open('../bing.key').read()
        api = BingSearchAPI(bing_key)
        result = api.search_image(
            str(translatedkw + '+AND+(illustration+OR+clipart)'), params)
        amount = len(result.json()['d']['results'])
        # print(json.dumps(result.json(), sort_keys=True, indent=2))

        # print(result.json())
        # print(result.json()['d']['results'][0]['MediaUrl'])
        img_num = random.randint(0, amount - 1)
        data = urllib.request.urlopen(
            result.json()['d']['results'][img_num]['MediaUrl'],
            timeout=2).read()
        return data
    except Exception as e:  # have to catch everything since socket exceptions seem to be broken
        print("ERROR in IMAGE SERVICE: Trying again, request was denied " +
              str(e))
        return request_image(window,
                             keyword,
                             num_of_try + 1,
                             translate=translate)
Esempio n. 18
0
import json
from bing_search_api import BingSearchAPI

#read the top keywords(usually 1-3) and generate the search keyword parameter
k = []
with open("topkeywords.txt") as f:
    for line in f:
        k.append(line.strip())
s=' '.join(k)

n=1000  #search result limit

my_key = "uAZ6dYNEodLuQxx1W3UKkLegY+Uj8y7e1E3AxPwqtmM"  #API key
query_string = s    #the query string. currently only has keyword parameter.
bing = BingSearchAPI(my_key)    #initialize search request
params = {'$format': 'json'}    #response format as json

#output file
f = open("bingresults.txt","w")

#get first 50 results from Bing
for obj in bing.search('web',query_string,params).json()['d']['results']:
    for lnk in obj['Web']:
        f.write(lnk['Url'])
        f.write('\n')

i=50

#get the rest results
while i<n:
    params = {'$format': 'json','$skip': i} #skip first i results
Esempio n. 19
0
import json
from bing_search_api import BingSearchAPI

#read the top keywords(usually 1-3) and generate the search keyword parameter
k = []
with open("topkeywords.txt") as f:
    for line in f:
        k.append(line.strip())
s = ' '.join(k)

n = 1000  #search result limit

my_key = "uAZ6dYNEodLuQxx1W3UKkLegY+Uj8y7e1E3AxPwqtmM"  #API key
query_string = s  #the query string. currently only has keyword parameter.
bing = BingSearchAPI(my_key)  #initialize search request
params = {'$format': 'json'}  #response format as json

#output file
f = open("bingresults.txt", "w")

#get first 50 results from Bing
for obj in bing.search('web', query_string, params).json()['d']['results']:
    for lnk in obj['Web']:
        f.write(lnk['Url'])
        f.write('\n')

i = 50

#get the rest results
while i < n:
    params = {'$format': 'json', '$skip': i}  #skip first i results
Esempio n. 20
0
from secret_config import secret_config
from bing_search_api import BingSearchAPI

search_api = BingSearchAPI(secret_config['BING_API_KEY'])


def query(q, sources='web'):
    params = {"$format": "json", "$top": 20}
    response = search_api.search(sources, q.encode('utf-8'), params)
    return response.json()
def request_image(window, keyword, num_of_try=0, translate=True):
    """
    Queries Bing for images and retries up to 5 times if the randomly selected image could not be accessed
    :param keyword:
        string which specifies the image content
    :param num_of_try:
        internal parameter that increases if the selected image could not be retrieved (e.g. Forbidden Error)
    :param translate:
        Should the keyword be translated to english before the search? (may increase result size)
    :return:
        The image data in bytes
    """

    if keyword is None:
        return None
    if translate:
        ms_key = open('../ms.key').read()
        trans = Translator('__RealTimeStoryIllustrator__', ms_key)
        translatedkw = trans.translate(keyword, lang_from='de', lang_to='en')
        print("IMAGE SERVICE: Getting image for " + str(keyword) + ". Searched for the english translation '" +
              str(translatedkw) + "'.")
    else:
        translatedkw = keyword
        print("IMAGE SERVICE: Getting image for " + str(keyword) + ".")

    if num_of_try > 5:  # no images were found
        logger.error("IMAGE SERVICE: Could not find an image after 5 tries for " + str(translatedkw) + ".")
        return None

    # OLD CODE FOR SEARCHING BEGIN

    # term = urllib.parse.quote_plus(translatedkw)

    # sites = [line.rstrip() for line in
    #         open(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'sites.txt'),
    #              encoding="utf-8")]
    # excludedsites = ""
    # for site in sites:
    #    excludedsites = excludedsites + "-site:" + urllib.parse.quote_plus(site) + '%20'

    # img_type = '%7Eillustration+AND+clipart'
    # opener = urllib.request.build_opener()
    # opener.addheaders = [('User-agent', 'Mozilla/5.0')]

    # url = ('http://ajax.googleapis.com/ajax/services/search/images?' +
    #      'v=1.0&q=' + term + '%20' + img_type + '%20' + excludedsites + '%20&userip=91.141.0.105' +
    #       '&rsz=8&imgsz=medium&safe=active' + '&tbs=ic:color')

    # OLD CODE FOR SEARCHING END

    try:
        params = {'$format': 'json', '$top': 10, 'ImageFilters': '\'Size:Small\''}
        bing_key = open('../bing.key').read()
        api = BingSearchAPI(bing_key)
        result = api.search_image(str(translatedkw+'+AND+(illustration+OR+clipart)'), params)
        amount = len(result.json()['d']['results'])
        # print(json.dumps(result.json(), sort_keys=True, indent=2))

        # print(result.json())
        # print(result.json()['d']['results'][0]['MediaUrl'])
        img_num = random.randint(0, amount-1)
        data = urllib.request.urlopen(result.json()['d']['results'][img_num]['MediaUrl'], timeout=2).read()
        return data
    except Exception as e:  # have to catch everything since socket exceptions seem to be broken
        print("ERROR in IMAGE SERVICE: Trying again, request was denied "+str(e))
        return request_image(window, keyword, num_of_try + 1, translate=translate)
Esempio n. 22
0
import logging
from collections import Counter
from bing_search_api import BingSearchAPI
my_key = "urTuvjb7b6dFiCmC3Jj6ZAxuX8DqyXwQRDccSEQJVbc"
bing = BingSearchAPI(my_key)


def main():
    print(search("Hello", 3))


# category='Web' or category='News'
def search(query_string, num_results, category='Web'):
    query_string = query_string.replace("#", "%23")
    params = {
        'ImageFilters': '"Face:Face"',
        '$format': 'json',
        '$top': num_results,
        '$skip': 0
    }
    results = bing.search(category, query_string,
                          params).json()  # requests 1.0+

    return [result['Url'] for result in results['d']['results'][0][category]]


def group_search(query_list,
                 num_results,
                 category='Web',
                 on_wiki=False,
                 on_ubd=False,
Esempio n. 23
0
import sys
import random
import urllib									#import urllib to download images from their URLs	
import time 									#to implement delays in making requests to Toronto Deep Learning
from textblob import TextBlob, Word				#import the class TextBlob from textblob, Word to get definitions
from bing_search_api import BingSearchAPI 		#import the class BingSearchAPI from https://github.com/xthepoet/pyBingSearchAPI
from cvserver import response_for_image, captions, nearest_neighbour	#import the function response_for_image to fetch image descriptions, caption to extract them from HTML using beautiful soup

#MY FUNCTONS
def fix_punctuation(sentence):			#pass in a string to fix the punctuation
	return sentence.replace(' .', '').replace(' , ', ', ')
	
#INFO FOR BING API
my_key = "insert_API_key"	#replace with Bing API Key
query_string = sys.argv[1]	#get query string as input from command line using sys.argv, for multiple words use query between " "
bing = BingSearchAPI(my_key)

#parameters for image searching -- more documentation on params and image filters here http://goo.gl/xG0v0O
params = {'ImageFilters':'"Style:Photo"',
          '$format': 'json',	#specifies format of data response
      	  '$top': 400,			#specifies number of results to return, default is 50
          '$skip': 0}			#specifies starting point offset for the results
          
#bing.search()requires sources first (images, web, video, etc.), then query string, then rest of params (above)
#full schema documentation for bing API is here http://goo.gl/xG0v0O
results = bing.search('image',query_string,params).json() 	#requests 1.0+ 

image_list = results['d']['results'][0]['Image']	#this gets us to the list of all the images

#create a new list of all the image source URLs using a list comprehension
image_urls = [image['MediaUrl'] for image in image_list if len(image['MediaUrl']) > 0]
Esempio n. 24
0
from bing_search_api import BingSearchAPI
import json

my_key = "dWls875YJyXwh7dmX3LdIaETO9IDjfkdG4g8533M9zs"
query_string = raw_input("What is your query? ")
bing = BingSearchAPI(my_key)
params = { '$format': 'json',
              '$top': 10,
              '$skip': 0}
searchJSON = bing.search('news',query_string,params).json()
print searchJSON[1]

Esempio n. 25
0
def Collocations_Method_2(_bing_api_key, _n_grams_from_input_text_file, _input_file_path, _apply_POS_restrictions,
                          _verbose):
    if _verbose:
        # A file to save the verbose output of the program
        _output_file_verbose = str(_input_file_path).replace(_input_file_path.split('/')[-1], 'verbose.txt')
        _output_file_verbose = open(_output_file_verbose, 'a')
        print("\n--------------------------------------------------------------------------", file=_output_file_verbose)
        print("\tMethod-2: Title-Url - Extracting collocations:", file=_output_file_verbose)
        print("--------------------------------------------------------------------------\n\n",
              file=_output_file_verbose)
        print("\tMethod-2: Title-Url - Extracting collocations ...")

    # A list to store n-gram phrases that are collocations
    title_url_collocations = []
    # A list to store n-gram phrases that are not collocations
    n_grams_not_collocations = []

    # Snowball stemmer is used to stem words
    stemmer = snowballstemmer.stemmer('english')
    # Call to Bing search API
    _bing_search = BingSearchAPI(_bing_api_key)
    _bing_search_parameters = {'$format': 'json', '$top': 10}  # Top 10 search results
    # Python list with words synonymous to 'Wikipedia', 'dictionary', 'definition'
    _list_of_synonymous_words = ['dictionary', 'lexicon', 'definition', 'meaning', 'unabridged', 'gazetteer' \
                                                                                                 'spellchecker',
                                 'spellingchecker', 'thesaurus', 'synonymfinder', 'wordfinder', 'wikipedia',
                                 'investorwords' \
                                 'investopedia', 'wiktionary']

    for _n_gram in _n_grams_from_input_text_file:
        if _verbose:
            print("\n%s:" % (_n_gram), file=_output_file_verbose)
        if _n_gram in title_url_collocations or _n_gram in n_grams_not_collocations:
            # If a particular n-gram phrase is checked if it is a collocation before,
            # it will be present in one of the lists, wordnet_collocations OR n_grams_not_collocations
            # Hence, we move on to the next n-gram / phrase
            continue
        else:
            # Before checking if the n-gram is a collocation we check if atlease one
            # POS tag is from the valid POS tag list: {Noun, Verb, Adverb, Adjective} if
            # _apply_POS_restrictions is set to True
            if _apply_POS_restrictions:
                valid_POS_tags = ['NN', 'VB', 'RB', 'JJ']
                _valid_POS_tag_counter = 0  # A counter to count the number of valid POS tags in n-gram
                for _pos_tag in valid_POS_tags:
                    if _pos_tag in _n_gram:
                        _valid_POS_tag_counter += 1
                if _valid_POS_tag_counter == 0:
                    # If no valid POS tag is present in the n-gram, it is not a collocation
                    # when POS restrictions are applied
                    n_grams_not_collocations.append(_n_gram)
                    if _verbose:
                        print("\t'%s' does not have valid POS tags\n\tMoving on to the next phrase ..." % (_n_gram),
                              file=_output_file_verbose)
                    continue  # We move on to the next phrase

            # If POS restrictions are not to be applied on the n-gram
            _n_gram_lower = _n_gram.lower() + ' '  # Lower case
            _n_gram_lower = re.sub(r'_.*? ', ' ', _n_gram_lower).rstrip(' ')
            _n_gram_lower_search_phrase = 'define "%s"' % (_n_gram_lower)  # Bing - Phrase search
            try:
                _search_results = _bing_search.search('web', _n_gram_lower_search_phrase,
                                                      _bing_search_parameters).json()
                _search_result_count = len(_search_results["d"]["results"][0]["Web"])
            except Exception as e:
                if _verbose:
                    print("\tERROR: Method-2 - Bing search - Title-Url\n%s" % (str(e)), file=_output_file_verbose)
                    print("\tERROR: Method-2 - Bing search - Title-Url\n%s" % (str(e)))
                _search_result_count = 0
                continue
            # List to save top 10 search Titles
            _search_titles = []
            # List to store top 10 search Urls
            _search_urls = []
            # We iterate through each of the search result and append search titles and Urls to their respective lists
            for x in xrange(0, _search_result_count):
                _url = _search_results["d"]["results"][0]["Web"][x]["Url"]
                _title = _search_results["d"]["results"][0]["Web"][x]["Title"]
                _title = unicodedata.normalize('NFKD', _title).encode('ascii', 'ignore')
                _url = unicodedata.normalize('NFKD', _url).encode('ascii', 'ignore')
                _search_titles.append(_title)
                _search_urls.append(_url)
            # removing punctuation, special characters and spaces from the keyword
            _n_gram_lower_no_spaces = ''.join(_char for _char in _n_gram_lower if _char.isalnum())
            _n_gram_lower_no_spaces = _n_gram_lower_no_spaces.replace(' ', '')
            _number_of_search_results_returned = len(_search_urls)  # No. of search urls = titles
            # Variable to store the count of titles and urls that have valid keywords and match with the search phrase
            _number_of_valid_titles = 0
            _number_of_valid_urls = 0
            for x in xrange(0, _number_of_search_results_returned):
                _search_title = ""
                _search_title = _search_titles[x]
                _search_title_lower_case = _search_title.lower()
                _search_title_lower_case_no_spaces = "".join(
                    _char for _char in _search_title_lower_case if _char.isalnum())
                _search_url = ""
                _search_url = _search_urls[x]
                _search_url_lower_case = _search_url.lower()
                _search_url_lower_case_no_spaces = "".join(_char for _char in _search_url_lower_case if _char.isalnum())
                if _verbose:
                    print("\t%d:\n\tSearch title: %s\n\tSearch Url: %s" % (x + 1, _search_title, _search_url),
                          file=_output_file_verbose)
                for _synonym in _list_of_synonymous_words:
                    _synonym_match = False
                    # Check if _synonym is present in the tile
                    _title_match = re.search(_synonym, _search_title_lower_case_no_spaces)
                    # check if _synonym is present in the url
                    _url_match = re.search(_synonym, _search_url_lower_case_no_spaces)
                    # If a match is found either in title or the url, open the link and check if the
                    # <title> </title> tag from the html has a match with the keyword
                    if _title_match:
                        _synonym_match = True
                    elif _url_match:
                        _synonym_match = True
                    else:
                        continue
                    if _synonym_match:
                        # Reading HTML from url
                        try:
                            # replace: _url_response = urllib2.urlopen(_search_url)
                            # _url_response = urllib2.urlopen(_search_url)
                            http = httplib2.Http(".cache")
                            resp, _url_response = http.request(_search_url, "GET")
                            _html = _url_response
                            # print(_html)
                            _beautiful_html = BeautifulSoup(_html, "lxml")
                        except Exception as e:
                            if _verbose:
                                print("\tException - Method-2 - Reading HTML\n%s" % (str(e)), file=_output_file_verbose)
                                print("\tException - Method-2 - Reading HTML\n%s" % (str(e)))
                                # print(e.fp.read())
                                print("-----------------\n" + _search_url + "\n---------------\n")
                        # Extracting text in between <h1> tag
                        try:
                            # Comments are to excluded, this part is to coded

                            # _text_from_title = _beautiful_html.find('h1').text
                            # print(_beautiful_html.find('h1').text + "\n")
                            # print("sss" + _beautiful_html.title.string + '\n')
                            _text_from_title = _beautiful_html.title.string
                            # Remove any non-ascii characters from the text extracted
                            _text_from_title_ascii_only = "".join(
                                _char for _char in _text_from_title if ord(_char) < 128)
                            _text_from_title_ascii_only = _text_from_title_ascii_only.lower()
                        except:
                            # If failed to extract text from <h1>
                            _text_from_title_ascii_only = ""

                        """
						# ------- FOR Stemmed match ------------
						# Stem the title text extracted and the n-gram phrase
						# If the stemmed n-gram phrase is present in the stemmed title, 
						# that n-gram phrase is a collocation
						_n_gram_lower_stemmed = ""
						for _word in _n_gram_lower.split(' '):
							_n_gram_lower_stemmed = " " + stemmer.stemWord(_word)
						_text_from_title_ascii_only_stemmed = ""
						for _word in _text_from_title_ascii_only.split(' '):
							_text_from_title_ascii_only_stemmed = " " + stemmer.stemWord(_word)
						if _verbose:
							print "\t\tStemmed search title: %s\n\t\tStemmed phrase: %s" %(_text_from_title_ascii_only_stemmed, _n_gram_lower_stemmed)
						if _n_gram_lower_stemmed in _text_from_title_ascii_only_stemmed:
							_number_of_valid_titles += 1
							if _verbose:
								print "\t\t\tMatch"
						else:
							if _verbose:
								print "\t\t\tNot a match"
						# ---------------------------------------
						"""
                        # ------------ FOR Exact title match -------------
                        if _verbose:
                            print("\t\tSearch TITLE processed: %s\n\t\tPhrase processed: %s" % (
                                _text_from_title_ascii_only, _n_gram_lower), file=_output_file_verbose)
                        if _n_gram_lower in _text_from_title_ascii_only:
                            _number_of_valid_titles += 1
                            if _verbose:
                                print("\t\t\tMatch", file=_output_file_verbose)
                        else:
                            if _verbose:
                                print("\t\t\tNot a match", file=_output_file_verbose)
                        # ------------------------------------------------
                        # Remove punctuation and numbers from Url and see if the n-gram / phrase is present in it
                        # If yes, then that n-gram is a collocation
                        _search_url_lower_case_no_spaces_no_punctuation = "".join(
                            [_char for _char in _search_url_lower_case_no_spaces if not _char.isdigit()])
                        if _verbose:
                            print("\t\tSearch URL processed: %s\n\t\tPhrase processed: %s" % (
                                _search_url_lower_case_no_spaces_no_punctuation, _n_gram_lower_no_spaces),
                                  file=_output_file_verbose)
                        if _n_gram_lower_no_spaces in _search_url_lower_case_no_spaces_no_punctuation:
                            _number_of_valid_urls += 1
                            if _verbose:
                                print("\t\t\tMatch", file=_output_file_verbose)
                        else:
                            if _verbose:
                                print("\t\t\tNot a match", file=_output_file_verbose)
                        break
                    else:
                        continue
        if _number_of_valid_titles > 0 or _number_of_valid_urls > 0:
            title_url_collocations.append(_n_gram)
            if _verbose:
                print("\n\tTotal number of valid titles: %d\n\tTotal number of valid urls: %d\n\t- Collocation -\n" \
                      % (_number_of_valid_titles, _number_of_valid_urls), file=_output_file_verbose)
        else:
            n_grams_not_collocations.append(_n_gram)
            if _verbose:
                print("\t- Not a collocation -\n", file=_output_file_verbose)

    # Output text file to save collocations
    _output_file_path_title_url_collocations = str(_input_file_path).replace(_input_file_path.split('/')[-1],
                                                                             'collocations_title_url.txt')
    _output_file_title_url_collocations = open(_output_file_path_title_url_collocations, 'w')
    for _collocation in title_url_collocations:
        _output_file_title_url_collocations.write(_collocation + '\n')
    _output_file_title_url_collocations.close()
    if _verbose:
        print("\nMethod-2: Title-Url - Collocations are written to the file:\n%s" % (
            _output_file_path_title_url_collocations), file=_output_file_verbose)

    # Output text file to save n-grams that are not collocations
    _output_file_path_title_url_not_collocations = str(_input_file_path).replace(_input_file_path.split('/')[-1],
                                                                                 'not_collocations_title_url.txt')
    _output_file_title_url_not_collocations = open(_output_file_path_title_url_not_collocations, 'w')
    for _n_gram in n_grams_not_collocations:
        _output_file_title_url_not_collocations.write(_n_gram + '\n')
    _output_file_title_url_not_collocations.close()
    if _verbose:
        print("Method-2: Title-Url - N-grams that are not collocations are written to the file:\n%s" % (
            _output_file_path_title_url_not_collocations), file=_output_file_verbose)

    if _verbose:
        print("\n--------------------------------------------------------------------------", file=_output_file_verbose)
        print("\tMethod-2: Title-Url - Extracting collocations - Complete", file=_output_file_verbose)
        print("--------------------------------------------------------------------------\n\n",
              file=_output_file_verbose)

    # Returning n-grams that are collocations and n-grams that are not
    if _verbose:
        print("\t\tMethod-2: Collocation extraction successful")
    return title_url_collocations, n_grams_not_collocations