Beispiel #1
0
def bing(request):
    """
    Search bing using a paid API.
    ---
    type:
      translated:
        type: string
    parameters:
        - name: query
          description: search query
          required: true
          type: string
          paramType: form
        - name: tx
          description: Transaction Id  (proof of payment)
          type: string
          paramType: query
    """
    if 'query' not in request.data:
        return Response({"error": "Must provide a 'query' parameter."},
                        status=status.HTTP_400_BAD_REQUEST)
    api = BingSearchAPI(settings.AZURE_MARKETPLACE_KEY)
    result = api.search_web(request.data['query'], payload={'$format': 'json'})
    if result.ok:
        return Response({"results": result.text})
    else:
        return Response({"error": result.text},
                        status=status.HTTP_400_BAD_REQUEST)
Beispiel #2
0
def query(query_string):
    bing = BingSearchAPI(my_key)
    params = {
        'ImageFilters': '"Face:Face"',
        '$format': 'json',
        '$top': 10,
        '$skip': 0
    }
    results = bing.search('web', query_string, params).json()  # requests 1.0+

    return [result['Url'] for result in results['d']['results'][0]['Web']]
Beispiel #3
0
def get_actor_url(actor_name):
    bing = BingSearchAPI(BING_KEY)
    params = {
        'ImageFilters': '"Face:Face"',
        '$format': 'json',
        '$top': 1,
        '$skip': 0
    }
    actor_name = actor_name.encode('utf-8')
    data = bing.search('image', actor_name, params).json()
    return data['d']['results'][0]['Image'][0]['Thumbnail']['MediaUrl']
Beispiel #4
0
def bing_search_total(_verbose, _search_phrase, _bing_api_key):

    _search_phrase_parsed = "%22" + _search_phrase.replace(' ', '+').strip(
        ' ') + "%22"  # %22 acts as quotes, facilitating a phrase search
    _bing_search = BingSearchAPI(_bing_api_key)
    _bing_parameters = {'$format': 'json', '$top': 2}

    try:
        res = _bing_search.search('web', _search_phrase_parsed,
                                  _bing_parameters).json()
        total_search_results = res["d"]["results"][0]["WebTotal"]
        total = int(total_search_results)
        if (isinstance(total, int)):
            if _verbose:
                print('\t' + _search_phrase_parsed.replace('+', ' ').replace(
                    '%22', '') + total)
                pass
            return total
    except Exception as e:
        if _verbose:
            print('\tERROR: in bing.search() - search total\n\t' + str(e))
        print('\tERROR: in bing.search() - search total\n\t' + str(e))
        print("[Errno {0}] {1}".format(e.errno, e.strerror))
        return 0
Beispiel #5
0
from bing_search_api import BingSearchAPI
import json

my_key = "8jhH8TwVCHdDiWxXYgC5KqyEmChYTKW0kkFngbVYnH8"
query_string = "Sony"
bing = BingSearchAPI(my_key)
params = {'$format': 'json', '$top': 10, '$skip': 0}
news = bing.search('news', query_string, params).json()
for i in range(10):
    print(news['d']['results'][0]['News'][i])
#news = json.loads(bing.search('news', query_string, params).json())
Beispiel #6
0
from secret_config import secret_config
from bing_search_api import BingSearchAPI

search_api = BingSearchAPI(secret_config['BING_API_KEY'])


def query(q, sources='web'):
    params = {"$format": "json", "$top": 20}
    response = search_api.search(sources, q.encode('utf-8'), params)
    return response.json()
def request_image(window, keyword, num_of_try=0, translate=True):
    """
    Queries Bing for images and retries up to 5 times if the randomly selected image could not be accessed
    :param keyword:
        string which specifies the image content
    :param num_of_try:
        internal parameter that increases if the selected image could not be retrieved (e.g. Forbidden Error)
    :param translate:
        Should the keyword be translated to english before the search? (may increase result size)
    :return:
        The image data in bytes
    """

    if keyword is None:
        return None
    if translate:
        ms_key = open('../ms.key').read()
        trans = Translator('__RealTimeStoryIllustrator__', ms_key)
        translatedkw = trans.translate(keyword, lang_from='de', lang_to='en')
        print("IMAGE SERVICE: Getting image for " + str(keyword) +
              ". Searched for the english translation '" + str(translatedkw) +
              "'.")
    else:
        translatedkw = keyword
        print("IMAGE SERVICE: Getting image for " + str(keyword) + ".")

    if num_of_try > 5:  # no images were found
        logger.error(
            "IMAGE SERVICE: Could not find an image after 5 tries for " +
            str(translatedkw) + ".")
        return None

    # OLD CODE FOR SEARCHING BEGIN

    # term = urllib.parse.quote_plus(translatedkw)

    # sites = [line.rstrip() for line in
    #         open(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'sites.txt'),
    #              encoding="utf-8")]
    # excludedsites = ""
    # for site in sites:
    #    excludedsites = excludedsites + "-site:" + urllib.parse.quote_plus(site) + '%20'

    # img_type = '%7Eillustration+AND+clipart'
    # opener = urllib.request.build_opener()
    # opener.addheaders = [('User-agent', 'Mozilla/5.0')]

    # url = ('http://ajax.googleapis.com/ajax/services/search/images?' +
    #      'v=1.0&q=' + term + '%20' + img_type + '%20' + excludedsites + '%20&userip=91.141.0.105' +
    #       '&rsz=8&imgsz=medium&safe=active' + '&tbs=ic:color')

    # OLD CODE FOR SEARCHING END

    try:
        params = {
            '$format': 'json',
            '$top': 10,
            'ImageFilters': '\'Size:Small\''
        }
        bing_key = open('../bing.key').read()
        api = BingSearchAPI(bing_key)
        result = api.search_image(
            str(translatedkw + '+AND+(illustration+OR+clipart)'), params)
        amount = len(result.json()['d']['results'])
        # print(json.dumps(result.json(), sort_keys=True, indent=2))

        # print(result.json())
        # print(result.json()['d']['results'][0]['MediaUrl'])
        img_num = random.randint(0, amount - 1)
        data = urllib.request.urlopen(
            result.json()['d']['results'][img_num]['MediaUrl'],
            timeout=2).read()
        return data
    except Exception as e:  # have to catch everything since socket exceptions seem to be broken
        print("ERROR in IMAGE SERVICE: Trying again, request was denied " +
              str(e))
        return request_image(window,
                             keyword,
                             num_of_try + 1,
                             translate=translate)
def Collocations_Method_2(_bing_api_key, _n_grams_from_input_text_file, _input_file_path, _apply_POS_restrictions,
                          _verbose):
    if _verbose:
        # A file to save the verbose output of the program
        _output_file_verbose = str(_input_file_path).replace(_input_file_path.split('/')[-1], 'verbose.txt')
        _output_file_verbose = open(_output_file_verbose, 'a')
        print("\n--------------------------------------------------------------------------", file=_output_file_verbose)
        print("\tMethod-2: Title-Url - Extracting collocations:", file=_output_file_verbose)
        print("--------------------------------------------------------------------------\n\n",
              file=_output_file_verbose)
        print("\tMethod-2: Title-Url - Extracting collocations ...")

    # A list to store n-gram phrases that are collocations
    title_url_collocations = []
    # A list to store n-gram phrases that are not collocations
    n_grams_not_collocations = []

    # Snowball stemmer is used to stem words
    stemmer = snowballstemmer.stemmer('english')
    # Call to Bing search API
    _bing_search = BingSearchAPI(_bing_api_key)
    _bing_search_parameters = {'$format': 'json', '$top': 10}  # Top 10 search results
    # Python list with words synonymous to 'Wikipedia', 'dictionary', 'definition'
    _list_of_synonymous_words = ['dictionary', 'lexicon', 'definition', 'meaning', 'unabridged', 'gazetteer' \
                                                                                                 'spellchecker',
                                 'spellingchecker', 'thesaurus', 'synonymfinder', 'wordfinder', 'wikipedia',
                                 'investorwords' \
                                 'investopedia', 'wiktionary']

    for _n_gram in _n_grams_from_input_text_file:
        if _verbose:
            print("\n%s:" % (_n_gram), file=_output_file_verbose)
        if _n_gram in title_url_collocations or _n_gram in n_grams_not_collocations:
            # If a particular n-gram phrase is checked if it is a collocation before,
            # it will be present in one of the lists, wordnet_collocations OR n_grams_not_collocations
            # Hence, we move on to the next n-gram / phrase
            continue
        else:
            # Before checking if the n-gram is a collocation we check if atlease one
            # POS tag is from the valid POS tag list: {Noun, Verb, Adverb, Adjective} if
            # _apply_POS_restrictions is set to True
            if _apply_POS_restrictions:
                valid_POS_tags = ['NN', 'VB', 'RB', 'JJ']
                _valid_POS_tag_counter = 0  # A counter to count the number of valid POS tags in n-gram
                for _pos_tag in valid_POS_tags:
                    if _pos_tag in _n_gram:
                        _valid_POS_tag_counter += 1
                if _valid_POS_tag_counter == 0:
                    # If no valid POS tag is present in the n-gram, it is not a collocation
                    # when POS restrictions are applied
                    n_grams_not_collocations.append(_n_gram)
                    if _verbose:
                        print("\t'%s' does not have valid POS tags\n\tMoving on to the next phrase ..." % (_n_gram),
                              file=_output_file_verbose)
                    continue  # We move on to the next phrase

            # If POS restrictions are not to be applied on the n-gram
            _n_gram_lower = _n_gram.lower() + ' '  # Lower case
            _n_gram_lower = re.sub(r'_.*? ', ' ', _n_gram_lower).rstrip(' ')
            _n_gram_lower_search_phrase = 'define "%s"' % (_n_gram_lower)  # Bing - Phrase search
            try:
                _search_results = _bing_search.search('web', _n_gram_lower_search_phrase,
                                                      _bing_search_parameters).json()
                _search_result_count = len(_search_results["d"]["results"][0]["Web"])
            except Exception as e:
                if _verbose:
                    print("\tERROR: Method-2 - Bing search - Title-Url\n%s" % (str(e)), file=_output_file_verbose)
                    print("\tERROR: Method-2 - Bing search - Title-Url\n%s" % (str(e)))
                _search_result_count = 0
                continue
            # List to save top 10 search Titles
            _search_titles = []
            # List to store top 10 search Urls
            _search_urls = []
            # We iterate through each of the search result and append search titles and Urls to their respective lists
            for x in xrange(0, _search_result_count):
                _url = _search_results["d"]["results"][0]["Web"][x]["Url"]
                _title = _search_results["d"]["results"][0]["Web"][x]["Title"]
                _title = unicodedata.normalize('NFKD', _title).encode('ascii', 'ignore')
                _url = unicodedata.normalize('NFKD', _url).encode('ascii', 'ignore')
                _search_titles.append(_title)
                _search_urls.append(_url)
            # removing punctuation, special characters and spaces from the keyword
            _n_gram_lower_no_spaces = ''.join(_char for _char in _n_gram_lower if _char.isalnum())
            _n_gram_lower_no_spaces = _n_gram_lower_no_spaces.replace(' ', '')
            _number_of_search_results_returned = len(_search_urls)  # No. of search urls = titles
            # Variable to store the count of titles and urls that have valid keywords and match with the search phrase
            _number_of_valid_titles = 0
            _number_of_valid_urls = 0
            for x in xrange(0, _number_of_search_results_returned):
                _search_title = ""
                _search_title = _search_titles[x]
                _search_title_lower_case = _search_title.lower()
                _search_title_lower_case_no_spaces = "".join(
                    _char for _char in _search_title_lower_case if _char.isalnum())
                _search_url = ""
                _search_url = _search_urls[x]
                _search_url_lower_case = _search_url.lower()
                _search_url_lower_case_no_spaces = "".join(_char for _char in _search_url_lower_case if _char.isalnum())
                if _verbose:
                    print("\t%d:\n\tSearch title: %s\n\tSearch Url: %s" % (x + 1, _search_title, _search_url),
                          file=_output_file_verbose)
                for _synonym in _list_of_synonymous_words:
                    _synonym_match = False
                    # Check if _synonym is present in the tile
                    _title_match = re.search(_synonym, _search_title_lower_case_no_spaces)
                    # check if _synonym is present in the url
                    _url_match = re.search(_synonym, _search_url_lower_case_no_spaces)
                    # If a match is found either in title or the url, open the link and check if the
                    # <title> </title> tag from the html has a match with the keyword
                    if _title_match:
                        _synonym_match = True
                    elif _url_match:
                        _synonym_match = True
                    else:
                        continue
                    if _synonym_match:
                        # Reading HTML from url
                        try:
                            # replace: _url_response = urllib2.urlopen(_search_url)
                            # _url_response = urllib2.urlopen(_search_url)
                            http = httplib2.Http(".cache")
                            resp, _url_response = http.request(_search_url, "GET")
                            _html = _url_response
                            # print(_html)
                            _beautiful_html = BeautifulSoup(_html, "lxml")
                        except Exception as e:
                            if _verbose:
                                print("\tException - Method-2 - Reading HTML\n%s" % (str(e)), file=_output_file_verbose)
                                print("\tException - Method-2 - Reading HTML\n%s" % (str(e)))
                                # print(e.fp.read())
                                print("-----------------\n" + _search_url + "\n---------------\n")
                        # Extracting text in between <h1> tag
                        try:
                            # Comments are to excluded, this part is to coded

                            # _text_from_title = _beautiful_html.find('h1').text
                            # print(_beautiful_html.find('h1').text + "\n")
                            # print("sss" + _beautiful_html.title.string + '\n')
                            _text_from_title = _beautiful_html.title.string
                            # Remove any non-ascii characters from the text extracted
                            _text_from_title_ascii_only = "".join(
                                _char for _char in _text_from_title if ord(_char) < 128)
                            _text_from_title_ascii_only = _text_from_title_ascii_only.lower()
                        except:
                            # If failed to extract text from <h1>
                            _text_from_title_ascii_only = ""

                        """
						# ------- FOR Stemmed match ------------
						# Stem the title text extracted and the n-gram phrase
						# If the stemmed n-gram phrase is present in the stemmed title, 
						# that n-gram phrase is a collocation
						_n_gram_lower_stemmed = ""
						for _word in _n_gram_lower.split(' '):
							_n_gram_lower_stemmed = " " + stemmer.stemWord(_word)
						_text_from_title_ascii_only_stemmed = ""
						for _word in _text_from_title_ascii_only.split(' '):
							_text_from_title_ascii_only_stemmed = " " + stemmer.stemWord(_word)
						if _verbose:
							print "\t\tStemmed search title: %s\n\t\tStemmed phrase: %s" %(_text_from_title_ascii_only_stemmed, _n_gram_lower_stemmed)
						if _n_gram_lower_stemmed in _text_from_title_ascii_only_stemmed:
							_number_of_valid_titles += 1
							if _verbose:
								print "\t\t\tMatch"
						else:
							if _verbose:
								print "\t\t\tNot a match"
						# ---------------------------------------
						"""
                        # ------------ FOR Exact title match -------------
                        if _verbose:
                            print("\t\tSearch TITLE processed: %s\n\t\tPhrase processed: %s" % (
                                _text_from_title_ascii_only, _n_gram_lower), file=_output_file_verbose)
                        if _n_gram_lower in _text_from_title_ascii_only:
                            _number_of_valid_titles += 1
                            if _verbose:
                                print("\t\t\tMatch", file=_output_file_verbose)
                        else:
                            if _verbose:
                                print("\t\t\tNot a match", file=_output_file_verbose)
                        # ------------------------------------------------
                        # Remove punctuation and numbers from Url and see if the n-gram / phrase is present in it
                        # If yes, then that n-gram is a collocation
                        _search_url_lower_case_no_spaces_no_punctuation = "".join(
                            [_char for _char in _search_url_lower_case_no_spaces if not _char.isdigit()])
                        if _verbose:
                            print("\t\tSearch URL processed: %s\n\t\tPhrase processed: %s" % (
                                _search_url_lower_case_no_spaces_no_punctuation, _n_gram_lower_no_spaces),
                                  file=_output_file_verbose)
                        if _n_gram_lower_no_spaces in _search_url_lower_case_no_spaces_no_punctuation:
                            _number_of_valid_urls += 1
                            if _verbose:
                                print("\t\t\tMatch", file=_output_file_verbose)
                        else:
                            if _verbose:
                                print("\t\t\tNot a match", file=_output_file_verbose)
                        break
                    else:
                        continue
        if _number_of_valid_titles > 0 or _number_of_valid_urls > 0:
            title_url_collocations.append(_n_gram)
            if _verbose:
                print("\n\tTotal number of valid titles: %d\n\tTotal number of valid urls: %d\n\t- Collocation -\n" \
                      % (_number_of_valid_titles, _number_of_valid_urls), file=_output_file_verbose)
        else:
            n_grams_not_collocations.append(_n_gram)
            if _verbose:
                print("\t- Not a collocation -\n", file=_output_file_verbose)

    # Output text file to save collocations
    _output_file_path_title_url_collocations = str(_input_file_path).replace(_input_file_path.split('/')[-1],
                                                                             'collocations_title_url.txt')
    _output_file_title_url_collocations = open(_output_file_path_title_url_collocations, 'w')
    for _collocation in title_url_collocations:
        _output_file_title_url_collocations.write(_collocation + '\n')
    _output_file_title_url_collocations.close()
    if _verbose:
        print("\nMethod-2: Title-Url - Collocations are written to the file:\n%s" % (
            _output_file_path_title_url_collocations), file=_output_file_verbose)

    # Output text file to save n-grams that are not collocations
    _output_file_path_title_url_not_collocations = str(_input_file_path).replace(_input_file_path.split('/')[-1],
                                                                                 'not_collocations_title_url.txt')
    _output_file_title_url_not_collocations = open(_output_file_path_title_url_not_collocations, 'w')
    for _n_gram in n_grams_not_collocations:
        _output_file_title_url_not_collocations.write(_n_gram + '\n')
    _output_file_title_url_not_collocations.close()
    if _verbose:
        print("Method-2: Title-Url - N-grams that are not collocations are written to the file:\n%s" % (
            _output_file_path_title_url_not_collocations), file=_output_file_verbose)

    if _verbose:
        print("\n--------------------------------------------------------------------------", file=_output_file_verbose)
        print("\tMethod-2: Title-Url - Extracting collocations - Complete", file=_output_file_verbose)
        print("--------------------------------------------------------------------------\n\n",
              file=_output_file_verbose)

    # Returning n-grams that are collocations and n-grams that are not
    if _verbose:
        print("\t\tMethod-2: Collocation extraction successful")
    return title_url_collocations, n_grams_not_collocations
Beispiel #9
0
    result = bing.search('image', query_string, params).json()

    image_url = result['d']['results'][0]['Image'][random_index]['MediaUrl']

    return image_url


def getAmazonURL(item):
    """
    A utility for retrieving the Amazon search results URL for some item.
    """

    return "http://www.amazon.com/s?field-keywords={0}".format(urllib.quote_plus(item))


#
# SETUP
bing_key = 'api_key_here'

angelListAPI = AngelListAPI()
genderGuesserAPI = GenderGuesserAPI()
googleBooksAPI = GoogleBooksAPI()
googleResultsAPI = GoogleResultsAPI()
thesaurusAPI = ThesaurusAPI()
tweetSentimentAPI = TweetSentimentAPI()
bing = BingSearchAPI(bing_key)

APIS = [angelListAPI, genderGuesserAPI, googleBooksAPI, googleResultsAPI, thesaurusAPI, tweetSentimentAPI]
CALCULATOR = Calculator(APIS)
Beispiel #10
0
import json
from bing_search_api import BingSearchAPI

#read the top keywords(usually 1-3) and generate the search keyword parameter
k = []
with open("topkeywords.txt") as f:
    for line in f:
        k.append(line.strip())
s = ' '.join(k)

n = 1000  #search result limit

my_key = "uAZ6dYNEodLuQxx1W3UKkLegY+Uj8y7e1E3AxPwqtmM"  #API key
query_string = s  #the query string. currently only has keyword parameter.
bing = BingSearchAPI(my_key)  #initialize search request
params = {'$format': 'json'}  #response format as json

#output file
f = open("bingresults.txt", "w")

#get first 50 results from Bing
for obj in bing.search('web', query_string, params).json()['d']['results']:
    for lnk in obj['Web']:
        f.write(lnk['Url'])
        f.write('\n')

i = 50

#get the rest results
while i < n:
    params = {'$format': 'json', '$skip': i}  #skip first i results