コード例 #1
0
 def test_can_silent_fail_web_search(self):
     web_bing = PyMsCognitiveWebSearch(SECRET_KEY,
                                       "Python Software Foundation",
                                       silent_fail=True)
     result_one = web_bing.search(limit=50)
     self.assertTrue(len(result_one) == 50)
     self.assertTrue("python" in result_one[0].name.lower())
コード例 #2
0
def run_query(search_terms):
    bing_api_key = read_bing_key()
    if not bing_api_key:
        raise KeyError("Bing Key Not Found")

    # Create our results list which we'll populate.
    results = []
    # Custom params
    params = 'mkt=ru-RU'

    try:
        search_service = PyMsCognitiveWebSearch(bing_api_key, search_terms, custom_params=params)
        json_response = search_service.search(limit=11, format='json')  # 1 - 10

    except:
        print("Error when querying the Bing API")

    # Loop through each page returned, populating out results list.
    # dict_keys(['json', 'deep_links', 'snippet', 'url', 'title', 'name', 'description', 'id', 'display_url'])
    for idx in range(11):
        results.append({'title': json_response[idx].title,
                        'link': json_response[idx].display_url,
                        'summary': json_response[idx].description})

    return results
コード例 #3
0
 def test_empty_response(self):
     '''
     This test checks that searching for a non-existent keyword will not error out.
     '''
     non_existing_result = u'youwillmostdeffinitlynotfindthisveryweirdandlongstringopnanysitewhatsoever123'
     web_bing = PyMsCognitiveWebSearch(SECRET_KEY, non_existing_result)
     self.assertTrue([] == web_bing.search())
コード例 #4
0
ファイル: seedcrawler_bing.py プロジェクト: fgsect/fexm
    def website_crawl(self, query):
        """
        This function issues the given query to ping, then crawls the websites 
        that were given in the ResultSet for links to a file. To be used with 
        queries such as "jpg example file" or "inurl:(avi) intitle:index of"
        :return: A generator - StopIteration is called when no more links can/should be found.
        """
        self.search_service = PyMsCognitiveWebSearch(self.ms_key, query)
        self.search_service.SEARCH_WEB_BASE = "https://api.cognitive.microsoft.com/bing/v7.0/search"
        results = self.search_service.search_all(format="json",
                                                 quota=LIMIT_RESULTS)
        print(len(results))
        for item in results:
            try:
                r = requests.get(item.url, timeout=MAX_TIMEOUT)
            except Exception as e:
                print("Skipping ", item.url, "because of Exception", str(e))
                continue

            parsed_uri = urlparse(r.url)
            subdomain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            # extract the top level domain:
            rootdomain = '{uri.scheme}://{ext.domain}.{ext.suffix}'.format(
                uri=parsed_uri, ext=tldextract.extract(r.url))
            try:
                if requests.head(
                        subdomain +
                        "/robots.txt").status_code == 404 and requests.head(
                            rootdomain + "/robots.txt").status_code == 404:
                    # No Robots TXT - Skip
                    print("Skipping", subdomain,
                          "because it does not contain a robots.txt")
                    continue
            except Exception as e:
                print("Skipping", subdomain, "because of exception", str(e))
                continue
            print("Now scanning through", r.url)
            html_text = r.text
            if "index of" in query and not "index of" in html_text:
                # TODO: Really really hacky. This if statement shoud only be
                # TODO: in place if we are issuing the index of query
                # We probably did not reach a file repository
                continue
            soup = BeautifulSoup(html_text, "html.parser")
            link_anchors = soup.find_all("a")
            links = list(map(lambda x: x.get("href"),
                             link_anchors))  # type: [str]
            links = list(
                filter(
                    lambda x: x is not None and x.lower().endswith(
                        self.filetype), links))
            for link in links:
                path = link
                filelink = urljoin(
                    r.url, path
                )  # Join the two urls. Urljoin handles every case: path is relative and path is absolute
                if self.is_valid_file(self.filetype, filelink):
                    print("Yielding", filelink)
                    yield filelink
コード例 #5
0
def find_omim_link(row):
    geneName = safely_convert_val_to_str(
        row['Gene Symbol'])  #search based on the ingenuity assigned gene name
    searchTerm = 'omim ' + geneName
    search_service = PyMsCognitiveWebSearch(bingWebSearchKey, searchTerm)
    firstFiftyResults = search_service.search(limit=50, format='json')
    url = find_first_correct_result_url(firstFiftyResults, 'omim')
    return url
コード例 #6
0
def azure_search(claim):
    search_term = claim
    search_service = PyMsCognitiveWebSearch('75d1a40af4bf4ba4bdf561ae25b5db5c',
                                            claim)
    first_three_result = search_service.search(limit=3, format='json')  #1-50

    urls = []
    # To get individual result json:
    for i in first_three_result:
        urls.append(i.url.encode('utf-8'))
    return urls
コード例 #7
0
ファイル: BingSearch.py プロジェクト: Selora/OSST
    def do_search(self, search_term):
        """ Perform paged searches while we find new FQDN """

        print(search_term)

        custom_params = {'safeSearch': 'Off'}
        if self.args.offset:
            custom_params['offset'] = self.args.offset

        search_service = MsWeb(self.args.bing_api_key,
                               query=search_term,
                               custom_params=custom_params)

        bing_results = search_service.search(limit=self.args.limit,
                                             format='json')
        self.query_count += 1
        results = BingSearch._parse_results(bing_results)

        # Find all the things until we can't find new URLs anymore
        # We have the first result set, do a new search and compare it to the previous one.
        if self.args.find_all and self.query_count < self.args.max_queries:

            result_set = set((x["url"] for x in results))

            # Internal py_ms_cognitive object deals with the offset
            bing_results = search_service.search(limit=self.args.limit,
                                                 format='json')
            self.query_count += 1
            new_results = BingSearch._parse_results(bing_results)

            new_result_set = set((x["url"] for x in new_results))

            # New result contains everything already inside result_set
            while not result_set.issuperset(new_result_set):

                results += new_results

                if self.query_count < self.args.max_queries:
                    # Internal py_ms_cognitive object deals with the offset
                    bing_results = search_service.search(limit=self.args.limit,
                                                         format='json')
                    self.query_count += 1
                    new_results = BingSearch._parse_results(bing_results)

                    new_result_set = set((tuple(x) for x in results))

                else:
                    # Do not perform a search, but previous results were added
                    # Stop looping, exit
                    break

        return results
コード例 #8
0
ファイル: py_crawler.py プロジェクト: holmes0078/Web-Crawler
    def get_seed(self, search_term):
        search_service = PyMsCognitiveWebSearch(config.key, search_term)
        first_twenty_result = search_service.search(limit=10,
                                                    format='json')  #1-10

        for i in range(10):
            #Giving Each of the 10 links a pagerank of 0.1 (i.e. 1/10)
            link = first_twenty_result[i].json['displayUrl']
            link = self.normalize(link)
            self.nodesToVisit.put(
                (-0.1,
                 link))  #Negate for priority queue to favor higher pagerank
            self.bfs_queue.put(link)
            self.ranks[link] = 0.1
コード例 #9
0
def run_query(search_terms):
    bing_api_key = read_bing_key()
    if not bing_api_key:
        raise KeyError('Bing Key Not Found')
    results_per_page = 10

    results = []

    try:
        search_service = PyMsCognitiveWebSearch(bing_api_key, search_terms)
        response = search_service.search(limit=results_per_page, format='json')

        for result in response:
            results.append({
                'title': result.json['name'],
                'link': result.json['displayUrl'],
                'summary': result.json['snippet']})
    except:
        print("Error when querying the Bing API")

    return results
コード例 #10
0
 def test_search_all(self):
     web_bing = PyMsCognitiveWebSearch(SECRET_KEY,
                                       "Python Software Foundation")
     result_one = web_bing.search_all(quota=60)
     self.assertTrue(len(result_one) == 60)
     self.assertTrue("python" in result_one[0].name.lower())
コード例 #11
0
print "Status: 200 OK"

form = cgi.FieldStorage()
query = form.getvalue('query')

logfile.write(query)
logfile.write("\n")

#query = "moon snails"

API_KEY = "54bcf3e9f555466fa24f529b88311f65"
search_term = query
limit = 10
search_array = search_term.split()
search_service = PyMsCognitiveWebSearch(API_KEY, search_term)
results = search_service.search(limit=limit, format='json')
x = 0
h = 550
w = 550

freq_array = []
url_array = []
name_array = []
snippet_array = []

for x in range(0, limit):
    try:
        dirty_url = results[x].url[20:]
        clean_url = dirty_url[dirty_url.find("http"):]
        clean_url = clean_url[:clean_url.find("&p=DevEx")]
コード例 #12
0
def query(query):
    search_service = PyMsCognitiveWebSearch(keys.get('bing_api_key'), query)
    first_fifty_result = search_service.search(limit=10, format='json')  #1-50
    return [(x.name, x.display_url) for x in first_fifty_result]
コード例 #13
0
ファイル: seedcrawler_bing.py プロジェクト: fgsect/fexm
class FileCrawler:
    """
    This class should be used to seeds a specific file and save it to a specific url
    """
    def __init__(self, filetype: str, ms_key: str, out_dir: str):
        """
        
        :param filetype: The filetype to seeds.
        :param ms_key: The api key for the bing search api. 
        """
        self.filetype = filetype.lower()
        self.ms_key = ms_key
        print("KEY", self.ms_key)
        self.out_dir = out_dir
        self.search_service = None

    @staticmethod
    def is_valid_file(filetype, url: str) -> bool:
        """
        Given an url, it checks if the content is a file and of the right fileformat or not.
        :param url: The url to check
        :return: True if url is not an html webpage, false if it is
        """

        utils.temp_print("Trying", url)
        try:
            response = requests.head(url, timeout=MAX_TIMEOUT, headers=headers)
        except Exception as e:
            return False
        if response.headers.get("content-type") is not None:
            # return False
            if "text/html" in response.headers["content-type"]:
                return False
            if filetype in response.headers["content-type"]:
                return True
        part = url.rpartition(
            "."
        )  # Returns a three tuple, last tuple containing the part after the "."
        if part[2].lower() == filetype:
            return True
        return False

    def website_crawl(self, query):
        """
        This function issues the given query to ping, then crawls the websites 
        that were given in the ResultSet for links to a file. To be used with 
        queries such as "jpg example file" or "inurl:(avi) intitle:index of"
        :return: A generator - StopIteration is called when no more links can/should be found.
        """
        self.search_service = PyMsCognitiveWebSearch(self.ms_key, query)
        self.search_service.SEARCH_WEB_BASE = "https://api.cognitive.microsoft.com/bing/v7.0/search"
        results = self.search_service.search_all(format="json",
                                                 quota=LIMIT_RESULTS)
        print(len(results))
        for item in results:
            try:
                r = requests.get(item.url, timeout=MAX_TIMEOUT)
            except Exception as e:
                print("Skipping ", item.url, "because of Exception", str(e))
                continue

            parsed_uri = urlparse(r.url)
            subdomain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            # extract the top level domain:
            rootdomain = '{uri.scheme}://{ext.domain}.{ext.suffix}'.format(
                uri=parsed_uri, ext=tldextract.extract(r.url))
            try:
                if requests.head(
                        subdomain +
                        "/robots.txt").status_code == 404 and requests.head(
                            rootdomain + "/robots.txt").status_code == 404:
                    # No Robots TXT - Skip
                    print("Skipping", subdomain,
                          "because it does not contain a robots.txt")
                    continue
            except Exception as e:
                print("Skipping", subdomain, "because of exception", str(e))
                continue
            print("Now scanning through", r.url)
            html_text = r.text
            if "index of" in query and not "index of" in html_text:
                # TODO: Really really hacky. This if statement shoud only be
                # TODO: in place if we are issuing the index of query
                # We probably did not reach a file repository
                continue
            soup = BeautifulSoup(html_text, "html.parser")
            link_anchors = soup.find_all("a")
            links = list(map(lambda x: x.get("href"),
                             link_anchors))  # type: [str]
            links = list(
                filter(
                    lambda x: x is not None and x.lower().endswith(
                        self.filetype), links))
            for link in links:
                path = link
                filelink = urljoin(
                    r.url, path
                )  # Join the two urls. Urljoin handles every case: path is relative and path is absolute
                if self.is_valid_file(self.filetype, filelink):
                    print("Yielding", filelink)
                    yield filelink

    def try_filetype_crawl(self):
        """
        Try to find download links to files of the given file format. 
        :return: A generator - StopIteration is called when no more links can/should be found.
        """

        # First: Try a simple  "filetype:" query - works for some, but not all filetypes
        query = "filetype:" + self.filetype
        PyMsCognitiveWebSearch.SEARCH_WEB_BASE = "https://api.cognitive.microsoft.com/bing/v7.0/search"
        self.search_service = PyMsCognitiveWebSearch(self.ms_key, query)
        results = self.search_service.search_all(format="json",
                                                 quota=LIMIT_RESULTS + 20)
        for item in results:
            try:
                r = requests.get(
                    item.url, timeout=MAX_TIMEOUT,
                    headers=headers)  # Request the url to resolve the redirect
            except Exception as e:  # requests.exceptions.ConnectTimeout:
                print("Skipping ", item.url, "because of Exception", str(e))
                # Then just skip
                continue
            if self.is_valid_file(self.filetype, r.url):
                print("Yielding ", r.url)
                yield r.url
        # If this fails, maybe the requested filetype is an image? Then perform an image search
        if self.filetype in image_list:  # Perform an image Search
            query = self.filetype + " sample"
            PyMsCognitiveImageSearch.SEARCH_IMAGE_BASE = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
            self.search_service = PyMsCognitiveImageSearch(self.ms_key, query)

            results = self.search_service._search(
                limit=LIMIT_RESULTS,
                format="json")  # TODO: Class does not implement pagination? :(
            for item in results:
                utils.temp_print("Checking item", item.content_url)
                try:
                    r = requests.get(item.content_url,
                                     timeout=MAX_TIMEOUT,
                                     headers=headers)
                except Exception as e:
                    print("Skipping ", item.url, "because of Exception",
                          str(e))
                    # print("Timeout, checking next item")
                    continue

                print("Url is", r.url)
                if self.is_valid_file(self.filetype, r.url):
                    print("Yielding ", r.url)
                    yield r.url

        for result in self.website_crawl("." + self.filetype +
                                         " example file"):
            print("Yielding", result)
            yield result
        for result in self.website_crawl("." + self.filetype + " sample file"):
            print("Yielding", result)
            yield result

        # Last Resort: The index of trick. Note thatfi this can yield some undesired file samples, use with caution!
        query = "inurl:(" + self.filetype + ") intitle:\"index of:\""
        self.search_service = PyMsCognitiveWebSearch(self.ms_key, query)
        results = self.search_service.search_all(format="json",
                                                 quota=LIMIT_RESULTS)
        print(len(results))
        for item in results:
            try:
                r = requests.get(item.url, timeout=MAX_TIMEOUT)
            except Exception as e:
                print("Skipping ", item.url, "because of Exception", str(e))
                continue

            parsed_uri = urlparse(r.url)
            domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            try:
                if requests.head(domain + "/robots.txt").status_code == 404:
                    # No Robots TXT - Skip
                    print("Skipping", domain,
                          "because it does not contain a robots.txt")
                    continue
            except Exception as e:
                print("Skipping", domain, "because of exception", str(e))
                continue
            print("Now scanning through", r.url)
            html_text = r.text
            if not "index of" in html_text:
                # We probably did not reach a file repository
                continue
            soup = BeautifulSoup(html_text, "html.parser")
            link_anchors = soup.find_all("a")
            links = list(map(lambda x: x.get("href"),
                             link_anchors))  # type: [str]
            links = list(
                filter(
                    lambda x: x is not None and x.lower().endswith(
                        self.filetype), links))
            for link in links:
                path = link
                filelink = urljoin(
                    r.url, path
                )  # Join the two urls. Urljoin handles every case: path is relative and path is absolute
                if self.is_valid_file(self.filetype, filelink):
                    print("Yielding", filelink)
                    yield filelink

    def download(self, max_download=1) -> int:
        """
        Tries to download max number of samples files of the given file format to the self.out_dir folder
        :return: The amount of downloaded files.
        """
        print("MAX", max_download)

        i = 0
        for rurl in self.try_filetype_crawl():
            print("Yielded", rurl)
            filename = self.filetype + "_" + str(uuid.uuid4())
            if (not os.path.exists(self.out_dir)):
                os.makedirs(self.out_dir)
            utils.download_seed_to_folder(download_link=rurl,
                                          to_directory=self.out_dir,
                                          filename=filename)
            # with open(self.out_dir + "/" + filename + "." + self.filetype, "wb") as file:
            #    for chunk in r.iter_content(chunk_size=1024):
            #        if chunk:  # filter out keep-alive new chunks
            #            file.write(chunk)
            i += 1
            if i >= max_download:
                return max_download
            # print("Downloaded",rurl)
        return i
コード例 #14
0
ファイル: seedcrawler_bing.py プロジェクト: fgsect/fexm
    def try_filetype_crawl(self):
        """
        Try to find download links to files of the given file format. 
        :return: A generator - StopIteration is called when no more links can/should be found.
        """

        # First: Try a simple  "filetype:" query - works for some, but not all filetypes
        query = "filetype:" + self.filetype
        PyMsCognitiveWebSearch.SEARCH_WEB_BASE = "https://api.cognitive.microsoft.com/bing/v7.0/search"
        self.search_service = PyMsCognitiveWebSearch(self.ms_key, query)
        results = self.search_service.search_all(format="json",
                                                 quota=LIMIT_RESULTS + 20)
        for item in results:
            try:
                r = requests.get(
                    item.url, timeout=MAX_TIMEOUT,
                    headers=headers)  # Request the url to resolve the redirect
            except Exception as e:  # requests.exceptions.ConnectTimeout:
                print("Skipping ", item.url, "because of Exception", str(e))
                # Then just skip
                continue
            if self.is_valid_file(self.filetype, r.url):
                print("Yielding ", r.url)
                yield r.url
        # If this fails, maybe the requested filetype is an image? Then perform an image search
        if self.filetype in image_list:  # Perform an image Search
            query = self.filetype + " sample"
            PyMsCognitiveImageSearch.SEARCH_IMAGE_BASE = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
            self.search_service = PyMsCognitiveImageSearch(self.ms_key, query)

            results = self.search_service._search(
                limit=LIMIT_RESULTS,
                format="json")  # TODO: Class does not implement pagination? :(
            for item in results:
                utils.temp_print("Checking item", item.content_url)
                try:
                    r = requests.get(item.content_url,
                                     timeout=MAX_TIMEOUT,
                                     headers=headers)
                except Exception as e:
                    print("Skipping ", item.url, "because of Exception",
                          str(e))
                    # print("Timeout, checking next item")
                    continue

                print("Url is", r.url)
                if self.is_valid_file(self.filetype, r.url):
                    print("Yielding ", r.url)
                    yield r.url

        for result in self.website_crawl("." + self.filetype +
                                         " example file"):
            print("Yielding", result)
            yield result
        for result in self.website_crawl("." + self.filetype + " sample file"):
            print("Yielding", result)
            yield result

        # Last Resort: The index of trick. Note thatfi this can yield some undesired file samples, use with caution!
        query = "inurl:(" + self.filetype + ") intitle:\"index of:\""
        self.search_service = PyMsCognitiveWebSearch(self.ms_key, query)
        results = self.search_service.search_all(format="json",
                                                 quota=LIMIT_RESULTS)
        print(len(results))
        for item in results:
            try:
                r = requests.get(item.url, timeout=MAX_TIMEOUT)
            except Exception as e:
                print("Skipping ", item.url, "because of Exception", str(e))
                continue

            parsed_uri = urlparse(r.url)
            domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            try:
                if requests.head(domain + "/robots.txt").status_code == 404:
                    # No Robots TXT - Skip
                    print("Skipping", domain,
                          "because it does not contain a robots.txt")
                    continue
            except Exception as e:
                print("Skipping", domain, "because of exception", str(e))
                continue
            print("Now scanning through", r.url)
            html_text = r.text
            if not "index of" in html_text:
                # We probably did not reach a file repository
                continue
            soup = BeautifulSoup(html_text, "html.parser")
            link_anchors = soup.find_all("a")
            links = list(map(lambda x: x.get("href"),
                             link_anchors))  # type: [str]
            links = list(
                filter(
                    lambda x: x is not None and x.lower().endswith(
                        self.filetype), links))
            for link in links:
                path = link
                filelink = urljoin(
                    r.url, path
                )  # Join the two urls. Urljoin handles every case: path is relative and path is absolute
                if self.is_valid_file(self.filetype, filelink):
                    print("Yielding", filelink)
                    yield filelink
コード例 #15
0
from py_ms_cognitive import PyMsCognitiveWebSearch, PyMsCognitiveImageSearch

key = '82af3a845a3640318879cf8d6db7320a'
query = "New York City"
bing_web = PyMsCognitiveWebSearch(key, query)
bing_Image = PyMsCognitiveImageSearch(key, query)
first_ten_result = bing_web.search(limit=10, format='json')  #1-10
first_ten_image = bing_Image.search(limit=10, format='json')  #1-10

print(first_ten_image[0].name)
コード例 #16
0
def query(query):
    search_service = PyMsCognitiveWebSearch(keys.get('bing_api_key'), query)
    first_fifty_result = search_service.search(limit=10, format='json') #1-50
    return [(x.name, x.display_url) for x in first_fifty_result]
コード例 #17
0
import json
import os
import csv
import requests
import re
from urllib.parse import urlparse
from py_ms_cognitive import PyMsCognitiveWebSearch

search_service = PyMsCognitiveWebSearch('', search_term)
import clearbit

clearbit.key = 'sk_7177eb671d9fe0c5cbbc4d6113e2157a'

with open(os.path.abspath('out_1_sample.json'), 'r') as f:
    items_json = json.loads(f.read())

csv_f = open('out_1_enriched_sample.csv', 'w')
csv_writer = csv.writer(csv_f, dialect='excel')

# "name": "VelocityEHS", "logo": "https://ga0.imgix.net/logo/o/104315-1467798979-3301337?ixlib=rb-1.0.0&ch=Width%2CDPR&auto=format", "desc": "Environment, Health, Safety (EHS) Management Software", "rating": "4.67", "num_reviews": "3", "website_click_out": "/x/velocityehs-application?route=listing_detail&from_listing=104315", "category": "https://www.getapp.com/business-intelligence-analytics-software/data-visualization/?page=2"},
cols = [
    "name", "domain", "getapp_logo", "clearbit_logo", "desc", "rating",
    "num_reviews", "cat", "subcat", "getapp_cat_link"
]
csv_writer.writerow(cols)
for json_in in items_json:
    csv_out = [None] * len(cols)
    # csv_out[cols.index("")] = json_in[""]
    csv_out[cols.index("name")] = json_in["name"]
    csv_out[cols.index("getapp_logo")] = json_in["logo"]
    csv_out[cols.index("desc")] = json_in["desc"]
コード例 #18
0
ファイル: hello.py プロジェクト: cristoper/NLP
from py_ms_cognitive import PyMsCognitiveWebSearch

search_term = "Amy Burkhardt"
search_service = PyMsCognitiveWebSearch(
    "a185959d275247529ba4bb965f9f56ce",
    '"Five-star technology solutions" AND "New York City Department of Education" AND assessment'
)
first_result = search_service.search(limit=1, format='json')  # 1-50
print(first_result[0].title)
print(first_result[0].url)

print('hello world')
コード例 #19
0
ファイル: bingTest.py プロジェクト: caitlinstanton/seventhson
def search(query):
    bing_web = PyMsCognitiveWebSearch(key, query)
    bing_Image = PyMsCognitiveImageSearch(key, query)
    first_ten_result = bing_web.search(limit=10, format='json')  #1-10
    first_ten_image = bing_Image.search(limit=10, format='json')  #1-10
    return (first_ten_image[0].content_url)