Esempio n. 1
0
class VirusTotalApi(object):
    BASE_DOMAIN = u'https://www.virustotal.com/vtapi/v2/'

    def __init__(self, api_key, resources_per_req=25, cache_file_name=None, update_cache=True, req_timeout=None):
        """Establishes basic HTTP params and loads a cache.

        Args:
            api_key: VirusTotal API key
            resources_per_req: Maximum number of resources (hashes, URLs)
                to be send in a single request
            cache_file_name: String file name of cache.
            update_cache: Determines whether cache should be written out back to the disk when closing it.
                          Default is `True`.
            req_timeout: Maximum number of seconds to wait without reading a response byte before deciding an error has occurred.
                         Default is None.
        """
        self._api_key = api_key
        self._resources_per_req = resources_per_req
        self._requests = MultiRequest(req_timeout=req_timeout)

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name, update_cache) if cache_file_name else None

    @MultiRequest.error_handling
    def get_file_reports(self, resources):
        """Retrieves the most recent reports for a set of md5, sha1, and/or sha2 hashes.

        Args:
            resources: list of string hashes.
        Returns:
            A dict with the hash as key and the VT report as value.
        """
        api_name = 'virustotal-file-reports'

        all_responses, resources = self._bulk_cache_lookup(api_name, resources)
        resource_chunks = self._prepare_resource_chunks(resources)
        response_chunks = self._request_reports("resource", resource_chunks, 'file/report')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    def _extract_all_responses(self, resources, api_endpoint, api_name):
        """ Aux function to extract all the API endpoint responses.

        Args:
            resources: list of string hashes.
            api_endpoint: endpoint path
            api_name: endpoint name
        Returns:
            A dict with the hash as key and the VT report as value.
        """
        all_responses, resources = self._bulk_cache_lookup(api_name, resources)
        resource_chunks = self._prepare_resource_chunks(resources)
        response_chunks = self._request_reports("resource", resource_chunks, api_endpoint)
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_behaviour(self, resources):
        """Retrieves a report about the behaviour of a md5, sha1, and/or sha2 hash of
        a file when executed in a sandboxed environment (Cuckoo sandbox).

        Args:
            resources: list of string hashes.
        """
        api_name = 'virustotal-file-behaviour'
        api_endpoint = 'file/behaviour'
        return self._extract_all_responses(resources, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_file_download(self, resources):
        """Retrieves a file from its a md5, sha1, and/or sha2 hash.

        Args:
            resources: list of string hashes.
        Returns:
            a file download
        """
        api_name = 'virustotal-file-download'
        api_endpoint = 'file/download'
        return self._extract_all_responses(resources, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_file_network_traffic(self, resources):
        """Retrieves a report about the network traffic of a md5, sha1, and/or sha2 hash of
           file, when it is executed.

        Args:
            resources: list of string hashes.
        """
        api_name = 'virustotal-file-network-traffic'
        api_endpoint = 'file/network-traffic'
        return self._extract_all_responses(resources, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_domain_reports(self, domains):
        """Retrieves the most recent VT info for a set of domains.

        Args:
            domains: list of string domains.
        Returns:
            A dict with the domain as key and the VT report as value.
        """
        api_name = 'virustotal-domain-reports'

        (all_responses, domains) = self._bulk_cache_lookup(api_name, domains)
        responses = self._request_reports("domain", domains, 'domain/report')

        for domain, response in zip(domains, responses):
            if self._cache:
                self._cache.cache_value(api_name, domain, response)
            all_responses[domain] = response

        return all_responses

    @MultiRequest.error_handling
    def get_url_distribution(self, params=None):
        """Retrieves a live feed with the latest URLs submitted to VT.

        Args:
            resources: a dictionary with name and value for optional arguments
        Returns:
            A dict with the VT report.
        """
        params = params or {}
        all_responses = {}
        api_name = 'virustotal-url-distribution'

        response_chunks = self._request_reports(params.keys(), params.values(), 'url/distribution')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_distribution(self, params=None):
        """Retrieves a live feed with the latest hashes submitted to VT.

        Args:
            params: a dictionary with name and values for optional arguments,
            such as: before (timestampe), after (timestamp), reports (boolean),
            limit (retrieve limit file items).
            Example: 'reports': 'true'
        Returns:
            A dict with the VT report.
        """
        params = params or []
        all_responses = {}
        api_name = 'virustotal-file-distribution'

        response_chunks = self._request_reports(params.keys(), params.value(), 'file/distribution')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_url_reports(self, resources):
        """Retrieves a scan report on a given URL.

        Args:
            resources: list of URLs.
        Returns:
            A dict with the URL as key and the VT report as value.
        """
        api_name = 'virustotal-url-reports'

        (all_responses, resources) = self._bulk_cache_lookup(api_name, resources)
        resource_chunks = self._prepare_resource_chunks(resources, '\n')
        response_chunks = self._request_reports("resource", resource_chunks, 'url/report')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_ip_reports(self, ips):
        """Retrieves the most recent VT info for a set of ips.

        Args:
            ips: list of IPs.
        Returns:
            A dict with the IP as key and the VT report as value.
        """
        api_name = 'virustotal-ip-address-reports'

        (all_responses, ips) = self._bulk_cache_lookup(api_name, ips)
        responses = self._request_reports("ip", ips, 'ip-address/report')

        for ip, response in zip(ips, responses):
            if self._cache:
                self._cache.cache_value(api_name, ip, response)
            all_responses[ip] = response

        return all_responses

    @MultiRequest.error_handling
    def get_file_search(self, query):
        """Performs advanced search on samples, matching certain binary/
           metadata/detection criteria.
           Possible queries: file size, file type, first or last submission to
            VT, number of positives, bynary content, etc.

        Args:
            query: dictionary with search arguments
            Example: 'query': 'type:peexe size:90kb+ positives:5+ behaviour:"taskkill"'
        Returns:
            A dict with the VT report.
        """
        api_name = 'virustotal-file-search'

        (all_responses, query) = self._bulk_cache_lookup(api_name, query)
        response_chunks = self._request_reports("query", query, 'file/search')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_clusters(self, date):
        """Retrieves file similarity clusters for a given time frame.

        Args:
            date: the specific date for which we want the clustering details.
            Example: 'date': '2013-09-10'
        Returns:
            A dict with the VT report.
        """
        api_name = 'virustotal-file-clusters'

        (all_responses, resources) = self._bulk_cache_lookup(api_name, date)
        response = self._request_reports("date", date, 'file/clusters')
        self._extract_response_chunks(all_responses, response, api_name)

        return all_responses

    def _bulk_cache_lookup(self, api_name, keys):
        """Performes a bulk cache lookup and returns a tuple with the results
        found and the keys missing in the cache. If cached is not configured
        it will return an empty dictionary of found results and the initial
        list of keys.

        Args:
            api_name: a string name of the API.
            keys: an enumerable of string keys.
        Returns:
            A tuple: (responses found, missing keys).
        """
        if self._cache:
            responses = self._cache.bulk_lookup(api_name, keys)
            missing_keys = [key for key in keys if key not in responses.keys()]
            return (responses, missing_keys)

        return ({}, keys)

    def _prepare_resource_chunks(self, resources, resource_delim=','):
        """As in some VirusTotal API methods the call can be made for multiple
        resources at once this method prepares a list of concatenated resources
        according to the maximum number of resources per requests.

        Args:
            resources: a list of the resources.
            resource_delim: a string used to separate the resources.
              Default value is a comma.
        Returns:
            A list of the concatenated resources.
        """
        return [self._prepare_resource_chunk(resources, resource_delim, pos)
                for pos in xrange(0, len(resources), self._resources_per_req)]

    def _prepare_resource_chunk(self, resources, resource_delim, pos):
        return resource_delim.join(
            resources[pos:pos + self._resources_per_req])

    def _request_reports(self, resource_param_name, resources, endpoint_name):
        """Sends multiples requests for the resources to a particular endpoint.

        Args:
            resource_param_name: a string name of the resource parameter.
            resources: list of of the resources.
            endpoint_name: VirusTotal endpoint URL suffix.
        Returns:
            A list of the responses.
        """
        params = [{resource_param_name: resource, 'apikey': self._api_key} for resource in resources]
        return self._requests.multi_get(self.BASE_DOMAIN + endpoint_name, query_params=params)

    def _extract_response_chunks(self, all_responses, response_chunks, api_name):
        """Extracts and caches the responses from the response chunks in case
        of the responses for the requests containing multiple concatenated
        resources. Extracted responses are added to the already cached
        responses passed in the all_responses parameter.

        Args:
            all_responses: a list containing already cached responses.
            response_chunks: a list with response chunks.
            api_name: a string name of the API.
        """
        for response_chunk in response_chunks:
            if not isinstance(response_chunk, list):
                response_chunk = [response_chunk]
            for response in response_chunk:
                if not response:
                    continue

                if self._cache:
                    self._cache.cache_value(api_name, response['resource'], response)
                all_responses[response['resource']] = response
Esempio n. 2
0
class InvestigateApi(object):
    """Calls the OpenDNS investigate API.

    Applies rate limits and issues parallel requests.
    """

    BASE_URL = u'https://investigate.api.opendns.com/'

    def __init__(self, api_key, cache_file_name=None):
        auth_header = {'Authorization': 'Bearer {0}'.format(api_key)}
        self._requests = MultiRequest(default_headers=auth_header,
                                      max_requests=12,
                                      rate_limit=30)

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name) if cache_file_name else None

    @classmethod
    def _to_url(cls, url_path):
        try:
            return u'{0}{1}'.format(cls.BASE_URL, url_path)
        except Exception as e:
            write_error_message(url_path)
            write_exception(e)
            raise e

    @classmethod
    def _to_urls(cls, fmt_url_path, url_path_args):
        url_paths = []
        for path_arg in url_path_args:
            try:
                url_paths.append(fmt_url_path.format(path_arg))
            except Exception as e:
                write_error_message(path_arg)
                write_exception(e)
                raise e

        return [cls._to_url(url_path) for url_path in url_paths]

    @MultiRequest.error_handling
    @_cached_by_domain(api_name='opendns-categorization')
    def categorization(self, domains):
        """Calls categorization end point and adds an 'is_suspicious' key to each response.

        Args:
            domains: An enumerable of domains
        Returns:
            A dict of {domain: categorization_result}
        """
        url_path = u'domains/categorization/?showLabels'
        response = self._requests.multi_post(self._to_url(url_path),
                                             data=simplejson.dumps(domains))
        return response[0]

    @MultiRequest.error_handling
    @_cached_by_domain(api_name='opendns-domain_score')
    def domain_score(self, domains):
        url_path = 'domains/score/'
        response = self._requests.multi_post(self._to_url(url_path),
                                             data=simplejson.dumps(domains))
        return response[0]

    @MultiRequest.error_handling
    def _multi_get(self, cache_api_name, fmt_url_path, url_params):
        """Makes multiple GETs to an OpenDNS endpoint.

        Args:
            cache_api_name: string api_name for caching
            fmt_url_path: format string for building URL paths
            url_params: An enumerable of strings used in building URLs
        Returns:
            A dict of {url_param: api_result}
        """
        all_responses = {}

        if self._cache:
            all_responses = self._cache.bulk_lookup(cache_api_name, url_params)
            url_params = [
                key for key in url_params if key not in all_responses.keys()
            ]

        if len(url_params):
            urls = self._to_urls(fmt_url_path, url_params)
            responses = self._requests.multi_get(urls)
            responses = dict(zip(url_params, responses))
            for url_param in responses.keys():
                if self._cache:
                    self._cache.cache_value(cache_api_name, url_param,
                                            responses[url_param])
                all_responses[url_param] = responses[url_param]

        return all_responses

    def security(self, domains):
        """Calls security end point and adds an 'is_suspicious' key to each response.

        Args:
            domains: An enumerable of strings
        Returns:
            A dict of {domain: security_result}
        """
        api_name = 'opendns-security'
        fmt_url_path = u'security/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def cooccurrences(self, domains):
        """Get the domains related to input domains.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of string domain names
        """
        api_name = 'opendns-cooccurrences'
        fmt_url_path = u'recommendations/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def domain_tag(self, domains):
        """Get the data range when a domain is part of OpenDNS block list.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of string with period, category, and url
        """
        api_name = 'opendns-domain_tag'
        fmt_url_path = u'domains/{0}/latest_tags'
        return self._multi_get(api_name, fmt_url_path, domains)

    def related_domains(self, domains):
        """Get list of domain names that have been seen requested around the
        same time (up to 60 seconds before or after) to the given domain name.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of [domain name, scores]
        """
        api_name = 'opendns-related_domains'
        fmt_url_path = u'links/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def rr_history(self, ips):
        """Get the domains related to input ips.

        Args:
            ips: an enumerable of strings as ips
        Returns:
            An enumerable of resource records and features
        """
        api_name = 'opendns-rr_history'
        fmt_url_path = u'dnsdb/ip/a/{0}.json'
        return self._multi_get(api_name, fmt_url_path, ips)

    def dns_rr(self, ips):
        """Get the domains related to input domains.

        Args:
            domains: an enumerable of strings as domains
        Returns:
            An enumerable of resource records and features
        """
        api_name = 'opendns-dns_rr'
        fmt_url_path = u'dnsdb/name/a/{0}.json'
        return self._multi_get(api_name, fmt_url_path, ips)

    def latest_malicious(self, ips):
        """Get the a list of malicious domains related to input ips.

        Args:
            ips: an enumerable of strings as ips
        Returns:
            An enumerable of strings for the malicious domains
        """
        api_name = 'opendns-latest_malicious'
        fmt_url_path = u'ips/{0}/latest_domains'
        return self._multi_get(api_name, fmt_url_path, ips)
Esempio n. 3
0
class InvestigateApi(object):
    """Calls the OpenDNS investigate API.

    Applies rate limits and issues parallel requests.
    """

    BASE_URL = u'https://investigate.api.opendns.com/'

    # TODO: consider moving this to a config file
    MAX_DOMAINS_IN_POST = 1000

    def __init__(self,
                 api_key,
                 cache_file_name=None,
                 update_cache=True,
                 req_timeout=None):
        auth_header = {'Authorization': 'Bearer {0}'.format(api_key)}
        self._requests = MultiRequest(
            default_headers=auth_header,
            max_requests=12,
            rate_limit=30,
            req_timeout=req_timeout,
            drop_404s=True,
        )

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name,
                               update_cache) if cache_file_name else None

    @classmethod
    def _to_url(cls, url_path):
        try:
            return u'{0}{1}'.format(cls.BASE_URL, url_path)
        except Exception as e:
            write_error_message(url_path)
            write_exception(e)
            raise e

    @classmethod
    def _to_urls(cls, fmt_url_path, url_path_args):
        url_paths = []
        for path_arg in url_path_args:
            try:
                url_paths.append(fmt_url_path.format(path_arg))
            except Exception as e:
                write_error_message(path_arg)
                write_exception(e)
                raise e

        return [cls._to_url(url_path) for url_path in url_paths]

    @MultiRequest.error_handling
    def _multi_post(self, url_path, domains):
        data = [
            simplejson.dumps(domains[pos:pos + self.MAX_DOMAINS_IN_POST])
            for pos in range(0, len(domains), self.MAX_DOMAINS_IN_POST)
        ]
        # multi_post() returns list of dictionaries, so they need to be merged into one dict
        all_responses = self._requests.multi_post(self._to_url(url_path),
                                                  data=data)
        responses = {}
        for r in all_responses:
            responses.update(r)
        return responses

    @_cached_by_domain(api_name='opendns-categorization')
    def categorization(self, domains):
        """Calls categorization end point and adds an 'is_suspicious' key to each response.

        Args:
            domains: An enumerable of domains
        Returns:
            A dict of {domain: categorization_result}
        """
        url_path = u'domains/categorization/?showLabels'
        return self._multi_post(url_path, domains)

    @_cached_by_domain(api_name='opendns-domain_score')
    def domain_score(self, domains):
        """Calls domain scores endpoint.

        This method is deprecated since OpenDNS Investigate API
        endpoint is also deprecated.
        """
        warn(
            'OpenDNS Domain Scores endpoint is deprecated. Use '
            'InvestigateApi.categorization() instead',
            DeprecationWarning,
        )
        url_path = 'domains/score/'
        return self._multi_post(url_path, domains)

    @MultiRequest.error_handling
    def _multi_get(self,
                   cache_api_name,
                   fmt_url_path,
                   url_params,
                   query_params=None):
        """Makes multiple GETs to an OpenDNS endpoint.

        Args:
            cache_api_name: string api_name for caching
            fmt_url_path: format string for building URL paths
            url_params: An enumerable of strings used in building URLs
            query_params - None / dict / list of dicts containing query params
        Returns:
            A dict of {url_param: api_result}
        """
        all_responses = {}

        if self._cache:
            all_responses = self._cache.bulk_lookup(cache_api_name, url_params)
            url_params = [
                key for key in url_params if key not in all_responses.keys()
            ]

        if len(url_params):
            urls = self._to_urls(fmt_url_path, url_params)
            responses = self._requests.multi_get(urls, query_params)
            for url_param, response in zip(url_params, responses):
                if self._cache:
                    self._cache.cache_value(cache_api_name, url_param,
                                            response)
                all_responses[url_param] = response

        return all_responses

    def security(self, domains):
        """Calls security end point and adds an 'is_suspicious' key to each response.

        Args:
            domains: An enumerable of strings
        Returns:
            A dict of {domain: security_result}
        """
        api_name = 'opendns-security'
        fmt_url_path = u'security/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def whois_emails(self, emails):
        """Calls WHOIS Email end point

        Args:
            emails: An enumerable of string Emails
        Returns:
            A dict of {email: domain_result}
        """
        api_name = 'opendns-whois-emails'
        fmt_url_path = u'whois/emails/{0}'
        return self._multi_get(api_name, fmt_url_path, emails)

    def whois_nameservers(self, nameservers):
        """Calls WHOIS Nameserver end point

        Args:
            emails: An enumerable of nameservers
        Returns:
            A dict of {nameserver: domain_result}
        """
        api_name = 'opendns-whois-nameservers'
        fmt_url_path = u'whois/nameservers/{0}'
        return self._multi_get(api_name, fmt_url_path, nameservers)

    def whois_domains(self, domains):
        """Calls WHOIS domain end point

        Args:
            domains: An enumerable of domains
        Returns:
            A dict of {domain: domain_result}
        """
        api_name = 'opendns-whois-domain'
        fmt_url_path = u'whois/{0}'
        return self._multi_get(api_name, fmt_url_path, domains)

    def whois_domains_history(self, domains):
        """Calls WHOIS domain history end point

        Args:
            domains: An enumerable of domains
        Returns:
            A dict of {domain: domain_history_result}
        """
        api_name = 'opendns-whois-domain-history'
        fmt_url_path = u'whois/{0}/history'
        return self._multi_get(api_name, fmt_url_path, domains)

    def cooccurrences(self, domains):
        """Get the domains related to input domains.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of string domain names
        """
        api_name = 'opendns-cooccurrences'
        fmt_url_path = u'recommendations/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def domain_tag(self, domains):
        """Get the data range when a domain is part of OpenDNS block list.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of string with period, category, and url
        """
        api_name = 'opendns-domain_tag'
        fmt_url_path = u'domains/{0}/latest_tags'
        return self._multi_get(api_name, fmt_url_path, domains)

    def related_domains(self, domains):
        """Get list of domain names that have been seen requested around the
        same time (up to 60 seconds before or after) to the given domain name.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of [domain name, scores]
        """
        api_name = 'opendns-related_domains'
        fmt_url_path = u'links/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def rr_history(self, ips):
        """Get the domains related to input ips.

        Args:
            ips: an enumerable of strings as ips
        Returns:
            An enumerable of resource records and features
        """
        api_name = 'opendns-rr_history'
        fmt_url_path = u'dnsdb/ip/a/{0}.json'
        return self._multi_get(api_name, fmt_url_path, ips)

    def dns_rr(self, ips):
        """Get the domains related to input domains.

        Args:
            domains: an enumerable of strings as domains
        Returns:
            An enumerable of resource records and features
        """
        api_name = 'opendns-dns_rr'
        fmt_url_path = u'dnsdb/name/a/{0}.json'
        return self._multi_get(api_name, fmt_url_path, ips)

    def latest_malicious(self, ips):
        """Get the a list of malicious domains related to input ips.

        Args:
            ips: an enumerable of strings as ips
        Returns:
            An enumerable of strings for the malicious domains
        """
        api_name = 'opendns-latest_malicious'
        fmt_url_path = u'ips/{0}/latest_domains'
        return self._multi_get(api_name, fmt_url_path, ips)

    def sample(self, hashes):
        """Get the information about a sample based on its hash.

        Args:
            hashes: an enumerable of strings as hashes
        Returns:
            An enumerable of arrays which contains the information
            about the original samples
        """
        api_name = 'opendns-sample'
        fmt_url_path = u'sample/{0}'
        return self._multi_get(api_name, fmt_url_path, hashes)

    def search(self, patterns, start=30, limit=1000, include_category=False):
        """Performs pattern searches against the Investigate database.

        Args:
            patterns: An enumerable of RegEx domain patterns to search for
            start:   How far back results extend from in days (max is 30)
            limit:   Number of results to show (max is 1000)
            include_category: Include OpenDNS security categories
        Returns:
            An enumerable of matching domain strings
        """
        api_name = 'opendns-patterns'
        fmt_url_path = u'search/{0}'
        start = '-{0}days'.format(start)
        include_category = str(include_category).lower()
        query_params = {
            'start': start,
            'limit': limit,
            'includecategory': include_category,
        }
        return self._multi_get(api_name, fmt_url_path, patterns, query_params)

    def risk_score(self, domains):
        """Performs Umbrella risk score analysis on the input domains

        Args:
            domains: an enumerable of domains
        Returns:
            An enumerable of associated domain risk scores
        """
        api_name = 'opendns-risk_score'
        fmt_url_path = u'domains/risk-score/{0}'
        return self._multi_get(api_name, fmt_url_path, domains)
Esempio n. 4
0
class InvestigateApi(object):

    """Calls the OpenDNS investigate API.

    Applies rate limits and issues parallel requests.
    """

    BASE_URL = u'https://investigate.api.opendns.com/'

    def __init__(self, api_key, cache_file_name=None):
        auth_header = {'Authorization': 'Bearer {0}'.format(api_key)}
        self._requests = MultiRequest(default_headers=auth_header, max_requests=12, rate_limit=30)

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name) if cache_file_name else None

    @classmethod
    def _to_url(cls, url_path):
        try:
            return u'{0}{1}'.format(cls.BASE_URL, url_path)
        except Exception as e:
            write_error_message(url_path)
            write_exception(e)
            raise e

    @classmethod
    def _to_urls(cls, fmt_url_path, url_path_args):
        url_paths = []
        for path_arg in url_path_args:
            try:
                url_paths.append(fmt_url_path.format(path_arg))
            except Exception as e:
                write_error_message(path_arg)
                write_exception(e)
                raise e

        return [cls._to_url(url_path) for url_path in url_paths]

    @MultiRequest.error_handling
    @_cached_by_domain(api_name='opendns-categorization')
    def categorization(self, domains):
        """Calls categorization end point and adds an 'is_suspicious' key to each response.

        Args:
            domains: An enumerable of domains
        Returns:
            A dict of {domain: categorization_result}
        """
        url_path = u'domains/categorization/?showLabels'
        response = self._requests.multi_post(self._to_url(url_path), data=simplejson.dumps(domains))
        return response[0]

    @MultiRequest.error_handling
    @_cached_by_domain(api_name='opendns-domain_score')
    def domain_score(self, domains):
        url_path = 'domains/score/'
        response = self._requests.multi_post(self._to_url(url_path), data=simplejson.dumps(domains))
        return response[0]

    @MultiRequest.error_handling
    def _multi_get(self, cache_api_name, fmt_url_path, url_params):
        """Makes multiple GETs to an OpenDNS endpoint.

        Args:
            cache_api_name: string api_name for caching
            fmt_url_path: format string for building URL paths
            url_params: An enumerable of strings used in building URLs
        Returns:
            A dict of {url_param: api_result}
        """
        all_responses = {}

        if self._cache:
            all_responses = self._cache.bulk_lookup(cache_api_name, url_params)
            url_params = [key for key in url_params if key not in all_responses.keys()]

        if len(url_params):
            urls = self._to_urls(fmt_url_path, url_params)
            responses = self._requests.multi_get(urls)
            responses = dict(zip(url_params, responses))
            for url_param in responses.keys():
                if self._cache:
                    self._cache.cache_value(cache_api_name, url_param, responses[url_param])
                all_responses[url_param] = responses[url_param]

        return all_responses

    def security(self, domains):
        """Calls security end point and adds an 'is_suspicious' key to each response.

        Args:
            domains: An enumerable of strings
        Returns:
            A dict of {domain: security_result}
        """
        api_name = 'opendns-security'
        fmt_url_path = u'security/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def cooccurrences(self, domains):
        """Get the domains related to input domains.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of string domain names
        """
        api_name = 'opendns-cooccurrences'
        fmt_url_path = u'recommendations/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def domain_tag(self, domains):
        """Get the data range when a domain is part of OpenDNS block list.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of string with period, category, and url
        """
        api_name = 'opendns-domain_tag'
        fmt_url_path = u'domains/{0}/latest_tags'
        return self._multi_get(api_name, fmt_url_path, domains)

    def related_domains(self, domains):
        """Get list of domain names that have been seen requested around the
        same time (up to 60 seconds before or after) to the given domain name.

        Args:
            domains: an enumerable of strings domain names
        Returns:
            An enumerable of [domain name, scores]
        """
        api_name = 'opendns-related_domains'
        fmt_url_path = u'links/name/{0}.json'
        return self._multi_get(api_name, fmt_url_path, domains)

    def rr_history(self, ips):
        """Get the domains related to input ips.

        Args:
            ips: an enumerable of strings as ips
        Returns:
            An enumerable of resource records and features
        """
        api_name = 'opendns-rr_history'
        fmt_url_path = u'dnsdb/ip/a/{0}.json'
        return self._multi_get(api_name, fmt_url_path, ips)

    def dns_rr(self, ips):
        """Get the domains related to input domains.

        Args:
            domains: an enumerable of strings as domains
        Returns:
            An enumerable of resource records and features
        """
        api_name = 'opendns-dns_rr'
        fmt_url_path = u'dnsdb/name/a/{0}.json'
        return self._multi_get(api_name, fmt_url_path, ips)

    def latest_malicious(self, ips):
        """Get the a list of malicious domains related to input ips.

        Args:
            ips: an enumerable of strings as ips
        Returns:
            An enumerable of strings for the malicious domains
        """
        api_name = 'opendns-latest_malicious'
        fmt_url_path = u'ips/{0}/latest_domains'
        return self._multi_get(api_name, fmt_url_path, ips)
Esempio n. 5
0
class VirusTotalApi(object):
    BASE_DOMAIN = u'https://www.virustotal.com/vtapi/v2/'

    def __init__(self, api_key, resources_per_req=25, cache_file_name=None):
        """Establishes basic HTTP params and loads a cache.

        Args:
            api_key: VirusTotal API key
            resources_per_req: Maximum number of resources (hashes, URLs)
                to be send in a single request
            cache_file_name: String file name of cache.
        """
        self._api_key = api_key
        self._resources_per_req = resources_per_req
        self._requests = MultiRequest()

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name) if cache_file_name else None

    @MultiRequest.error_handling
    def get_file_reports(self, resources):
        """Retrieves the most recent reports for a set of md5, sha1, and/or sha2 hashes.

        Args:
            resources: list of string hashes.
        Returns:
            A dict with the hash as key and the VT report as value.
        """
        api_name = 'virustotal-file-reports'

        all_responses, resources = self._bulk_cache_lookup(api_name, resources)
        resource_chunks = self._prepare_resource_chunks(resources)
        response_chunks = self._request_reports("resource", resource_chunks, 'file/report')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    def _extract_all_responses(self, resources, api_endpoint, api_name):
        """ Aux function to extract all the API endpoint responses.

        Args:
            resources: list of string hashes.
            api_endpoint: endpoint path
            api_name: endpoint name
        Returns:
            A dict with the hash as key and the VT report as value.
        """
        all_responses, resources = self._bulk_cache_lookup(api_name, resources)
        resource_chunks = self._prepare_resource_chunks(resources)
        response_chunks = self._request_reports("resource", resource_chunks, api_endpoint)
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_behaviour(self, resources):
        """Retrieves a report about the behaviour of a md5, sha1, and/or sha2 hash of
        a file when executed in a sandboxed environment (Cuckoo sandbox).

        Args:
            resources: list of string hashes.
        """
        api_name = 'virustotal-file-behaviour'
        api_endpoint = 'file/behaviour'
        return self._extract_all_responses(resources, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_file_download(self, resources):
        """Retrieves a file from its a md5, sha1, and/or sha2 hash.

        Args:
            resources: list of string hashes.
        Returns:
            a file download
        """
        api_name = 'virustotal-file-download'
        api_endpoint = 'file/download'
        return self._extract_all_responses(resources, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_file_network_traffic(self, resources):
        """Retrieves a report about the network traffic of a md5, sha1, and/or sha2 hash of
           file, when it is executed.

        Args:
            resources: list of string hashes.
        """
        api_name = 'virustotal-file-network-traffic'
        api_endpoint = 'file/network-traffic'
        return self._extract_all_responses(resources, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_domain_reports(self, domains):
        """Retrieves the most recent VT info for a set of domains.

        Args:
            domains: list of string domains.
        Returns:
            A dict with the domain as key and the VT report as value.
        """
        api_name = 'virustotal-domain-reports'

        (all_responses, domains) = self._bulk_cache_lookup(api_name, domains)
        responses = self._request_reports("domain", domains, 'domain/report')

        for domain, response in zip(domains, responses):
            if self._cache:
                self._cache.cache_value(api_name, domain, response)
            all_responses[domain] = response

        return all_responses

    @MultiRequest.error_handling
    def get_url_distribution(self, params=None):
        """Retrieves a live feed with the latest URLs submitted to VT.

        Args:
            resources: a dictionary with name and value for optional arguments
        Returns:
            A dict with the VT report.
        """
        params = params or {}
        all_responses = {}
        api_name = 'virustotal-url-distribution'

        response_chunks = self._request_reports(params.keys(), params.values(), 'url/distribution')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_distribution(self, params=None):
        """Retrieves a live feed with the latest hashes submitted to VT.

        Args:
            params: a dictionary with name and values for optional arguments,
            such as: before (timestampe), after (timestamp), reports (boolean),
            limit (retrieve limit file items).
            Example: 'reports': 'true'
        Returns:
            A dict with the VT report.
        """
        params = params or []
        all_responses = {}
        api_name = 'virustotal-file-distribution'

        response_chunks = self._request_reports(params.keys(), params.value(), 'file/distribution')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_url_reports(self, resources):
        """Retrieves a scan report on a given URL.

        Args:
            resources: list of URLs.
        Returns:
            A dict with the URL as key and the VT report as value.
        """
        api_name = 'virustotal-url-reports'

        (all_responses, resources) = self._bulk_cache_lookup(api_name, resources)
        resource_chunks = self._prepare_resource_chunks(resources, '\n')
        response_chunks = self._request_reports("resource", resource_chunks, 'url/report')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_ip_reports(self, ips):
        """Retrieves the most recent VT info for a set of ips.

        Args:
            ips: list of IPs.
        Returns:
            A dict with the IP as key and the VT report as value.
        """
        api_name = 'virustotal-ip-address-reports'

        (all_responses, ips) = self._bulk_cache_lookup(api_name, ips)
        responses = self._request_reports("ip", ips, 'ip-address/report')

        for ip, response in zip(ips, responses):
            if self._cache:
                self._cache.cache_value(api_name, ip, response)
            all_responses[ip] = response

        return all_responses

    @MultiRequest.error_handling
    def get_file_search(self, query):
        """Performs advanced search on samples, matching certain binary/
           metadata/detection criteria.
           Possible queries: file size, file type, first or last submission to
            VT, number of positives, bynary content, etc.

        Args:
            query: dictionary with search arguments
            Example: 'query': 'type:peexe size:90kb+ positives:5+ behaviour:"taskkill"'
        Returns:
            A dict with the VT report.
        """
        api_name = 'virustotal-file-search'

        (all_responses, query) = self._bulk_cache_lookup(api_name, query)
        response_chunks = self._request_reports("query", query, 'file/search')
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_clusters(self, date):
        """Retrieves file similarity clusters for a given time frame.

        Args:
            date: the specific date for which we want the clustering details.
            Example: 'date': '2013-09-10'
        Returns:
            A dict with the VT report.
        """
        api_name = 'virustotal-file-clusters'

        (all_responses, resources) = self._bulk_cache_lookup(api_name, date)
        response = self._request_reports("date", date, 'file/clusters')
        self._extract_response_chunks(all_responses, response, api_name)

        return all_responses

    def _bulk_cache_lookup(self, api_name, keys):
        """Performes a bulk cache lookup and returns a tuple with the results
        found and the keys missing in the cache. If cached is not configured
        it will return an empty dictionary of found results and the initial
        list of keys.

        Args:
            api_name: a string name of the API.
            keys: an enumerable of string keys.
        Returns:
            A tuple: (responses found, missing keys).
        """
        if self._cache:
            responses = self._cache.bulk_lookup(api_name, keys)
            missing_keys = [key for key in keys if key not in responses.keys()]
            return (responses, missing_keys)

        return ({}, keys)

    def _prepare_resource_chunks(self, resources, resource_delim=','):
        """As in some VirusTotal API methods the call can be made for multiple
        resources at once this method prepares a list of concatenated resources
        according to the maximum number of resources per requests.

        Args:
            resources: a list of the resources.
            resource_delim: a string used to separate the resources.
              Default value is a comma.
        Returns:
            A list of the concatenated resources.
        """
        return [self._prepare_resource_chunk(resources, resource_delim, pos)
                for pos in xrange(0, len(resources), self._resources_per_req)]

    def _prepare_resource_chunk(self, resources, resource_delim, pos):
        return resource_delim.join(
            resources[pos:pos + self._resources_per_req])

    def _request_reports(self, resource_param_name, resources, endpoint_name):
        """Sends multiples requests for the resources to a particular endpoint.

        Args:
            resource_param_name: a string name of the resource parameter.
            resources: list of of the resources.
            endpoint_name: VirusTotal endpoint URL suffix.
        Returns:
            A list of the responses.
        """
        params = [{resource_param_name: resource, 'apikey': self._api_key} for resource in resources]
        return self._requests.multi_get(self.BASE_DOMAIN + endpoint_name, query_params=params)

    def _extract_response_chunks(self, all_responses, response_chunks, api_name):
        """Extracts and caches the responses from the response chunks in case
        of the responses for the requests containing multiple concatenated
        resources. Extracted responses are added to the already cached
        responses passed in the all_responses parameter.

        Args:
            all_responses: a list containing already cached responses.
            response_chunks: a list with response chunks.
            api_name: a string name of the API.
        """
        for response_chunk in response_chunks:
            if not isinstance(response_chunk, list):
                response_chunk = [response_chunk]
            for response in response_chunk:
                if not response:
                    continue

                if self._cache:
                    self._cache.cache_value(api_name, response['resource'], response)
                all_responses[response['resource']] = response
Esempio n. 6
0
class VirusTotalApi(object):
    BASE_DOMAIN = u'https://www.virustotal.com/api/v3/'

    def __init__(self,
                 api_key,
                 cache_file_name=None,
                 update_cache=True,
                 req_timeout=None):
        """Establishes basic HTTP params and loads a cache.

        Args:
            api_key: VirusTotal API key
            cache_file_name: String file name of cache.
            update_cache: Determines whether cache should be written out back to the disk when closing it.
                          Default is `True`.
            req_timeout: Maximum number of seconds to wait without reading a response byte before deciding an error has occurred.
                         Default is None.
        """
        self._requests = MultiRequest(req_timeout=req_timeout,
                                      default_headers={'x-apikey': api_key},
                                      drop_404s=True)

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name,
                               update_cache) if cache_file_name else None

    @MultiRequest.error_handling
    def get_file_reports(self, file_hash_list):
        """Retrieves the most recent reports for a set of md5, sha1, and/or sha2 hashes.

        Args:
            file_hash_list: list of string hashes.
        Returns:
            A dict with the hash as key and the VT report as value.
        """
        api_name = 'virustotal-file-reports'
        api_endpoint = 'files/{}'

        all_responses, file_hash_list = self._bulk_cache_lookup(
            api_name, file_hash_list)
        response_chunks = self._request_reports(file_hash_list, api_endpoint)
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_behaviour(self, file_hash_list):
        """Retrieves a report about the behaviour of a md5, sha1, and/or sha2 hash of
        a file when executed in a sandboxed environment (Cuckoo sandbox).

        Args:
            file_hash_list: list of string hashes.
        """
        api_name = 'virustotal-file-behaviour'
        api_endpoint = 'files/{}/behaviours'

        all_responses, file_hash_list = self._bulk_cache_lookup(
            api_name, file_hash_list)
        response_chunks = self._request_reports(file_hash_list, api_endpoint)
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_download(self, file_hash_list):
        """Retrieves a file from its a md5, sha1, and/or sha2 hash.

        Args:
            file_hash_list: list of string hashes.
        Returns:
            a base64encoded string of the file
        """
        api_name = 'virustotal-file-download'
        api_endpoint = 'files/{}/download'
        return self._extract_all_responses(file_hash_list,
                                           api_endpoint,
                                           api_name,
                                           file_download=True)

    @MultiRequest.error_handling
    def get_file_contacted_domains(self, file_hash_list):
        """Retrieves a report about the contacted domains of a md5, sha1, and/or sha2 hash of
           file, when it is executed.

        Args:
            file_hash_list: list of string hashes.
        """
        api_name = 'virustotal-file-contacted-domains'
        api_endpoint = 'files/{}/contacted_domains'

        return self._extract_all_responses(file_hash_list, api_endpoint,
                                           api_name)

    @MultiRequest.error_handling
    def get_file_contacted_ips(self, file_hash_list):
        """Retrieves a report about the contacted ip addresses of a md5, sha1,
           and/or sha2 hash of file, when it is executed.

        Args:
            resources: list of string hashes.
        """
        api_name = 'virustotal-file-contacted-ips'
        api_endpoint = 'files/{}/contacted_ips'

        return self._extract_all_responses(file_hash_list, api_endpoint,
                                           api_name)

    @MultiRequest.error_handling
    def get_file_contacted_urls(self, file_hash_list):
        """Retrieves a report about the contacted urls of a md5, sha1,
           and/or sha2 hash of file, when it is executed.

        Args:
            file_hash_list: list of string hashes.
        """
        api_name = 'virustotal-file-contacted-urls'
        api_endpoint = 'files/{}/contacted_urls'

        return self._extract_all_responses(file_hash_list, api_endpoint,
                                           api_name)

    @MultiRequest.error_handling
    def get_file_itw_urls(self, file_hash_list):
        """Retrieves a report about the in the wild URLs from where the file
           with the hash has been downloaded.

        Args:
            file_hash_list: list of string hashes.
        """
        api_name = 'virustotal-file-itw-urls'
        api_endpoint = 'files/{}/itw_urls'

        return self._extract_all_responses(file_hash_list, api_endpoint,
                                           api_name)

    @MultiRequest.error_handling
    def get_domain_communicating_files(self, domain_list):
        """Retrieves a report about the files that communicate with this internet domain.

        Args:
            domain_list: list of string domains.
        """
        api_name = 'virustotal-domain-communicating-files'
        api_endpoint = 'domains/{}/communicating_files'

        return self._extract_all_responses(domain_list, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_domain_referrer_files(self, domain_list):
        """Retrieves a report about the files containing the internet domain.

        Args:
            domain_list: list of string domains.
        """
        api_name = 'virustotal-domain-referrer-files'
        api_endpoint = 'domains/{}/referrer_files'

        return self._extract_all_responses(domain_list, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_domain_reports(self, domain_list):
        """Retrieves the most recent VT info for a set of domains.

        Args:
            domain_list: list of string domains.
        Returns:
            A dict with the domain as key and the VT report as value.
        """
        api_name = 'virustotal-domain-reports'

        (all_responses,
         domain_list) = self._bulk_cache_lookup(api_name, domain_list)
        responses = self._request_reports(domain_list, 'domains/{}')

        for domain, response in zip(domain_list, responses):
            if self._cache:
                self._cache.cache_value(api_name, domain, response)
            all_responses[domain] = response

        return all_responses

    @MultiRequest.error_handling
    def get_feeds_url(self, time_frame):
        """Retrieves a live feed with the latest URLs submitted to VT.

        Args:
            time_frame: a list of timeframe strings in date format YYYYMMDDhhmm.
        Returns:
            A base64 encoded bzip2 compressed UTF-8 text file contains one JSON structure per line.
        """
        api_name = 'virustotal-url-distribution'
        all_responses = {}

        response = self._request_reports(time_frame,
                                         'feeds/urls/{}',
                                         file_download=True)
        self._extract_response_chunks(all_responses, response, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_file_distribution(self, time_frame):
        """Retrieves a live feed with the latest hashes submitted to VT.

        Args:
            time_frame: A list of strings in format YYYYMMDDhhmm.
        Returns:
            A dict with the VT report.
        """
        all_responses = {}
        api_name = 'virustotal-file-distribution'

        response = self._request_reports(time_frame, 'feeds/files/{}')
        self._extract_response_chunks(all_responses, response, api_name)

        return all_responses

    @MultiRequest.error_handling
    def get_url_reports(self, url_hash_list):
        """Retrieves a scan report on a given URL.

        Args:
            url_hash_list: list of sha256 hashed urls.
        Returns:
            A dict with the URL hash as key and the VT report as value.
        """
        api_name = 'virustotal-url-reports'
        api_endpoint = 'urls/{}'

        return self._extract_all_responses(url_hash_list, api_endpoint,
                                           api_name)

    @MultiRequest.error_handling
    def get_ip_reports(self, ips):
        """Retrieves the most recent VT info for a set of ips.

        Args:
            ips: list of IPs.
        Returns:
            A dict with the IP as key and the VT report as value.
        """
        api_name = 'virustotal-ip-address-reports'

        (all_responses, ips) = self._bulk_cache_lookup(api_name, ips)
        responses = self._request_reports(ips, 'ip_addresses/{}')

        for ip, response in zip(ips, responses):
            if self._cache:
                self._cache.cache_value(api_name, ip, response)
            all_responses[ip] = response

        return all_responses

    @MultiRequest.error_handling
    def get_file_search(self, query):
        """Performs advanced search on samples, matching certain binary/
           metadata/detection criteria.
           Possible queries: file size, file type, first or last submission to
            VT, number of positives, bynary content, etc.

        Args:
            query: dictionary with search arguments
            Example: 'query': 'type:peexe size:90kb+ positives:5+ behaviour:"taskkill"'
        Returns:
            A dict with the VT report.
        """
        api_name = 'virustotal-file-search'
        api_endpoint = 'intelligence/search?query={}'

        return self._extract_all_responses(query, api_endpoint, api_name)

    @MultiRequest.error_handling
    def get_file_clusters(self, time_frame):
        """Retrieves file similarity clusters for a given time frame.

        Args:
            time_frame: a list of time frames for which we want the clustering details in YYYYMMDDhhmm format.
        Returns:
            A dict with the VT report.
        """
        api_name = 'virustotal-file-clusters'
        api_endpoint = 'feeds/file-behaviours/{}'

        return self._extract_all_responses(time_frame, api_endpoint, api_name)

    def _bulk_cache_lookup(self, api_name, keys):
        """Performes a bulk cache lookup and returns a tuple with the results
        found and the keys missing in the cache. If cached is not configured
        it will return an empty dictionary of found results and the initial
        list of keys.

        Args:
            api_name: a string name of the API.
            keys: an enumerable of string keys.
        Returns:
            A tuple: (responses found, missing keys).
        """
        if self._cache:
            responses = self._cache.bulk_lookup(api_name, keys)
            missing_keys = [key for key in keys if key not in responses.keys()]
            return (responses, missing_keys)

        return ({}, keys)

    def _request_reports(self, ids, endpoint_name, file_download=False):
        """Sends multiples requests for the resources to a particular endpoint.

        Args:
            ids: list of the hash identifying the file.
            endpoint_name: VirusTotal endpoint URL suffix.
            file_download: boolean, whether a file download is expected
        Returns:
            A list of the responses.
        """
        urls = [
            '{}{}'.format(self.BASE_DOMAIN, endpoint_name.format(id))
            for id in ids
        ]
        return self._requests.multi_get(
            urls, file_download=file_download) if urls else []

    def _extract_cache_id(self, response):
        """Extracts the object hash from the response to be used to
           uniquely identify the result.

        Args:
            response: response object.
        Returns:
            A hash that uniquely identities the result.
        """

        cache_id = None
        if isinstance(response['data'], list):
            if response['data']:
                # gets the first data items' id
                cache_id = response['data'][0]['id']
        else:
            cache_id = response['data']['id']
        # sandbox id output has an underscore as the separator
        if cache_id and '_' in cache_id:
            cache_id = cache_id.split('_')[0]
        return cache_id

    def _extract_all_responses(self,
                               resources,
                               api_endpoint,
                               api_name,
                               file_download=False):
        """ Aux function to extract all the API endpoint responses.

        Args:
            resources: list of string hashes.
            api_endpoint: endpoint path
            api_name: endpoint name
        Returns:
            A dict with the hash as key and the VT report as value.
        """
        all_responses, resources = self._bulk_cache_lookup(api_name, resources)
        response_chunks = self._request_reports(resources, api_endpoint,
                                                file_download)
        self._extract_response_chunks(all_responses, response_chunks, api_name)

        return all_responses

    def _extract_response_chunks(self, all_responses, response_chunks,
                                 api_name):
        """Extracts and caches the responses from the response chunks in case
        of the responses for the requests containing multiple concatenated
        resources. Extracted responses are added to the already cached
        responses passed in the all_responses parameter.

        Args:
            all_responses: a list containing already cached responses.
            response_chunks: a list with response chunks.
            api_name: a string name of the API.
        """
        for response_chunk in response_chunks:
            if not isinstance(response_chunk, list):
                response_chunk = [response_chunk]
            for response in response_chunk:
                if not response:
                    continue

                cache_id = self._extract_cache_id(response)
                if cache_id:
                    if self._cache:
                        self._cache.cache_value(api_name, cache_id, response)
                    all_responses[cache_id] = response
Esempio n. 7
0
class AlexaRankingApi(object):

    BASE_URL = u'https://data.alexa.com/data?cli=10'

    def __init__(self,
                 resources_per_req=10,
                 cache_file_name=None,
                 update_cache=True,
                 req_timeout=None):
        """Establishes basic HTTP params and loads a cache.

        Args:
            resources_per_req: Maximum number of resources (hashes, URLs)
                to be send in a single request
            cache_file_name: String file name of cache.
            update_cache: Determines whether cache should be written out
                          back to the disk when closing it.
                          Default is `True`.
            req_timeout: Maximum number of seconds to wait without reading
                         a response byte before deciding an error has occurred.
                         Default is None.
        """
        self._resources_per_req = resources_per_req
        self._requests = MultiRequest(req_timeout=req_timeout)

        # Create an ApiCache if instructed to
        self._cache = ApiCache(cache_file_name,
                               update_cache) if cache_file_name else None

    @MultiRequest.error_handling
    def get_alexa_rankings(self, domains):
        """Retrieves the most recent VT info for a set of domains.

        Args:
            domains: list of string domains.
        Returns:
            A dict with the domain as key and the VT report as value.
        """
        api_name = 'alexa_rankings'

        (all_responses, domains) = self._bulk_cache_lookup(api_name, domains)
        responses = self._request_reports(domains)

        for domain, response in zip(domains, responses):
            xml_response = self._extract_response_xml(domain, response)
            if self._cache:
                self._cache.cache_value(api_name, domain, response)
            all_responses[domain] = xml_response

        return all_responses

    def _request_reports(self, domains):
        """Sends multiples requests for the resources to a particular endpoint.

        Args:
            resource_param_name: a string name of the resource parameter.
            resources: list of of the resources.
            endpoint_name: AlexaRankingApi endpoint URL suffix.
        Returns:
            A list of the responses.
        """
        params = [{'url': domain} for domain in domains]
        responses = self._requests.multi_get(self.BASE_URL,
                                             query_params=params,
                                             to_json=False)
        return responses

    def _extract_response_xml(self, domain, response):
        """Extract XML content of an HTTP response into dictionary format.

        Args:
            response: HTML Response objects
        Returns:
            A dictionary: {alexa-ranking key : alexa-ranking value}.
        """
        attributes = {}
        alexa_keys = {'POPULARITY': 'TEXT', 'REACH': 'RANK', 'RANK': 'DELTA'}
        try:
            xml_root = ET.fromstring(response._content)
            for xml_child in xml_root.findall('SD//'):
                if xml_child.tag in alexa_keys and \
                        alexa_keys[xml_child.tag] in xml_child.attrib:
                    attributes[xml_child.tag.lower()] = xml_child.attrib[
                        alexa_keys[xml_child.tag]]
        except ParseError:
            # Skip ill-formatted XML and return no Alexa attributes
            pass
        attributes['domain'] = domain
        return {'attributes': attributes}

    def _bulk_cache_lookup(self, api_name, keys):
        """Performes a bulk cache lookup and returns a tuple with the results
        found and the keys missing in the cache. If cached is not configured
        it will return an empty dictionary of found results and the initial
        list of keys.

        Args:
            api_name: a string name of the API.
            keys: an enumerable of string keys.
        Returns:
            A tuple: (responses found, missing keys).
        """
        if self._cache:
            responses = self._cache.bulk_lookup(api_name, keys)
            missing_keys = [key for key in keys if key not in responses.keys()]
            return (responses, missing_keys)

        return ({}, keys)