Esempio n. 1
0
def main():

    argparser = configargparse.ArgumentParser(
        description="AWIS API Proof of Concept")

    argparser.add_argument('--key-id', required=True)
    argparser.add_argument('--secret-key', required=True)
    argparser.add_argument('--sites', required=True, nargs='+')
    args = argparser.parse_args()

    client = AwisApi(args.key_id, args.secret_key)

    tree = client.url_info(args.sites, "Rank", "LinksInCount", "Speed")
    print etree_tostring(tree)

    print "client ns_prefixes: ", client.NS_PREFIXES
    alexa_prefix = client.NS_PREFIXES['alexa']
    awis_prefix = client.NS_PREFIXES['awis']

    elem = tree.find('//{%s}StatusCode' % alexa_prefix)
    assert elem.text == 'Success'

    for elem_result in tree.findall('//{%s}UrlInfoResult' % awis_prefix):
        # print etree_tostring(elem_result)
        print "elem_result tag: %s, text: %s" % (elem_result.tag,
                                                 elem_result.text)

        tree_result = ElementTree(elem_result)
        elem_url = tree_result.find('//{%s}DataUrl' % awis_prefix)
        if elem_url is not None:
            print "elem_url tag: %s, text: %s" % (elem_url.tag, elem_url.text)
        elem_metric = tree_result.find('//{%s}Rank' % awis_prefix)
        if elem_metric is not None:
            print "elem_metric tag: %s, text: %s " % (elem_metric.tag,
                                                      elem_metric.text)
Esempio n. 2
0
    def get_metrics(cls, domains, metrics, options):
        awis_client = AwisApi(options.key_id, options.secret_key)

        tree = awis_client.url_info(domains, *metrics)

        alexa_prefix = awis_client.NS_PREFIXES['alexa']
        awis_prefix = awis_client.NS_PREFIXES['awis']

        elem = tree.find('//{%s}StatusCode' % alexa_prefix)
        if elem.text != 'Success':
            raise UserWarning('unable to get metrics: %s' %
                              etree_tostring(tree))

        metric_values = []
        elems_results = enumerate(
            tree.findall('//{%s}UrlInfoResult' % awis_prefix))
        for result_count, elem_result in elems_results:
            # print("UrlInfoResult Elem: %s" % etree_tostring(elem_result))
            # print("elem_result tag: %s, text: %s" % (elem_result.tag, elem_result.text))
            tree_result = ElementTree(elem_result)
            domain = None
            elem_url = tree_result.find('//{%s}DataUrl' % awis_prefix)
            if elem_url is not None:
                # print("elem_url tag: %s, text: %s" % (elem_url.tag, elem_url.text))
                domain = elem_url.text
                if domain[-1] == "/":
                    domain = domain[:-1]

            assert domain == domains[result_count], \
                "sanity check %s == %s" % (domain, domains[result_count])
            # if domain:
            # print("getting results for domain %s" % domain)

            domain_metrics = {}
            for metric in metrics:
                elem_metric = tree_result.find('//{%s}%s' %
                                               (awis_prefix, metric))
                if elem_metric is None:
                    raise UserWarning('unable to find metric within UrlInfoResult: %s' \
                        % etree_tostring(tree_result))
                domain_metrics[metric] = elem_metric.text
            metric_values.append(domain_metrics)

        print("success: %s" % metric_values)
        return metric_values
Esempio n. 3
0
	def start( self, baseUrl ):
		queryUrl = 'http://' + baseUrl

		content = {}

		api = AwisApi(aws_config['accessKeyId'], aws_config['secretAccessKey'])
		respXml = api.url_info(queryUrl, 'RelatedLinks', 'Categories', 'Rank', 'RankByCountry', 'UsageStats', 'ContactInfo', 'Speed', 'Language', 'Keywords', 'OwnedDomains', 'LinksInCount', 'SiteData' )
		xml = etree.tostring( respXml, encoding = 'UTF-8' )
		
		respStatus = respXml.find( '//{%s}StatusCode' % api.NS_PREFIXES['alexa'] ).text
		if 'Success' == respStatus:
			dom_doc = parseString( xml )
			rank_list_items = []
			for country in dom_doc.getElementsByTagName( 'aws:Country' ):
				country_code = country.getAttribute( 'Code' )
				country_name = country_name_by_code( country_code )
				ranks = country.getElementsByTagName( 'aws:Rank' )
				if len( ranks ) > 0 and ranks[0].firstChild is not None:
					rank = ranks[0].firstChild.nodeValue
					try:
						rank_list_items.append( '<li>%(rank)s<sup>th</sup> most visited website in <img src="/images/flags/%(countryCode)s.png" alt="%(countryName)s flag" /> %(countryName)s</li>' % { 'countryCode': country_code.lower(), 'countryName': country_name, 'rank': rank } )
					except:
						pass
			content['visitorsLocation'] = '<ul>' + ''.join( rank_list_items[:3] ) + '</ul>'

			related_list_items = []
			for related in dom_doc.getElementsByTagName( 'aws:RelatedLink' ):
				related_url = related.getElementsByTagName( 'aws:NavigableUrl' )[0].firstChild.nodeValue
				related_title = related.getElementsByTagName( 'aws:Title' )[0].firstChild.nodeValue
				related_list_items.append( '<li><a href="%s" rel="nofollow" class="external" target="_blank">%s</a></li>' % ( related_url, related_title ) )
			content['relatedLinks'] = '<ul>' + ''.join( related_list_items[:5] ) + '</ul>'

			content['worldRank'] = respXml.find( '//{%s}Rank' % api.NS_PREFIXES['awis'] ).text

			temp = respXml.find( '//{%s}MedianLoadTime' % api.NS_PREFIXES['awis'] ).text
			if temp is not None:
				content['loadTimeMs'] = long( temp )
				if int( respXml.find( '//{%s}Percentile' % api.NS_PREFIXES['awis'] ).text ) < 50:
					pass

		self.sendAndSaveReport( baseUrl, content )
Esempio n. 4
0
import os

from awis import AwisApi

ACCESS_ID = os.environ.get("AWS_ACCESS_ID", None)
SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", None)
assert ACCESS_ID and SECRET_ACCESS_KEY, "You must set credentials in the environment."

api = AwisApi(ACCESS_ID, SECRET_ACCESS_KEY)
# tree = api.top_info(count=10, offset=0, path='Top', recursive=True, descriptions=True)
tree = api.url_info("www.dailydot.com", "Rank", "LinksInCount")
Esempio n. 5
0
# AUTHOR: JOHN SKANDALAKIS
# USE: This program queries the Alexa API to get the URLs associated with an
# alexa category. To use this file you need an alexa api key file as provided
# by AWS. The file needs to be located in the same directory

from awis import AwisApi
import os.path

# grab the api key and id from file and create the alexa object
ACCESS_ID = None
SECRET_ACCESS_KEY = None
with open("rootkey.csv", "r") as keyfile:
    ACCESS_ID = keyfile.readline().split("=")[1].strip()
    SECRET_ACCESS_KEY = keyfile.readline().split("=")[1].strip()
api = AwisApi(ACCESS_ID, SECRET_ACCESS_KEY)

# check the list of categories you want to take from alexa
with open('categories.csv', 'r') as c:

    for category in c:
        category = category.strip()

        fname = "categories/" + category.replace('/', '-')

        # check to make sure you haven't already
        # done this so you don't spend money
        if os.path.isfile(fname):
            print fname, "already exists"
            continue

        i = 1
Esempio n. 6
0
# USE: This program queries the Alexa API to get the URLs associated with an 
# alexa category. To use this file you need an alexa api key file as provided 
# by AWS. The file needs to be located in the same directory


from awis import AwisApi
import os.path


# grab the api key and id from file and create the alexa object
ACCESS_ID = None
SECRET_ACCESS_KEY = None
with open("rootkey.csv","r") as keyfile:
    ACCESS_ID = keyfile.readline().split("=")[1].strip()
    SECRET_ACCESS_KEY = keyfile.readline().split("=")[1].strip()
api = AwisApi(ACCESS_ID, SECRET_ACCESS_KEY)

# check the list of categories you want to take from alexa
with open('categories.csv','r') as c:

    
    for category in c:
        category = category.strip()
        
        fname = "categories/"+category.replace('/','-')

        # check to make sure you haven't already 
        # done this so you don't spend money
        if os.path.isfile(fname):
            print fname, "already exists"
            continue        
Esempio n. 7
0
def test_unicode():
    api = AwisApi(os.environ["AWS_ACCESS_ID"], os.environ["AWS_SECRET_ACCESS_KEY"])
    tree = api.category_listings("Top/World/Dansk/Børn_og_unge/Kultur")
    listings = tree.findall(".//awis:Listing", AwisApi.NS_PREFIXES)

    assert len(listings) > 0
Esempio n. 8
0
 def __enter__(self):
     self.api = AwisApi(*self.auth)
     self.session_result = {}
     self.session_list_of_raw_result = []
     return self
Esempio n. 9
0
class AWISContextManager:
    def __init__(self, access_id, secret_access_key, workers_count=5):
        self.auth = (access_id.encode('utf-8'),
                     secret_access_key.encode('utf-8'))
        self.workers_count = workers_count
        self.closed = False
        self.cache = Cache()

    def __enter__(self):
        self.api = AwisApi(*self.auth)
        self.session_result = {}
        self.session_list_of_raw_result = []
        return self

    def __exit__(self, *args):
        # Save new values (or update exists)
        for d, data in self.session_result.items():
            self.cache.set(d, data)

        self.closed = True

    def url_info(self, domains, *categories):
        """ Wrapper over `AwisApi.url_info` for parallel processing.
        """
        if self.closed:
            raise AttributeError('Session is closed')

        CHUNK_SIZE = 5  # AWIS`s limit

        def request(domains, categories):
            logger.info('AWIS request for %s', domains)
            return self.api.url_info(domains, *categories)

        with ThreadPoolExecutor(self.workers_count) as executor:
            tasks = [
                executor.submit(request, chunk, categories)
                for chunk in chunks(CHUNK_SIZE, self.handle_cache(domains))
            ]

            for future in as_completed(tasks):
                self.session_list_of_raw_result.append(future.result())

    def get_value(self, root, path, default=None):
        """ Shortcut for fetching first node.
        """
        path = self.handle_path(path)
        return getattr(root.find(path, self.api.NS_PREFIXES), 'text', default)

    def iter_results(self, path):
        """ Iterator for filling results for a particular domain.
        """
        path = self.handle_path(path)
        for tree in self.session_list_of_raw_result:
            for node in tree.findall(path, self.api.NS_PREFIXES):
                domain = self.get_value(node, 'DataUrl')
                result_row = self.session_result.setdefault(
                    domain, defaultdict(lambda: None))

                yield domain, node, result_row

    @staticmethod
    def handle_path(path):
        # TODO: compile reg-exp?
        return './/awis:%s' % (path.strip('.//').strip('/')
                               .replace('/', '/awis:'))

    def handle_cache(self, domains):
        """ Filter domains which already in cache and put them into the result.
        """
        for d in domains:
            cached_value = self.cache.get(d)

            if cached_value:
                self.session_result[d] = cached_value
                continue

            yield d
Esempio n. 10
0
    def start(self, baseUrl):
        queryUrl = 'http://' + baseUrl

        content = {}

        api = AwisApi(aws_config['accessKeyId'], aws_config['secretAccessKey'])
        respXml = api.url_info(queryUrl, 'RelatedLinks', 'Categories', 'Rank',
                               'RankByCountry', 'UsageStats', 'ContactInfo',
                               'Speed', 'Language', 'Keywords', 'OwnedDomains',
                               'LinksInCount', 'SiteData')
        xml = etree.tostring(respXml, encoding='UTF-8')

        respStatus = respXml.find('//{%s}StatusCode' %
                                  api.NS_PREFIXES['alexa']).text
        if 'Success' == respStatus:
            dom_doc = parseString(xml)
            rank_list_items = []
            for country in dom_doc.getElementsByTagName('aws:Country'):
                country_code = country.getAttribute('Code')
                country_name = country_name_by_code(country_code)
                ranks = country.getElementsByTagName('aws:Rank')
                if len(ranks) > 0 and ranks[0].firstChild is not None:
                    rank = ranks[0].firstChild.nodeValue
                    try:
                        rank_list_items.append(
                            '<li>%(rank)s<sup>th</sup> most visited website in <img src="/images/flags/%(countryCode)s.png" alt="%(countryName)s flag" /> %(countryName)s</li>'
                            % {
                                'countryCode': country_code.lower(),
                                'countryName': country_name,
                                'rank': rank
                            })
                    except:
                        pass
            content['visitorsLocation'] = '<ul>' + ''.join(
                rank_list_items[:3]) + '</ul>'

            related_list_items = []
            for related in dom_doc.getElementsByTagName('aws:RelatedLink'):
                related_url = related.getElementsByTagName(
                    'aws:NavigableUrl')[0].firstChild.nodeValue
                related_title = related.getElementsByTagName(
                    'aws:Title')[0].firstChild.nodeValue
                related_list_items.append(
                    '<li><a href="%s" rel="nofollow" class="external" target="_blank">%s</a></li>'
                    % (related_url, related_title))
            content['relatedLinks'] = '<ul>' + ''.join(
                related_list_items[:5]) + '</ul>'

            content['worldRank'] = respXml.find('//{%s}Rank' %
                                                api.NS_PREFIXES['awis']).text

            temp = respXml.find('//{%s}MedianLoadTime' %
                                api.NS_PREFIXES['awis']).text
            if temp is not None:
                content['loadTimeMs'] = long(temp)
                if int(
                        respXml.find('//{%s}Percentile' %
                                     api.NS_PREFIXES['awis']).text) < 50:
                    pass

        self.sendAndSaveReport(baseUrl, content)