Exemple #1
0
 def test_chunked_by_partsize_divisible (self):
     parts1 = sequt.chunked_by_partsize (self.seq1, partsize=5)
     expected1 = [
         [0, 1, 2, 3, 4],
         [5, 6, 7, 8, 9],
         [10, 11, 12, 13, 14]
     ]
     self.assertListEqual (parts1, expected1)
Exemple #2
0
 def test_chunked_by_partsize_nondivisible (self):
     parts1 = sequt.chunked_by_partsize (self.seq1, 4)
     expected1 = [
         [0, 1, 2, 3],
         [4, 5, 6, 7],
         [8, 9, 10, 11],
         [12, 13, 14]
     ]
     self.assertListEqual (parts1, expected1)
Exemple #3
0
def crawl (domains):
    """ Crawls WOT reputations and confidences for a sequence of domains,
        and returns two dicts 'target' and 'scores'.

        'target':
            Mapping a domain to an effective WOT scoring target,
            e.g. 'www.cs.umn.edu' -> 'cs.umn.edu'

        'scores':
            Mapping a WOT target to a list of 4 <reputation,confidence> tuples
            e.g. [(91,56), (91,40), (83,5), (0,0)]
            the last number is the sum of all five confidences
    """
    X0124 = '0124'
    WOT_API = 'http://api.mywot.com/0.4/public_link_json?hosts=%s/'
    t0 = time.time()
    target = {}
    scores = {}

    # auto keep-alive (i.e. reusing TCP conn) within a session
    session = requests.session()
    session.config['keep_alive'] = True

    domainchunks = sequt.chunked_by_partsize (domains, 98, shuffle=True)
    print('Total domains: %d' % len(domains))
    print('Total chunks: %d' % len(domainchunks))

    for i, domainchunk in enumerate(domainchunks):
        if i % 500 == 0:
            print('%ds: crawling chunk no.%d' % (time.time() - t0, i))
        url = WOT_API % '/'.join(domainchunk)
        response, session = _crawl_chunk(url, session)

        statuscode = response.status_code
        if statuscode != 200:
            if statuscode == 403:
                print 'Ivalid URL or target name, URL='
                print url
            elif statuscode == 500:
                print 'server-side error'
            break
        time.sleep(0.1)  # being nice
        jsondict = response.json

        for domain, valuedict in jsondict.iteritems():
            t = valuedict['target']
            if domain != t:
                target[domain] = t
            scores[t] = [valuedict.get(x, (0,0)) for x in X0124]

    return target, scores
Exemple #4
0
 def test_chunked_by_partsize_nondivisible(self):
     parts1 = sequt.chunked_by_partsize(self.seq1, 4)
     expected1 = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14]]
     self.assertListEqual(parts1, expected1)
Exemple #5
0
 def test_chunked_by_partsize_divisible(self):
     parts1 = sequt.chunked_by_partsize(self.seq1, partsize=5)
     expected1 = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]
     self.assertListEqual(parts1, expected1)