def test_chunked_by_partsize_divisible (self): parts1 = sequt.chunked_by_partsize (self.seq1, partsize=5) expected1 = [ [0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14] ] self.assertListEqual (parts1, expected1)
def test_chunked_by_partsize_nondivisible (self): parts1 = sequt.chunked_by_partsize (self.seq1, 4) expected1 = [ [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14] ] self.assertListEqual (parts1, expected1)
def crawl (domains): """ Crawls WOT reputations and confidences for a sequence of domains, and returns two dicts 'target' and 'scores'. 'target': Mapping a domain to an effective WOT scoring target, e.g. 'www.cs.umn.edu' -> 'cs.umn.edu' 'scores': Mapping a WOT target to a list of 4 <reputation,confidence> tuples e.g. [(91,56), (91,40), (83,5), (0,0)] the last number is the sum of all five confidences """ X0124 = '0124' WOT_API = 'http://api.mywot.com/0.4/public_link_json?hosts=%s/' t0 = time.time() target = {} scores = {} # auto keep-alive (i.e. reusing TCP conn) within a session session = requests.session() session.config['keep_alive'] = True domainchunks = sequt.chunked_by_partsize (domains, 98, shuffle=True) print('Total domains: %d' % len(domains)) print('Total chunks: %d' % len(domainchunks)) for i, domainchunk in enumerate(domainchunks): if i % 500 == 0: print('%ds: crawling chunk no.%d' % (time.time() - t0, i)) url = WOT_API % '/'.join(domainchunk) response, session = _crawl_chunk(url, session) statuscode = response.status_code if statuscode != 200: if statuscode == 403: print 'Ivalid URL or target name, URL=' print url elif statuscode == 500: print 'server-side error' break time.sleep(0.1) # being nice jsondict = response.json for domain, valuedict in jsondict.iteritems(): t = valuedict['target'] if domain != t: target[domain] = t scores[t] = [valuedict.get(x, (0,0)) for x in X0124] return target, scores
def test_chunked_by_partsize_nondivisible(self): parts1 = sequt.chunked_by_partsize(self.seq1, 4) expected1 = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14]] self.assertListEqual(parts1, expected1)
def test_chunked_by_partsize_divisible(self): parts1 = sequt.chunked_by_partsize(self.seq1, partsize=5) expected1 = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]] self.assertListEqual(parts1, expected1)