Esempio n. 1
0
 def test_redis_url_noexists_referer(self):
     html0 = Html(url='http://rs.crosswarp.com/')
     html1 = Html(url='http://www.crosswarp.com/', priority=10)
     html0.destinations.append(html1.url)
     html1.referer = html0.url
     self.data_access.insert_htmls([html1])
     self.assertEqual(self.cli.zcard(self.data_access.url_rank), 1)
Esempio n. 2
0
 def get_next_url(self):
     doc = self.htmls.find_and_modify(
         query={'crawled_at': {'$exists': False}},
         update={'$set': {'crawled_at': datetime.now()}},
         upsert=False, sort={'priority': -1})
     if doc is None:
         return None
     obj = Html()
     obj.from_dict(doc)
     return obj
Esempio n. 3
0
 def get_next_url(self):
     if self.cli.zcard(self.url_rank) == 0:
         return None
     with self.cli.pipeline() as p:
         while True:
             try:
                 p.watch('get_next_url')
                 md5hash_list = p.zrevrange(self.url_rank, 0, 0)
                 if len(md5hash_list) == 0:
                     continue
                 md5hash = md5hash_list[0]
                 url_json = p.get(self.prefix + md5hash)
                 p.multi()
                 p.zrem(self.url_rank, md5hash)
                 p.execute()
                 html = Html(json_str=url_json)
                 html.crawled_at = datetime.now()
                 return html
             except redis.WatchError:
                 self.logger.exception({'datasource': 'redis', 'message': 'redis.WatchError', 'method': 'get_next_url'})
                 continue
             except:
                 self.logger.exception({'datasource': 'redis', 'message': 'Exception', 'method': 'get_next_url'})
                 continue
Esempio n. 4
0
 def test_html_to_json(self):
     #m = Html(url='http://www.crosswarp.com/info/', priority=10, cookie='hoge', referer='http://www.crosswarp.com/')
     m = Html()
     print m.to_json()
     self.assertEqual(m.to_json(), '{"priority": 1, "response_code": 0, "md5hash": "d41d8cd98f00b204e9800998ecf8427e"}')