def __init__(self, cfg): self.from_queue = cfg["from_queue"] self.from_host = cfg["from_host"] self.from_port = cfg["from_port"] self.crawler = cfg["crawler"] self.credentials = pika.PlainCredentials(cfg["user"], cfg["password"]) nodes = Config().get()["SSDBNodes"] self.ssdb_clients = get_clients(nodes=nodes)
def setUp(self): self.crawler_name = 'test' self.req_d = { 'crawler_name': self.crawler_name, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'proxy_name': 'http_china', 'method': 'GET', 'headers': {}, 'files': None, 'data': None, 'params': {}, 'auth': None, 'cookies': {}, 'hooks': None, 'json': None, 'timeout': 10, } test_html_file = os.path.join(os.path.dirname(__file__), "test.html") with open(test_html_file, 'r') as f: html = f.read() self.resp_d = { 'crawler_name': self.crawler_name, 'http_request': json.dumps(self.req_d), 'error_code': 0, 'error_msg': '', 'status_code': 200, 'reason': 'OK', 'html': html, 'cookies': {}, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'headers': {}, 'encoding': None, 'elapsed': None, 'http_proxy': '127.0.0.1:8000' } cfg = Config().get() self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) self.publish_channel = conn.channel() self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) self.filter_q = FilterQueue( crawler_name=self.crawler_name, bloomd_client=self.bloomd_client ) self.req_q = RequestQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, filter_q=self.filter_q ) html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]] fake_worker = type("Worker", (object, ), {}) setattr(fake_worker, "crawler_name", self.crawler_name) setattr(fake_worker, "req_q", self.req_q) setattr(fake_worker, "publish_channel", self.publish_channel) setattr(fake_worker, "html_404_strings", html_404_strings) self.error_handler = Plugin(fake_worker)
def get(args): k = args.key if k == "": print "key can not be empty" return nodes = Config().get()["SSDBNodes"] clients = get_clients(nodes=nodes) client = get_client(clients, k) r = redis.Redis(connection_pool=client["connection_pool"]) resp = r.get(k) print resp print "client: %s:%s" % (client["node"]["Host"], client["node"]["Port"])
def status(args): crawler_name = args.crawler_name if crawler_name == "": print "crawler_name can not be empty" return print "show %s crawler http_response status..." % crawler_name start = "http_response:%s:" % crawler_name end = "http_response:%s:z" % crawler_name nodes = Config().get()["SSDBNodes"] clients = get_clients(nodes=nodes) total = 0 for client in clients: print "%s:%s" % (client["node"]["Host"], client["node"]["Port"]) r = redis.Redis(connection_pool=client["connection_pool"]) keys = r.execute_command("keys", start, end, -1) total += len(keys) print "length: ", len(keys) print "total: ", total
def test(): crawler_name = "weibo" cfg = Config().get() ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name) queue_name = "http_request:%s:test" % crawler_name queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name) ch = conn.channel() ch.exchange_declare( exchange=crawler_name, exchange_type="topic", durable=True ) ch.queue_declare( queue=queue_name, durable=True ) ch.queue_bind( exchange=crawler_name, queue=queue_name, routing_key=queue_name ) ch.close() publish_channel = conn.channel() publish_channel.confirm_delivery() page_cnt = 50000 cnt = 0 logger = get_logger("test_queue") start_time = time.time() for i in xrange(1, page_cnt + 1): url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i cnt += 1 if cnt % 1000 == 0: logger.info(str(cnt)) r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name) queue.push(r, publish_channel) end_time = time.time() print "start time: ", start_time print "end time: ", end_time print "speed: %f times/second" % (page_cnt / (end_time - start_time))
def test(): crawler_name = "weibo" cfg = Config().get() ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name) queue_name = "http_request:%s:test" % crawler_name queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name) ch = conn.channel() ch.exchange_declare(exchange=crawler_name, exchange_type="topic", durable=True) ch.queue_declare(queue=queue_name, durable=True) ch.queue_bind(exchange=crawler_name, queue=queue_name, routing_key=queue_name) ch.close() publish_channel = conn.channel() publish_channel.confirm_delivery() page_cnt = 50000 cnt = 0 logger = get_logger("test_queue") start_time = time.time() for i in xrange(1, page_cnt + 1): url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i cnt += 1 if cnt % 1000 == 0: logger.info(str(cnt)) r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name) queue.push(r, publish_channel) end_time = time.time() print "start time: ", start_time print "end time: ", end_time print "speed: %f times/second" % (page_cnt / (end_time - start_time))
def setUp(self): cfg = Config().get() self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) self.crawler_name = 'test_crawler' self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) self.filter_q = FilterQueue( crawler_name=self.crawler_name, bloomd_client=self.bloomd_client ) self.req_q = RequestQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, filter_q=self.filter_q ) self.resp_q = ResponseQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, ) self.publish_channel = conn.channel() self.req_d = { 'crawler_name': self.crawler_name, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'proxy_name': 'http_china', 'method': 'GET', 'headers': {}, 'files': None, 'data': None, 'params': {}, 'auth': None, 'cookies': {}, 'hooks': None, 'json': None, 'timeout': 10, } self.req = Request(**self.req_d)