def setUp(self): self.proxy_name = "http_china" self.cfg = Config().get() self.proxy_client = get_proxy_client(cfg=self.cfg) fake_worker = type('Worker', (object, ), {}) setattr(fake_worker, "proxy_name", self.proxy_name) setattr(fake_worker, "proxy_client", self.proxy_client) self.proxy_handler = Plugin(fake_worker)
def __init__(self, cfg): self.from_queue = cfg["from_queue"] self.from_host = cfg["from_host"] self.from_port = cfg["from_port"] self.crawler = cfg["crawler"] self.credentials = pika.PlainCredentials(cfg["user"], cfg["password"]) nodes = Config().get()["SSDBNodes"] self.ssdb_clients = get_clients(nodes=nodes)
def setUp(self): self.crawler_name = 'test' self.req_d = { 'crawler_name': self.crawler_name, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'proxy_name': 'http_china', 'method': 'GET', 'headers': {}, 'files': None, 'data': None, 'params': {}, 'auth': None, 'cookies': {}, 'hooks': None, 'json': None, 'timeout': 10, } test_html_file = os.path.join(os.path.dirname(__file__), "test.html") with open(test_html_file, 'r') as f: html = f.read() self.resp_d = { 'crawler_name': self.crawler_name, 'http_request': json.dumps(self.req_d), 'error_code': 0, 'error_msg': '', 'status_code': 200, 'reason': 'OK', 'html': html, 'cookies': {}, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'headers': {}, 'encoding': None, 'elapsed': None, 'http_proxy': '127.0.0.1:8000' } cfg = Config().get() self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) self.publish_channel = conn.channel() self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) self.filter_q = FilterQueue( crawler_name=self.crawler_name, bloomd_client=self.bloomd_client ) self.req_q = RequestQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, filter_q=self.filter_q ) html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]] fake_worker = type("Worker", (object, ), {}) setattr(fake_worker, "crawler_name", self.crawler_name) setattr(fake_worker, "req_q", self.req_q) setattr(fake_worker, "publish_channel", self.publish_channel) setattr(fake_worker, "html_404_strings", html_404_strings) self.error_handler = Plugin(fake_worker)
def status(args): cfg = Config().get() client = get_proxy_client(cfg=cfg) r = redis.Redis(connection_pool=client["connection_pool"]) status_str = ["Stopped", "Running"] keys = r.hkeys("proxy_config") for k in keys: content = r.hget("proxy_config", k) proxy_config = json.loads(content) status = status_str[proxy_config["Status"]] print "proxy_config %s Status: %s" % (proxy_config["Name"], status)
def get(args): k = args.key if k == "": print "key can not be empty" return nodes = Config().get()["SSDBNodes"] clients = get_clients(nodes=nodes) client = get_client(clients, k) r = redis.Redis(connection_pool=client["connection_pool"]) resp = r.get(k) print resp print "client: %s:%s" % (client["node"]["Host"], client["node"]["Port"])
def status(args): cfg = Config().get() status_str = ["Stopped", "Running"] client = get_proxy_client(cfg=cfg) r = redis.Redis(connection_pool=client["connection_pool"]) keys = r.hkeys("proxy_config") for k in keys: content = r.hget("proxy_config", k) proxy_config = json.loads(content) proxy_name = proxy_config["Name"] status = status_str[proxy_config["Status"]] cnt = r.scard("http_proxy:%s" % proxy_name) print "%s[%s]: %s items" % (proxy_name, status, cnt)
def load(args): cfg = Config().get() client = get_proxy_client(cfg=cfg) r = redis.Redis(connection_pool=client["connection_pool"]) print "load proxy_config ..." keys = r.hkeys("proxy_config") for key in keys: r.hdel("proxy_config", key) print "delete proxyconfig by key: %s" % key for data in get_proxy_json(): print "Add proxy_config %s" % data["Name"] r.hset("proxy_config", data["Name"], json.dumps(data)) new_keys = r.hkeys("proxy_config") print "load ok keys: %s" % ",".join(new_keys)
def delete(args): cfg = Config().get() client = get_proxy_client(cfg=cfg) r = redis.Redis(connection_pool=client["connection_pool"]) print "delete proxy_config %s ..." % args.proxy keys = [] if args.proxy: key = r.hget("proxy_config", args.proxy) if key: keys.append(args.proxy) else: print "%s not found " % args.proxy else: keys = r.hkeys("proxy_config") for key in keys: r.hdel("proxy_config", key) print "delete proxy_config %s ok" % key
def status(args): crawler_name = args.crawler_name if crawler_name == "": print "crawler_name can not be empty" return print "show %s crawler http_response status..." % crawler_name start = "http_response:%s:" % crawler_name end = "http_response:%s:z" % crawler_name nodes = Config().get()["SSDBNodes"] clients = get_clients(nodes=nodes) total = 0 for client in clients: print "%s:%s" % (client["node"]["Host"], client["node"]["Port"]) r = redis.Redis(connection_pool=client["connection_pool"]) keys = r.execute_command("keys", start, end, -1) total += len(keys) print "length: ", len(keys) print "total: ", total
def stop(args): cfg = Config().get() client = get_proxy_client(cfg=cfg) r = redis.Redis(connection_pool=client["connection_pool"]) print "stop proxy_config %s..." % args.proxy keys = [] if args.proxy: key = r.hget("proxy_config", args.proxy) if key: keys.append(args.proxy) else: print "%s not found " % args.proxy else: keys = r.hkeys("proxy_config") for key in keys: print "stop %s" % key proxy_config = json.loads(r.hget("proxy_config", key)) proxy_config["Status"] = 0 r.hset("proxy_config", key, json.dumps(proxy_config))
def test(): crawler_name = "weibo" cfg = Config().get() ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name) queue_name = "http_request:%s:test" % crawler_name queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name) ch = conn.channel() ch.exchange_declare(exchange=crawler_name, exchange_type="topic", durable=True) ch.queue_declare(queue=queue_name, durable=True) ch.queue_bind(exchange=crawler_name, queue=queue_name, routing_key=queue_name) ch.close() publish_channel = conn.channel() publish_channel.confirm_delivery() page_cnt = 50000 cnt = 0 logger = get_logger("test_queue") start_time = time.time() for i in xrange(1, page_cnt + 1): url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i cnt += 1 if cnt % 1000 == 0: logger.info(str(cnt)) r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name) queue.push(r, publish_channel) end_time = time.time() print "start time: ", start_time print "end time: ", end_time print "speed: %f times/second" % (page_cnt / (end_time - start_time))
def setUp(self): cfg = Config().get() self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) self.crawler_name = 'test_crawler' self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) self.filter_q = FilterQueue( crawler_name=self.crawler_name, bloomd_client=self.bloomd_client ) self.req_q = RequestQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, filter_q=self.filter_q ) self.resp_q = ResponseQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, ) self.publish_channel = conn.channel() self.req_d = { 'crawler_name': self.crawler_name, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'proxy_name': 'http_china', 'method': 'GET', 'headers': {}, 'files': None, 'data': None, 'params': {}, 'auth': None, 'cookies': {}, 'hooks': None, 'json': None, 'timeout': 10, } self.req = Request(**self.req_d)
def test_config(self): cfg = Config().get() self.assertTrue("RabbitmqIp" in cfg) self.assertTrue("RabbitmqPort" in cfg) self.assertTrue("MongoIp" in cfg) self.assertTrue("MongoPort" in cfg)
def main(): args = input_params() cfg = Config(conf_file=args.conf).get() del_bloomd_filter(cfg, args.crawler_name) del_rabbitmq_queue(cfg, args.crawler_name, args.req, args.resp) del_ssdb_cache(cfg, args.crawler_name)