Beispiel #1
0
 def setUp(self):
     self.proxy_name = "http_china"
     self.cfg = Config().get()
     self.proxy_client = get_proxy_client(cfg=self.cfg)
     fake_worker = type('Worker', (object, ), {})
     setattr(fake_worker, "proxy_name", self.proxy_name)
     setattr(fake_worker, "proxy_client", self.proxy_client)
     self.proxy_handler = Plugin(fake_worker)
Beispiel #2
0
 def __init__(self, cfg):
     self.from_queue = cfg["from_queue"]
     self.from_host = cfg["from_host"]
     self.from_port = cfg["from_port"]
     self.crawler = cfg["crawler"]
     self.credentials = pika.PlainCredentials(cfg["user"], cfg["password"])
     nodes = Config().get()["SSDBNodes"]
     self.ssdb_clients = get_clients(nodes=nodes)
    def setUp(self):
        self.crawler_name = 'test'
        self.req_d = {
            'crawler_name': self.crawler_name,
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'proxy_name': 'http_china',
            'method': 'GET',
            'headers': {},
            'files': None,
            'data': None,
            'params': {},
            'auth': None,
            'cookies': {},
            'hooks': None,
            'json': None,
            'timeout': 10,
        }
        test_html_file = os.path.join(os.path.dirname(__file__), "test.html")
        with open(test_html_file, 'r') as f:
            html = f.read()

        self.resp_d = {
            'crawler_name': self.crawler_name,
            'http_request': json.dumps(self.req_d),
            'error_code': 0,
            'error_msg': '',
            'status_code': 200,
            'reason': 'OK',
            'html': html,
            'cookies': {},
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'headers': {},
            'encoding': None,
            'elapsed': None,
            'http_proxy': '127.0.0.1:8000'
        }
        cfg = Config().get()
        self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
        conn = create_conn(cfg)
        self.publish_channel = conn.channel()
        self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
        self.filter_q = FilterQueue(
            crawler_name=self.crawler_name,
            bloomd_client=self.bloomd_client
        )
        self.req_q = RequestQueue(
            self.crawler_name,
            ssdb_clients=self.ssdb_clients,
            filter_q=self.filter_q
        )
        html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]]
        fake_worker = type("Worker", (object, ), {})
        setattr(fake_worker, "crawler_name", self.crawler_name)
        setattr(fake_worker, "req_q", self.req_q)
        setattr(fake_worker, "publish_channel", self.publish_channel)
        setattr(fake_worker, "html_404_strings", html_404_strings)
        self.error_handler = Plugin(fake_worker)
Beispiel #4
0
def status(args):
    cfg = Config().get()
    client = get_proxy_client(cfg=cfg)
    r = redis.Redis(connection_pool=client["connection_pool"])
    status_str = ["Stopped", "Running"]
    keys = r.hkeys("proxy_config")
    for k in keys:
        content = r.hget("proxy_config", k)
        proxy_config = json.loads(content)
        status = status_str[proxy_config["Status"]]
        print "proxy_config %s Status: %s" % (proxy_config["Name"], status)
Beispiel #5
0
def get(args):
    k = args.key
    if k == "":
        print "key can not be empty"
        return
    nodes = Config().get()["SSDBNodes"]
    clients = get_clients(nodes=nodes)
    client = get_client(clients, k)
    r = redis.Redis(connection_pool=client["connection_pool"])
    resp = r.get(k)
    print resp
    print "client: %s:%s" % (client["node"]["Host"], client["node"]["Port"])
Beispiel #6
0
def status(args):
    cfg = Config().get()
    status_str = ["Stopped", "Running"]
    client = get_proxy_client(cfg=cfg)
    r = redis.Redis(connection_pool=client["connection_pool"])
    keys = r.hkeys("proxy_config")
    for k in keys:
        content = r.hget("proxy_config", k)
        proxy_config = json.loads(content)
        proxy_name = proxy_config["Name"]
        status = status_str[proxy_config["Status"]]
        cnt = r.scard("http_proxy:%s" % proxy_name)
        print "%s[%s]: %s items" % (proxy_name, status, cnt)
Beispiel #7
0
def load(args):
    cfg = Config().get()
    client = get_proxy_client(cfg=cfg)
    r = redis.Redis(connection_pool=client["connection_pool"])
    print "load proxy_config ..."
    keys = r.hkeys("proxy_config")
    for key in keys:
        r.hdel("proxy_config", key)
        print "delete proxyconfig by key: %s" % key
    for data in get_proxy_json():
        print "Add proxy_config %s" % data["Name"]
        r.hset("proxy_config", data["Name"], json.dumps(data))
    new_keys = r.hkeys("proxy_config")
    print "load ok keys: %s" % ",".join(new_keys)
Beispiel #8
0
def delete(args):
    cfg = Config().get()
    client = get_proxy_client(cfg=cfg)
    r = redis.Redis(connection_pool=client["connection_pool"])
    print "delete proxy_config %s ..." % args.proxy
    keys = []
    if args.proxy:
        key = r.hget("proxy_config", args.proxy)
        if key:
            keys.append(args.proxy)
        else:
            print "%s not found " % args.proxy
    else:
        keys = r.hkeys("proxy_config")
    for key in keys:
        r.hdel("proxy_config", key)
        print "delete proxy_config %s ok" % key
Beispiel #9
0
def status(args):
    crawler_name = args.crawler_name
    if crawler_name == "":
        print "crawler_name can not be empty"
        return
    print "show %s crawler http_response status..." % crawler_name
    start = "http_response:%s:" % crawler_name
    end = "http_response:%s:z" % crawler_name
    nodes = Config().get()["SSDBNodes"]
    clients = get_clients(nodes=nodes)
    total = 0
    for client in clients:
        print "%s:%s" % (client["node"]["Host"], client["node"]["Port"])
        r = redis.Redis(connection_pool=client["connection_pool"])
        keys = r.execute_command("keys", start, end, -1)
        total += len(keys)
        print "length: ", len(keys)
    print "total: ", total
Beispiel #10
0
def stop(args):
    cfg = Config().get()
    client = get_proxy_client(cfg=cfg)
    r = redis.Redis(connection_pool=client["connection_pool"])
    print "stop proxy_config %s..." % args.proxy
    keys = []
    if args.proxy:
        key = r.hget("proxy_config", args.proxy)
        if key:
            keys.append(args.proxy)
        else:
            print "%s not found " % args.proxy
    else:
        keys = r.hkeys("proxy_config")
    for key in keys:
        print "stop %s" % key
        proxy_config = json.loads(r.hget("proxy_config", key))
        proxy_config["Status"] = 0
        r.hset("proxy_config", key, json.dumps(proxy_config))
Beispiel #11
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
    filter_q = FilterQueue(bloomd_client=bloomd_client,
                           crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name,
                         ssdb_clients=ssdb_clients,
                         filter_q=filter_q,
                         queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(exchange=crawler_name,
                        exchange_type="topic",
                        durable=True)
    ch.queue_declare(queue=queue_name, durable=True)
    ch.queue_bind(exchange=crawler_name,
                  queue=queue_name,
                  routing_key=queue_name)
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Beispiel #12
0
 def setUp(self):
     cfg = Config().get()
     self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
     conn = create_conn(cfg)
     self.crawler_name = 'test_crawler'
     self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
     self.filter_q = FilterQueue(
         crawler_name=self.crawler_name, 
         bloomd_client=self.bloomd_client
     )
     self.req_q = RequestQueue(
         self.crawler_name, 
         ssdb_clients=self.ssdb_clients, 
         filter_q=self.filter_q
     )
     self.resp_q = ResponseQueue(
         self.crawler_name, 
         ssdb_clients=self.ssdb_clients,
     )
     self.publish_channel = conn.channel()
     self.req_d = {
         'crawler_name': self.crawler_name,
         'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
         'proxy_name': 'http_china',
         'method': 'GET',
         'headers': {},
         'files': None,
         'data': None,
         'params': {},
         'auth': None,
         'cookies': {},
         'hooks': None,
         'json': None,
         'timeout': 10,
     }
     self.req = Request(**self.req_d)
Beispiel #13
0
 def test_config(self):
     cfg = Config().get()
     self.assertTrue("RabbitmqIp" in cfg)
     self.assertTrue("RabbitmqPort" in cfg)
     self.assertTrue("MongoIp" in cfg)
     self.assertTrue("MongoPort" in cfg)
Beispiel #14
0
def main():
    args = input_params()
    cfg = Config(conf_file=args.conf).get()
    del_bloomd_filter(cfg, args.crawler_name)
    del_rabbitmq_queue(cfg, args.crawler_name, args.req, args.resp)
    del_ssdb_cache(cfg, args.crawler_name)