Example #1
0
def del_bloomd_filter(cfg, crawler_name):
    client = bloomd.get_client(nodes=cfg["BloomdNodes"])
    f = client.create_filter(crawler_name)
    try:
        f.drop()
        print "[info] drop bloomd %s success" % crawler_name
    except Exception:
        print "[error] drop bloomd %s fail" % crawler_name
Example #2
0
def del_bloomd_filter(cfg, crawler_name):
    client = bloomd.get_client(nodes=cfg["BloomdNodes"])
    f = client.create_filter(crawler_name)
    try:
        f.drop()
        print "[info] drop bloomd %s success" % crawler_name
    except Exception:
        print "[error] drop bloomd %s fail" % crawler_name
    def setUp(self):
        self.crawler_name = 'test'
        self.req_d = {
            'crawler_name': self.crawler_name,
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'proxy_name': 'http_china',
            'method': 'GET',
            'headers': {},
            'files': None,
            'data': None,
            'params': {},
            'auth': None,
            'cookies': {},
            'hooks': None,
            'json': None,
            'timeout': 10,
        }
        test_html_file = os.path.join(os.path.dirname(__file__), "test.html")
        with open(test_html_file, 'r') as f:
            html = f.read()

        self.resp_d = {
            'crawler_name': self.crawler_name,
            'http_request': json.dumps(self.req_d),
            'error_code': 0,
            'error_msg': '',
            'status_code': 200,
            'reason': 'OK',
            'html': html,
            'cookies': {},
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'headers': {},
            'encoding': None,
            'elapsed': None,
            'http_proxy': '127.0.0.1:8000'
        }
        cfg = Config().get()
        self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
        conn = create_conn(cfg)
        self.publish_channel = conn.channel()
        self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
        self.filter_q = FilterQueue(
            crawler_name=self.crawler_name,
            bloomd_client=self.bloomd_client
        )
        self.req_q = RequestQueue(
            self.crawler_name,
            ssdb_clients=self.ssdb_clients,
            filter_q=self.filter_q
        )
        html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]]
        fake_worker = type("Worker", (object, ), {})
        setattr(fake_worker, "crawler_name", self.crawler_name)
        setattr(fake_worker, "req_q", self.req_q)
        setattr(fake_worker, "publish_channel", self.publish_channel)
        setattr(fake_worker, "html_404_strings", html_404_strings)
        self.error_handler = Plugin(fake_worker)
def test():
    bloomd_client = get_client()
    crawler_name = "test_crawler_1"
    queue = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        queue.push(url)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Example #5
0
def test():
    bloomd_client = get_client()
    crawler_name = "test_crawler_1"
    queue = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        queue.push(url)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Example #6
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) 
    filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(
        exchange=crawler_name, 
        exchange_type="topic", 
        durable=True
    )
    ch.queue_declare(
        queue=queue_name,
        durable=True
    )
    ch.queue_bind(
        exchange=crawler_name, 
        queue=queue_name, 
        routing_key=queue_name
    )
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Example #7
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
    filter_q = FilterQueue(bloomd_client=bloomd_client,
                           crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name,
                         ssdb_clients=ssdb_clients,
                         filter_q=filter_q,
                         queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(exchange=crawler_name,
                        exchange_type="topic",
                        durable=True)
    ch.queue_declare(queue=queue_name, durable=True)
    ch.queue_bind(exchange=crawler_name,
                  queue=queue_name,
                  routing_key=queue_name)
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Example #8
0
 def setUp(self):
     cfg = Config().get()
     self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
     conn = create_conn(cfg)
     self.crawler_name = 'test_crawler'
     self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
     self.filter_q = FilterQueue(
         crawler_name=self.crawler_name, 
         bloomd_client=self.bloomd_client
     )
     self.req_q = RequestQueue(
         self.crawler_name, 
         ssdb_clients=self.ssdb_clients, 
         filter_q=self.filter_q
     )
     self.resp_q = ResponseQueue(
         self.crawler_name, 
         ssdb_clients=self.ssdb_clients,
     )
     self.publish_channel = conn.channel()
     self.req_d = {
         'crawler_name': self.crawler_name,
         'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
         'proxy_name': 'http_china',
         'method': 'GET',
         'headers': {},
         'files': None,
         'data': None,
         'params': {},
         'auth': None,
         'cookies': {},
         'hooks': None,
         'json': None,
         'timeout': 10,
     }
     self.req = Request(**self.req_d)