def test():
    bloomd_client = get_client()
    crawler_name = "test_crawler_1"
    queue = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        queue.push(url)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Beispiel #2
0
def test():
    bloomd_client = get_client()
    crawler_name = "test_crawler_1"
    queue = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        queue.push(url)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Beispiel #3
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) 
    filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(
        exchange=crawler_name, 
        exchange_type="topic", 
        durable=True
    )
    ch.queue_declare(
        queue=queue_name,
        durable=True
    )
    ch.queue_bind(
        exchange=crawler_name, 
        queue=queue_name, 
        routing_key=queue_name
    )
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
Beispiel #4
0
def test():
    item_cnt = 10000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    mongo_cli = pymongo.MongoClient(host=settings['MongoIp'], port=settings['MongoPort'])
    mongo_cli.test.user.ensure_index('id')
    for i in xrange(1, item_cnt + 1):
        item = {
            "id": i,
            "url": "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i,
        }
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        mongo_cli.test.user.update({'id': item['id']}, item, True)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (item_cnt / (end_time - start_time))
Beispiel #5
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
    filter_q = FilterQueue(bloomd_client=bloomd_client,
                           crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name,
                         ssdb_clients=ssdb_clients,
                         filter_q=filter_q,
                         queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(exchange=crawler_name,
                        exchange_type="topic",
                        durable=True)
    ch.queue_declare(queue=queue_name, durable=True)
    ch.queue_bind(exchange=crawler_name,
                  queue=queue_name,
                  routing_key=queue_name)
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))