コード例 #1
0
class QueueManage(object):

    def __init__(self, name):
        self.q_obj = RedisQueue(name)

    def get_queue_data(self):
        q_re = self.q_obj.get_all()
        return q_re

    def queue_size(self):
        return self.q_obj.qsize()
コード例 #2
0
def main(run_type, store_num):
    q = RedisQueue(store_num)
    if run_type == 'gen':
        for i in range(len(sectorPolygons1954.sectors)):
            s = generateSectors(sectorPolygons1954.sectors[i], i, q, store_num)
            print('starting thread ' + str(i))
            s.start()

    elif run_type == 'run':
        batch_size = 30
        uri = 'localhost:27017'
        client = pymongo.MongoClient(uri)
        db = client['streets']
        streets = db[store_num]
        streets.create_index([("latitude", pymongo.DESCENDING),
                              ("longitude", pymongo.DESCENDING)])
        for i in range(int(batch_size)):
            rg = request_getter(q, store_num)
            print('starting request thread ' + str(i))
            rg.start()

        while q.qsize():
            sleep(10)
            print(q.qsize())
コード例 #3
0
def main():

    done_que = RedisQueue('seed')
    run_que = RedisQueue('run')

    run_que.flushdb()

    conn = sqlite3.connect('site_data.db')
    conn.execute(
        "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
    )

    spend = 0
    cnt = 0
    size = 0
    while True:

        data = cPickle.loads(done_que.get())
        st = time.time()
        urls = geturls(data['url'], data['content'])
        if len(urls) == 0:
            continue

        for url in urls:
            if url not in bfdone:
                run_que.put(url)

        gziphtml = sqlite3.Binary(gzip.zlib.compress(data['content']))
        size += len(gziphtml)
        conn.execute(
            "insert into mainpages (url,headers,content) values (?,?,?)",
            (data['url'], str(data['headers']), gziphtml))

        et = time.time()
        spend += (et - st)
        cnt += 1
        if cnt % 10 == 0:
            print "cost:", spend / cnt, cnt, done_que.qsize(
            ), size / 1024 / 1024
            conn.commit()
コード例 #4
0
ファイル: Queue.py プロジェクト: 0x24bin/pythonCodes
    html = page
    return html


def next_page():
    base_url = 'http://jandan.net/ooxx/page-1006#comments'
    for i in range(3):
        html = user_agent(base_url).read()
        soup = BeautifulSoup(html)
        next_url = soup.find('a', {
            'class': 'next-comment-page',
            'title': 'Newer Comments'
        }).get('href')
        yield base_url
        base_url = next_url


for page in next_page():
    queue.put(page)
print 'There are %d pages' % queue.qsize()

while not queue.empty():
    page_url = queue.get()
    html = user_agent(page_url).read()
    soup = BeautifulSoup(html)
    img_urls = soup.find_all(['img'])
    for myimg in img_urls:
        Jpgurl = myimg.get('src')
        redis.put(Jpgurl)
print 'There are %d pictures' % redis.qsize()
コード例 #5
0
ファイル: account_login.py プロジェクト: Tencent-Luis/python
q = RedisQueue('account_login', **redis_conn)
http = urllib3.PoolManager(num_pools=50)

def worker(value):
    params = {}
    params['account_login'] = base64.encodestring(value)
    r = http.request('POST', author_login, params)

    #服务器失败,重新压回队列
    if r.status != 200:
        q.put(value)

    #IP白名单验证失败,重新压回队列
    if r.data['status'] == 10002:
        q.put(value)
    print r.data

while 1:
    # time.sleep(1);
    if q.empty():
        print 'empty queue'
        break

    s = q.qsize()
    for i in range(0,s):
        value = q.get()
        t = threading.Thread(target=worker, args=(value,))
        t.start()
        if threading.active_count() >= 500:
            time.sleep(1)
コード例 #6
0
ファイル: test_redis.py プロジェクト: salmonx/fengbei
# 
#
# store the website
# dbname with ip?
#   
# sitedata_2nd_
#
#
# 
# db = dbd['runque']
# db = dbd['extracturls']
# 

dbd = dict()
dbd['runque'] = 1
dbd['extracturls'] = 2

host = "127.0.0.1"
password = '******'
# first insert into the done_site.bin

rq = RedisQueue(name = 'extracturls', host=host, password=password, db=dbd['extracturls'])
rr = RedisQueue(name ='runque', host=host, password=password, db=dbd['runque'])


print rq.qsize()
print rr.qsize()
#exit(0)


コード例 #7
0
ファイル: scanret.py プロジェクト: sry309/crawler_old
# then master pop them to check  whether already in done_site.bin
# if not, send the host (without http://) to the dns server
# from dns server we got the website host is accessble or not
# when recived the reply from dns, then insert the parsed url (which is accessble) into the runqueu in a specific redis queue
#
#
# store the website
#
#
# db = dbd['runque']
# db = dbd['extracturls']
#

dbd = dict()
dbd['runque'] = 1
dbd['extracturls'] = 2

host = "127.0.0.1"
password = '******'
# first insert into the done_site.bin

rq = RedisQueue(name='extracturls',
                host=host,
                password=password,
                db=dbd['extracturls'])
rr = RedisQueue(name='runque', host=host, password=password, db=dbd['runque'])

print rq.qsize()
print rr.qsize()
#exit(0)
コード例 #8
0
ファイル: Queue.py プロジェクト: madaoCN/pythonCodes
    req_timeout = 20
    req = urllib2.Request(url, None, req_header)
    page = urllib2.urlopen(req, None, req_timeout)
    html = page
    return html


def next_page():
    base_url = "http://jandan.net/ooxx/page-1006#comments"
    for i in range(3):
        html = user_agent(base_url).read()
        soup = BeautifulSoup(html)
        next_url = soup.find("a", {"class": "next-comment-page", "title": "Newer Comments"}).get("href")
        yield base_url
        base_url = next_url


for page in next_page():
    queue.put(page)
print "There are %d pages" % queue.qsize()

while not queue.empty():
    page_url = queue.get()
    html = user_agent(page_url).read()
    soup = BeautifulSoup(html)
    img_urls = soup.find_all(["img"])
    for myimg in img_urls:
        Jpgurl = myimg.get("src")
        redis.put(Jpgurl)
print "There are %d pictures" % redis.qsize()