Python RedisClient.llen Examples

Programming Language: Python

Namespace/Package Name: db

Class/Type: RedisClient

Method/Function: llen

Examples at hotexamples.com: 2

Python RedisClient.llen - 2 examples found. These are the top rated real world Python examples of db.RedisClient.llen extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RedisClient(30)

max(30)

add(30)

decrease(30)

all(30)

count(17)

batch(16)

get(12)

isExist(10)

remove(7)

delete(6)

getOnceProxy(6)

getProxy(6)

exists(4)

get_proxy_by_score(3)

lpush(2)

pop(2)

drop(2)

get_all(2)

llen(2)

push(1)

addProxy(1)

mysql_add(1)

pop_mmsi(1)

proxies(1)

all_proxies(1)

push_mmsi(1)

push_spare_mmsi(1)

put(1)

random(1)

randomChoic(1)

randow(1)

reduce_proxy_score(1)

rand_proxy(1)

add_new_url(1)

list(1)

is_url_lock(1)

is_old_url(1)

adjust_score(1)

increase_proxy_score(1)

hget(1)

hexists(1)

hadd(1)

add_old_urls(1)

get_new_urls(1)

get_allproxy(1)

add_proxy(1)

derease(1)

del_all(1)

add_score(1)

Example #1

Show file

def main():
    s = Scheduler()
    print('程序开始运行。。')
    redisClient = RedisClient()
    # flag = True
    # while flag:
    redis_len = redisClient.llen('employment')
    print('redis队列长度：' + str(redis_len))
    if redis_len >= 0:
        s.run()

Example #2

Show file

File: scheduler.py Project: zack7wong/spiders

class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MysqlClient()
        self.redisClient = RedisClient()

    def run(self):

        # self.push_url_to_redis()

        flag = True
        while flag:
            redis_len = self.redisClient.llen('employment')
            print('redis队列长度：' + str(redis_len))
            if redis_len > 0:
                self.get_position()
            else:
                flag = False

    def get_qu(self):
        sql = 'select * from shi'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[2]
            url = SHI_URL.format(shi_id='c' + shi_id)
            print(url)
            html = self.download.get_html(url)
            if html.status_code == 200 and html is not None:
                html = HTML(html.text)
                qu_id_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/@href'
                )
                qu_name_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/text()'
                )
                for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]):
                    qu_id = qu_id.split('/')
                    qu_id = qu_id[2]
                    sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_zhen(self):
        sql = 'select * from qu'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[1]
            qu_id = res[2]
            url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id)
            print(url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)
                zhen_id_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/@href'
                )
                zhen_name_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/text()'
                )
                for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]):
                    zhen_id = zhen_id.split('/')
                    zhen_id = zhen_id[2]
                    sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_position(self):
        redis_results = self.redisClient.pop('employment')
        try:
            json_obj = json.loads(redis_results[1].decode('utf8'))
        except:
            return None

        if json_obj:
            detail_url_list = []
            url = json_obj['url']
            # pre_page = re.search('\/\?page=(.*?)&', url).group(1)
            # if int(pageToken) > 10:
            #     break
            # url = url.replace('page='+pre_page+'&sort=2&ka=page-'+pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken))
            cityId = json_obj['cityId']
            zhiweiId = json_obj['zhiweiId']
            print(url)
            html = self.download.get_html(url)

            if html is not None and html.status_code == 200:
                html = HTML(html.text)

                #判断是否是当天发布，是的话请求详情页, 判断数据库是否有这条数据，有的话不请求（暂时）
                li_xpath = html.xpath('//div[@class="job-list"]/ul/li')
                for li in li_xpath:
                    content = etree.tostring(li)
                    content = HT.unescape(content.decode())
                    content = HTML(content)
                    li_time = content.xpath(
                        'string(//div[@class="info-publis"]/p)')
                    href_url = content.xpath(
                        'string(//div[@class="info-primary"]//h3/a/@href)')
                    #判断数据库存不存在：
                    try:
                        cid = re.match('^/job_detail/(.*?)\.html',
                                       href_url).group(1)
                        sql = "select * from positions where cid='%s'" % (cid)
                        find_one_res = self.db.find_one(sql)
                        if find_one_res is None:
                            #先把cid插入，避免重复抓取
                            sql = "insert into positions(cid) values ('%s')" % (
                                cid)
                            self.db.save(sql)
                            detail_url_list.append(config.HOST_URL + href_url)
                        elif find_one_res[2] is None:
                            detail_url_list.append(config.HOST_URL + href_url)
                        else:
                            print('数据库存在该记录：' + str(cid))
                    except:
                        print('查询数据库出错：' + str(cid))

                results = self.get_detail(detail_url_list, cityId, zhiweiId)

            else:
                print('该url无数据')

    def get_detail(self, detail_url_list, cityId, zhiweiId):
        for url in detail_url_list:
            print('下载该详情页：' + url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)

                try:
                    cid = re.match(
                        '^https://www.zhipin.com/job_detail/(.*?)\.html',
                        url).group(1)
                except:
                    print('获取cid失败')
                    continue

                title = html.xpath('string(//h1)')
                if title == '':
                    continue
                url = url
                try:
                    publishDateStr = html.xpath(
                        'string(//span[@class="time"])').split('发布于')[1]
                    publishDate = int(
                        time.mktime(
                            time.strptime(publishDateStr, "%Y-%m-%d %H:%M")))
                except:
                    publishDateStr = None
                    publishDate = None

                try:
                    info = html.xpath(
                        'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)'
                    )
                    info = info.split('：')
                    city = info[1][:-2]
                    jingyan = info[2][:-2]
                    xueli = info[3]
                except:
                    city = None
                    jingyan = None
                    xueli = None
                price = html.xpath(
                    'string(//div[@class="info-primary"]//span[@class="badge"])'
                )
                price = price.strip()
                posterName = html.xpath('string(//h2)')
                posterId = None
                posterUrl = html.xpath(
                    'string(//div[@class="detail-figure"]/img/@src)')
                content = html.xpath(
                    'string(//div[@class="job-sec"]/div[@class="text"])'
                ).strip()

                try:
                    company_text = html.xpath(
                        'string(//a[@ka="job-cominfo"]/@href)')
                    companyID = re.match('/gongsi/(.*?)\.html',
                                         company_text).group(1)
                except:
                    companyID = None
                createDate = int(time.time())

                temp_time = time.localtime(int(time.time()))
                now_DateStr = time.strftime("%Y-%m-%d", temp_time)
                lt = time.strptime(now_DateStr, "%Y-%m-%d")
                now_timestamp = int(time.mktime(lt))

                res_obj = {
                    'cid': cid,
                    'title': title,
                    'url': url,
                    'publishDateStr': publishDateStr,
                    'publishDate': publishDate,
                    'city': city,
                    'jingyan': jingyan,
                    'xueli': xueli,
                    'price': price,
                    'posterName': posterName,
                    'posterId': posterId,
                    'posterUrl': posterUrl,
                    'content': content,
                    'companyID': companyID,
                    'createDate': createDate,
                    'cityId': cityId,
                    'zhiweiId': zhiweiId
                }
                print(res_obj)
                sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \
                      " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                      % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\
                      + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \
                      %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)
                self.db.save(sql)
            else:
                print('请求详情页失败：' + str(url))

    def push_url_to_redis(self):
        # zhiwei_list = []
        # zhiwei_sql = 'select * from zhiwei'
        # zhiwei_results = self.db.find_all(zhiwei_sql)
        # for zhiwei in zhiwei_results:
        #     zhiwei_list.append(zhiwei[2])
        #
        # zhen_sql = 'select * from zhen'
        # zhen_results = self.db.find_all(zhen_sql)
        #
        # for res in zhen_results:
        #     pid = res[1]
        #     zhen_id = res[2]
        #     for zhiwei_id in zhiwei_list:
        #         url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1')
        #         self.redisClient.push('employment',url)

        zhiwei_list = []
        zhiwei_sql = 'select * from zhiwei'
        zhiwei_results = self.db.find_all(zhiwei_sql)
        for zhiwei in zhiwei_results:
            zhiwei_list.append(zhiwei[2])

        shi_sql = 'select * from shi'
        shi_results = self.db.find_all(shi_sql)

        for res in shi_results:
            pid = res[2]
            # NEW_POSITION_URL = 'https://www.zhipin.com/job_detail/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&scity={pid}&industry=&position='
            for i in range(1, 11):
                NEW_POSITION_URL = 'https://www.zhipin.com/c{pid}/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&page={pageToken}&ka=page-{pageToken}'
                url = NEW_POSITION_URL.format(pid=pid, pageToken=i)
                url_obj = {"url": url, "cityId": pid, 'zhiweiId': '1'}
                self.redisClient.push('employment', json.dumps(url_obj))