Beispiel #1
0
def main():
    s = Scheduler()
    print('程序开始运行。。')
    redisClient = RedisClient()
    # flag = True
    # while flag:
    redis_len = redisClient.llen('employment')
    print('redis队列长度:' + str(redis_len))
    if redis_len >= 0:
        s.run()
Beispiel #2
0
class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MysqlClient()
        self.redisClient = RedisClient()

    def run(self):

        # self.push_url_to_redis()

        flag = True
        while flag:
            redis_len = self.redisClient.llen('employment')
            print('redis队列长度:' + str(redis_len))
            if redis_len > 0:
                self.get_position()
            else:
                flag = False

    def get_qu(self):
        sql = 'select * from shi'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[2]
            url = SHI_URL.format(shi_id='c' + shi_id)
            print(url)
            html = self.download.get_html(url)
            if html.status_code == 200 and html is not None:
                html = HTML(html.text)
                qu_id_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/@href'
                )
                qu_name_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/text()'
                )
                for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]):
                    qu_id = qu_id.split('/')
                    qu_id = qu_id[2]
                    sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_zhen(self):
        sql = 'select * from qu'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[1]
            qu_id = res[2]
            url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id)
            print(url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)
                zhen_id_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/@href'
                )
                zhen_name_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/text()'
                )
                for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]):
                    zhen_id = zhen_id.split('/')
                    zhen_id = zhen_id[2]
                    sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_position(self):
        redis_results = self.redisClient.pop('employment')
        try:
            json_obj = json.loads(redis_results[1].decode('utf8'))
        except:
            return None

        if json_obj:
            detail_url_list = []
            url = json_obj['url']
            # pre_page = re.search('\/\?page=(.*?)&', url).group(1)
            # if int(pageToken) > 10:
            #     break
            # url = url.replace('page='+pre_page+'&sort=2&ka=page-'+pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken))
            cityId = json_obj['cityId']
            zhiweiId = json_obj['zhiweiId']
            print(url)
            html = self.download.get_html(url)

            if html is not None and html.status_code == 200:
                html = HTML(html.text)

                #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时)
                li_xpath = html.xpath('//div[@class="job-list"]/ul/li')
                for li in li_xpath:
                    content = etree.tostring(li)
                    content = HT.unescape(content.decode())
                    content = HTML(content)
                    li_time = content.xpath(
                        'string(//div[@class="info-publis"]/p)')
                    href_url = content.xpath(
                        'string(//div[@class="info-primary"]//h3/a/@href)')
                    #判断数据库存不存在:
                    try:
                        cid = re.match('^/job_detail/(.*?)\.html',
                                       href_url).group(1)
                        sql = "select * from positions where cid='%s'" % (cid)
                        find_one_res = self.db.find_one(sql)
                        if find_one_res is None:
                            #先把cid插入,避免重复抓取
                            sql = "insert into positions(cid) values ('%s')" % (
                                cid)
                            self.db.save(sql)
                            detail_url_list.append(config.HOST_URL + href_url)
                        elif find_one_res[2] is None:
                            detail_url_list.append(config.HOST_URL + href_url)
                        else:
                            print('数据库存在该记录:' + str(cid))
                    except:
                        print('查询数据库出错:' + str(cid))

                results = self.get_detail(detail_url_list, cityId, zhiweiId)

            else:
                print('该url无数据')

    def get_detail(self, detail_url_list, cityId, zhiweiId):
        for url in detail_url_list:
            print('下载该详情页:' + url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)

                try:
                    cid = re.match(
                        '^https://www.zhipin.com/job_detail/(.*?)\.html',
                        url).group(1)
                except:
                    print('获取cid失败')
                    continue

                title = html.xpath('string(//h1)')
                if title == '':
                    continue
                url = url
                try:
                    publishDateStr = html.xpath(
                        'string(//span[@class="time"])').split('发布于')[1]
                    publishDate = int(
                        time.mktime(
                            time.strptime(publishDateStr, "%Y-%m-%d %H:%M")))
                except:
                    publishDateStr = None
                    publishDate = None

                try:
                    info = html.xpath(
                        'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)'
                    )
                    info = info.split(':')
                    city = info[1][:-2]
                    jingyan = info[2][:-2]
                    xueli = info[3]
                except:
                    city = None
                    jingyan = None
                    xueli = None
                price = html.xpath(
                    'string(//div[@class="info-primary"]//span[@class="badge"])'
                )
                price = price.strip()
                posterName = html.xpath('string(//h2)')
                posterId = None
                posterUrl = html.xpath(
                    'string(//div[@class="detail-figure"]/img/@src)')
                content = html.xpath(
                    'string(//div[@class="job-sec"]/div[@class="text"])'
                ).strip()

                try:
                    company_text = html.xpath(
                        'string(//a[@ka="job-cominfo"]/@href)')
                    companyID = re.match('/gongsi/(.*?)\.html',
                                         company_text).group(1)
                except:
                    companyID = None
                createDate = int(time.time())

                temp_time = time.localtime(int(time.time()))
                now_DateStr = time.strftime("%Y-%m-%d", temp_time)
                lt = time.strptime(now_DateStr, "%Y-%m-%d")
                now_timestamp = int(time.mktime(lt))

                res_obj = {
                    'cid': cid,
                    'title': title,
                    'url': url,
                    'publishDateStr': publishDateStr,
                    'publishDate': publishDate,
                    'city': city,
                    'jingyan': jingyan,
                    'xueli': xueli,
                    'price': price,
                    'posterName': posterName,
                    'posterId': posterId,
                    'posterUrl': posterUrl,
                    'content': content,
                    'companyID': companyID,
                    'createDate': createDate,
                    'cityId': cityId,
                    'zhiweiId': zhiweiId
                }
                print(res_obj)
                sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \
                      " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                      % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\
                      + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \
                      %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)
                self.db.save(sql)
            else:
                print('请求详情页失败:' + str(url))

    def push_url_to_redis(self):
        # zhiwei_list = []
        # zhiwei_sql = 'select * from zhiwei'
        # zhiwei_results = self.db.find_all(zhiwei_sql)
        # for zhiwei in zhiwei_results:
        #     zhiwei_list.append(zhiwei[2])
        #
        # zhen_sql = 'select * from zhen'
        # zhen_results = self.db.find_all(zhen_sql)
        #
        # for res in zhen_results:
        #     pid = res[1]
        #     zhen_id = res[2]
        #     for zhiwei_id in zhiwei_list:
        #         url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1')
        #         self.redisClient.push('employment',url)

        zhiwei_list = []
        zhiwei_sql = 'select * from zhiwei'
        zhiwei_results = self.db.find_all(zhiwei_sql)
        for zhiwei in zhiwei_results:
            zhiwei_list.append(zhiwei[2])

        shi_sql = 'select * from shi'
        shi_results = self.db.find_all(shi_sql)

        for res in shi_results:
            pid = res[2]
            # NEW_POSITION_URL = 'https://www.zhipin.com/job_detail/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&scity={pid}&industry=&position='
            for i in range(1, 11):
                NEW_POSITION_URL = 'https://www.zhipin.com/c{pid}/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&page={pageToken}&ka=page-{pageToken}'
                url = NEW_POSITION_URL.format(pid=pid, pageToken=i)
                url_obj = {"url": url, "cityId": pid, 'zhiweiId': '1'}
                self.redisClient.push('employment', json.dumps(url_obj))