def main(): s = Scheduler() print('程序开始运行。。') redisClient = RedisClient() # flag = True # while flag: redis_len = redisClient.llen('employment') print('redis队列长度:' + str(redis_len)) if redis_len >= 0: s.run()
class Scheduler(object): def __init__(self): self.download = Download() self.db = MysqlClient() self.redisClient = RedisClient() def run(self): # self.push_url_to_redis() flag = True while flag: redis_len = self.redisClient.llen('employment') print('redis队列长度:' + str(redis_len)) if redis_len > 0: self.get_position() else: flag = False def get_qu(self): sql = 'select * from shi' results = self.db.find_all(sql) for res in results: shi_id = res[2] url = SHI_URL.format(shi_id='c' + shi_id) print(url) html = self.download.get_html(url) if html.status_code == 200 and html is not None: html = HTML(html.text) qu_id_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/@href' ) qu_name_list = html.xpath( '//dl[@class="condition-district show-condition-district"]/dd/a/text()' ) for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]): qu_id = qu_id.split('/') qu_id = qu_id[2] sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_zhen(self): sql = 'select * from qu' results = self.db.find_all(sql) for res in results: shi_id = res[1] qu_id = res[2] url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id) print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) zhen_id_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/@href' ) zhen_name_list = html.xpath( '//dl[@class="condition-area show-condition-area"]/dd/a/text()' ) for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]): zhen_id = zhen_id.split('/') zhen_id = zhen_id[2] sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\ .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name) print(sql) self.db.save(sql) else: print('该url无数据') def get_position(self): redis_results = self.redisClient.pop('employment') try: json_obj = json.loads(redis_results[1].decode('utf8')) except: return None if json_obj: detail_url_list = [] url = json_obj['url'] # pre_page = re.search('\/\?page=(.*?)&', url).group(1) # if int(pageToken) > 10: # break # url = url.replace('page='+pre_page+'&sort=2&ka=page-'+pre_page, 'page=' + str(pageToken) + '&sort=2&ka=page-' + str(pageToken)) cityId = json_obj['cityId'] zhiweiId = json_obj['zhiweiId'] print(url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时) li_xpath = html.xpath('//div[@class="job-list"]/ul/li') for li in li_xpath: content = etree.tostring(li) content = HT.unescape(content.decode()) content = HTML(content) li_time = content.xpath( 'string(//div[@class="info-publis"]/p)') href_url = content.xpath( 'string(//div[@class="info-primary"]//h3/a/@href)') #判断数据库存不存在: try: cid = re.match('^/job_detail/(.*?)\.html', href_url).group(1) sql = "select * from positions where cid='%s'" % (cid) find_one_res = self.db.find_one(sql) if find_one_res is None: #先把cid插入,避免重复抓取 sql = "insert into positions(cid) values ('%s')" % ( cid) self.db.save(sql) detail_url_list.append(config.HOST_URL + href_url) elif find_one_res[2] is None: detail_url_list.append(config.HOST_URL + href_url) else: print('数据库存在该记录:' + str(cid)) except: print('查询数据库出错:' + str(cid)) results = self.get_detail(detail_url_list, cityId, zhiweiId) else: print('该url无数据') def get_detail(self, detail_url_list, cityId, zhiweiId): for url in detail_url_list: print('下载该详情页:' + url) html = self.download.get_html(url) if html is not None and html.status_code == 200: html = HTML(html.text) try: cid = re.match( '^https://www.zhipin.com/job_detail/(.*?)\.html', url).group(1) except: print('获取cid失败') continue title = html.xpath('string(//h1)') if title == '': continue url = url try: publishDateStr = html.xpath( 'string(//span[@class="time"])').split('发布于')[1] publishDate = int( time.mktime( time.strptime(publishDateStr, "%Y-%m-%d %H:%M"))) except: publishDateStr = None publishDate = None try: info = html.xpath( 'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)' ) info = info.split(':') city = info[1][:-2] jingyan = info[2][:-2] xueli = info[3] except: city = None jingyan = None xueli = None price = html.xpath( 'string(//div[@class="info-primary"]//span[@class="badge"])' ) price = price.strip() posterName = html.xpath('string(//h2)') posterId = None posterUrl = html.xpath( 'string(//div[@class="detail-figure"]/img/@src)') content = html.xpath( 'string(//div[@class="job-sec"]/div[@class="text"])' ).strip() try: company_text = html.xpath( 'string(//a[@ka="job-cominfo"]/@href)') companyID = re.match('/gongsi/(.*?)\.html', company_text).group(1) except: companyID = None createDate = int(time.time()) temp_time = time.localtime(int(time.time())) now_DateStr = time.strftime("%Y-%m-%d", temp_time) lt = time.strptime(now_DateStr, "%Y-%m-%d") now_timestamp = int(time.mktime(lt)) res_obj = { 'cid': cid, 'title': title, 'url': url, 'publishDateStr': publishDateStr, 'publishDate': publishDate, 'city': city, 'jingyan': jingyan, 'xueli': xueli, 'price': price, 'posterName': posterName, 'posterId': posterId, 'posterUrl': posterUrl, 'content': content, 'companyID': companyID, 'createDate': createDate, 'cityId': cityId, 'zhiweiId': zhiweiId } print(res_obj) sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \ " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\ + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \ %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId) self.db.save(sql) else: print('请求详情页失败:' + str(url)) def push_url_to_redis(self): # zhiwei_list = [] # zhiwei_sql = 'select * from zhiwei' # zhiwei_results = self.db.find_all(zhiwei_sql) # for zhiwei in zhiwei_results: # zhiwei_list.append(zhiwei[2]) # # zhen_sql = 'select * from zhen' # zhen_results = self.db.find_all(zhen_sql) # # for res in zhen_results: # pid = res[1] # zhen_id = res[2] # for zhiwei_id in zhiwei_list: # url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1') # self.redisClient.push('employment',url) zhiwei_list = [] zhiwei_sql = 'select * from zhiwei' zhiwei_results = self.db.find_all(zhiwei_sql) for zhiwei in zhiwei_results: zhiwei_list.append(zhiwei[2]) shi_sql = 'select * from shi' shi_results = self.db.find_all(shi_sql) for res in shi_results: pid = res[2] # NEW_POSITION_URL = 'https://www.zhipin.com/job_detail/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&scity={pid}&industry=&position=' for i in range(1, 11): NEW_POSITION_URL = 'https://www.zhipin.com/c{pid}/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&page={pageToken}&ka=page-{pageToken}' url = NEW_POSITION_URL.format(pid=pid, pageToken=i) url_obj = {"url": url, "cityId": pid, 'zhiweiId': '1'} self.redisClient.push('employment', json.dumps(url_obj))