def main(check_day): es = Elasticsearch(hosts='172.16.25.9') index = 'morgan-v3-%s' \ % (datetime2str(str2datetime(check_day), fmt='%Y.%m.%d')) start_time = int( time.mktime((str2datetime(check_day) - datetime.timedelta(hours=1)).timetuple())) * 1000 - 1 end_time = int(time.mktime((str2datetime(check_day)).timetuple())) * 1000 body = { "version": True, "size": 10000, # 用于控制输出数量 "query": { "bool": { "must": [{ "query_string": { "query": "log_message:\"判断是否需要下载 id=\"", "analyze_wildcard": True } }, { "range": { "@timestamp": { "gte": start_time, "lte": end_time, "format": "epoch_millis" } } }], "must_not": [] } }, "_source": { "excludes": [] }, "aggs": { "2": { "date_histogram": { "field": "@timestamp", "interval": "30m", "time_zone": "Asia/Shanghai", "min_doc_count": 1 } } } } res = es.search(index=index, body=body) hits = res.get('hits').get('hits') today = datetime2str(str2datetime(check_day), fmt='%Y-%m-%d') pools = RedisSet('ZHI_LIAN_AWAKE_FAILED-%s' % today) logger.info("匹配到%s个唤醒失败的简历 『%s - %s』。" % (len(hits), start_time, end_time)) for hit in hits: log_message = hit.get('_source').get('log_message') # 正则匹配id normal_id = re.findall('(?<=id=)\d+(?=\s)', log_message)[0] pools.sadd(normal_id) logger.info('当前集合长度为: %s' % len(pools.scard()))
def resume_search(self, **search_args): self.get_cookie() ip = eval(self.auth_kwargs['proxy'])['ip'] port = eval(self.auth_kwargs['proxy'])['port'] self.proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port), } today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') # print(search_args) flag = False while True: awake_flow_no = self.h_awake_flow_no.hget('FIVE_ONE') if self.h_status.hget(awake_flow_no) == '400': self.logger.info("程序当前处于暂停状态.sleep 60s") time.sleep(60) continue if flag is False: # 扫描第一页 page_html = self.init_search_page() resume_list, page_html = self.get_resume_list( page_html, action='pagerTopNew$ctl00', **search_args) flag = True else: resume_list, page_html = self.get_resume_list( page_html, **search_args) if not resume_list: raise FiveOneResumeException('resume_list_empty') time.sleep(random.randint(1, 5)) for resume_args in resume_list: if self.is_limited(self.auth_kwargs['username']) is True: raise FiveOneResumeException('user_record_limited') count = self.is_limited(self.auth_kwargs['username']) # 用于简历去重 resume_id = re.findall('''(?<=hidUserID=)\d+?(?=&)''', resume_args)[0].encode('utf-8') last_search_day = self.h_black_list.hget(resume_id) if last_search_day: distance = ( str2datetime(today.replace('|', ''), '%Y-%m-%d') - str2datetime(last_search_day, '%Y-%m-%d')).days else: distance = DAY_LIMITED + 1 if distance < DAY_LIMITED: self.logger.warning('该简历%s天内已经被采集过: %s' % (DAY_LIMITED, resume_id)) continue self.h_black_list.hset(resume_id, today.replace('|', '')) resume_detail = self.get_resume_detail(resume_url=resume_args) if not resume_detail: continue resume_uuid = str(uuid.uuid1()) # content_origin = {'name': '', 'email': '', 'phone': '', # 'html': resume_detail.decode('utf-8')} # content = json.dumps(content_origin, ensure_ascii=False) content = resume_detail.decode('utf-8') sql = '''INSERT INTO spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) VALUES (%s, %s, "python", %s, now(), %s, %s, %s, %s)''' sql_value = (self.common_settings.SOURCE, content, resume_uuid, self.auth_kwargs['username'], search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], str(resume_id)) resume_update_time = '' msg_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": self.auth_kwargs['username'], 'emailJobType': search_args['keywords'].split('|')[0], 'emailCity': search_args['city'].split('|')[0], 'subject': str(resume_id) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH_AWAKE", "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': self.common_settings.CALL_SYSTEM_ID, } # self.mysql_handler.save(sql=sql, data=sql_value) res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data) if res: count += 1 self.h_use_record.hset( self.auth_kwargs['username'] + today, count) mysql_ = self.init_mysql( user='******', passwd='bi_admin#@1mofanghr', host='172.16.25.1', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) sql = ''' INSERT INTO spider.resume_awake_record (source, position, city, raw_id, create_time, username) VALUES ('FIVE_ONE', %s, %s, %s, now(), %s) ''' value = (search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], res, self.auth_kwargs['username']) mysql_.save(sql, value) time.sleep(random.randint(5, 7))
def get_resume_list(self, previous_page_html, action='pagerTopNew$ctl03', **search_args): """ :param previous_page_html: :param action: :param search_args: city['北京|010000'], keywords['销售代表|3001'] :return: """ url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'ehire.51job.com', 'Origin': 'https://ehire.51job.com', 'Pragma': 'no-cache', 'Referer': 'https://ehire.51job.com/Candidate/' 'SearchResumeNew.aspx', 'Upgrade-Insecure-Requests': '1', 'User-Agent': self.user_agent, 'Cookie': self.cookie } _soups = self.html_parser.parser(previous_page_html) data = { '__EVENTTARGET': action, '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': _soups.find('input', id='__VIEWSTATE').get('value'), 'ctrlSerach$search_keyword_txt': search_args['keywords'].split('|')[0], 'ctrlSerach$search_company_txt': '', 'ctrlSerach$search_area_input': '', 'ctrlSerach$search_area_hid': '', 'ctrlSerach$search_funtype_hid': '', 'ctrlSerach$search_expectsalaryf_input': '不限', 'ctrlSerach$search_expectsalaryt_input': '不限', 'ctrlSerach$search_industry_hid': '', 'ctrlSerach$search_wyf_input': '不限', 'ctrlSerach$search_wyt_input': '不限', 'ctrlSerach$search_df_input': '不限', 'ctrlSerach$search_dt_input': '不限', 'ctrlSerach$search_cursalaryf_input': '不限', 'ctrlSerach$search_cursalaryt_input': '不限', 'ctrlSerach$search_age_input': '年龄:18-30', 'ctrlSerach$search_agef_input': '18', 'ctrlSerach$search_aget_input': '30', 'ctrlSerach$search_expjobarea_input': search_args['city'].split('|')[0], 'ctrlSerach$search_expjobarea_hid': search_args['city'], 'ctrlSerach$search_forlang_input': '语言', 'ctrlSerach$search_fl_input': '不限', 'ctrlSerach$search_fllsabilityll_input': '不限', 'ctrlSerach$search_englishlevel_input': '英语等级', 'ctrlSerach$search_sex_input': '性别', 'ctrlSerach$search_major_input': '专业', 'ctrlSerach$search_major_hid': '', 'ctrlSerach$search_hukou_input': '户口', 'ctrlSerach$search_hukou_hid': '', 'ctrlSerach$search_rsmupdate_input': '近1周', 'ctrlSerach$search_jobstatus_input': '求职状态', 'send_cycle': '1', 'send_time': '7', 'send_sum': '10', 'ctrlSerach$hidSearchValue': u'%s##0#######20#35############近1周|1##1#0##%s#0#0#0' % (search_args['keywords'].split('|')[0], search_args['city']), 'ctrlSerach$hidKeyWordMind': '', 'ctrlSerach$hidRecommend': '', 'ctrlSerach$hidWorkYearArea': '', 'ctrlSerach$hidDegreeArea': '', 'ctrlSerach$hidSalaryArea': '', 'ctrlSerach$hidCurSalaryArea': '', 'ctrlSerach$hidIsRecDisplay': '1', 'showselected': '', 'pagerTopNew$ctl06': '50', 'cbxColumns$0': 'AGE', 'cbxColumns$1': 'WORKYEAR', 'cbxColumns$2': 'SEX', 'cbxColumns$3': 'AREA', 'cbxColumns$4': 'WORKFUNC', 'cbxColumns$5': 'TOPDEGREE', 'cbxColumns$6': 'LASTUPDATE', 'hidAccessKey': _soups.find('input', id='hidAccessKey').get('value'), 'hidShowCode': '0', 'hidDisplayType': '1', 'hidEhireDemo': '', 'hidUserID': '', 'hidCheckUserIds': _soups.find('input', id='hidCheckUserIds').get('value'), 'hidCheckKey': _soups.find('input', id='hidCheckKey').get('value'), 'hidEvents': '', 'hidNoSearchIDs': '', 'hidBtnType': '', 'hideMarkid': '', 'hidStrAuthority': _soups.find('input', id='hidStrAuthority').get('value'), 'hidDownloadNum': _soups.find('input', id='hidDownloadNum').get('value'), 'hidKeywordCookie': '', 'showGuide': '', } if not search_args['use_keywords']: self.logger.info('采用职能进行搜索.') data['ctrlSerach$search_keyword_txt'] = '' data['ctrlSerach$search_funtype_hid'] = search_args['keywords'] data['hidSearchValue'] = \ u'##0#%s######20#35############近1周|1##1#0##%s#0#0#0' \ % (search_args['keywords'], search_args['city']) else: self.logger.info('采用关键词进行搜索.') res = self.html_downloader.download(url, method='POST', headers=headers, data=data, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie invalid.') # self.h.hset( # self.source + '|' + self.auth_kwargs['username'].encode( # 'utf-8'), '') raise MfCookieValidException('cookie_invalid') access_key = self.html_parser.parser(res.content).find( 'input', id='hidAccessKey').get('value') # auth_ = self.html_parser.parser(res.content).find( # 'div', id='divVerifyCode_ch').get('style') soups = self.html_parser.parser(res.content).find_all( 'td', class_='Common_list_table-id-text') resume_list = [] if not soups: # 通过empty_times控制,当某账号累计10次遇到返回为空的情况,则进行验证码验证 empty_times = int(self.h_search_empty_times.hget(self.auth_kwargs['username'])) \ if self.h_search_empty_times.hget(self.auth_kwargs['username']) else 0 if empty_times > 10: self.logger.warning( '搜索列表遇到验证码. %s' % self.auth_kwargs['username'].encode('utf-8')) self.get_captcha(referer=res.url, access_key=access_key, do_type='CheckSearchResume') self.h_search_empty_times.hset(self.auth_kwargs['username'], 0) raise Exception else: self.logger.warning( '未匹配到搜索结果,跳过该任务[%s, %s, %s]' % (self.auth_kwargs['username'].encode('utf-8'), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) empty_times += 1 self.h_search_empty_times.hset(self.auth_kwargs['username'], empty_times) return resume_list, '' for soup in soups: ref_time = soup.find_parent().find_all('td')[-2].text.encode( 'utf-8') if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = settings.DAY_LIMITED limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = settings.DAY_LIMITED limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime(ref_time, '%Y-%m-%d').date() < \ limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_list.append(soup.find('a').get('href')) try: page = self.html_parser.parser(res.content).find( 'div', class_='Search_page-numble').find( 'a', class_='active').get('title').encode('utf-8') except Exception as e: self.logger.warning('未找到分页组件,跳过该任务[%s, %s]' % (search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_list, '' self.logger.info( 'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) if int(page) > settings.TASK_PAGE_LIMIT: raise FiveOneResumeException('task_page_limit') return resume_list, res.content
def resume_search(self, page, **search_args): self.get_cookie() ip = eval(self.auth_kwargs['proxy'])['ip'] port = eval(self.auth_kwargs['proxy'])['port'] self.proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port), } today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') # print(search_args) resume_list = self.get_resume_list(page=page, **search_args) if not resume_list: raise ZhiLianResumeException('resume_list_empty') for resume_args in resume_list: # 用于限制帐号进入详情页次数 if not self.h_use_record.hget( self.auth_kwargs['username'] + today): self.h_use_record.hset(self.auth_kwargs['username'] + today, 0) count = 0 else: count = int( self.h_use_record.hget( self.auth_kwargs['username'] + today)) if self.check_limit(count=count): today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset(today + '|' + self.auth_kwargs[ 'username'].encode('utf-8'), 1) raise ZhiLianResumeException('user_record_limited') # 用于简历去重 try: resume_id = str(resume_args.get('resumeNo').encode('utf-8')[ :10]) except: resume_id = str(resume_args.get('number')[:10]) last_search_day = self.h_black_list.hget(resume_id) if last_search_day: distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') - str2datetime(last_search_day, '%Y-%m-%d')).days else: distance = DAY_LIMITED + 1 if distance < DAY_LIMITED: self.logger.warning('该简历%s天内已经被采集过: %s' % (DAY_LIMITED, resume_id)) continue self.h_black_list.hset(resume_id, today.replace('|', '')) resume_detail = self.get_resume_detail( resume_args=resume_args) if not resume_detail: continue if resume_detail.get('resumeSource').encode('utf-8').lower() == \ 'download': resource_type = 'RESUME_INBOX' else: resource_type = 'RESUME_SEARCH' content = json.dumps({'name': '', 'email': '', 'phone': '', 'html': resume_detail}, ensure_ascii=False) data = { 'ChannelType': 'APP', 'Source': self.source, 'ResourceType': resource_type, 'content': content, 'accountContent': json.dumps(self.auth_kwargs, ensure_ascii=False, cls=JsonCustomEncoder), 'resumeId': resume_detail.get('resumeNo'), 'searchData': json.dumps( search_args.get('origin_search_args'), ensure_ascii=False), 'code': 200 } self.push_resume(**data) time.sleep(random.randint(1, 5))
def execute_awake(): runner = ResumeFen() while True: task_id, params = runner.get_task() if not task_id: runner.push_task() continue try: runner.get_cookie() page = 1 has_next_page = True while has_next_page: if params.get('model_name') == 'ZHI_LIAN': mode = 'zl' resume_list = runner.get_resume_list_zl(page, **params) else: mode = 'lp' resume_list = runner.get_resume_list_lp(page, **params) if not resume_list: runner.logger.warning('简历列表为空,开始切换任务.') runner.update_task(task_id=task_id) break flag = 0 for resume in resume_list: resume_id = resume.get('id').encode('utf-8') day = datetime.datetime.now().day today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') last_search_day = runner.h_search_back_list.hget(resume_id) if flag > 25: runner.logger.info('当前页存在超过25个已采集简历,跳过任务.') has_next_page = False break # 用于去重天数限制 TIME_LIMIT if last_search_day: last_search_day = str2datetime(last_search_day, fmt='%Y-%m-%d').day if day - last_search_day <= TIME_LIMIT: runner.logger.warning('该简历[%s] %s天内已采集过.' % (resume_id, TIME_LIMIT)) flag += 1 continue content = runner.get_resume_detail(resume_id=resume_id, mode=mode) if not content: continue content = json.dumps(content, ensure_ascii=False) resume_uuid = str(uuid.uuid1()) sql = '''insert into spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)''' sql_value = (runner.common_settings.SOURCE, content, resume_uuid, runner.auth_kwargs['username'], params['job_name'], params['area_name'], str(resume_id)) resume_update_time = '' msg_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": runner.common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": runner.auth_kwargs['username'], 'emailJobType': params['job_name'], 'emailCity': params['area_name'], 'subject': resume_id }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": runner.common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': runner.common_settings.CALL_SYSTEM_ID, } # self.mysql_handler.save(sql=sql, data=sql_value) res = runner.save_data(sql=sql, data=sql_value, msg_data=msg_data) if res: # 重置cookie重试次数 runner.h_account_limit.hset( runner.auth_kwargs['username'], 0) runner.h_search_back_list.hset(resume_id, today) time.sleep(random.randint(1, 5)) if len(resume_list) < 30: runner.logger.info('当前页简历小于30,任务结束。') has_next_page = False page += 1 runner.update_task(task_id=task_id) except MfCookieValidException: runner.update_task(task_id=task_id) runner.add_task(param=json.dumps(params, ensure_ascii=False)) runner.logger.warning('因Cookie失败导致任务退出,重新添加任务!') except Exception as e: runner.update_task(task_id=task_id, execute_status='FAILURE', execute_result=str(e)) runner.logger.exception(str(e))
def get_resume_list(self, page=1, is_download=False, **search_args): """ 获取简历列表页 搜索条件: 关键词/所在地/年龄20-35/学历/最近三天upDate :param page: :param search_args: :return: """ url = 'https://ihr.zhaopin.com/resumesearch/search.do?' \ 'access_token=%s' % self.access_token headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'ihr.zhaopin.com', 'Origin': 'https://ihr.zhaopin.com', 'Pragma': 'no-cache', 'Referer': 'https://ihr.zhaopin.com/resumesearch/search/', 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } data = { 'keywords': search_args['keywords'].split('|')[0].encode('utf-8'), 'startNum': (page - 1) * 30, 'rowsCount': '30', 'resumeGrade': '', 'sortColumnName': 'sortUpDate', 'sortColumn': 'sortUpDate desc', 'onlyHasImg': 'false', 'anyKeyWord': 'false', 'hopeWorkCity': search_args['city'].split('|')[1].encode('utf-8'), 'ageStart': search_args.get('age_start', '18'), 'ageEnd': search_args.get('age_end', '30'), 'workYears': search_args.get('work_years', ''), 'liveCity': search_args.get('live_city', ''), 'sex': search_args.get('sex', ''), 'edu': search_args.get('degree', '5'), 'upDate': search_args.get('up_date', ''), # 默认搜索最近三天 'companyName': search_args.get('company_name', ''), 'jobType': '', 'desiredJobType': search_args.get('desired_job_type', ''), 'industry': search_args.get('industry', ''), 'desiredIndustry': '', 'careerStatus': '', 'desiredSalary': '', 'langSkill': '', 'hukouCity': '', 'major': '', 'onlyLastWork': 'false', } # print(json.dumps(data, ensure_ascii=False, indent=4)) if search_args['use_keywords'] is False: data['desiredJobType'] = search_args['keywords'].split('|')[1] self.logger.info('采用职能进行搜索.') else: self.logger.info('采用关键词进行搜索') res = self.html_downloader.download(url, method='POST', data=data, headers=headers, proxies=self.proxies) # self.logger.info('搜索返回 %s' % res.json()) if res.json().get('code') == 6001: self.logger.info(self.logger_prefix + 'cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') if res.json().get('code') == 808: self.logger.warning(self.logger_prefix + res.json().get('message').encode('utf-8')) today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset( today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1) # 当日搜索大库简历已达上限 global LIMIT_MESSAGE_BOX if not LIMIT_MESSAGE_BOX.get( self.auth_kwargs['username'].encode('utf-8'), ''): LIMIT_MESSAGE_BOX[self.auth_kwargs['username'].encode( 'utf-8')] = 1 self.robot_login.send_markdown( title="智联简历搜索", content="#### 智联简历当日关键词搜索量已达上限.\n" "- 帐号: %s\n" "- 密码: %s\n" "- 代理: %s\n" "- 达到上限账号总数: %s\n" % (self.auth_kwargs['username'].encode('utf-8'), self.auth_kwargs['password'].encode('utf-8'), self.auth_kwargs['ip'].encode('utf-8') + ':' + self.auth_kwargs['port'].encode('utf-8'), len(LIMIT_MESSAGE_BOX))) raise ZhiLianResumeException('user_record_limited') try: resume_list = res.json().get('results') if not resume_list: raise Exception except Exception as e: self.logger.exception('获取list失败: %s | %s' % (str(e), res.content)) return [] resume_accept_list = [] for resume in resume_list: # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4 # ) if is_download is False: if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime(resume.get('modifyDate'), '%Y-%m-%d').date() < limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_accept_list.append(resume) self.logger.info('page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_accept_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_accept_list
def get_resume_list(self, page=1, **search_args): """ 获取简历列表页 搜索条件: 关键词/所在地/年龄20-35/学历/最近三天SF_1_1_7 :param page: :param search_args: :return: """ if search_args['use_keywords'] is False: self.logger.info('采用职能进行搜索.') url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_2=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[1].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_2=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[1].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) else: self.logger.info('采用关键词进行搜索.') url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_1=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[0].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_1=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[0].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'ihrsearch.zhaopin.com', 'Pragma': 'no-cache', 'Referer': referer, 'Upgrade-Insecure-Requests': '1', 'User-Agent': self.user_agent, 'Cookie': self.cookie } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') try: soups = self.html_parser.parser(res.content).find( 'form', attrs={ 'name': 'frmResult' }).find('tbody').find_all('tr', class_='info') soups1 = self.html_parser.parser(res.content).find( 'form', attrs={ 'name': 'frmResult' }).find('tbody').find_all('tr', valign='middle') except Exception as e: self.logger.exception('获取resume_list失败: %s' % str(e)) return [] resume_list = [] for index, soup in enumerate(soups): # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4 # ) if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime( soups1[index].find_all('td')[-1].text.encode('utf-8'), '%y-%m-%d').date() < limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_item = dict() resume_item['resumeNo'] = soup.find('a').get('resumeurlpart') resume_item['t'] = soup.find('a').get('t') resume_item['k'] = soup.find('a').get('k') resume_list.append(resume_item) self.logger.info( 'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_list
def resume_search(self, page, **search_args): self.get_cookie() ip = eval(self.auth_kwargs['proxy'])['ip'] port = eval(self.auth_kwargs['proxy'])['port'] self.proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port), } today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') # print(search_args) resume_list = self.get_resume_list(page=page, **search_args) if not resume_list: raise ZhiLianResumeException('resume_list_empty') for resume_args in resume_list: # 用于限制帐号进入详情页次数 if not self.h_use_record.hget(self.auth_kwargs['username'] + today): self.h_use_record.hset(self.auth_kwargs['username'] + today, 0) count = 0 else: count = int( self.h_use_record.hget(self.auth_kwargs['username'] + today)) if self.check_limit(count=count): today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset( today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1) raise ZhiLianResumeException('user_record_limited') # 用于简历去重 try: resume_id = str( resume_args.get('resumeNo').encode('utf-8')[:10]) except: resume_id = str(resume_args.get('number')[:10]) mysql_1 = self.init_mysql( user='******', passwd='bi_admin#@1mofanghr', host='172.16.25.1', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) sql = ''' insert into spider.resume_awake_record_no_repeat (source, position, city, raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, %s, %s, now(), %s) ''' value = (search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], resume_id, self.auth_kwargs['username']) mysql_1.save(sql, value) del mysql_1 last_search_day = self.h_black_list.hget(resume_id) if last_search_day: distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') - str2datetime(last_search_day, '%Y-%m-%d')).days else: distance = DAY_LIMITED + 1 if distance < DAY_LIMITED: self.logger.warning('该简历%s天内已经被采集过: %s' % (DAY_LIMITED, resume_id)) continue self.h_black_list.hset(resume_id, today.replace('|', '')) resume_detail = self.get_resume_detail(resume_args=resume_args) if not resume_detail: continue resume_uuid = str(uuid.uuid1()) content = json.dumps( { 'name': '', 'email': '', 'phone': '', 'html': resume_detail }, ensure_ascii=False) sql = '''insert into spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)''' sql_value = (self.common_settings.SOURCE, content, resume_uuid, self.auth_kwargs['username'], search_args['keywords'], search_args['city'], str(resume_detail.get('resumeNo'))) resume_update_time = '' msg_data = { "channelType": "APP", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": self.auth_kwargs['username'], 'emailJobType': search_args['keywords'], 'emailCity': search_args['city'], 'subject': str(resume_detail.get('resumeNo')) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH_AWAKE", "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': self.common_settings.CALL_SYSTEM_ID, } # self.mysql_handler.save(sql=sql, data=sql_value) res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data) if res: count += 1 self.h_use_record.hset(self.auth_kwargs['username'] + today, count) mysql_ = self.init_mysql( user='******', passwd='bi_admin#@1mofanghr', host='172.16.25.1', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) sql = ''' insert into spider.resume_awake_record (source, position, city, raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, %s, %s, now(), %s) ''' value = (search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], res, self.auth_kwargs['username']) mysql_.save(sql, value) del mysql_ time.sleep(random.randint(3, 5))