def main(check_day): es = Elasticsearch(hosts='172.16.25.9') index = 'morgan-v3-%s' \ % (datetime2str(str2datetime(check_day), fmt='%Y.%m.%d')) start_time = int( time.mktime((str2datetime(check_day) - datetime.timedelta(hours=1)).timetuple())) * 1000 - 1 end_time = int(time.mktime((str2datetime(check_day)).timetuple())) * 1000 body = { "version": True, "size": 10000, # 用于控制输出数量 "query": { "bool": { "must": [{ "query_string": { "query": "log_message:\"判断是否需要下载 id=\"", "analyze_wildcard": True } }, { "range": { "@timestamp": { "gte": start_time, "lte": end_time, "format": "epoch_millis" } } }], "must_not": [] } }, "_source": { "excludes": [] }, "aggs": { "2": { "date_histogram": { "field": "@timestamp", "interval": "30m", "time_zone": "Asia/Shanghai", "min_doc_count": 1 } } } } res = es.search(index=index, body=body) hits = res.get('hits').get('hits') today = datetime2str(str2datetime(check_day), fmt='%Y-%m-%d') pools = RedisSet('ZHI_LIAN_AWAKE_FAILED-%s' % today) logger.info("匹配到%s个唤醒失败的简历 『%s - %s』。" % (len(hits), start_time, end_time)) for hit in hits: log_message = hit.get('_source').get('log_message') # 正则匹配id normal_id = re.findall('(?<=id=)\d+(?=\s)', log_message)[0] pools.sadd(normal_id) logger.info('当前集合长度为: %s' % len(pools.scard()))
def get_resume_list(self, page=1, **params): self.logger.info('开始执行搜索任务, USER: {} PAGE: {} {}'.format( self.auth_kwargs['username'].encode('utf-8'), page, json.dumps(params, ensure_ascii=False).encode('utf-8'))) end = datetime.datetime.now() + datetime.timedelta(days=1) start = end - datetime.timedelta(days=settings.SEARCH_DAY) post_date = datetime2str(start, fmt='%Y%m%d') + '000000_' \ + datetime2str(end, fmt='%Y%m%d') + '000000' url = '{}pn{}/pve_5593_{}/?key={}&age=18_30&postdate={}'.format( params.get('city_url'), page, params.get('degree', '4'), params.get('keyword').encode('utf-8'), post_date) # print(url) headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'cookie': self.cookie, 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies) if 'passport.58.com' in res.url: raise MfCookieValidException('cookie invalid. {}'.format( self.auth_kwargs['username'].encode('utf-8'))) real_html = self.get_real_html(res.content) soups = self.html_parser.parser(real_html).find( 'div', id='infolist').find_all('dl') has_next = True if self.html_parser.parser(real_html).find( 'div', class_='pagerout').find('a', class_='next') else False url_lst = [] for soup in soups: url = soup.find('dt').find('a').get('href') resume_id = self.get_resume_id(url) if self.do_filter(resume_id): self.logger.info('简历: {}, {}天内已被采集过'.format( resume_id, settings.DELAY_DAY)) continue url_lst.append(url) time.sleep(random.randint(1, 2)) return has_next, url_lst[:-1]
def is_limited(self, username): # 用于限制帐号进入详情页次数 today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') if not self.h_use_record.hget(username + today): self.h_use_record.hset(username + today, 0) count = 0 else: count = int(self.h_use_record.hget(username + today)) if username == ('E4gl36307', '18518261507', 'bjlb7610'): # if count >= 2000: if count >= settings.COUNT_LIMITED: return True # elif username in ('lzmy9771', 'hgmy2130'): # if count > 5000: # return True else: if count >= settings.COUNT_LIMITED: return True return count
def is_limit(self, change=True): """ 判断是否达到上限 :param change: 用于标识是否变更limit :return: """ today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') key = today + '|' + self.auth_kwargs['username'].encode('utf-8') limit = self.h_search_limit.hget(key) if not limit: self.h_search_limit.hset(key, 0) return False if int(limit) >= settings.SEARCH_LIMIT: self.logger.warning('当前帐号 %s 已达上限 %s,当天不再进行搜索.' % (self.auth_kwargs['username'].encode('utf-8'), settings.SEARCH_LIMIT)) return True if change: limit = int(limit) limit += 1 self.h_search_limit.hset(key, limit) return False
def run(self): try: email_lst = self.mysql_handle.query_by_sql(''' select email, password, pop3_host from spider.t_email WHERE valid=1 and pop3_host!='imap.exmail.qq.com' ''') if email_lst: self.logger.info("总计加载邮箱%s个." % len(email_lst)) for index, item in enumerate(email_lst): username, password, host = item self.logger.info("开始处理: %s | %s | %s" % item) imap = MailReceiver(host, username, password) try: imap.server.login(username, password) self.logger.info("%s: 登录成功 <%s>" % (username, index)) except Exception as e: if "LOGIN Login error" in e: self.robot.send_text("邮箱登录异常: %s %s %s" % (username, password, host)) self.logger.exception("%s: 登录失败 %s <%s>" % (username, e, index)) continue try: folder_list = imap.folder_list() except Exception as e: self.logger.exception("%s: 邮件箱加载失败 %s" % (username, e)) continue if not folder_list: continue # print(folder_list) if "Deleted Messages" in folder_list: # 腾讯 delete_folder = u"Deleted Messages" elif "已删除邮件" in folder_list: # 阿里云 delete_folder = u"已删除邮件" else: # 163 delete_folder = u"已删除" # print delete_folder before_day = 7 check_day = 60 start_date = datetime.datetime.now() - datetime.timedelta( days=before_day) # date = datetime.datetime.now() end_date = start_date - datetime.timedelta(days=check_day) self.logger.info("当前处理时间段为: %s 到 %s" % (datetime2str(start_date, fmt='%Y-%m-%d'), datetime2str(end_date, fmt='%Y-%m-%d'))) # search_ = 'BEFORE %s' % fmt_date for day in xrange(check_day): date = start_date - datetime.timedelta(days=day) fmt_date = datetime.datetime.strftime(date, "%d-%b-%Y") search_ = 'ON %s' % fmt_date try: message_list = imap.message_list(mailbox=delete_folder, search_=search_) except Exception as e: self.logger.exception("%s: 邮件list加载失败 %s" % (username, e)) continue if not message_list: self.logger.debug("%s: 没有 [%s] 已删除的邮件" % (username, search_)) continue self.logger.info("%s: [%s] 匹配到邮件%s个" % (username, search_, len(message_list))) if imap.delete_message(message_list, deleted_folder=None) is not True: self.logger.warning("%s 删除失败." % username) continue self.logger.info("%s 删除成功." % username) self.logger.info('%s 邮箱处理完毕.' % username) try: imap.close() except Exception as e: self.logger.exception('imap close 异常: %s ' % e) continue except Exception as e: self.logger.exception(e)
def resume_search(self, **search_args): self.get_cookie() ip = eval(self.auth_kwargs['proxy'])['ip'] port = eval(self.auth_kwargs['proxy'])['port'] self.proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port), } today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') # print(search_args) flag = False while True: awake_flow_no = self.h_awake_flow_no.hget('FIVE_ONE') if self.h_status.hget(awake_flow_no) == '400': self.logger.info("程序当前处于暂停状态.sleep 60s") time.sleep(60) continue if flag is False: # 扫描第一页 page_html = self.init_search_page() resume_list, page_html = self.get_resume_list( page_html, action='pagerTopNew$ctl00', **search_args) flag = True else: resume_list, page_html = self.get_resume_list( page_html, **search_args) if not resume_list: raise FiveOneResumeException('resume_list_empty') time.sleep(random.randint(1, 5)) for resume_args in resume_list: if self.is_limited(self.auth_kwargs['username']) is True: raise FiveOneResumeException('user_record_limited') count = self.is_limited(self.auth_kwargs['username']) # 用于简历去重 resume_id = re.findall('''(?<=hidUserID=)\d+?(?=&)''', resume_args)[0].encode('utf-8') last_search_day = self.h_black_list.hget(resume_id) if last_search_day: distance = ( str2datetime(today.replace('|', ''), '%Y-%m-%d') - str2datetime(last_search_day, '%Y-%m-%d')).days else: distance = DAY_LIMITED + 1 if distance < DAY_LIMITED: self.logger.warning('该简历%s天内已经被采集过: %s' % (DAY_LIMITED, resume_id)) continue self.h_black_list.hset(resume_id, today.replace('|', '')) resume_detail = self.get_resume_detail(resume_url=resume_args) if not resume_detail: continue resume_uuid = str(uuid.uuid1()) # content_origin = {'name': '', 'email': '', 'phone': '', # 'html': resume_detail.decode('utf-8')} # content = json.dumps(content_origin, ensure_ascii=False) content = resume_detail.decode('utf-8') sql = '''INSERT INTO spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) VALUES (%s, %s, "python", %s, now(), %s, %s, %s, %s)''' sql_value = (self.common_settings.SOURCE, content, resume_uuid, self.auth_kwargs['username'], search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], str(resume_id)) resume_update_time = '' msg_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": self.auth_kwargs['username'], 'emailJobType': search_args['keywords'].split('|')[0], 'emailCity': search_args['city'].split('|')[0], 'subject': str(resume_id) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH_AWAKE", "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': self.common_settings.CALL_SYSTEM_ID, } # self.mysql_handler.save(sql=sql, data=sql_value) res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data) if res: count += 1 self.h_use_record.hset( self.auth_kwargs['username'] + today, count) mysql_ = self.init_mysql( user='******', passwd='bi_admin#@1mofanghr', host='172.16.25.1', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) sql = ''' INSERT INTO spider.resume_awake_record (source, position, city, raw_id, create_time, username) VALUES ('FIVE_ONE', %s, %s, %s, now(), %s) ''' value = (search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], res, self.auth_kwargs['username']) mysql_.save(sql, value) time.sleep(random.randint(5, 7))
def resume_search(self, page, **search_args): self.get_cookie() ip = eval(self.auth_kwargs['proxy'])['ip'] port = eval(self.auth_kwargs['proxy'])['port'] self.proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port), } today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') # print(search_args) resume_list = self.get_resume_list(page=page, **search_args) if not resume_list: raise ZhiLianResumeException('resume_list_empty') for resume_args in resume_list: # 用于限制帐号进入详情页次数 if not self.h_use_record.hget( self.auth_kwargs['username'] + today): self.h_use_record.hset(self.auth_kwargs['username'] + today, 0) count = 0 else: count = int( self.h_use_record.hget( self.auth_kwargs['username'] + today)) if self.check_limit(count=count): today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset(today + '|' + self.auth_kwargs[ 'username'].encode('utf-8'), 1) raise ZhiLianResumeException('user_record_limited') # 用于简历去重 try: resume_id = str(resume_args.get('resumeNo').encode('utf-8')[ :10]) except: resume_id = str(resume_args.get('number')[:10]) last_search_day = self.h_black_list.hget(resume_id) if last_search_day: distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') - str2datetime(last_search_day, '%Y-%m-%d')).days else: distance = DAY_LIMITED + 1 if distance < DAY_LIMITED: self.logger.warning('该简历%s天内已经被采集过: %s' % (DAY_LIMITED, resume_id)) continue self.h_black_list.hset(resume_id, today.replace('|', '')) resume_detail = self.get_resume_detail( resume_args=resume_args) if not resume_detail: continue if resume_detail.get('resumeSource').encode('utf-8').lower() == \ 'download': resource_type = 'RESUME_INBOX' else: resource_type = 'RESUME_SEARCH' content = json.dumps({'name': '', 'email': '', 'phone': '', 'html': resume_detail}, ensure_ascii=False) data = { 'ChannelType': 'APP', 'Source': self.source, 'ResourceType': resource_type, 'content': content, 'accountContent': json.dumps(self.auth_kwargs, ensure_ascii=False, cls=JsonCustomEncoder), 'resumeId': resume_detail.get('resumeNo'), 'searchData': json.dumps( search_args.get('origin_search_args'), ensure_ascii=False), 'code': 200 } self.push_resume(**data) time.sleep(random.randint(1, 5))
"excludes": [] }, "aggs": { "2": { "date_histogram": { "field": "@timestamp", "interval": "30m", "time_zone": "Asia/Shanghai", "min_doc_count": 1 } } } } res = es.search(index=index, body=body) hits = res.get('hits').get('hits') today = datetime2str(str2datetime(check_day), fmt='%Y-%m-%d') pools = RedisSet('ZHI_LIAN_AWAKE_FAILED-%s' % today) logger.info("匹配到%s个唤醒失败的简历 『%s - %s』。" % (len(hits), start_time, end_time)) for hit in hits: log_message = hit.get('_source').get('log_message') # 正则匹配id normal_id = re.findall('(?<=id=)\d+(?=\s)', log_message)[0] pools.sadd(normal_id) logger.info('当前集合长度为: %s' % len(pools.scard())) if __name__ == '__main__': check_day = datetime2str(datetime.datetime.now()) main(check_day=check_day)
def execute_awake(): runner = ResumeFen() while True: task_id, params = runner.get_task() if not task_id: runner.push_task() continue try: runner.get_cookie() page = 1 has_next_page = True while has_next_page: if params.get('model_name') == 'ZHI_LIAN': mode = 'zl' resume_list = runner.get_resume_list_zl(page, **params) else: mode = 'lp' resume_list = runner.get_resume_list_lp(page, **params) if not resume_list: runner.logger.warning('简历列表为空,开始切换任务.') runner.update_task(task_id=task_id) break flag = 0 for resume in resume_list: resume_id = resume.get('id').encode('utf-8') day = datetime.datetime.now().day today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') last_search_day = runner.h_search_back_list.hget(resume_id) if flag > 25: runner.logger.info('当前页存在超过25个已采集简历,跳过任务.') has_next_page = False break # 用于去重天数限制 TIME_LIMIT if last_search_day: last_search_day = str2datetime(last_search_day, fmt='%Y-%m-%d').day if day - last_search_day <= TIME_LIMIT: runner.logger.warning('该简历[%s] %s天内已采集过.' % (resume_id, TIME_LIMIT)) flag += 1 continue content = runner.get_resume_detail(resume_id=resume_id, mode=mode) if not content: continue content = json.dumps(content, ensure_ascii=False) resume_uuid = str(uuid.uuid1()) sql = '''insert into spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)''' sql_value = (runner.common_settings.SOURCE, content, resume_uuid, runner.auth_kwargs['username'], params['job_name'], params['area_name'], str(resume_id)) resume_update_time = '' msg_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": runner.common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": runner.auth_kwargs['username'], 'emailJobType': params['job_name'], 'emailCity': params['area_name'], 'subject': resume_id }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": runner.common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': runner.common_settings.CALL_SYSTEM_ID, } # self.mysql_handler.save(sql=sql, data=sql_value) res = runner.save_data(sql=sql, data=sql_value, msg_data=msg_data) if res: # 重置cookie重试次数 runner.h_account_limit.hset( runner.auth_kwargs['username'], 0) runner.h_search_back_list.hset(resume_id, today) time.sleep(random.randint(1, 5)) if len(resume_list) < 30: runner.logger.info('当前页简历小于30,任务结束。') has_next_page = False page += 1 runner.update_task(task_id=task_id) except MfCookieValidException: runner.update_task(task_id=task_id) runner.add_task(param=json.dumps(params, ensure_ascii=False)) runner.logger.warning('因Cookie失败导致任务退出,重新添加任务!') except Exception as e: runner.update_task(task_id=task_id, execute_status='FAILURE', execute_result=str(e)) runner.logger.exception(str(e))
def get_resume_list(self, page=1, is_download=False, **search_args): """ 获取简历列表页 搜索条件: 关键词/所在地/年龄20-35/学历/最近三天upDate :param page: :param search_args: :return: """ url = 'https://ihr.zhaopin.com/resumesearch/search.do?' \ 'access_token=%s' % self.access_token headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'ihr.zhaopin.com', 'Origin': 'https://ihr.zhaopin.com', 'Pragma': 'no-cache', 'Referer': 'https://ihr.zhaopin.com/resumesearch/search/', 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } data = { 'keywords': search_args['keywords'].split('|')[0].encode('utf-8'), 'startNum': (page - 1) * 30, 'rowsCount': '30', 'resumeGrade': '', 'sortColumnName': 'sortUpDate', 'sortColumn': 'sortUpDate desc', 'onlyHasImg': 'false', 'anyKeyWord': 'false', 'hopeWorkCity': search_args['city'].split('|')[1].encode('utf-8'), 'ageStart': search_args.get('age_start', '18'), 'ageEnd': search_args.get('age_end', '30'), 'workYears': search_args.get('work_years', ''), 'liveCity': search_args.get('live_city', ''), 'sex': search_args.get('sex', ''), 'edu': search_args.get('degree', '5'), 'upDate': search_args.get('up_date', ''), # 默认搜索最近三天 'companyName': search_args.get('company_name', ''), 'jobType': '', 'desiredJobType': search_args.get('desired_job_type', ''), 'industry': search_args.get('industry', ''), 'desiredIndustry': '', 'careerStatus': '', 'desiredSalary': '', 'langSkill': '', 'hukouCity': '', 'major': '', 'onlyLastWork': 'false', } # print(json.dumps(data, ensure_ascii=False, indent=4)) if search_args['use_keywords'] is False: data['desiredJobType'] = search_args['keywords'].split('|')[1] self.logger.info('采用职能进行搜索.') else: self.logger.info('采用关键词进行搜索') res = self.html_downloader.download(url, method='POST', data=data, headers=headers, proxies=self.proxies) # self.logger.info('搜索返回 %s' % res.json()) if res.json().get('code') == 6001: self.logger.info(self.logger_prefix + 'cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') if res.json().get('code') == 808: self.logger.warning(self.logger_prefix + res.json().get('message').encode('utf-8')) today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset( today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1) # 当日搜索大库简历已达上限 global LIMIT_MESSAGE_BOX if not LIMIT_MESSAGE_BOX.get( self.auth_kwargs['username'].encode('utf-8'), ''): LIMIT_MESSAGE_BOX[self.auth_kwargs['username'].encode( 'utf-8')] = 1 self.robot_login.send_markdown( title="智联简历搜索", content="#### 智联简历当日关键词搜索量已达上限.\n" "- 帐号: %s\n" "- 密码: %s\n" "- 代理: %s\n" "- 达到上限账号总数: %s\n" % (self.auth_kwargs['username'].encode('utf-8'), self.auth_kwargs['password'].encode('utf-8'), self.auth_kwargs['ip'].encode('utf-8') + ':' + self.auth_kwargs['port'].encode('utf-8'), len(LIMIT_MESSAGE_BOX))) raise ZhiLianResumeException('user_record_limited') try: resume_list = res.json().get('results') if not resume_list: raise Exception except Exception as e: self.logger.exception('获取list失败: %s | %s' % (str(e), res.content)) return [] resume_accept_list = [] for resume in resume_list: # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4 # ) if is_download is False: if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime(resume.get('modifyDate'), '%Y-%m-%d').date() < limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_accept_list.append(resume) self.logger.info('page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_accept_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_accept_list
def resume_search(self, page, **search_args): self.get_cookie() ip = eval(self.auth_kwargs['proxy'])['ip'] port = eval(self.auth_kwargs['proxy'])['port'] self.proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port), } today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d') # print(search_args) resume_list = self.get_resume_list(page=page, **search_args) if not resume_list: raise ZhiLianResumeException('resume_list_empty') for resume_args in resume_list: # 用于限制帐号进入详情页次数 if not self.h_use_record.hget(self.auth_kwargs['username'] + today): self.h_use_record.hset(self.auth_kwargs['username'] + today, 0) count = 0 else: count = int( self.h_use_record.hget(self.auth_kwargs['username'] + today)) if self.check_limit(count=count): today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset( today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1) raise ZhiLianResumeException('user_record_limited') # 用于简历去重 try: resume_id = str( resume_args.get('resumeNo').encode('utf-8')[:10]) except: resume_id = str(resume_args.get('number')[:10]) mysql_1 = self.init_mysql( user='******', passwd='bi_admin#@1mofanghr', host='172.16.25.1', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) sql = ''' insert into spider.resume_awake_record_no_repeat (source, position, city, raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, %s, %s, now(), %s) ''' value = (search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], resume_id, self.auth_kwargs['username']) mysql_1.save(sql, value) del mysql_1 last_search_day = self.h_black_list.hget(resume_id) if last_search_day: distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') - str2datetime(last_search_day, '%Y-%m-%d')).days else: distance = DAY_LIMITED + 1 if distance < DAY_LIMITED: self.logger.warning('该简历%s天内已经被采集过: %s' % (DAY_LIMITED, resume_id)) continue self.h_black_list.hset(resume_id, today.replace('|', '')) resume_detail = self.get_resume_detail(resume_args=resume_args) if not resume_detail: continue resume_uuid = str(uuid.uuid1()) content = json.dumps( { 'name': '', 'email': '', 'phone': '', 'html': resume_detail }, ensure_ascii=False) sql = '''insert into spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)''' sql_value = (self.common_settings.SOURCE, content, resume_uuid, self.auth_kwargs['username'], search_args['keywords'], search_args['city'], str(resume_detail.get('resumeNo'))) resume_update_time = '' msg_data = { "channelType": "APP", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": self.auth_kwargs['username'], 'emailJobType': search_args['keywords'], 'emailCity': search_args['city'], 'subject': str(resume_detail.get('resumeNo')) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH_AWAKE", "source": self.common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': self.common_settings.CALL_SYSTEM_ID, } # self.mysql_handler.save(sql=sql, data=sql_value) res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data) if res: count += 1 self.h_use_record.hset(self.auth_kwargs['username'] + today, count) mysql_ = self.init_mysql( user='******', passwd='bi_admin#@1mofanghr', host='172.16.25.1', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) sql = ''' insert into spider.resume_awake_record (source, position, city, raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, %s, %s, now(), %s) ''' value = (search_args['keywords'].split('|')[0], search_args['city'].split('|')[0], res, self.auth_kwargs['username']) mysql_.save(sql, value) del mysql_ time.sleep(random.randint(3, 5))
def init_search_account(self, use_type='SEARCH_AWAKE'): if sem.locked(): return sem.acquire() try: self.q_search_account.clean() self.logger.info('开始初始化搜索帐号队列') if not self.q_search_account.empty(): self.logger.info('当前队列非空,剩余帐号: %s' % self.q_search_account.qsize()) return mysql_ = self.init_mysql( user='******', passwd='nMGZKQIXr4PE8aR2', host='rm-2ze15h84ax219xf08.mysql.rds.aliyuncs.com', # user='******', # passwd='bi_admin#@1mofanghr', # host='10.0.3.52', cursorclass=DictCursor, cls_singleton=False) account_list = mysql_.query_by_sql(""" SELECT a.* FROM autojob_v2.t_account a WHERE EXISTS( SELECT 1 FROM autojob_v2.t_account_use_type b WHERE a.id = b.accountId AND b.useType = '%s' AND b.valid=1 AND a.valid = 1 AND a.status = 1 AND a.source = 'ZHI_LIAN' ) ORDER BY a.id DESC; """ % use_type) self.logger.info('共匹配到%s个有效帐号' % len(account_list)) effective_account_list = [] for account in account_list: today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') global RUN_STATUS if RUN_STATUS.get(account['username'].encode('utf-8')): self.logger.warning('当前帐号 %s 处于执行状态.' % account['username'].encode('utf-8')) continue if self.h_over_search_limit.hget( today + '|' + account['username'].encode('utf-8')): self.logger.warning( '当前帐号 %s 已达上限 %s,当天不再进行搜索.' % (account['username'].encode('utf-8'), self.h_use_record.hget(account['username'].encode( 'utf-8') + '|' + today))) continue self.q_search_account.put(account) effective_account_list.append(account) self.logger.info('初始化搜索帐号队列完毕 %s' % len(effective_account_list)) sem.release() except Exception as e: self.logger.exception('初始化搜索帐号队列失败: ' + str(e)) sem.release()