Beispiel #1
0
def main(check_day):
    es = Elasticsearch(hosts='172.16.25.9')
    index = 'morgan-v3-%s' \
            % (datetime2str(str2datetime(check_day), fmt='%Y.%m.%d'))
    start_time = int(
        time.mktime((str2datetime(check_day) -
                     datetime.timedelta(hours=1)).timetuple())) * 1000 - 1
    end_time = int(time.mktime((str2datetime(check_day)).timetuple())) * 1000

    body = {
        "version": True,
        "size": 10000,  # 用于控制输出数量
        "query": {
            "bool": {
                "must": [{
                    "query_string": {
                        "query": "log_message:\"判断是否需要下载 id=\"",
                        "analyze_wildcard": True
                    }
                }, {
                    "range": {
                        "@timestamp": {
                            "gte": start_time,
                            "lte": end_time,
                            "format": "epoch_millis"
                        }
                    }
                }],
                "must_not": []
            }
        },
        "_source": {
            "excludes": []
        },
        "aggs": {
            "2": {
                "date_histogram": {
                    "field": "@timestamp",
                    "interval": "30m",
                    "time_zone": "Asia/Shanghai",
                    "min_doc_count": 1
                }
            }
        }
    }
    res = es.search(index=index, body=body)
    hits = res.get('hits').get('hits')
    today = datetime2str(str2datetime(check_day), fmt='%Y-%m-%d')
    pools = RedisSet('ZHI_LIAN_AWAKE_FAILED-%s' % today)

    logger.info("匹配到%s个唤醒失败的简历 『%s - %s』。" % (len(hits), start_time, end_time))
    for hit in hits:
        log_message = hit.get('_source').get('log_message')
        # 正则匹配id
        normal_id = re.findall('(?<=id=)\d+(?=\s)', log_message)[0]
        pools.sadd(normal_id)
    logger.info('当前集合长度为: %s' % len(pools.scard()))
Beispiel #2
0
    def get_resume_list(self, page=1, **params):
        self.logger.info('开始执行搜索任务, USER: {} PAGE: {} {}'.format(
            self.auth_kwargs['username'].encode('utf-8'), page,
            json.dumps(params, ensure_ascii=False).encode('utf-8')))
        end = datetime.datetime.now() + datetime.timedelta(days=1)
        start = end - datetime.timedelta(days=settings.SEARCH_DAY)
        post_date = datetime2str(start, fmt='%Y%m%d') + '000000_' \
                    + datetime2str(end, fmt='%Y%m%d') + '000000'
        url = '{}pn{}/pve_5593_{}/?key={}&age=18_30&postdate={}'.format(
            params.get('city_url'), page, params.get('degree', '4'),
            params.get('keyword').encode('utf-8'), post_date)
        # print(url)
        headers = {
            'accept':
            'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'zh-CN,zh;q=0.9',
            'cache-control':
            'no-cache',
            'cookie':
            self.cookie,
            'pragma':
            'no-cache',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (X11; Linux x86_64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/68.0.3440.84 Safari/537.36'
        }
        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies)
        if 'passport.58.com' in res.url:
            raise MfCookieValidException('cookie invalid. {}'.format(
                self.auth_kwargs['username'].encode('utf-8')))
        real_html = self.get_real_html(res.content)
        soups = self.html_parser.parser(real_html).find(
            'div', id='infolist').find_all('dl')

        has_next = True if self.html_parser.parser(real_html).find(
            'div', class_='pagerout').find('a', class_='next') else False

        url_lst = []
        for soup in soups:
            url = soup.find('dt').find('a').get('href')
            resume_id = self.get_resume_id(url)
            if self.do_filter(resume_id):
                self.logger.info('简历: {}, {}天内已被采集过'.format(
                    resume_id, settings.DELAY_DAY))
                continue
            url_lst.append(url)
        time.sleep(random.randint(1, 2))
        return has_next, url_lst[:-1]
Beispiel #3
0
    def is_limited(self, username):
        # 用于限制帐号进入详情页次数
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        if not self.h_use_record.hget(username + today):
            self.h_use_record.hset(username + today, 0)
            count = 0
        else:
            count = int(self.h_use_record.hget(username + today))

        if username == ('E4gl36307', '18518261507', 'bjlb7610'):
            # if count >= 2000:
            if count >= settings.COUNT_LIMITED:
                return True

        # elif username in ('lzmy9771', 'hgmy2130'):
        #     if count > 5000:
        #         return True

        else:
            if count >= settings.COUNT_LIMITED:
                return True
        return count
Beispiel #4
0
    def is_limit(self, change=True):
        """
        判断是否达到上限
        :param change: 用于标识是否变更limit
        :return:
        """
        today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
        key = today + '|' + self.auth_kwargs['username'].encode('utf-8')
        limit = self.h_search_limit.hget(key)

        if not limit:
            self.h_search_limit.hset(key, 0)
            return False

        if int(limit) >= settings.SEARCH_LIMIT:
            self.logger.warning('当前帐号 %s 已达上限 %s,当天不再进行搜索.' %
                                (self.auth_kwargs['username'].encode('utf-8'),
                                 settings.SEARCH_LIMIT))
            return True
        if change:
            limit = int(limit)
            limit += 1
            self.h_search_limit.hset(key, limit)
        return False
Beispiel #5
0
    def run(self):
        try:
            email_lst = self.mysql_handle.query_by_sql('''
                select email, password, pop3_host from spider.t_email 
                WHERE valid=1 and pop3_host!='imap.exmail.qq.com'
                ''')

            if email_lst:
                self.logger.info("总计加载邮箱%s个." % len(email_lst))
            for index, item in enumerate(email_lst):
                username, password, host = item
                self.logger.info("开始处理: %s | %s | %s" % item)
                imap = MailReceiver(host, username, password)
                try:
                    imap.server.login(username, password)
                    self.logger.info("%s: 登录成功 <%s>" % (username, index))
                except Exception as e:
                    if "LOGIN Login error" in e:
                        self.robot.send_text("邮箱登录异常: %s %s %s" %
                                             (username, password, host))
                    self.logger.exception("%s: 登录失败 %s <%s>" %
                                          (username, e, index))
                    continue

                try:
                    folder_list = imap.folder_list()
                except Exception as e:
                    self.logger.exception("%s: 邮件箱加载失败 %s" % (username, e))
                    continue

                if not folder_list:
                    continue
                # print(folder_list)
                if "Deleted Messages" in folder_list:
                    # 腾讯
                    delete_folder = u"Deleted Messages"
                elif "已删除邮件" in folder_list:
                    # 阿里云
                    delete_folder = u"已删除邮件"
                else:
                    # 163
                    delete_folder = u"已删除"

                # print delete_folder

                before_day = 7
                check_day = 60

                start_date = datetime.datetime.now() - datetime.timedelta(
                    days=before_day)
                # date = datetime.datetime.now()
                end_date = start_date - datetime.timedelta(days=check_day)

                self.logger.info("当前处理时间段为: %s 到 %s" %
                                 (datetime2str(start_date, fmt='%Y-%m-%d'),
                                  datetime2str(end_date, fmt='%Y-%m-%d')))

                # search_ = 'BEFORE %s' % fmt_date
                for day in xrange(check_day):
                    date = start_date - datetime.timedelta(days=day)
                    fmt_date = datetime.datetime.strftime(date, "%d-%b-%Y")

                    search_ = 'ON %s' % fmt_date

                    try:
                        message_list = imap.message_list(mailbox=delete_folder,
                                                         search_=search_)
                    except Exception as e:
                        self.logger.exception("%s: 邮件list加载失败 %s" %
                                              (username, e))
                        continue

                    if not message_list:
                        self.logger.debug("%s: 没有 [%s] 已删除的邮件" %
                                          (username, search_))
                        continue

                    self.logger.info("%s: [%s] 匹配到邮件%s个" %
                                     (username, search_, len(message_list)))

                    if imap.delete_message(message_list,
                                           deleted_folder=None) is not True:
                        self.logger.warning("%s 删除失败." % username)
                        continue
                    self.logger.info("%s 删除成功." % username)

                self.logger.info('%s 邮箱处理完毕.' % username)
                try:
                    imap.close()
                except Exception as e:
                    self.logger.exception('imap close 异常: %s ' % e)
                    continue

        except Exception as e:
            self.logger.exception(e)
Beispiel #6
0
    def resume_search(self, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)

        flag = False
        while True:

            awake_flow_no = self.h_awake_flow_no.hget('FIVE_ONE')
            if self.h_status.hget(awake_flow_no) == '400':
                self.logger.info("程序当前处于暂停状态.sleep 60s")
                time.sleep(60)
                continue

            if flag is False:
                # 扫描第一页
                page_html = self.init_search_page()
                resume_list, page_html = self.get_resume_list(
                    page_html, action='pagerTopNew$ctl00', **search_args)
                flag = True
            else:
                resume_list, page_html = self.get_resume_list(
                    page_html, **search_args)

            if not resume_list:
                raise FiveOneResumeException('resume_list_empty')

            time.sleep(random.randint(1, 5))
            for resume_args in resume_list:

                if self.is_limited(self.auth_kwargs['username']) is True:
                    raise FiveOneResumeException('user_record_limited')

                count = self.is_limited(self.auth_kwargs['username'])

                # 用于简历去重
                resume_id = re.findall('''(?<=hidUserID=)\d+?(?=&)''',
                                       resume_args)[0].encode('utf-8')

                last_search_day = self.h_black_list.hget(resume_id)
                if last_search_day:
                    distance = (
                        str2datetime(today.replace('|', ''), '%Y-%m-%d') -
                        str2datetime(last_search_day, '%Y-%m-%d')).days
                else:
                    distance = DAY_LIMITED + 1
                if distance < DAY_LIMITED:
                    self.logger.warning('该简历%s天内已经被采集过: %s' %
                                        (DAY_LIMITED, resume_id))
                    continue
                self.h_black_list.hset(resume_id, today.replace('|', ''))
                resume_detail = self.get_resume_detail(resume_url=resume_args)
                if not resume_detail:
                    continue
                resume_uuid = str(uuid.uuid1())
                # content_origin = {'name': '', 'email': '', 'phone': '',
                #                   'html': resume_detail.decode('utf-8')}
                # content = json.dumps(content_origin, ensure_ascii=False)

                content = resume_detail.decode('utf-8')

                sql = '''INSERT INTO spider_search.resume_raw (source, content, 
                createBy, 
                trackId, createtime, email, emailJobType, emailCity, subject) VALUES 
                (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
                sql_value = (self.common_settings.SOURCE, content, resume_uuid,
                             self.auth_kwargs['username'],
                             search_args['keywords'].split('|')[0],
                             search_args['city'].split('|')[0], str(resume_id))

                resume_update_time = ''
                msg_data = {
                    "channelType": "WEB",
                    "content": {
                        "content": content,
                        "id": '',
                        "createBy": "python",
                        "createTime": int(time.time() * 1000),
                        "ip": '',
                        "resumeSubmitTime": '',
                        "resumeUpdateTime": resume_update_time,
                        "source": self.common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        "avatarUrl": '',
                        "email": self.auth_kwargs['username'],
                        'emailJobType': search_args['keywords'].split('|')[0],
                        'emailCity': search_args['city'].split('|')[0],
                        'subject': str(resume_id)
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_SEARCH_AWAKE",
                    "source": self.common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': self.common_settings.CALL_SYSTEM_ID,
                }
                # self.mysql_handler.save(sql=sql, data=sql_value)
                res = self.save_data(sql=sql,
                                     data=sql_value,
                                     msg_data=msg_data)

                if res:
                    count += 1
                    self.h_use_record.hset(
                        self.auth_kwargs['username'] + today, count)
                    mysql_ = self.init_mysql(
                        user='******',
                        passwd='bi_admin#@1mofanghr',
                        host='172.16.25.1',
                        # user='******',
                        # passwd='bi_admin#@1mofanghr',
                        # host='10.0.3.52',
                        cursorclass=DictCursor,
                        cls_singleton=False)
                    sql = '''
                    INSERT INTO spider.resume_awake_record (source, position, city,
                     raw_id, create_time, username) VALUES ('FIVE_ONE', %s, 
                     %s, %s, now(), %s)
                    '''
                    value = (search_args['keywords'].split('|')[0],
                             search_args['city'].split('|')[0], res,
                             self.auth_kwargs['username'])
                    mysql_.save(sql, value)

                time.sleep(random.randint(5, 7))
Beispiel #7
0
    def resume_search(self, page, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)
        resume_list = self.get_resume_list(page=page, **search_args)

        if not resume_list:
            raise ZhiLianResumeException('resume_list_empty')

        for resume_args in resume_list:
            # 用于限制帐号进入详情页次数
            if not self.h_use_record.hget(
                    self.auth_kwargs['username'] + today):
                self.h_use_record.hset(self.auth_kwargs['username'] + today, 0)
                count = 0
            else:
                count = int(
                    self.h_use_record.hget(
                        self.auth_kwargs['username'] + today))

            if self.check_limit(count=count):
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                self.h_over_search_limit.hset(today + '|' + self.auth_kwargs[
                    'username'].encode('utf-8'), 1)
                raise ZhiLianResumeException('user_record_limited')

            # 用于简历去重
            try:
                resume_id = str(resume_args.get('resumeNo').encode('utf-8')[
                                :10])
            except:
                resume_id = str(resume_args.get('number')[:10])
            last_search_day = self.h_black_list.hget(resume_id)
            if last_search_day:
                distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d')
                            - str2datetime(last_search_day, '%Y-%m-%d')).days
            else:
                distance = DAY_LIMITED + 1
            if distance < DAY_LIMITED:
                self.logger.warning('该简历%s天内已经被采集过: %s'
                                    % (DAY_LIMITED, resume_id))
                continue
            self.h_black_list.hset(resume_id, today.replace('|', ''))
            resume_detail = self.get_resume_detail(
                resume_args=resume_args)

            if not resume_detail:
                continue

            if resume_detail.get('resumeSource').encode('utf-8').lower() == \
                    'download':
                resource_type = 'RESUME_INBOX'
            else:
                resource_type = 'RESUME_SEARCH'

            content = json.dumps({'name': '', 'email': '', 'phone': '',
                                  'html': resume_detail},
                                 ensure_ascii=False)
            data = {
                'ChannelType': 'APP',
                'Source': self.source,
                'ResourceType': resource_type,
                'content': content,
                'accountContent': json.dumps(self.auth_kwargs,
                                             ensure_ascii=False,
                                             cls=JsonCustomEncoder),
                'resumeId': resume_detail.get('resumeNo'),
                'searchData': json.dumps(
                    search_args.get('origin_search_args'), ensure_ascii=False),
                'code': 200
            }
            self.push_resume(**data)
            time.sleep(random.randint(1, 5))
Beispiel #8
0
            "excludes": []
        },
        "aggs": {
            "2": {
                "date_histogram": {
                    "field": "@timestamp",
                    "interval": "30m",
                    "time_zone": "Asia/Shanghai",
                    "min_doc_count": 1
                }
            }
        }
    }
    res = es.search(index=index, body=body)
    hits = res.get('hits').get('hits')
    today = datetime2str(str2datetime(check_day), fmt='%Y-%m-%d')
    pools = RedisSet('ZHI_LIAN_AWAKE_FAILED-%s' % today)

    logger.info("匹配到%s个唤醒失败的简历 『%s - %s』。" % (len(hits), start_time, end_time))
    for hit in hits:
        log_message = hit.get('_source').get('log_message')
        # 正则匹配id
        normal_id = re.findall('(?<=id=)\d+(?=\s)', log_message)[0]
        pools.sadd(normal_id)
    logger.info('当前集合长度为: %s' % len(pools.scard()))


if __name__ == '__main__':
    check_day = datetime2str(datetime.datetime.now())
    main(check_day=check_day)
Beispiel #9
0
def execute_awake():
    runner = ResumeFen()
    while True:
        task_id, params = runner.get_task()
        if not task_id:
            runner.push_task()
            continue

        try:
            runner.get_cookie()
            page = 1
            has_next_page = True
            while has_next_page:
                if params.get('model_name') == 'ZHI_LIAN':
                    mode = 'zl'
                    resume_list = runner.get_resume_list_zl(page, **params)
                else:
                    mode = 'lp'
                    resume_list = runner.get_resume_list_lp(page, **params)

                if not resume_list:
                    runner.logger.warning('简历列表为空,开始切换任务.')
                    runner.update_task(task_id=task_id)
                    break

                flag = 0
                for resume in resume_list:
                    resume_id = resume.get('id').encode('utf-8')
                    day = datetime.datetime.now().day
                    today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                    last_search_day = runner.h_search_back_list.hget(resume_id)

                    if flag > 25:
                        runner.logger.info('当前页存在超过25个已采集简历,跳过任务.')
                        has_next_page = False
                        break

                    # 用于去重天数限制 TIME_LIMIT
                    if last_search_day:
                        last_search_day = str2datetime(last_search_day,
                                                       fmt='%Y-%m-%d').day

                        if day - last_search_day <= TIME_LIMIT:
                            runner.logger.warning('该简历[%s] %s天内已采集过.' %
                                                  (resume_id, TIME_LIMIT))
                            flag += 1
                            continue

                    content = runner.get_resume_detail(resume_id=resume_id,
                                                       mode=mode)

                    if not content:
                        continue

                    content = json.dumps(content, ensure_ascii=False)

                    resume_uuid = str(uuid.uuid1())

                    sql = '''insert into spider_search.resume_raw (source, content, 
                                createBy, 
                                trackId, createtime, email, emailJobType, emailCity, subject) values 
                                (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
                    sql_value = (runner.common_settings.SOURCE, content,
                                 resume_uuid, runner.auth_kwargs['username'],
                                 params['job_name'], params['area_name'],
                                 str(resume_id))

                    resume_update_time = ''
                    msg_data = {
                        "channelType": "WEB",
                        "content": {
                            "content": content,
                            "id": '',
                            "createBy": "python",
                            "createTime": int(time.time() * 1000),
                            "ip": '',
                            "resumeSubmitTime": '',
                            "resumeUpdateTime": resume_update_time,
                            "source": runner.common_settings.SOURCE,
                            "trackId": str(resume_uuid),
                            "avatarUrl": '',
                            "email": runner.auth_kwargs['username'],
                            'emailJobType': params['job_name'],
                            'emailCity': params['area_name'],
                            'subject': resume_id
                        },
                        "interfaceType": "PARSE",
                        "resourceDataType": "RAW",
                        "resourceType": "RESUME_SEARCH",
                        "source": runner.common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        'traceID': str(resume_uuid),
                        'callSystemID': runner.common_settings.CALL_SYSTEM_ID,
                    }
                    # self.mysql_handler.save(sql=sql, data=sql_value)
                    res = runner.save_data(sql=sql,
                                           data=sql_value,
                                           msg_data=msg_data)
                    if res:
                        # 重置cookie重试次数
                        runner.h_account_limit.hset(
                            runner.auth_kwargs['username'], 0)
                        runner.h_search_back_list.hset(resume_id, today)
                    time.sleep(random.randint(1, 5))
                if len(resume_list) < 30:
                    runner.logger.info('当前页简历小于30,任务结束。')
                    has_next_page = False

                page += 1
                runner.update_task(task_id=task_id)

        except MfCookieValidException:
            runner.update_task(task_id=task_id)
            runner.add_task(param=json.dumps(params, ensure_ascii=False))
            runner.logger.warning('因Cookie失败导致任务退出,重新添加任务!')

        except Exception as e:
            runner.update_task(task_id=task_id,
                               execute_status='FAILURE',
                               execute_result=str(e))
            runner.logger.exception(str(e))
Beispiel #10
0
    def get_resume_list(self, page=1, is_download=False, **search_args):
        """
        获取简历列表页
        搜索条件: 关键词/所在地/年龄20-35/学历/最近三天upDate
        :param page:
        :param search_args:
        :return:
        """
        url = 'https://ihr.zhaopin.com/resumesearch/search.do?' \
              'access_token=%s' % self.access_token
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'ihr.zhaopin.com',
            'Origin': 'https://ihr.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': 'https://ihr.zhaopin.com/resumesearch/search/',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }

        data = {
            'keywords': search_args['keywords'].split('|')[0].encode('utf-8'),
            'startNum': (page - 1) * 30,
            'rowsCount': '30',
            'resumeGrade': '',
            'sortColumnName': 'sortUpDate',
            'sortColumn': 'sortUpDate desc',
            'onlyHasImg': 'false',
            'anyKeyWord': 'false',
            'hopeWorkCity': search_args['city'].split('|')[1].encode('utf-8'),
            'ageStart': search_args.get('age_start', '18'),
            'ageEnd': search_args.get('age_end', '30'),
            'workYears': search_args.get('work_years', ''),
            'liveCity': search_args.get('live_city', ''),
            'sex': search_args.get('sex', ''),
            'edu': search_args.get('degree', '5'),
            'upDate': search_args.get('up_date', ''),  # 默认搜索最近三天
            'companyName': search_args.get('company_name', ''),
            'jobType': '',
            'desiredJobType': search_args.get('desired_job_type', ''),
            'industry': search_args.get('industry', ''),
            'desiredIndustry': '',
            'careerStatus': '',
            'desiredSalary': '',
            'langSkill': '',
            'hukouCity': '',
            'major': '',
            'onlyLastWork': 'false',
        }
        # print(json.dumps(data, ensure_ascii=False, indent=4))
        if search_args['use_keywords'] is False:
            data['desiredJobType'] = search_args['keywords'].split('|')[1]
            self.logger.info('采用职能进行搜索.')
        else:
            self.logger.info('采用关键词进行搜索')

        res = self.html_downloader.download(url,
                                            method='POST',
                                            data=data,
                                            headers=headers,
                                            proxies=self.proxies)
        # self.logger.info('搜索返回 %s' % res.json())
        if res.json().get('code') == 6001:
            self.logger.info(self.logger_prefix + 'cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        if res.json().get('code') == 808:
            self.logger.warning(self.logger_prefix +
                                res.json().get('message').encode('utf-8'))
            today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
            self.h_over_search_limit.hset(
                today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1)
            # 当日搜索大库简历已达上限

            global LIMIT_MESSAGE_BOX
            if not LIMIT_MESSAGE_BOX.get(
                    self.auth_kwargs['username'].encode('utf-8'), ''):
                LIMIT_MESSAGE_BOX[self.auth_kwargs['username'].encode(
                    'utf-8')] = 1
                self.robot_login.send_markdown(
                    title="智联简历搜索",
                    content="#### 智联简历当日关键词搜索量已达上限.\n"
                    "- 帐号: %s\n"
                    "- 密码: %s\n"
                    "- 代理: %s\n"
                    "- 达到上限账号总数: %s\n" %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     self.auth_kwargs['password'].encode('utf-8'),
                     self.auth_kwargs['ip'].encode('utf-8') + ':' +
                     self.auth_kwargs['port'].encode('utf-8'),
                     len(LIMIT_MESSAGE_BOX)))

            raise ZhiLianResumeException('user_record_limited')

        try:
            resume_list = res.json().get('results')
            if not resume_list:
                raise Exception
        except Exception as e:
            self.logger.exception('获取list失败: %s | %s' % (str(e), res.content))
            return []

        resume_accept_list = []
        for resume in resume_list:
            # global DAY_LIMITED
            # DAY_LIMITED = 5
            # limited_day = datetime.datetime.now() - datetime.timedelta(
            #     days=4
            # )
            if is_download is False:
                if datetime.datetime.now().isoweekday() == 1:
                    # 周一
                    # global DAY_LIMITED
                    # DAY_LIMITED = 5
                    # limited_day = datetime.datetime.now() - datetime.timedelta(
                    #     days=4)
                    global DAY_LIMITED
                    DAY_LIMITED = 2
                    limited_day = datetime.datetime.now() - datetime.timedelta(
                        days=1)
                else:
                    global DAY_LIMITED
                    DAY_LIMITED = 2
                    limited_day = datetime.datetime.now() - datetime.timedelta(
                        days=1)

                if str2datetime(resume.get('modifyDate'),
                                '%Y-%m-%d').date() < limited_day.date():
                    self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                    break
            resume_accept_list.append(resume)

        self.logger.info('page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
                         (page, len(resume_accept_list),
                          search_args['keywords'].encode('utf-8'),
                          search_args['city'].encode('utf-8')))
        return resume_accept_list
Beispiel #11
0
    def resume_search(self, page, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)
        resume_list = self.get_resume_list(page=page, **search_args)

        if not resume_list:
            raise ZhiLianResumeException('resume_list_empty')

        for resume_args in resume_list:
            # 用于限制帐号进入详情页次数
            if not self.h_use_record.hget(self.auth_kwargs['username'] +
                                          today):
                self.h_use_record.hset(self.auth_kwargs['username'] + today, 0)
                count = 0
            else:
                count = int(
                    self.h_use_record.hget(self.auth_kwargs['username'] +
                                           today))

            if self.check_limit(count=count):
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                self.h_over_search_limit.hset(
                    today + '|' + self.auth_kwargs['username'].encode('utf-8'),
                    1)
                raise ZhiLianResumeException('user_record_limited')

            # 用于简历去重
            try:
                resume_id = str(
                    resume_args.get('resumeNo').encode('utf-8')[:10])
            except:
                resume_id = str(resume_args.get('number')[:10])

            mysql_1 = self.init_mysql(
                user='******',
                passwd='bi_admin#@1mofanghr',
                host='172.16.25.1',
                # user='******',
                # passwd='bi_admin#@1mofanghr',
                # host='10.0.3.52',
                cursorclass=DictCursor,
                cls_singleton=False)
            sql = '''
                 insert into spider.resume_awake_record_no_repeat (source, 
                 position, 
                 city,
                  raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, 
                  %s, %s, now(), %s)
                 '''
            value = (search_args['keywords'].split('|')[0],
                     search_args['city'].split('|')[0], resume_id,
                     self.auth_kwargs['username'])
            mysql_1.save(sql, value)
            del mysql_1

            last_search_day = self.h_black_list.hget(resume_id)
            if last_search_day:
                distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') -
                            str2datetime(last_search_day, '%Y-%m-%d')).days
            else:
                distance = DAY_LIMITED + 1
            if distance < DAY_LIMITED:
                self.logger.warning('该简历%s天内已经被采集过: %s' %
                                    (DAY_LIMITED, resume_id))
                continue
            self.h_black_list.hset(resume_id, today.replace('|', ''))
            resume_detail = self.get_resume_detail(resume_args=resume_args)
            if not resume_detail:
                continue
            resume_uuid = str(uuid.uuid1())
            content = json.dumps(
                {
                    'name': '',
                    'email': '',
                    'phone': '',
                    'html': resume_detail
                },
                ensure_ascii=False)
            sql = '''insert into spider_search.resume_raw (source, content, 
            createBy, 
            trackId, createtime, email, emailJobType, emailCity, subject) values 
            (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
            sql_value = (self.common_settings.SOURCE, content, resume_uuid,
                         self.auth_kwargs['username'],
                         search_args['keywords'], search_args['city'],
                         str(resume_detail.get('resumeNo')))

            resume_update_time = ''
            msg_data = {
                "channelType": "APP",
                "content": {
                    "content": content,
                    "id": '',
                    "createBy": "python",
                    "createTime": int(time.time() * 1000),
                    "ip": '',
                    "resumeSubmitTime": '',
                    "resumeUpdateTime": resume_update_time,
                    "source": self.common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    "avatarUrl": '',
                    "email": self.auth_kwargs['username'],
                    'emailJobType': search_args['keywords'],
                    'emailCity': search_args['city'],
                    'subject': str(resume_detail.get('resumeNo'))
                },
                "interfaceType": "PARSE",
                "resourceDataType": "RAW",
                "resourceType": "RESUME_SEARCH_AWAKE",
                "source": self.common_settings.SOURCE,
                "trackId": str(resume_uuid),
                'traceID': str(resume_uuid),
                'callSystemID': self.common_settings.CALL_SYSTEM_ID,
            }
            # self.mysql_handler.save(sql=sql, data=sql_value)
            res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data)

            if res:
                count += 1
                self.h_use_record.hset(self.auth_kwargs['username'] + today,
                                       count)
                mysql_ = self.init_mysql(
                    user='******',
                    passwd='bi_admin#@1mofanghr',
                    host='172.16.25.1',
                    # user='******',
                    # passwd='bi_admin#@1mofanghr',
                    # host='10.0.3.52',
                    cursorclass=DictCursor,
                    cls_singleton=False)
                sql = '''
                    insert into spider.resume_awake_record (source, position, city,
                     raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, 
                     %s, %s, now(), %s)
                    '''
                value = (search_args['keywords'].split('|')[0],
                         search_args['city'].split('|')[0], res,
                         self.auth_kwargs['username'])
                mysql_.save(sql, value)
                del mysql_

            time.sleep(random.randint(3, 5))
Beispiel #12
0
    def init_search_account(self, use_type='SEARCH_AWAKE'):
        if sem.locked():
            return

        sem.acquire()
        try:
            self.q_search_account.clean()
            self.logger.info('开始初始化搜索帐号队列')
            if not self.q_search_account.empty():
                self.logger.info('当前队列非空,剩余帐号: %s' %
                                 self.q_search_account.qsize())
                return

            mysql_ = self.init_mysql(
                user='******',
                passwd='nMGZKQIXr4PE8aR2',
                host='rm-2ze15h84ax219xf08.mysql.rds.aliyuncs.com',
                # user='******',
                # passwd='bi_admin#@1mofanghr',
                # host='10.0.3.52',
                cursorclass=DictCursor,
                cls_singleton=False)
            account_list = mysql_.query_by_sql("""
                SELECT a.*
                FROM
                  autojob_v2.t_account a
                WHERE
                  EXISTS(
                      SELECT 1
                      FROM autojob_v2.t_account_use_type b
                      WHERE a.id = b.accountId
                            AND b.useType = '%s'
                            AND b.valid=1
                            AND a.valid = 1
                            AND a.status = 1
                            AND a.source = 'ZHI_LIAN'
                  )
                ORDER BY a.id DESC;
                """ % use_type)
            self.logger.info('共匹配到%s个有效帐号' % len(account_list))
            effective_account_list = []
            for account in account_list:
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                global RUN_STATUS
                if RUN_STATUS.get(account['username'].encode('utf-8')):
                    self.logger.warning('当前帐号 %s 处于执行状态.' %
                                        account['username'].encode('utf-8'))
                    continue

                if self.h_over_search_limit.hget(
                        today + '|' + account['username'].encode('utf-8')):
                    self.logger.warning(
                        '当前帐号 %s 已达上限 %s,当天不再进行搜索.' %
                        (account['username'].encode('utf-8'),
                         self.h_use_record.hget(account['username'].encode(
                             'utf-8') + '|' + today)))
                    continue

                self.q_search_account.put(account)
                effective_account_list.append(account)

            self.logger.info('初始化搜索帐号队列完毕 %s' % len(effective_account_list))
            sem.release()

        except Exception as e:
            self.logger.exception('初始化搜索帐号队列失败: ' + str(e))
            sem.release()