Esempi in Python per RedisQueue

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: mf_utils.queue.RedisQueue

Classe/tipologia: RedisQueue

Esempi su hotexamples.com: 7

RedisQueue in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per mf_utils.queue.RedisQueue.RedisQueue, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

RedisQueue(4)

put(3)

qsize(3)

empty(2)

clean(1)

Metodi utilizzati di frequente

RedisQueue (4)

put (3)

qsize (3)

empty (2)

clean (1)

Esempio n. 1

Mostra file

    def __init__(self):
        super(ZhiLianEmailMobile, self).__init__(
        )
        self.mns_handler = self.init_mns(
            endpoint='http://1315265288610488.mns.cn-beijing.aliyuncs.com',
            # queue='morgan-queue-resume-raw'
            queue='morgan-queue-resume-raw'
        )
        # self.spider_admin = self.init_mysql(host='10.0.3.52',
        #                                     port=3306,
        #                                     user='******',
        #                                     passwd='bi_admin#@1mofanghr',
        #                                     db='spider', cursorclass=DictCursor)
        self.spider_admin = self.init_mysql(host='rm-2ze15h84ax219xf08.mysql.rds.aliyuncs.com',
                                            port=3306,
                                            user='******',
                                            passwd='n4UZknFH6F',
                                            db='spider', cursorclass=DictCursor)

        self.robot = DingDingRobot(
            access_token="2d021b9d686a5432aa0d65f0e75d03bc6200d2b766616c75224d8e7bfb1cbc57")

        self.logger = Logger.timed_rt_logger()
        self.q = RedisQueue('zhi_lian_need_get_mobile', host='172.16.25.36',
                            port=6379)

Esempio n. 2

Mostra file

 def __init__(self, local_setting=None):
     super(ResumeZhiLianCrmBase, self).__init__(
         local_setting=local_setting,
         # common_settings_path='/data/config/morgan/'
         #                      'morgan_spider_common_settings_test.cfg'
     )
     # detail 黑名单
     self.h_black_list = RedisHash("zhi_lian_resume_crm_back_list",
                                   host=REDIS_HOST,
                                   port=REDIS_PORT,
                                   password=REDIS_PASSWORD)
     # 帐号当天使用次数
     self.h_use_record = RedisHash("zhi_lian_resume_crm_search_use_record",
                                   host=REDIS_HOST,
                                   port=REDIS_PORT,
                                   password=REDIS_PASSWORD)
     # 帐号已达上限标识
     self.h_over_search_limit = RedisHash("zhi_lian_crm_over_search_limit",
                                          host=REDIS_HOST,
                                          port=REDIS_PORT,
                                          password=REDIS_PASSWORD)
     self.q_search_account = RedisQueue(
         "zhi_lian_resume_crm_search_account",
         host=REDIS_HOST,
         port=REDIS_PORT,
         password=REDIS_PASSWORD)
     self.robot_login = DingDingRobot(
         access_token="b80ce3024c1818fe341bfad52bed12f2"
                      "4448d6180174a88bc1570c8908f4623a"
     )

Esempio n. 3

Mostra file

 def __init__(self, local_setting=settings.project_settings):
     super(Resume51job, self).__init__(
         local_setting=local_setting,
         # common_settings_path='/data/config/morgan/'
         #                      'morgan_spider_common_settings_test.cfg'
     )
     self.cookie = None
     self.proxies = None
     self.auth_kwargs = None
     self.source = 'FIVE_ONE'
     self.logger = Logger.timed_rt_logger()
     self.h = RedisHash("cookie_pool",
                        host=REDIS_HOST,
                        port=REDIS_PORT,
                        password=REDIS_PASSWORD)
     self.h_limit = RedisHash("account_limit",
                              host=REDIS_HOST,
                              port=REDIS_PORT,
                              password=REDIS_PASSWORD)
     self.h_black_list = RedisHash("five_one_resume_back_list",
                                   host=REDIS_HOST,
                                   port=REDIS_PORT,
                                   password=REDIS_PASSWORD)
     self.h_use_record = RedisHash("five_one_resume_search_use_record",
                                   host=REDIS_HOST,
                                   port=REDIS_PORT,
                                   password=REDIS_PASSWORD)
     self.h_search_empty_times = RedisHash(
         "five_one_resume_search_empty_times",
         host=REDIS_HOST,
         port=REDIS_PORT,
         password=REDIS_PASSWORD)
     self.q_search_account = RedisQueue("five_one_resume_search_account",
                                        host=REDIS_HOST,
                                        port=REDIS_PORT,
                                        password=REDIS_PASSWORD)
     self.h_status = RedisHash(
         'FIVE_ONE',
         # host='172.16.25.35',
         # port='6379',
         host='r-2ze95acf94ea4a04.redis.rds.aliyuncs.com',
         port='6379',
         password='******',
         db=6)
     self.h_awake_flow_no = RedisHash("awake_flow_no",
                                      host=REDIS_HOST,
                                      port=REDIS_PORT,
                                      password=REDIS_PASSWORD)
     # self.search_db = self.init_mysql(db='spider_search')
     self.mns_handler = self.init_mns(
         endpoint='http://1315265288610488.mns.cn-beijing.aliyuncs.com',
         queue='morgan-queue-resume-raw'
         # queue='morgan-queue-test-resume-raw'
     )
     self.mysql_handler = self.init_mysql()
     self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) ' \
                       'AppleWebKit/537.36 (KHTML, ' \
                       'like Gecko) Chrome/62.0.3202.62 ' \
                       'Safari/537.36'
     self.robot_login = DingDingRobot(
         access_token="3c7b5bd12b49cfdd5f47f00df7fa9c478"
         "485254485d567ff6abcbf45927e648a")

Esempio n. 4

Mostra file

class Resume51job(InitCorePlus):
    def __init__(self, local_setting=settings.project_settings):
        super(Resume51job, self).__init__(
            local_setting=local_setting,
            # common_settings_path='/data/config/morgan/'
            #                      'morgan_spider_common_settings_test.cfg'
        )
        self.cookie = None
        self.proxies = None
        self.auth_kwargs = None
        self.source = 'FIVE_ONE'
        self.logger = Logger.timed_rt_logger()
        self.h = RedisHash("cookie_pool",
                           host=REDIS_HOST,
                           port=REDIS_PORT,
                           password=REDIS_PASSWORD)
        self.h_limit = RedisHash("account_limit",
                                 host=REDIS_HOST,
                                 port=REDIS_PORT,
                                 password=REDIS_PASSWORD)
        self.h_black_list = RedisHash("five_one_resume_back_list",
                                      host=REDIS_HOST,
                                      port=REDIS_PORT,
                                      password=REDIS_PASSWORD)
        self.h_use_record = RedisHash("five_one_resume_search_use_record",
                                      host=REDIS_HOST,
                                      port=REDIS_PORT,
                                      password=REDIS_PASSWORD)
        self.h_search_empty_times = RedisHash(
            "five_one_resume_search_empty_times",
            host=REDIS_HOST,
            port=REDIS_PORT,
            password=REDIS_PASSWORD)
        self.q_search_account = RedisQueue("five_one_resume_search_account",
                                           host=REDIS_HOST,
                                           port=REDIS_PORT,
                                           password=REDIS_PASSWORD)
        self.h_status = RedisHash(
            'FIVE_ONE',
            # host='172.16.25.35',
            # port='6379',
            host='r-2ze95acf94ea4a04.redis.rds.aliyuncs.com',
            port='6379',
            password='******',
            db=6)
        self.h_awake_flow_no = RedisHash("awake_flow_no",
                                         host=REDIS_HOST,
                                         port=REDIS_PORT,
                                         password=REDIS_PASSWORD)
        # self.search_db = self.init_mysql(db='spider_search')
        self.mns_handler = self.init_mns(
            endpoint='http://1315265288610488.mns.cn-beijing.aliyuncs.com',
            queue='morgan-queue-resume-raw'
            # queue='morgan-queue-test-resume-raw'
        )
        self.mysql_handler = self.init_mysql()
        self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) ' \
                          'AppleWebKit/537.36 (KHTML, ' \
                          'like Gecko) Chrome/62.0.3202.62 ' \
                          'Safari/537.36'
        self.robot_login = DingDingRobot(
            access_token="3c7b5bd12b49cfdd5f47f00df7fa9c478"
            "485254485d567ff6abcbf45927e648a")

    def release_run_status(self):
        global RUN_STATUS
        RUN_STATUS[self.auth_kwargs['username'].encode('utf-8')] = 0
        self.logger.info('将帐号 %s 置为非运行状态' %
                         self.auth_kwargs['username'].encode('utf-8'))

    def get_cookie(self):
        count = self.h_limit.hget(self.source + '|' +
                                  self.auth_kwargs['username'].encode('utf-8'))
        if not count:
            count = 0
        else:
            count = int(count)
        count += 1

        # 回写账号重试次数
        self.h_limit.hset(
            self.source + '|' + self.auth_kwargs['username'].encode('utf-8'),
            count)

        cookies = self.h.hget(self.source + '|' +
                              self.auth_kwargs['username'].encode('utf-8'))

        # 强制禁用账号
        if not cookies:
            count = 5

        # 调用登录接口进行登录
        # if count == 5:
        #     self.auth_kwargs['user_agent'] = self.user_agent
        #
        #     self.logger.info(
        #         "开始登录：%s %s %s %s %s" % (
        #             self.auth_kwargs['username'].encode('utf-8'),
        #             self.auth_kwargs['password'].encode('utf-8'),
        #             self.auth_kwargs['account_name'].encode(
        #                 'utf-8'),
        #             self.auth_kwargs['ip'].encode('utf-8'),
        #             self.auth_kwargs['port'].encode('utf-8')))
        #     url = 'http://172.16.25.36:5000/fiveone_login/' \
        #           '?username=%s&password' \
        #           '=%s&ip=%s&port=%s&user_agent=%s&account_name=%s' % (
        #               self.auth_kwargs['username'].encode('utf-8'),
        #               self.auth_kwargs['password'].encode('utf-8'),
        #               self.auth_kwargs['ip'].encode('utf-8'),
        #               self.auth_kwargs['port'].encode('utf-8'),
        #               self.auth_kwargs['user_agent'].encode('utf-8'),
        #               self.auth_kwargs['account_name'].encode('utf-8')
        #           )
        #
        #     res = self.html_downloader.download(url, timeout=180)
        #
        #     code = res.json().get("code")
        #     # code = 500
        #     if not code == 200:
        #         self.logger.warning(
        #             "登录失败: %s" % (self.auth_kwargs['username'].encode(
        #                 'utf-8')))
        #         return None
        #
        #     cookies = res.json().get('cookies')
        #     self.h.hset(self.auth_kwargs['username'], cookies)
        #
        #     self.cookie = cookie2str(cookies)
        #     self.h_limit.hset(
        #         self.source + '|' + self.auth_kwargs['username'].encode(
        #             'utf-8'), 0)
        #     return cookies

        if count >= 5:
            self.logger.warning("帐号%s 被限制登录" %
                                self.auth_kwargs['username'].encode('utf-8'))
            if int(count) == 6:
                self.robot_login.send_markdown(
                    title="简历搜索",
                    content="#### 前程简历搜索帐号被禁用.\n"
                    "- 帐号： %s\n"
                    "- 密码： %s\n"
                    "- 会员名: %s\n"
                    "- 代理： %s\n\n" %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     self.auth_kwargs['password'].encode('utf-8'),
                     self.auth_kwargs['account_name'].encode('utf-8'),
                     self.auth_kwargs['ip'].encode('utf-8') + ':' +
                     self.auth_kwargs['port'].encode('utf-8')))

            raise FiveOneResumeException('account_limited')

        self.cookie = cookie2str(eval(cookies))
        self.h_limit.hset(
            self.source + '|' + self.auth_kwargs['username'].encode('utf-8'),
            0)
        return cookies

    def update_status(self, **kwargs):
        try:
            url = self.common_settings.BACK_ENDS + \
                  '/task/searchAwake/update.json'
            data = {
                'flowNo': kwargs['flow_no'],
                'source': self.common_settings.SOURCE,
                'status': kwargs['status'],
                'message': ''
            }
            res = self.html_downloader.download(url, method='POST', data=data)
            if res.json().get('code') == 200:
                self.logger.info('任务状态更新成功.')
            else:
                self.logger.warning('任务更新失败： %s' % str(res.json()))
        except Exception as e:
            self.logger.exception('任务状态更新异常： %s' % str(e))

    def is_limited(self, username):
        # 用于限制帐号进入详情页次数
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        if not self.h_use_record.hget(username + today):
            self.h_use_record.hset(username + today, 0)
            count = 0
        else:
            count = int(self.h_use_record.hget(username + today))

        if username == ('E4gl36307', '18518261507', 'bjlb7610'):
            # if count >= 2000:
            if count >= settings.COUNT_LIMITED:
                return True

        # elif username in ('lzmy9771', 'hgmy2130'):
        #     if count > 5000:
        #         return True

        else:
            if count >= settings.COUNT_LIMITED:
                return True
        return count

    def init_search_account(self):
        if sem.locked():
            return
        try:
            sem.acquire()
            self.logger.info('开始初始化搜索帐号队列')
            if not self.q_search_account.empty():
                self.logger.info('当前队列非空，剩余帐号： %s' %
                                 self.q_search_account.qsize())
                return

            mysql_ = self.init_mysql(
                user='******',
                passwd='nMGZKQIXr4PE8aR2',
                host='rm-2ze15h84ax219xf08.mysql.rds.aliyuncs.com',
                # user='******',
                # passwd='bi_admin#@1mofanghr',
                # host='10.0.3.52',
                cursorclass=DictCursor,
                cls_singleton=False)
            sql = """
                SELECT a.*
                FROM
                  autojob_v2.t_account a
                WHERE
                  EXISTS(
                      SELECT 1
                      FROM autojob_v2.t_account_use_type b
                      WHERE a.id = b.accountId
                            AND b.useType = 'SEARCH_AWAKE'
                            AND b.valid=1
                            AND a.valid = 1
                            AND a.status = 1
                            AND a.source = 'FIVE_ONE'
                  );
                """
            account_list = mysql_.query_by_sql(sql)
            self.logger.info('共匹配到%s个有效帐号' % len(account_list))
            for account in account_list:
                global RUN_STATUS
                if RUN_STATUS.get(account['username'].encode('utf-8')):
                    self.logger.warning('当前帐号 %s 处于执行状态.' %
                                        account['username'].encode('utf-8'))
                    continue
                if self.is_limited(username=account['username']) is True:
                    continue

                self.q_search_account.put(account)
            self.logger.info('初始化搜索帐号队列完毕, %s' % self.q_search_account.qsize())
            sem.release()

        except Exception as e:
            self.logger.exception('初始化搜索帐号队列失败: ' + str(e))
            sem.release()

    def start_push_task(self):
        if sem.locked():
            return
        try:
            sem.acquire()
            self.logger.info('开始推送前程唤醒任务.')
            act = CreateTask()
            act.create_task_from_mysql(use_keywords=True)
            sem.release()
        except Exception as e:
            self.logger.exception(e)
            sem.release()

    @retry(stop_max_attempt_number=5)
    @cls_catch_exception
    @cls_refresh_cookie
    def init_search_page(self):
        url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx'
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Host': 'ehire.51job.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        self.logger.info('初始化search_page成功.')
        return res.content

    @retry(stop_max_attempt_number=5)
    @cls_catch_exception
    def get_captcha(self, referer, access_key, do_type='CheckResumeView'):

        js_context = PyV8.JSContext()
        js_context.__enter__()
        # js_context.eval(js_text)
        js_context.eval(FIVE_ONE_JS_FUNC)
        get_guid = js_context.locals.get_guid
        guid = get_guid(20, 16)
        url = 'https://ehire.51job.com/ajax/Validate/PageValidate.aspx?' \
              'type=getverify&key=%s&guid=%s' \
              '&doType=%s' % (access_key, guid, do_type)

        headers = {
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Host': 'ehire.51job.com',
            'Pragma': 'no-cache',
            'Referer': referer,
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code != 200:
            self.logger.warning('获取验证码失败.')
            raise Exception
        self.logger.info('获取验证码图片成功.')

        img_path = './data/img/test-%s.png' % str(uuid.uuid1())
        with open(img_path, 'wb') as f:
            f.write(res.content)

        self.check_captcha(img_path=img_path,
                           referer=referer,
                           guid=get_guid(20, 16),
                           access_key=access_key,
                           do_type=do_type)

    @retry(stop_max_attempt_number=3)
    @cls_catch_exception
    def check_captcha(self, img_path, referer, guid, access_key, do_type):
        identify = Identify()
        captcha_order = '{"height_row3": 156, "height_row2": 98, "height_row1": 40, "index_list_1": [12, 7, 2, 11, 5, 8, 4, 0, 1, 3, 6, 9, 10, 13, 14], "index_list_3": [23, 4, 22, 10, 12, 5, 26, 28, 0, 11, 17, 25, 16, 29, 20], "col_num": 15, "index_list_2": [3, 13, 18, 2, 7, 1, 19, 9, 24, 27, 14, 8, 15, 21, 6]}'
        new_img_path = identify.recover_captcha(captcha_file_path=img_path,
                                                captcha_order=captcha_order)
        res_string = identify.analysis_captcha(img_path=new_img_path,
                                               type_code=9104)

        # os.rmdir(img_path)
        # os.rmdir(new_img_path)

        if not res_string:
            raise Exception

        self.logger.info('返回的验证码坐标为： %s' % res_string.encode('utf-8'))

        url = 'https://ehire.51job.com/ajax/Validate/PageValidate.aspx'
        headers = {
            'Accept': 'application/xml, text/xml, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'ehire.51job.com',
            'Origin': 'https://ehire.51job.com',
            'Pragma': 'no-cache',
            'Referer': referer,
            'User-Agent': self.user_agent,
            'X-Requested-With': 'XMLHttpRequest',
        }

        real_string = ';'.join([
            ','.join([item.split(',')[0],
                      str(int(item.split(',')[1]) - 40)])
            for item in res_string.split('|')
        ])

        self.logger.info('真实验证码坐标为： %s' % real_string.encode('utf-8'))

        data = {
            'type': 'checkverift',
            'key': access_key,
            'p': real_string,
            'guid': guid,
            'doType': do_type,
        }

        res = self.html_downloader.download(url,
                                            method='POST',
                                            data=data,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)

        code = self.html_parser.parser(
            res.content).find('msgtype').text.encode('utf-8')
        if code == '0':
            self.logger.warning("验证码不正确")
            return
        if code == '1':
            self.logger.info('验证成功')

        else:
            self.logger.warning('未知的返回code: %s' % code)

    @retry(stop_max_attempt_number=5)
    @cls_catch_exception
    @cls_refresh_cookie
    def get_resume_list(self,
                        previous_page_html,
                        action='pagerTopNew$ctl03',
                        **search_args):
        """

        :param previous_page_html:
        :param action:
        :param search_args: city['北京|010000'], keywords['销售代表|3001']
        :return:
        """
        url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx'
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Host': 'ehire.51job.com',
            'Origin': 'https://ehire.51job.com',
            'Pragma': 'no-cache',
            'Referer': 'https://ehire.51job.com/Candidate/'
            'SearchResumeNew.aspx',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        _soups = self.html_parser.parser(previous_page_html)

        data = {
            '__EVENTTARGET':
            action,
            '__EVENTARGUMENT':
            '',
            '__LASTFOCUS':
            '',
            '__VIEWSTATE':
            _soups.find('input', id='__VIEWSTATE').get('value'),
            'ctrlSerach$search_keyword_txt':
            search_args['keywords'].split('|')[0],
            'ctrlSerach$search_company_txt':
            '',
            'ctrlSerach$search_area_input':
            '',
            'ctrlSerach$search_area_hid':
            '',
            'ctrlSerach$search_funtype_hid':
            '',
            'ctrlSerach$search_expectsalaryf_input':
            '不限',
            'ctrlSerach$search_expectsalaryt_input':
            '不限',
            'ctrlSerach$search_industry_hid':
            '',
            'ctrlSerach$search_wyf_input':
            '不限',
            'ctrlSerach$search_wyt_input':
            '不限',
            'ctrlSerach$search_df_input':
            '不限',
            'ctrlSerach$search_dt_input':
            '不限',
            'ctrlSerach$search_cursalaryf_input':
            '不限',
            'ctrlSerach$search_cursalaryt_input':
            '不限',
            'ctrlSerach$search_age_input':
            '年龄:18-30',
            'ctrlSerach$search_agef_input':
            '18',
            'ctrlSerach$search_aget_input':
            '30',
            'ctrlSerach$search_expjobarea_input':
            search_args['city'].split('|')[0],
            'ctrlSerach$search_expjobarea_hid':
            search_args['city'],
            'ctrlSerach$search_forlang_input':
            '语言',
            'ctrlSerach$search_fl_input':
            '不限',
            'ctrlSerach$search_fllsabilityll_input':
            '不限',
            'ctrlSerach$search_englishlevel_input':
            '英语等级',
            'ctrlSerach$search_sex_input':
            '性别',
            'ctrlSerach$search_major_input':
            '专业',
            'ctrlSerach$search_major_hid':
            '',
            'ctrlSerach$search_hukou_input':
            '户口',
            'ctrlSerach$search_hukou_hid':
            '',
            'ctrlSerach$search_rsmupdate_input':
            '近1周',
            'ctrlSerach$search_jobstatus_input':
            '求职状态',
            'send_cycle':
            '1',
            'send_time':
            '7',
            'send_sum':
            '10',
            'ctrlSerach$hidSearchValue':
            u'%s##0#######20#35############近1周|1##1#0##%s#0#0#0' %
            (search_args['keywords'].split('|')[0], search_args['city']),
            'ctrlSerach$hidKeyWordMind':
            '',
            'ctrlSerach$hidRecommend':
            '',
            'ctrlSerach$hidWorkYearArea':
            '',
            'ctrlSerach$hidDegreeArea':
            '',
            'ctrlSerach$hidSalaryArea':
            '',
            'ctrlSerach$hidCurSalaryArea':
            '',
            'ctrlSerach$hidIsRecDisplay':
            '1',
            'showselected':
            '',
            'pagerTopNew$ctl06':
            '50',
            'cbxColumns$0':
            'AGE',
            'cbxColumns$1':
            'WORKYEAR',
            'cbxColumns$2':
            'SEX',
            'cbxColumns$3':
            'AREA',
            'cbxColumns$4':
            'WORKFUNC',
            'cbxColumns$5':
            'TOPDEGREE',
            'cbxColumns$6':
            'LASTUPDATE',
            'hidAccessKey':
            _soups.find('input', id='hidAccessKey').get('value'),
            'hidShowCode':
            '0',
            'hidDisplayType':
            '1',
            'hidEhireDemo':
            '',
            'hidUserID':
            '',
            'hidCheckUserIds':
            _soups.find('input', id='hidCheckUserIds').get('value'),
            'hidCheckKey':
            _soups.find('input', id='hidCheckKey').get('value'),
            'hidEvents':
            '',
            'hidNoSearchIDs':
            '',
            'hidBtnType':
            '',
            'hideMarkid':
            '',
            'hidStrAuthority':
            _soups.find('input', id='hidStrAuthority').get('value'),
            'hidDownloadNum':
            _soups.find('input', id='hidDownloadNum').get('value'),
            'hidKeywordCookie':
            '',
            'showGuide':
            '',
        }

        if not search_args['use_keywords']:
            self.logger.info('采用职能进行搜索.')
            data['ctrlSerach$search_keyword_txt'] = ''
            data['ctrlSerach$search_funtype_hid'] = search_args['keywords']
            data['hidSearchValue'] = \
                u'##0#%s######20#35############近1周|1##1#0##%s#0#0#0' \
                % (search_args['keywords'], search_args['city'])
        else:
            self.logger.info('采用关键词进行搜索.')

        res = self.html_downloader.download(url,
                                            method='POST',
                                            headers=headers,
                                            data=data,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        access_key = self.html_parser.parser(res.content).find(
            'input', id='hidAccessKey').get('value')
        # auth_ = self.html_parser.parser(res.content).find(
        #     'div', id='divVerifyCode_ch').get('style')

        soups = self.html_parser.parser(res.content).find_all(
            'td', class_='Common_list_table-id-text')

        resume_list = []

        if not soups:
            # 通过empty_times控制，当某账号累计10次遇到返回为空的情况，则进行验证码验证
            empty_times = int(self.h_search_empty_times.hget(self.auth_kwargs['username'])) \
                if self.h_search_empty_times.hget(self.auth_kwargs['username']) else 0
            if empty_times > 10:
                self.logger.warning(
                    '搜索列表遇到验证码. %s' %
                    self.auth_kwargs['username'].encode('utf-8'))
                self.get_captcha(referer=res.url,
                                 access_key=access_key,
                                 do_type='CheckSearchResume')
                self.h_search_empty_times.hset(self.auth_kwargs['username'], 0)
                raise Exception
            else:
                self.logger.warning(
                    '未匹配到搜索结果，跳过该任务[%s, %s, %s]' %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     search_args['keywords'].encode('utf-8'),
                     search_args['city'].encode('utf-8')))
                empty_times += 1
                self.h_search_empty_times.hset(self.auth_kwargs['username'],
                                               empty_times)
                return resume_list, ''

        for soup in soups:
            ref_time = soup.find_parent().find_all('td')[-2].text.encode(
                'utf-8')
            if datetime.datetime.now().isoweekday() == 1:
                # 周一
                # global DAY_LIMITED
                # DAY_LIMITED = 5
                # limited_day = datetime.datetime.now() - datetime.timedelta(
                #     days=4)
                global DAY_LIMITED
                DAY_LIMITED = settings.DAY_LIMITED
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)
            else:
                global DAY_LIMITED
                DAY_LIMITED = settings.DAY_LIMITED
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)

            if str2datetime(ref_time, '%Y-%m-%d').date() < \
                    limited_day.date():
                self.logger.warning('匹配到%s天前的简历，执行跳过操作.' % DAY_LIMITED)
                break
            resume_list.append(soup.find('a').get('href'))

        try:
            page = self.html_parser.parser(res.content).find(
                'div', class_='Search_page-numble').find(
                    'a', class_='active').get('title').encode('utf-8')
        except Exception as e:
            self.logger.warning('未找到分页组件，跳过该任务[%s, %s]' %
                                (search_args['keywords'].encode('utf-8'),
                                 search_args['city'].encode('utf-8')))
            return resume_list, ''

        self.logger.info(
            'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
            (page, len(resume_list), search_args['keywords'].encode('utf-8'),
             search_args['city'].encode('utf-8')))

        if int(page) > settings.TASK_PAGE_LIMIT:
            raise FiveOneResumeException('task_page_limit')

        return resume_list, res.content

    @retry(stop_max_attempt_number=5)
    @cls_catch_exception
    @cls_refresh_cookie
    def get_resume_detail(self, resume_url):
        url = 'https://ehire.51job.com/' + resume_url
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'ehire.51job.com',
            'Referer':
            'https://ehire.51job.com/Candidate/SearchResumeNew.aspx',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        if '简历ID：' in res.content:
            self.logger.info('获取简历正文成功 %s' %
                             self.auth_kwargs['username'].encode('utf-8'))
            return res.content

        self.logger.warning('获取简历正文失败 %s' %
                            self.auth_kwargs['username'].encode('utf-8'))

        access_key = self.html_parser.parser(res.content).find(
            'input', id='hidAccessKey').get('value')

        # self.robot_login.send_markdown(
        #     title="简历搜索",
        #     content="#### 前程简历搜索详情页出现验证码.\n"
        #             "- 帐号： %s\n"
        #             "- 密码： %s\n"
        #             "- 会员名: %s\n"
        #             "- 代理： %s\n\n"
        #             % (
        #                 self.auth_kwargs['username'].encode(
        #                     'utf-8'),
        #                 self.auth_kwargs['password'].encode(
        #                     'utf-8'),
        #                 self.auth_kwargs['account_name'].encode(
        #                     'utf-8'),
        #                 self.auth_kwargs['ip'].encode(
        #                     'utf-8') + ':' +
        #                 self.auth_kwargs['port'].encode('utf-8'))
        # )
        self.get_captcha(referer=res.url, access_key=access_key)

        time.sleep(60)
        raise Exception

    def resume_search(self, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)

        flag = False
        while True:

            awake_flow_no = self.h_awake_flow_no.hget('FIVE_ONE')
            if self.h_status.hget(awake_flow_no) == '400':
                self.logger.info("程序当前处于暂停状态.sleep 60s")
                time.sleep(60)
                continue

            if flag is False:
                # 扫描第一页
                page_html = self.init_search_page()
                resume_list, page_html = self.get_resume_list(
                    page_html, action='pagerTopNew$ctl00', **search_args)
                flag = True
            else:
                resume_list, page_html = self.get_resume_list(
                    page_html, **search_args)

            if not resume_list:
                raise FiveOneResumeException('resume_list_empty')

            time.sleep(random.randint(1, 5))
            for resume_args in resume_list:

                if self.is_limited(self.auth_kwargs['username']) is True:
                    raise FiveOneResumeException('user_record_limited')

                count = self.is_limited(self.auth_kwargs['username'])

                # 用于简历去重
                resume_id = re.findall('''(?<=hidUserID=)\d+?(?=&)''',
                                       resume_args)[0].encode('utf-8')

                last_search_day = self.h_black_list.hget(resume_id)
                if last_search_day:
                    distance = (
                        str2datetime(today.replace('|', ''), '%Y-%m-%d') -
                        str2datetime(last_search_day, '%Y-%m-%d')).days
                else:
                    distance = DAY_LIMITED + 1
                if distance < DAY_LIMITED:
                    self.logger.warning('该简历%s天内已经被采集过: %s' %
                                        (DAY_LIMITED, resume_id))
                    continue
                self.h_black_list.hset(resume_id, today.replace('|', ''))
                resume_detail = self.get_resume_detail(resume_url=resume_args)
                if not resume_detail:
                    continue
                resume_uuid = str(uuid.uuid1())
                # content_origin = {'name': '', 'email': '', 'phone': '',
                #                   'html': resume_detail.decode('utf-8')}
                # content = json.dumps(content_origin, ensure_ascii=False)

                content = resume_detail.decode('utf-8')

                sql = '''INSERT INTO spider_search.resume_raw (source, content, 
                createBy, 
                trackId, createtime, email, emailJobType, emailCity, subject) VALUES 
                (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
                sql_value = (self.common_settings.SOURCE, content, resume_uuid,
                             self.auth_kwargs['username'],
                             search_args['keywords'].split('|')[0],
                             search_args['city'].split('|')[0], str(resume_id))

                resume_update_time = ''
                msg_data = {
                    "channelType": "WEB",
                    "content": {
                        "content": content,
                        "id": '',
                        "createBy": "python",
                        "createTime": int(time.time() * 1000),
                        "ip": '',
                        "resumeSubmitTime": '',
                        "resumeUpdateTime": resume_update_time,
                        "source": self.common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        "avatarUrl": '',
                        "email": self.auth_kwargs['username'],
                        'emailJobType': search_args['keywords'].split('|')[0],
                        'emailCity': search_args['city'].split('|')[0],
                        'subject': str(resume_id)
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_SEARCH_AWAKE",
                    "source": self.common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': self.common_settings.CALL_SYSTEM_ID,
                }
                # self.mysql_handler.save(sql=sql, data=sql_value)
                res = self.save_data(sql=sql,
                                     data=sql_value,
                                     msg_data=msg_data)

                if res:
                    count += 1
                    self.h_use_record.hset(
                        self.auth_kwargs['username'] + today, count)
                    mysql_ = self.init_mysql(
                        user='******',
                        passwd='bi_admin#@1mofanghr',
                        host='172.16.25.1',
                        # user='******',
                        # passwd='bi_admin#@1mofanghr',
                        # host='10.0.3.52',
                        cursorclass=DictCursor,
                        cls_singleton=False)
                    sql = '''
                    INSERT INTO spider.resume_awake_record (source, position, city,
                     raw_id, create_time, username) VALUES ('FIVE_ONE', %s, 
                     %s, %s, now(), %s)
                    '''
                    value = (search_args['keywords'].split('|')[0],
                             search_args['city'].split('|')[0], res,
                             self.auth_kwargs['username'])
                    mysql_.save(sql, value)

                time.sleep(random.randint(5, 7))

Esempio n. 5

Mostra file

class ZhiLianEmailMobile(InitCore):
    def __init__(self):
        super(ZhiLianEmailMobile, self).__init__(
        )
        self.mns_handler = self.init_mns(
            endpoint='http://1315265288610488.mns.cn-beijing.aliyuncs.com',
            # queue='morgan-queue-resume-raw'
            queue='morgan-queue-resume-raw'
        )
        # self.spider_admin = self.init_mysql(host='10.0.3.52',
        #                                     port=3306,
        #                                     user='******',
        #                                     passwd='bi_admin#@1mofanghr',
        #                                     db='spider', cursorclass=DictCursor)
        self.spider_admin = self.init_mysql(host='rm-2ze15h84ax219xf08.mysql.rds.aliyuncs.com',
                                            port=3306,
                                            user='******',
                                            passwd='n4UZknFH6F',
                                            db='spider', cursorclass=DictCursor)

        self.robot = DingDingRobot(
            access_token="2d021b9d686a5432aa0d65f0e75d03bc6200d2b766616c75224d8e7bfb1cbc57")

        self.logger = Logger.timed_rt_logger()
        self.q = RedisQueue('zhi_lian_need_get_mobile', host='172.16.25.36',
                            port=6379)

    def prepare_resume_ids(self):
        if lock.locked():
            return
        lock.acquire()
        try:
            download_record_sql = '''
                    SELECT * FROM spider.download_record WHERE source=3 AND valid=0 
                    ORDER BY id ASC LIMIT 100
                    '''
            result = self.spider_admin.query_by_sql(download_record_sql)
            if len(result) == 0 or not result:
                self.logger.info('没有需要下载的简历了 %s' % download_record_sql)
                time.sleep(60)
                lock.release()
                return
            for item in result:
                self.q.put(item)
            self.logger.info('任务添加完成 %s' % self.q.qsize())
        except Exception as e:
            self.logger.exception(str(e))

        lock.release()

    def load_data(self, result):
        """
        加载需要请求的数据
        valid为0，标识待下载
               2，标识下载完成
        :return:
        """
        self.logger.info('加载 %s 记录 %s'
                         % (result['id'], result['resumeId'].encode('utf-8')))

        sql = '''
                   select * from spider.resume_raw WHERE id =%s
                   ''' % result['resumeId'].encode('utf-8')
        data_lst = self.spider_admin.query_by_sql(sql)
        self.logger.info("获取简历成功: %s" % len(data_lst))
        if len(data_lst) > 0:
            raw_data = data_lst[0]
            # origin_url = re.search('(https://ihr\.zhaopin\.com/job/relay\.html?.*?)">我要联系TA', data_lst[0]['content'])
            origin_url = re.search('(https://ihr.*?)">我要联系TA</a>', data_lst[0]['content'].encode('utf-8'))
            if origin_url:
                origin_url = origin_url.group(1)
                raw_data['origin_url'] = origin_url
                raw_data['download_id'] = result['id']
                return raw_data
            else:
                self.logger.warning('未匹配到获取联系方式的.')
                self.update_record(result['id'], 3)
        else:
            self.logger.error('没有找到简历原文 %s ' % (result['resumeId'].encode('utf-8')))
        return None

    def update_record(self, id, result):
        """
        更新download_record的值
        :param id:
        :param result:
        :return:
        """
        sql = '''
            UPDATE spider.download_record SET valid=%s WHERE id=%s
        '''
        result = self.spider_admin.save(sql, [result, id])
        self.logger.info('%s download_record 修改成功 %s' % (id, result))

    def load_html(self, origin_url):
        """
        请求联系方式页面
        :param origin_url:
        :return:
        """
        proxy = get_proxy()
        self.logger.info('当前使用的代理为： %s' % proxy)
        session = requests.session()
        content1 = session.get(url=origin_url, proxies=proxy, headers={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Connection': 'keep-alive',
            'Host': 'ihr.zhaopin.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
        }).content
        document = etree.HTML(content1)
        name = document.xpath('//div[@class="login_content"]/p')[0].xpath('//a')[0].text
        if not name:
            params = re.search('param=(.*?)$', origin_url)
            if params:
                params = params.group(1)
            else:
                return None
            url = '''https://ihr.zhaopin.com/resumemanage/emailim.do?s=%s''' % params
            headers = {
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Host': 'ihr.zhaopin.com',
                'Referer': 'https://ihr.zhaopin.com/job/relay.html?param=%s' % params,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest',
            }
            content = session.get(url=url, headers=headers, proxies=proxy, allow_redirects=True).content
            self.logger.info('%s' % content)
            if '近期我们监控到您所用的IP地址出现风险，您的IP地址是' in content and '临时将此IP地址进行安全保护，但可能导致您无法正常登陆' in content and '如有问题，请与客服联系，给您带来的不便，敬请谅解' in content:
                self.robot.send_text('代理不能正常使用 %s ' % proxy)
                return None
            if content:
                data = json.loads(content, encoding='utf-8')
                if data['code'] == 200:
                    return json.loads(content, encoding='utf-8')
        return None

    def push_mns(self, raw_data):

        try:
            msg_data = {
                "channelType": "WEB",
                "content": {
                    "content": raw_data['content'],
                    "id": raw_data['id'],
                    "createBy": "python",
                    "createTime": str(raw_data['createTime']),
                    "ip": '',
                    "resumeSubmitTime": '',
                    "resumeUpdateTime": str(raw_data['resumeUpdateTime']),
                    "source": raw_data['source'],
                    "trackId": str(raw_data['trackId']),
                    "avatarUrl": '',
                    "email": raw_data['email'],
                    'emailJobType': raw_data['emailJobType'],
                    'emailCity': raw_data['emailCity'],
                    'subject': raw_data['subject'],
                    'externalInfo': raw_data['externalInfo']
                },
                "interfaceType": "PARSE",
                "resourceDataType": "RAW",
                "resourceType": "RESUME_EMAIL",
                "source": raw_data['source'],
                "trackId": raw_data['trackId'],
                'traceID': raw_data['trackId'],
                'callSystemID': 'resume_awake_search',
            }
            dumps = json.dumps(msg_data, ensure_ascii=False).encode('utf-8')
            dumps = remove_emoji(dumps)
            self.mns_handler.save(dumps)
            self.logger.info('推送成功： %s' % raw_data['id'])
        except Exception as e:
            self.logger.exception(str(e))

Esempio n. 6

Mostra file

File: resume_zhilian.py Progetto: logonmy/Spider-1

 def __init__(self, local_setting=None):
     super(ResumeZhiLianBase, self).__init__(
         local_setting=local_setting,
         # common_settings_path='/data/config/morgan/'
         #                      'morgan_spider_common_settings_test.cfg'
     )
     self.cookie = None
     self.access_token = None
     self.proxies = None
     self.auth_kwargs = None
     self.source = 'ZHI_LIAN'
     self.logger = Logger.timed_rt_logger()
     self.h = RedisHash("cookie_pool",
                        host=REDIS_HOST,
                        port=REDIS_PORT,
                        password=REDIS_PASSWORD)
     self.h_limit = RedisHash("account_limit",
                              host=REDIS_HOST,
                              port=REDIS_PORT,
                              password=REDIS_PASSWORD)
     # detail 黑名单
     self.h_black_list = RedisHash("zhi_lian_resume_back_list",
                                   host=REDIS_HOST,
                                   port=REDIS_PORT,
                                   password=REDIS_PASSWORD)
     # 帐号当天使用次数
     self.h_use_record = RedisHash("zhi_lian_resume_search_use_record",
                                   host=REDIS_HOST,
                                   port=REDIS_PORT,
                                   password=REDIS_PASSWORD)
     # 帐号已达上限标识
     self.h_over_search_limit = RedisHash("zhi_lian_over_search_limit",
                                          host=REDIS_HOST,
                                          port=REDIS_PORT,
                                          password=REDIS_PASSWORD)
     self.q_search_account = RedisQueue("zhi_lian_resume_search_account",
                                        host=REDIS_HOST,
                                        port=REDIS_PORT,
                                        password=REDIS_PASSWORD)
     self.h_status = RedisHash(
         'ZHI_LIAN',
         # host='172.16.25.35',
         # port='6379',
         host='r-2ze95acf94ea4a04.redis.rds.aliyuncs.com',
         port='6379',
         password='******',
         db=6)
     self.h_awake_flow_no = RedisHash("awake_flow_no",
                                      host=REDIS_HOST,
                                      port=REDIS_PORT,
                                      password=REDIS_PASSWORD)
     # self.search_db = self.init_mysql(db='spider_search')
     self.mns_handler = self.init_mns(
         endpoint='http://1315265288610488.mns.cn-beijing.aliyuncs.com',
         queue='morgan-queue-resume-raw'
         # queue='morgan-queue-test-resume-raw'
     )
     self.mysql_handler = self.init_mysql()
     self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) ' \
                       'AppleWebKit/537.36 (KHTML, ' \
                       'like Gecko) Chrome/62.0.3202.62 ' \
                       'Safari/537.36'
     self.robot = DingDingRobot(
         access_token="eb749abfe9080a69da6524b77f589b8f6"
         "ddbcc182c7a41bf095b095336edb0a1")
     self.robot_login = DingDingRobot(
         access_token="3c7b5bd12b49cfdd5f47f00df7fa9c478"
         "485254485d567ff6abcbf45927e648a")

Esempio n. 7

Mostra file

File: resume_zhilian.py Progetto: logonmy/Spider-1

class ResumeZhiLianBase(InitCorePlus):
    def __init__(self, local_setting=None):
        super(ResumeZhiLianBase, self).__init__(
            local_setting=local_setting,
            # common_settings_path='/data/config/morgan/'
            #                      'morgan_spider_common_settings_test.cfg'
        )
        self.cookie = None
        self.access_token = None
        self.proxies = None
        self.auth_kwargs = None
        self.source = 'ZHI_LIAN'
        self.logger = Logger.timed_rt_logger()
        self.h = RedisHash("cookie_pool",
                           host=REDIS_HOST,
                           port=REDIS_PORT,
                           password=REDIS_PASSWORD)
        self.h_limit = RedisHash("account_limit",
                                 host=REDIS_HOST,
                                 port=REDIS_PORT,
                                 password=REDIS_PASSWORD)
        # detail 黑名单
        self.h_black_list = RedisHash("zhi_lian_resume_back_list",
                                      host=REDIS_HOST,
                                      port=REDIS_PORT,
                                      password=REDIS_PASSWORD)
        # 帐号当天使用次数
        self.h_use_record = RedisHash("zhi_lian_resume_search_use_record",
                                      host=REDIS_HOST,
                                      port=REDIS_PORT,
                                      password=REDIS_PASSWORD)
        # 帐号已达上限标识
        self.h_over_search_limit = RedisHash("zhi_lian_over_search_limit",
                                             host=REDIS_HOST,
                                             port=REDIS_PORT,
                                             password=REDIS_PASSWORD)
        self.q_search_account = RedisQueue("zhi_lian_resume_search_account",
                                           host=REDIS_HOST,
                                           port=REDIS_PORT,
                                           password=REDIS_PASSWORD)
        self.h_status = RedisHash(
            'ZHI_LIAN',
            # host='172.16.25.35',
            # port='6379',
            host='r-2ze95acf94ea4a04.redis.rds.aliyuncs.com',
            port='6379',
            password='******',
            db=6)
        self.h_awake_flow_no = RedisHash("awake_flow_no",
                                         host=REDIS_HOST,
                                         port=REDIS_PORT,
                                         password=REDIS_PASSWORD)
        # self.search_db = self.init_mysql(db='spider_search')
        self.mns_handler = self.init_mns(
            endpoint='http://1315265288610488.mns.cn-beijing.aliyuncs.com',
            queue='morgan-queue-resume-raw'
            # queue='morgan-queue-test-resume-raw'
        )
        self.mysql_handler = self.init_mysql()
        self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) ' \
                          'AppleWebKit/537.36 (KHTML, ' \
                          'like Gecko) Chrome/62.0.3202.62 ' \
                          'Safari/537.36'
        self.robot = DingDingRobot(
            access_token="eb749abfe9080a69da6524b77f589b8f6"
            "ddbcc182c7a41bf095b095336edb0a1")
        self.robot_login = DingDingRobot(
            access_token="3c7b5bd12b49cfdd5f47f00df7fa9c478"
            "485254485d567ff6abcbf45927e648a")

    def init_search_account(self, use_type='SEARCH_AWAKE'):
        if sem.locked():
            return

        sem.acquire()
        try:
            self.q_search_account.clean()
            self.logger.info('开始初始化搜索帐号队列')
            if not self.q_search_account.empty():
                self.logger.info('当前队列非空，剩余帐号： %s' %
                                 self.q_search_account.qsize())
                return

            mysql_ = self.init_mysql(
                user='******',
                passwd='nMGZKQIXr4PE8aR2',
                host='rm-2ze15h84ax219xf08.mysql.rds.aliyuncs.com',
                # user='******',
                # passwd='bi_admin#@1mofanghr',
                # host='10.0.3.52',
                cursorclass=DictCursor,
                cls_singleton=False)
            account_list = mysql_.query_by_sql("""
                SELECT a.*
                FROM
                  autojob_v2.t_account a
                WHERE
                  EXISTS(
                      SELECT 1
                      FROM autojob_v2.t_account_use_type b
                      WHERE a.id = b.accountId
                            AND b.useType = '%s'
                            AND b.valid=1
                            AND a.valid = 1
                            AND a.status = 1
                            AND a.source = 'ZHI_LIAN'
                  )
                ORDER BY a.id DESC;
                """ % use_type)
            self.logger.info('共匹配到%s个有效帐号' % len(account_list))
            effective_account_list = []
            for account in account_list:
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                global RUN_STATUS
                if RUN_STATUS.get(account['username'].encode('utf-8')):
                    self.logger.warning('当前帐号 %s 处于执行状态.' %
                                        account['username'].encode('utf-8'))
                    continue

                if self.h_over_search_limit.hget(
                        today + '|' + account['username'].encode('utf-8')):
                    self.logger.warning(
                        '当前帐号 %s 已达上限 %s，当天不再进行搜索.' %
                        (account['username'].encode('utf-8'),
                         self.h_use_record.hget(account['username'].encode(
                             'utf-8') + '|' + today)))
                    continue

                self.q_search_account.put(account)
                effective_account_list.append(account)

            self.logger.info('初始化搜索帐号队列完毕 %s' % len(effective_account_list))
            sem.release()

        except Exception as e:
            self.logger.exception('初始化搜索帐号队列失败: ' + str(e))
            sem.release()

    def start_push_task(self):
        if sem.locked():
            return
        try:
            sem.acquire()
            self.logger.info('开始推送智联唤醒任务.')
            act = CreateTask()
            act.create_task_from_mysql(use_keywords=True)
            sem.release()
        except Exception as e:
            self.logger.exception(e)
            sem.release()

    def release_run_status(self):
        global RUN_STATUS
        RUN_STATUS[self.auth_kwargs['username'].encode('utf-8')] = 0
        self.logger.info('将帐号 %s 置为非运行状态' %
                         self.auth_kwargs['username'].encode('utf-8'))

    def check_limit(self, count):
        if self.auth_kwargs['username'].encode('utf-8') in \
                settings.SAFE_LIMITED:
            if count >= settings.SAFE_COUNT_LIMITED:
                return True
            return False

        if random.randint(1, 200) == 1:
            # 0.5%的几率检查是否达到 最小搜索上限
            if count >= settings.COUNT_LIMITED_L:
                return True
        if count >= settings.COUNT_LIMITED:
            return True
        return False

    def get_cookie(self):
        count = self.h_limit.hget(self.source + '|' +
                                  self.auth_kwargs['username'].encode('utf-8'))
        if not count:
            pass
        elif int(count) >= 5:
            self.logger.warning("帐号%s 被限制登录" %
                                self.auth_kwargs['username'].encode('utf-8'))
            count = int(count)
            count += 1
            self.h_limit.hset(
                self.source + '|' +
                self.auth_kwargs['username'].encode('utf-8'), count)

            if int(count) == 6:
                self.robot_login.send_markdown(
                    title="智联简历搜索",
                    content="#### 智联简历搜索帐号被禁用.\n"
                    "- 帐号： %s\n"
                    "- 密码： %s\n"
                    "- 代理： %s\n\n" %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     self.auth_kwargs['password'].encode('utf-8'),
                     self.auth_kwargs['ip'].encode('utf-8') + ':' +
                     self.auth_kwargs['port'].encode('utf-8')))

            raise AccountLimitedException('account_limited')

        cookies = self.h.hget(self.source + '|' +
                              self.auth_kwargs['username'].encode('utf-8'))

        # 调用登录接口进行登录
        # if not cookies:
        #     self.auth_kwargs['user_agent'] = self.user_agent
        #
        #     self.logger.info(
        #         "开始登录：%s %s %s %s" % (
        #             self.auth_kwargs['username'].encode('utf-8'),
        #             self.auth_kwargs['password'].encode('utf-8'),
        #             self.auth_kwargs['ip'].encode('utf-8'),
        #             self.auth_kwargs['port'].encode('utf-8')))
        #     url = 'http://127.0.0.1:5000/zhilian_login/' \
        #           '?user_name=%s&password' \
        #           '=%s&ip=%s&port=%s&user_agent=%s' % (
        #               self.auth_kwargs['username'].encode('utf-8'),
        #               self.auth_kwargs['password'].encode('utf-8'),
        #               self.auth_kwargs['ip'].encode('utf-8'),
        #               self.auth_kwargs['port'].encode('utf-8'),
        #               self.auth_kwargs['user_agent'].encode('utf-8')
        #           )
        #
        #     res = self.html_downloader.download(url, timeout=180)
        #     code = res.json().get("code")
        #     # code = 500
        #     if not code == 200:
        #         self.logger.warning(
        #             "登录失败: %s" % (
        #                 self.auth_kwargs['username'].encode('utf-8'))
        #         )
        #
        #         count = self.h_limit.hget(self.source+'|'+self.auth_kwargs['username'].encode('utf-8'))
        #         if not count:
        #             count = 1
        #         else:
        #             count = int(count)
        #             count += 1
        #         self.h_limit.hset(self.source+'|'+self.auth_kwargs['username'].encode('utf-8'), count)
        #
        #         return None
        #
        #     cookies = res.json().get('cookies')
        #     self.h.hset(self.source+'|'+self.auth_kwargs['username'].encode('utf-8'), cookies)
        #     self.h_limit.hset(self.source+'|'+self.auth_kwargs['username'].encode('utf-8'), 0)
        #     self.cookie = cookie2str(cookies)
        #     self.access_token = cookies['Token']
        #     return cookies

        # 强制
        if not cookies:
            self.cookie = ''
            self.h_limit.hset(
                self.source + '|' +
                self.auth_kwargs['username'].encode('utf-8'), 5)
            return

        else:
            self.cookie = cookie2str(eval(cookies))
            self.access_token = eval(cookies).get('Token', '')
            if not self.access_token:
                raise AccountLimitedException('account_limited')
            return eval(cookies)

    def set_cookie_invalid(self):
        self.h.hset(
            self.source + '|' + self.auth_kwargs['username'].encode('utf-8'),
            '')
        # global COOKIE_CONTROL
        # last_invalid_time = COOKIE_CONTROL.get(self.auth_kwargs['username'])
        # if not last_invalid_time:
        #     COOKIE_CONTROL[self.auth_kwargs['username']] = datetime.datetime.now()
        #     self.h.hset(self.source+'|'+self.auth_kwargs['username'].encode('utf-8'), '')
        # else:
        #     if datetime.datetime.now().minute - last_invalid_time.minute > 5:
        #         COOKIE_CONTROL[
        #             self.auth_kwargs['username']] = datetime.datetime.now()
        #         self.h.hset(self.source+'|'+self.auth_kwargs['username'].encode('utf-8'), '')
        #     else:
        #         self.logger.info('%s距离上次失效帐号已达到%s分钟.' % (
        #             self.auth_kwargs['username'].encode('utf-8'), 5))

    def update_status(self, **kwargs):
        try:
            url = self.common_settings.BACK_ENDS + \
                  '/task/searchAwake/update.json'
            data = {
                'flowNo': kwargs['flow_no'],
                'source': self.common_settings.SOURCE,
                'status': kwargs['status'],
                'message': ''
            }
            res = self.html_downloader.download(url, method='POST', data=data)
            if res.json().get('code') == 200:
                self.logger.info('任务状态更新成功.')
            else:
                self.logger.warning('任务更新失败： %s' % str(res.json()))
        except Exception as e:
            self.logger.exception('任务状态更新异常： %s' % str(e))

    def resume_search(self, page, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)
        resume_list = self.get_resume_list(page=page, **search_args)

        if not resume_list:
            raise ZhiLianResumeException('resume_list_empty')

        for resume_args in resume_list:
            # 用于限制帐号进入详情页次数
            if not self.h_use_record.hget(self.auth_kwargs['username'] +
                                          today):
                self.h_use_record.hset(self.auth_kwargs['username'] + today, 0)
                count = 0
            else:
                count = int(
                    self.h_use_record.hget(self.auth_kwargs['username'] +
                                           today))

            if self.check_limit(count=count):
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                self.h_over_search_limit.hset(
                    today + '|' + self.auth_kwargs['username'].encode('utf-8'),
                    1)
                raise ZhiLianResumeException('user_record_limited')

            # 用于简历去重
            try:
                resume_id = str(
                    resume_args.get('resumeNo').encode('utf-8')[:10])
            except:
                resume_id = str(resume_args.get('number')[:10])

            mysql_1 = self.init_mysql(
                user='******',
                passwd='bi_admin#@1mofanghr',
                host='172.16.25.1',
                # user='******',
                # passwd='bi_admin#@1mofanghr',
                # host='10.0.3.52',
                cursorclass=DictCursor,
                cls_singleton=False)
            sql = '''
                 insert into spider.resume_awake_record_no_repeat (source, 
                 position, 
                 city,
                  raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, 
                  %s, %s, now(), %s)
                 '''
            value = (search_args['keywords'].split('|')[0],
                     search_args['city'].split('|')[0], resume_id,
                     self.auth_kwargs['username'])
            mysql_1.save(sql, value)
            del mysql_1

            last_search_day = self.h_black_list.hget(resume_id)
            if last_search_day:
                distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') -
                            str2datetime(last_search_day, '%Y-%m-%d')).days
            else:
                distance = DAY_LIMITED + 1
            if distance < DAY_LIMITED:
                self.logger.warning('该简历%s天内已经被采集过: %s' %
                                    (DAY_LIMITED, resume_id))
                continue
            self.h_black_list.hset(resume_id, today.replace('|', ''))
            resume_detail = self.get_resume_detail(resume_args=resume_args)
            if not resume_detail:
                continue
            resume_uuid = str(uuid.uuid1())
            content = json.dumps(
                {
                    'name': '',
                    'email': '',
                    'phone': '',
                    'html': resume_detail
                },
                ensure_ascii=False)
            sql = '''insert into spider_search.resume_raw (source, content, 
            createBy, 
            trackId, createtime, email, emailJobType, emailCity, subject) values 
            (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
            sql_value = (self.common_settings.SOURCE, content, resume_uuid,
                         self.auth_kwargs['username'],
                         search_args['keywords'], search_args['city'],
                         str(resume_detail.get('resumeNo')))

            resume_update_time = ''
            msg_data = {
                "channelType": "APP",
                "content": {
                    "content": content,
                    "id": '',
                    "createBy": "python",
                    "createTime": int(time.time() * 1000),
                    "ip": '',
                    "resumeSubmitTime": '',
                    "resumeUpdateTime": resume_update_time,
                    "source": self.common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    "avatarUrl": '',
                    "email": self.auth_kwargs['username'],
                    'emailJobType': search_args['keywords'],
                    'emailCity': search_args['city'],
                    'subject': str(resume_detail.get('resumeNo'))
                },
                "interfaceType": "PARSE",
                "resourceDataType": "RAW",
                "resourceType": "RESUME_SEARCH_AWAKE",
                "source": self.common_settings.SOURCE,
                "trackId": str(resume_uuid),
                'traceID': str(resume_uuid),
                'callSystemID': self.common_settings.CALL_SYSTEM_ID,
            }
            # self.mysql_handler.save(sql=sql, data=sql_value)
            res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data)

            if res:
                count += 1
                self.h_use_record.hset(self.auth_kwargs['username'] + today,
                                       count)
                mysql_ = self.init_mysql(
                    user='******',
                    passwd='bi_admin#@1mofanghr',
                    host='172.16.25.1',
                    # user='******',
                    # passwd='bi_admin#@1mofanghr',
                    # host='10.0.3.52',
                    cursorclass=DictCursor,
                    cls_singleton=False)
                sql = '''
                    insert into spider.resume_awake_record (source, position, city,
                     raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, 
                     %s, %s, now(), %s)
                    '''
                value = (search_args['keywords'].split('|')[0],
                         search_args['city'].split('|')[0], res,
                         self.auth_kwargs['username'])
                mysql_.save(sql, value)
                del mysql_

            time.sleep(random.randint(3, 5))