Exemple #1
0
def get_code():
    captchaId = ''.join(random.sample('0123456789abcdef0123456789abcdef', 32))
    rc = RClient('XXX', 'XXX', 'XXX', 'XXX')
    code = ''
    while code == '':
        try:
            img_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId={}&random=0.4750974032186893'.format(
                captchaId)
            urllib.urlretrieve(img_url, 'code.jpg')
            with open('code.jpg', 'rb') as f:
                res = f.read()
                os.remove('code.jpg')
                code = rc.rk_create(res, 2040)['Result']
        except:
            pass
    return captchaId, code
Exemple #2
0
 def get_captcha(self):
     url = "https://console.bonuscloud.io/api/web/captcha/get/"
     self.mkdir('img')
     png_file = "img/cap_" + self.bcUser + "_" + str(
         get_now_time()) + ".png"
     header = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
     }
     response = download(url, headers=header, file_name=png_file)
     rc = RClient(RUO_KUAI_USER, RUO_KUAI_PASSWORD, RUO_KUAI_SOFT_ID,
                  RUO_KUAI_SOFT_KEY)
     im = open(png_file, 'rb').read()
     rc_result = rc.rk_create(im, 3060)
     if 'Result' in rc_result:
         code = rc_result['Result']
     else:
         print(rc_result)
         return None, None
     print('验证码为:', code)
     return code, dict(response.cookies.items())
Exemple #3
0
def get_code_shixin():
    captchaId = ''.join(random.sample('0123456789abcdef0123456789abcdef', 32))
    rc = RClient('XXX', 'XXX', 'XXX', 'XXX')
    code = ''
    while code == '':
        try:
            img_url = 'http://shixin.court.gov.cn/captchaNew.do?captchaId={}&random=0.23790190675874312'.format(
                captchaId)
            response = requests.get(img_url)
            image = Image.open(BytesIO(response.content))
            try:
                r, g, b, a = image.split()
            except:
                r, g, b = image.split()
            im = Image.merge("RGB", (r, g, b))
            im.save('code2.jpg')
            with open('code2.jpg', 'rb') as f:
                res = f.read()
                os.remove('code2.jpg')
                code = rc.rk_create(res, 2040)['Result']
        except:
            pass
    return captchaId, code
Exemple #4
0
    def golog(self):
        # -----------------------https://kyfw.12306.cn/otn/login/init-----------------------#
        self.session.get(
            'https://kyfw.12306.cn/otn/login/init',
            headers={
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':
                'gzip, deflate,br',
                'Accept-Language':
                'zh-CN,zh;q=0.9',
                'Host':
                'kyfw.12306.cn',
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            })
        # -----------------------https://kyfw.12306.cn/otn/HttpZF/logdevice?-----------------------#
        RAIL_DEVICEID = self.session.get(
            'https://kyfw.12306.cn/otn/HttpZF/logdevice?algID=pKLII3uPX6&hashCode=BgallKZtsTeM_GGfGVe3d559Qrrq-Hvw1Dibmj7s8Ro&FMQw=0&q4f3=zh-CN&VySQ=FGF1IEufT3O2t01Huu8aoSHcpOMqDxQT&VPIf=1&custID=133&VEek=unknown&dzuS=0&yD16=0&EOQP=8f58b1186770646318a429cb33977d8c&lEnu=2886991894&jp76=52d67b2a5aa5e031084733d5006cc664&hAqN=Win32&platform=WEB&ks0Q=d22ca0b81584fbea62237b14bd04c866&TeRS=1040x1920&tOHY=24xx1080x1920&Fvje=i1l1o1s1&q5aJ=-8&wNLf=99115dfb07133750ba677d055874de87&0aew=Mozilla/5.0%20(Windows%20NT%206.1;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/72.0.3626.121%20Safari/537.36&E3gR=982178053f420f9b3bc3a6c108acf9ae',
            headers={
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':
                'gzip, deflate,br',
                'Accept-Language':
                'zh-CN,zh;q=0.9',
                'Host':
                'kyfw.12306.cn',
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
            }).text
        RAIL_DEVICEID = re.findall('"dfp":"(.+?)"}', RAIL_DEVICEID)[0]
        self.session.cookies.set('RAIL_DEVICEID',
                                 RAIL_DEVICEID,
                                 domain='.12306.cn')
        # -----------------------captcha-image?login_site=E&module=login&rand=sjrand&0.9725909596164388-------下载验证码----------------#
        captchaurl = 'https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand'
        captchaheaders = {
            'Accept':
            'image / webp, image / apng, image / *, * / *;q = 0.8',
            'Accept-Encoding':
            'gzip, deflate,br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'kyfw.12306.cn',
            'Referer':
            'https://kyfw.12306.cn/otn/leftTicket/init',
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
        }
        checkcodecontent = self.session.get(captchaurl, headers=captchaheaders)
        with open('captcha-image.jpg', 'wb') as f:
            f.write(checkcodecontent.content)

        if self.AIcaptcha:
            # -----------------------验证码"真·人工智能"识别-----------------------#
            rc = RClient('用户名', '用户密码'.encode("utf-8"), '软件ID', '软件Key')  #敏感信息
            im = open('captcha-image.jpg', 'rb').read()
            im_num = rc.rk_create(im, 6113)
            checkcode = [int(x) for x in list(im_num['Result'])]
            checkcode = self.pick(checkcode)
        # -----------------------验证码手动识别-----------------------#
        else:
            os.startfile('captcha-image.jpg')
            print("""
                -----------------
                | 1 | 2 | 3 | 4 |
                -----------------
                | 5 | 6 | 7 | 8 |
                -----------------
                """)
            checkcode = [int(x) for x in input('输入图片序号用,分隔:').split(',')]
            checkcode = self.pick(checkcode)
        # -----------------------captcha-check------------提交验证码-----------#
        check_url = 'https://kyfw.12306.cn/passport/captcha/captcha-check'
        check_headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate,br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'kyfw.12306.cn',
            'Origin': 'https://kyfw.12306.cn',
            'Referer': 'https://kyfw.12306.cn/otn/login/init',
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            'X-Requested-With': 'XMLHttpRequest',
        }
        captcha_data = {
            'answer': checkcode,
            'login_site': 'E',
            'rand': 'sjrand',
        }
        captcha_response = self.session.post(check_url,
                                             data=captcha_data,
                                             headers=check_headers)
        print(captcha_response.text)
        if (captcha_response.json()["result_message"] == "验证码校验失败"):
            raise Exception("验证码校验失败")

        # -----------------------web/login-----------------------#

        login_url = 'https://kyfw.12306.cn/passport/web/login'
        form_data = {
            'username': self.use,
            'password': self.pw,
            'appid': 'otn',
        }
        log_response = self.session.post(login_url,
                                         data=form_data,
                                         headers=check_headers)
        print(log_response.text)
        # -----------------------userLogin-----------------------#
        self.session.post(
            'https://kyfw.12306.cn/otn/login/userLogin',
            data={
                '_json_att': '',
            },
            headers={
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':
                'gzip, deflate,br',
                'Accept-Language':
                'zh-CN,zh;q=0.9',
                'Host':
                'kyfw.12306.cn',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
            })

        # -----------------------uamtk-----------------------#
        uamtk_response1 = self.session.post(
            'https://kyfw.12306.cn/passport/web/auth/uamtk',
            data={
                'appid': 'otn',
            },
            headers={
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Accept-Encoding': 'gzip, deflate,br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'kyfw.12306.cn',
                'Origin': 'https://kyfw.12306.cn',
                'User-Agent':
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
                'X-Requested-With': 'XMLHttpRequest',
            })
        tk = json.loads(uamtk_response1.text)["newapptk"]
        # -----------------------uamauthclient-----------------------#
        self.session.post(
            'https://kyfw.12306.cn/otn/uamauthclient',
            data={
                'tk': tk,
            },
            headers={
                'Accept': '*/*',
                'Accept-Encoding': 'gzip, deflate,br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Host': 'kyfw.12306.cn',
                'Origin': 'https://kyfw.12306.cn',
                'User-Agent':
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
                'X-Requested-With': 'XMLHttpRequest',
            })

        return self.session
Exemple #5
0
    def process_request(cls, request, spider):
        #利用PhantomJS加载网页中的javascript动态内容
        print("WechatDownloaderMiddleware")

        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201"
        )
        # driver = webdriver.PhantomJS(desired_capabilities=dcap)
        # driver.set_window_size(1280, 800)
        # driver.set_page_load_timeout(20)
        driver = webdriver.Firefox()
        driver.get(request.url)

        ct = time.time()
        local_time = time.localtime(ct)
        data_head = time.strftime("%Y%m%d%H%M%S", local_time)
        data_secs = (ct - long(ct)) * 1000
        timestamp = "%s%05d" % (data_head, data_secs)

        try:
            seccode_image = driver.find_element_by_id('seccodeImage')
            if None != seccode_image:
                print u"Middleware 搜狗分析填入验证码!"
                # 截图
                driver.get_screenshot_as_file(
                    'E:/pyworkspace/Wechat/screenshot{0}.png'.format(
                        timestamp))
                # 获取验证码图片位置
                element = driver.find_element_by_id('seccodeImage')
                left = int(element.location['x'])
                top = int(element.location['y'])
                right = int(element.location['x'] + element.size['width'])
                bottom = int(element.location['y'] + element.size['height'])

                # 通过Image处理图像
                im = Image.open(
                    'E:/pyworkspace/Wechat/screenshot{0}.png'.format(
                        timestamp))
                im = im.crop((left, top, right, bottom))
                # 保存验证码图片
                im.save('E:/pyworkspace/Wechat/code{0}.png'.format(timestamp))

                rc = RClient('289594665', 'huangwen3895170', '86899',
                             'da005218780a43269aa9260fa26ec25d')
                im = open(
                    'E:/pyworkspace/Wechat/code{0}.png'.format(timestamp),
                    'rb').read()

                # 若快平台识别验证码
                rk_ret = rc.rk_create(im, 3060)
                code = str(rk_ret["Result"])
                print "code{0}:".format(timestamp) + code
                # 模拟输入验证码,并提交
                elem = driver.find_element_by_id("seccodeInput")
                elem.clear()
                elem.send_keys(code)
                driver.find_element_by_id("submit").click()
                #延时5秒,等待完成页面跳转
                time.sleep(5)

        except NoSuchElementException as e:
            print e

        try:
            verify_img = driver.find_element_by_id('verify_img')
            if None != verify_img:
                print u"Middleware 微信分析填入验证码!"
                # 截图
                driver.get_screenshot_as_file(
                    'E:/pyworkspace/Wechat/wescreenshot{0}.png'.format(
                        timestamp))
                # 获取指定元素位置
                element = driver.find_element_by_id('verify_img')
                left = int(element.location['x'])
                top = int(element.location['y'])
                right = int(element.location['x'] + element.size['width'])
                bottom = int(element.location['y'] + element.size['height'])

                # 通过Image处理图像
                im = Image.open(
                    'E:/pyworkspace/Wechat/wescreenshot{0}.png'.format(
                        timestamp))
                im = im.crop((left, top, right, bottom))
                # 保存验证码图片
                im.save(
                    'E:/pyworkspace/Wechat/wecode{0}.png'.format(timestamp))
                # 若快平台识别验证码
                rc = RClient('289594665', 'huangwen3895170', '86899',
                             'da005218780a43269aa9260fa26ec25d')
                im = open(
                    'E:/pyworkspace/Wechat/wecode{0}.png'.format(timestamp),
                    'rb').read()

                rk_ret = rc.rk_create(im, 3040)
                wecode = str(rk_ret["Result"])
                print "wecode{0}:".format(timestamp) + wecode
                # 模拟输入验证码,并提交
                elem = driver.find_element_by_id("input")
                elem.clear()
                elem.send_keys(wecode)
                driver.find_element_by_id("bt").click()
                #延时5秒,等待完成页面跳转
                time.sleep(5)

        except NoSuchElementException as e:
            print e

        time.sleep(5)
        content = driver.page_source.encode('utf-8')
        #print "content:" + content.__str__()
        driver.quit()
        return HtmlResponse(request.url,
                            encoding='utf-8',
                            body=content,
                            request=request)
Exemple #6
0
    def login(self):
        print "bagin login ..."
        log.info("bagin login ...")
        self.headers['Referer'] = 'https://www.12306.cn/index/'

        # 一、访问初始页面,获取cookie
        data = {'appid': 'otn'}
        self.session.post(
            'https://kyfw.12306.cn/passport/web/auth/uamtk-static',
            data=data,
            proxies=self.proxies,
            headers=self.headers)

        # 二、获取验证码、提取、转换、识别
        jquery = "jQuery191080472046843172_%s" % int(round(time.time() * 1000))
        url = ("https://kyfw.12306.cn/passport/captcha/captcha-image64?"
               "login_site=E&module=login&rand=sjrand&%s&callback=%s&_=%s" %
               (int(round(time.time() * 1000)), jquery,
                int(round(time.time() * 1000))))

        ret = self.session.get(url, proxies=self.proxies, headers=self.headers)

        # 提取转换验证码
        ret_text = re.split("\(|\)", ret.content)[1]
        log.info("verify photo base64: %s" % ret_text)
        ret_json = json.loads(ret_text)
        verify_photo = base64.b64decode(ret_json.get('image'))

        # 识别验证码
        rc = RClient(rk_username, rk_password, '1',
                     'b40ffbee5c1cf4e38028c197eb2fc751')
        verify_text = rc.rk_create(verify_photo, 6113)
        log.info("verify text: %s" % verify_text)

        # 三、校验验证码
        print "passport/captcha/captcha-check"
        log.info("passport/captcha/captcha-check")
        url = "https://kyfw.12306.cn/passport/captcha/captcha-check"
        get_data = {
            'callback': "%s" % jquery,
            'answer': "%s" % verify_text,
            'rand': "sjrand",
            'login_site': "E",
            '_': "%s" % int(round(time.time() * 1000)),
        }
        ret = self.session.get(url,
                               params=get_data,
                               proxies=self.proxies,
                               headers=self.headers)
        print ret.content
        log.info(ret.content)

        # 四、校验用户名密码
        print "login"
        log.info("login")
        url = "https://kyfw.12306.cn/passport/web/login"
        data = {
            'username': username,
            'password': password,
            'appid': 'otn',
            'answer': verify_text,
        }
        ret = self.session.post(url,
                                data=data,
                                proxies=self.proxies,
                                headers=self.headers)
        print ret.content
        log.info(ret.content)

        # 获取用户token
        print "auth/uamtk"
        log.info("auth/uamtk")
        url = "https://kyfw.12306.cn/passport/web/auth/uamtk"
        ret = self.session.post(url,
                                data={"appid": "otn"},
                                proxies=self.proxies,
                                headers=self.headers)
        print ret.content
        log.info(ret.content)

        # 获取权限
        print "uam auth client"
        log.info("uam auth client")
        data = {'tk': ret.json().get('newapptk')}
        url = "https://kyfw.12306.cn/otn/uamauthclient"
        ret = self.session.post(url,
                                data=data,
                                proxies=self.proxies,
                                headers=self.headers)
        print ret.content
        log.info(ret.content)

        return True
Exemple #7
0
im = im.crop((left, top, right, bottom))
im.save('verify.png')

# js = """
#     document.getElementsByClassName('inputGroup')[0].value='18883362563';
#     document.getElementsByClassName('inputGroup')[1].value='442891187';
#     document.getElementsByClassName('inputGroup')[2].value='442891187';
#     document.getElementById('checkbox').click()
# """
#
# driver.execute_script(js)

rc = RClient('username', 'password', 'soft_id',
             'soft_key')  # 从若快官网注册用户以及开发者即可接入
im = open('verify.png', 'rb').read()
verify = rc.rk_create(im, 3040).get('Result')
print(verify)

element = driver.find_elements_by_class_name('inputGroup')
element[0].send_keys(phone)
element[1].send_keys(password)
element[2].send_keys(password)
driver.find_element_by_id('checkbox').click()  # 接受协议

driver.find_element_by_id('code_input').send_keys(verify)  # 输入图片验证码

driver.find_element_by_id('verifyphonebtn').click()  # 获取手机验证码

print('请输入手机验证码!')

verify_phone = input()
def register(phone):
    coll_insert = connect_mongodb('192.168.0.235', 27017, 'fgg', 'user_info')
    s = requests.session()
    rc = RClient('ruokuaimiyao', 'goojia123456', '95632',
                 '6b8205cf61944329a5841c30e5ed0d5d')
    proxies = {'http': 'http://192.168.0.93:4234/'}
    c = int(time.time() * 1000)
    s.get('http://www.fungugu.com/ShenQingShiYong/fillInformation',
          proxies=proxies)
    jrbqiantai = s.cookies.get_dict()['jrbqiantai']
    cookie = 'Hm_lpvt_203904e114edfe3e6ab6bc0bc04207cd' + str(
        c) + ';Hm_lvt_203904e114edfe3e6ab6bc0bc04207cd' + str(
            c) + ';jrbqiantai=' + jrbqiantai
    headers = {
        'Cookie':
        cookie,
        'Referer':
        'http://www.fungugu.com/ShenQingShiYong/fillInformation',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
    }
    print(headers)
    while True:
        url = 'http://www.fungugu.com/yzmControlller/refreshPicCode?id=pwdCode&' + str(
            c)
        response = s.get(url=url, proxies=proxies)
        code = rc.rk_create(response.content, 3040)['Result']
        print('code:', code)
        params = {
            'code': code,
            'type': 'pwdCode',
        }
        code_url = 'http://www.fungugu.com/yzmControlller/verificationPicCode'
        result = s.post(url=code_url, params=params, proxies=proxies)
        print(result.content)
        if 'true' in result.text:
            break
    params = {'phone': phone, 'picCode': code}
    # 发送手机号短信
    send_url = 'http://www.fungugu.com/yzmControlller/sendCode'
    result = s.post(url=send_url, params=params, proxies=proxies)
    print(result.content.decode())
    print(phone)
    # phone_verification = get_msg(token, phone)
    phone_verification = input('验证码 :')
    params = {'securityCode': phone_verification, 'phone': phone}
    phone_url = 'http://www.fungugu.com/yzmControlller/verificationSMSCode'
    result = s.post(url=phone_url, params=params, proxies=proxies)
    print('aaa:', result.content)
    user_name = yonghuming()
    params = {
        'keyCode': '',
        'keHuShouJi': phone,
        'yongHuMing': user_name,
        'keHuMiMa': '4ac9fa21a775e4239e4c72317cdca870',
        'quanXianChengShi': '上海',
        'jiGouMingCheng': user_name,
    }
    register_url = 'http://www.fungugu.com/ShenQingShiYong/completionUserInfo'
    result = s.post(url=register_url, params=params, proxies=proxies)
    print(result.text)
    if 'true' in result.text:
        data = {
            'user_name': user_name,
        }
        coll_insert.insert_one(data)
        print('插入成功')
    else:
        print('错误')
Exemple #9
0
class Crawl(object):
    def __init__(self):
        logging.getLogger("wechatsogou").setLevel(logging.WARNING)
        logging.getLogger("peewee").setLevel(logging.WARNING)
        logging.getLogger("requests").setLevel(logging.WARNING)
        self.logger = logging.getLogger()
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.setLevel(logging.DEBUG)
        self.get_bad_proxies()
        self.WxTable = WechatInfo()
        self.get_conn()
        self.create_target()
        self.get_saved_data()
        self.proxies_list = NewGenerationProxy({
            'anony': 'L4',
            'post': 'false',
            'speed': 3000
        })
        proxyLine = self.proxies_list.getProxy()
        self.wx_api = wechatsogou.WechatSogouAPI(timeout=8,
                                                 proxies={
                                                     'http': proxyLine,
                                                     'https': proxyLine
                                                 })
        SpiderConfig = Config.SpiderConfig
        self.headers = SpiderConfig.headers.json()
        self.weChat_table = WechatInfo()
        self.proxies_table = UnableProxies()
        self.crawled_table = CrawledData()
        self.rk = RClient('ghost2017b', 'Ghost2017b', '107539',
                          'a8bd936aa1574ddb96d14564c1a0d022')

    def get_conn(self):
        conn = pymysql.connect(host=DBinfo.MYSQL_HOST,
                               port=DBinfo.MYSQL_PORT,
                               user=DBinfo.MYSQL_USER,
                               passwd=str(DBinfo.MYSQL_PASSWD),
                               db=DBinfo.MYSQL_DANAME,
                               charset='utf8')
        return conn

    def get_saved_data(self):
        conn = self.get_conn()
        cursor = conn.cursor()
        sql = """select main_url from crawleddata"""
        cursor.execute(sql)
        self.saved_data_list = [x for x, in cursor.fetchall()]

    def get_bad_proxies(self):
        conn = self.get_conn()
        cursor = conn.cursor()
        sql = """select object from unableproxies """
        cursor.execute(sql)
        self.bad_proxies_list = [x for x, in cursor.fetchall()]

    def create_target(self):
        self.target_list = []
        conn = self.get_conn()
        cursor = conn.cursor()
        sql = """select jobTypeId , jobTypeName from data_job_type where level = '3'"""
        cursor.execute(sql)
        data = cursor.fetchall()
        for num, item in enumerate(data):
            jobTypeId = item[0]
            jobTypeName = item[1]
            target = {
                'jobTypeId': jobTypeId,
                'jobTypeName': jobTypeName,
                'items': []
            }
            self.target_list.append(target)
        cursor.close()

    # def get_proxy(self, scene='default'):
    #     while True:
    #         # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ20185305368PAYAqN&returnType=1&count=1'
    #         # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ2018634506zLQTDj&returnType=2&count=1'
    #         # url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
    #         url = 'http://dev.kdlapi.com/api/getproxy/?orderid=953059683695998&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=2&method=1&an_an=1&an_ha=1&sep=1'
    #         html = requests.get(url).text
    #         time.sleep(random.randint(5, 10))
    #         if re.findall('普通订单5秒钟内只允许提取1次', html):
    #             self.logger.debug('need to get proxy ' + scene + ' again!')
    #             continue
    #
    #         try:
    #             proxyLine = html
    #             proxies = {
    #                 'https': proxyLine
    #             }
    #         except:
    #             pprint(html)
    #             continue
    #
    #         if str(proxies) in self.bad_proxies_list:
    #             self.logger.debug('ip is bad')
    #             continue
    #         else:
    #             self.logger.debug(u'Update %s proxy to %s' % (scene, proxyLine))
    #             return proxies  # proxyLine   proxies
    '''
        for run.py
    '''

    def get_target_list(self, target):
        page = 10
        keyword = target['jobTypeName']
        target['items'] = []
        target['main_url'] = []
        for i in range(page):
            url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % (
                i + 1, parse.quote(str(keyword)))
            if url in self.saved_data_list:
                self.logger.debug('the url had been crawled %s/%s' %
                                  (keyword, i + 1))
                continue
            while True:

                def identify_image_callback(image):
                    result = self.rk.rk_create(image, 3060)
                    if 'Result' in result:
                        self.logger.debug(u'Captcha: %s, ID: %s' %
                                          (result['Result'], result['Id']))
                        return result['Result']
                    self.logger.debug(result)
                    return ''

                try:
                    target_little_list = self.wx_api.search_article(
                        target['jobTypeName'],
                        page=i + 1,
                        identify_image_callback=identify_image_callback,
                    )
                    target['main_url'].append(url)
                    self.logger.debug(
                        u'Name: %s, Page: %s, Count: %s' %
                        (target['jobTypeName'], i, len(target_little_list)))
                    break
                except Exception as e:
                    self.logger.debug(traceback.format_exc())
                    self.logger.debug(u'Name: %s, Page: %s, Error: %s' %
                                      (target['jobTypeName'], i, e.__repr__()))
                    proxyLine = self.proxies_list.getProxy()
                    self.logger.debug(u'Update proxy to %s' % (proxyLine))
                    self.wx_api.requests_kwargs['proxies'] = {
                        "https": proxyLine,
                    }
            if target_little_list.__len__() == 0:
                break
            target['items'].append(target_little_list)
        return target

    def get_target_list_v2(self, target):
        page = 10
        keyword = target['jobTypeName']
        target['items'] = []
        target['main_url'] = []
        first_data = 'init_proxy'
        for i in range(page):
            while True:
                url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % (
                    i + 1, parse.quote(keyword))
                self.headers['Referer'] = url
                self.headers['Cookie'] = 'SUV="";SNUID="";'
                if url not in self.saved_data_list:
                    try:
                        self.logger.debug("queue get before size:%s" %
                                          q_proxies.qsize())
                        proxies = q_proxies.get()
                        self.logger.debug("queue get after size :%s" %
                                          q_proxies.qsize())
                        resp = requests.get(
                            url,
                            proxies=proxies,
                            headers=self.headers,
                            timeout=8,
                        )
                        if resp.ok:
                            if u'antispider' in resp.url:
                                # TODO:记录一下被识别为爬虫的代理IP到数据库 #
                                self.proxies_table.create(
                                    object=proxies['http'], )
                                self.logger.debug(
                                    u'Name: %s, Page: %s, DetachAntiSpider: %s'
                                    % (keyword, str(i + 1), proxies['http']))
                                first_data = 'detach_spider'
                                continue
                            else:
                                time.sleep(random.randint(2, 5))
                                target_little_list = WechatSogouStructuring.get_article_by_search(
                                    resp.text)
                                if target_little_list.__len__() == 0:
                                    break  # break for the page doesnt have data
                                target['items'].append(target_little_list)
                                target['main_url'].append(url)
                                self.logger.debug(
                                    'get item %s page %d total %d ' %
                                    (keyword, i + 1,
                                     target_little_list.__len__()))
                                break  # break for success
                        else:
                            self.logger.debug(
                                u'Name: %s, Page: %s, HttpError: %s' %
                                (keyword, str(i + 1), str(resp.status_code)))
                            first_data = 'http_error'
                    except Exception as e:
                        self.logger.debug(u'Name: %s, Page: %s, Error: %s' %
                                          (keyword, i + 1, type(e)))
                        first_data = 'catch_exception'
                else:
                    self.logger.debug('the url had been crawled')
                    break  # break for exist
        self.logger.debug(u'Name: %s, Total: %s' %
                          (keyword, len(target['items'])))
        return target

    def get_data(self, target):
        try:
            for num, topItem in enumerate(target['items']):
                url = target['main_url'][num]
                self.crawled_table.create(
                    main_url=url,
                    target=target['jobTypeName'],
                )
                for item in topItem:
                    detail_url = item['article']['url']
                    html = requests.get(url=detail_url,
                                        headers=self.headers).text
                    content = re.findall(
                        '<div class="rich_media_content " lang=="en" id="js_content">([\s\S]*)</div>',
                        html)
                    biz = re.findall('var biz = ""\|\|"(.*?)"', html)
                    is_delete = re.findall('该内容已被发布者删除', html)
                    if is_delete.__len__() == 0:
                        wechatId = re.findall(
                            '"profile_meta_value">(.*?)</span>', html)
                        if wechatId.__len__() == 0:
                            wechatId = ''
                        elif wechatId[0] == '':
                            wechatId = ''
                        else:
                            wechatId = wechatId[0]
                        if biz.__len__() == 0:
                            biz = ''
                        else:
                            biz = biz[0]
                        obj_wechat = self.weChat_table.create(
                            url=detail_url,
                            jobTypeId=target['jobTypeId'],
                            title=item['article']['title'],
                            content=''.join(content),
                            desc=item['article']['abstract'],
                            createAt=time.strftime(
                                '%Y-%m-%dT%H:%M:%S',
                                time.localtime(item['article']['time'])),
                            wechatId=wechatId,
                            wechatName=item['gzh']['wechat_name'],
                            biz=biz,
                        )
                        self.logger.debug(
                            'item: %s save id: %s' %
                            (target['jobTypeName'], str(obj_wechat.id)))
                    else:
                        self.logger.debug('pass for item is deleted by author')
        except Exception as e:
            self.logger.warning('something wrong : ' + e.__repr__())