def get_code(): captchaId = ''.join(random.sample('0123456789abcdef0123456789abcdef', 32)) rc = RClient('XXX', 'XXX', 'XXX', 'XXX') code = '' while code == '': try: img_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId={}&random=0.4750974032186893'.format( captchaId) urllib.urlretrieve(img_url, 'code.jpg') with open('code.jpg', 'rb') as f: res = f.read() os.remove('code.jpg') code = rc.rk_create(res, 2040)['Result'] except: pass return captchaId, code
def get_captcha(self): url = "https://console.bonuscloud.io/api/web/captcha/get/" self.mkdir('img') png_file = "img/cap_" + self.bcUser + "_" + str( get_now_time()) + ".png" header = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } response = download(url, headers=header, file_name=png_file) rc = RClient(RUO_KUAI_USER, RUO_KUAI_PASSWORD, RUO_KUAI_SOFT_ID, RUO_KUAI_SOFT_KEY) im = open(png_file, 'rb').read() rc_result = rc.rk_create(im, 3060) if 'Result' in rc_result: code = rc_result['Result'] else: print(rc_result) return None, None print('验证码为:', code) return code, dict(response.cookies.items())
def get_code_shixin(): captchaId = ''.join(random.sample('0123456789abcdef0123456789abcdef', 32)) rc = RClient('XXX', 'XXX', 'XXX', 'XXX') code = '' while code == '': try: img_url = 'http://shixin.court.gov.cn/captchaNew.do?captchaId={}&random=0.23790190675874312'.format( captchaId) response = requests.get(img_url) image = Image.open(BytesIO(response.content)) try: r, g, b, a = image.split() except: r, g, b = image.split() im = Image.merge("RGB", (r, g, b)) im.save('code2.jpg') with open('code2.jpg', 'rb') as f: res = f.read() os.remove('code2.jpg') code = rc.rk_create(res, 2040)['Result'] except: pass return captchaId, code
def golog(self): # -----------------------https://kyfw.12306.cn/otn/login/init-----------------------# self.session.get( 'https://kyfw.12306.cn/otn/login/init', headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", }) # -----------------------https://kyfw.12306.cn/otn/HttpZF/logdevice?-----------------------# RAIL_DEVICEID = self.session.get( 'https://kyfw.12306.cn/otn/HttpZF/logdevice?algID=pKLII3uPX6&hashCode=BgallKZtsTeM_GGfGVe3d559Qrrq-Hvw1Dibmj7s8Ro&FMQw=0&q4f3=zh-CN&VySQ=FGF1IEufT3O2t01Huu8aoSHcpOMqDxQT&VPIf=1&custID=133&VEek=unknown&dzuS=0&yD16=0&EOQP=8f58b1186770646318a429cb33977d8c&lEnu=2886991894&jp76=52d67b2a5aa5e031084733d5006cc664&hAqN=Win32&platform=WEB&ks0Q=d22ca0b81584fbea62237b14bd04c866&TeRS=1040x1920&tOHY=24xx1080x1920&Fvje=i1l1o1s1&q5aJ=-8&wNLf=99115dfb07133750ba677d055874de87&0aew=Mozilla/5.0%20(Windows%20NT%206.1;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/72.0.3626.121%20Safari/537.36&E3gR=982178053f420f9b3bc3a6c108acf9ae', headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36", }).text RAIL_DEVICEID = re.findall('"dfp":"(.+?)"}', RAIL_DEVICEID)[0] self.session.cookies.set('RAIL_DEVICEID', RAIL_DEVICEID, domain='.12306.cn') # -----------------------captcha-image?login_site=E&module=login&rand=sjrand&0.9725909596164388-------下载验证码----------------# captchaurl = 'https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand' captchaheaders = { 'Accept': 'image / webp, image / apng, image / *, * / *;q = 0.8', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', 'Referer': 'https://kyfw.12306.cn/otn/leftTicket/init', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", } checkcodecontent = self.session.get(captchaurl, headers=captchaheaders) with open('captcha-image.jpg', 'wb') as f: f.write(checkcodecontent.content) if self.AIcaptcha: # -----------------------验证码"真·人工智能"识别-----------------------# rc = RClient('用户名', '用户密码'.encode("utf-8"), '软件ID', '软件Key') #敏感信息 im = open('captcha-image.jpg', 'rb').read() im_num = rc.rk_create(im, 6113) checkcode = [int(x) for x in list(im_num['Result'])] checkcode = self.pick(checkcode) # -----------------------验证码手动识别-----------------------# else: os.startfile('captcha-image.jpg') print(""" ----------------- | 1 | 2 | 3 | 4 | ----------------- | 5 | 6 | 7 | 8 | ----------------- """) checkcode = [int(x) for x in input('输入图片序号用,分隔:').split(',')] checkcode = self.pick(checkcode) # -----------------------captcha-check------------提交验证码-----------# check_url = 'https://kyfw.12306.cn/passport/captcha/captcha-check' check_headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', 'Origin': 'https://kyfw.12306.cn', 'Referer': 'https://kyfw.12306.cn/otn/login/init', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", 'X-Requested-With': 'XMLHttpRequest', } captcha_data = { 'answer': checkcode, 'login_site': 'E', 'rand': 'sjrand', } captcha_response = self.session.post(check_url, data=captcha_data, headers=check_headers) print(captcha_response.text) if (captcha_response.json()["result_message"] == "验证码校验失败"): raise Exception("验证码校验失败") # -----------------------web/login-----------------------# login_url = 'https://kyfw.12306.cn/passport/web/login' form_data = { 'username': self.use, 'password': self.pw, 'appid': 'otn', } log_response = self.session.post(login_url, data=form_data, headers=check_headers) print(log_response.text) # -----------------------userLogin-----------------------# self.session.post( 'https://kyfw.12306.cn/otn/login/userLogin', data={ '_json_att': '', }, headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36', }) # -----------------------uamtk-----------------------# uamtk_response1 = self.session.post( 'https://kyfw.12306.cn/passport/web/auth/uamtk', data={ 'appid': 'otn', }, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', 'Origin': 'https://kyfw.12306.cn', 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", 'X-Requested-With': 'XMLHttpRequest', }) tk = json.loads(uamtk_response1.text)["newapptk"] # -----------------------uamauthclient-----------------------# self.session.post( 'https://kyfw.12306.cn/otn/uamauthclient', data={ 'tk': tk, }, headers={ 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'kyfw.12306.cn', 'Origin': 'https://kyfw.12306.cn', 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", 'X-Requested-With': 'XMLHttpRequest', }) return self.session
def process_request(cls, request, spider): #利用PhantomJS加载网页中的javascript动态内容 print("WechatDownloaderMiddleware") dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201" ) # driver = webdriver.PhantomJS(desired_capabilities=dcap) # driver.set_window_size(1280, 800) # driver.set_page_load_timeout(20) driver = webdriver.Firefox() driver.get(request.url) ct = time.time() local_time = time.localtime(ct) data_head = time.strftime("%Y%m%d%H%M%S", local_time) data_secs = (ct - long(ct)) * 1000 timestamp = "%s%05d" % (data_head, data_secs) try: seccode_image = driver.find_element_by_id('seccodeImage') if None != seccode_image: print u"Middleware 搜狗分析填入验证码!" # 截图 driver.get_screenshot_as_file( 'E:/pyworkspace/Wechat/screenshot{0}.png'.format( timestamp)) # 获取验证码图片位置 element = driver.find_element_by_id('seccodeImage') left = int(element.location['x']) top = int(element.location['y']) right = int(element.location['x'] + element.size['width']) bottom = int(element.location['y'] + element.size['height']) # 通过Image处理图像 im = Image.open( 'E:/pyworkspace/Wechat/screenshot{0}.png'.format( timestamp)) im = im.crop((left, top, right, bottom)) # 保存验证码图片 im.save('E:/pyworkspace/Wechat/code{0}.png'.format(timestamp)) rc = RClient('289594665', 'huangwen3895170', '86899', 'da005218780a43269aa9260fa26ec25d') im = open( 'E:/pyworkspace/Wechat/code{0}.png'.format(timestamp), 'rb').read() # 若快平台识别验证码 rk_ret = rc.rk_create(im, 3060) code = str(rk_ret["Result"]) print "code{0}:".format(timestamp) + code # 模拟输入验证码,并提交 elem = driver.find_element_by_id("seccodeInput") elem.clear() elem.send_keys(code) driver.find_element_by_id("submit").click() #延时5秒,等待完成页面跳转 time.sleep(5) except NoSuchElementException as e: print e try: verify_img = driver.find_element_by_id('verify_img') if None != verify_img: print u"Middleware 微信分析填入验证码!" # 截图 driver.get_screenshot_as_file( 'E:/pyworkspace/Wechat/wescreenshot{0}.png'.format( timestamp)) # 获取指定元素位置 element = driver.find_element_by_id('verify_img') left = int(element.location['x']) top = int(element.location['y']) right = int(element.location['x'] + element.size['width']) bottom = int(element.location['y'] + element.size['height']) # 通过Image处理图像 im = Image.open( 'E:/pyworkspace/Wechat/wescreenshot{0}.png'.format( timestamp)) im = im.crop((left, top, right, bottom)) # 保存验证码图片 im.save( 'E:/pyworkspace/Wechat/wecode{0}.png'.format(timestamp)) # 若快平台识别验证码 rc = RClient('289594665', 'huangwen3895170', '86899', 'da005218780a43269aa9260fa26ec25d') im = open( 'E:/pyworkspace/Wechat/wecode{0}.png'.format(timestamp), 'rb').read() rk_ret = rc.rk_create(im, 3040) wecode = str(rk_ret["Result"]) print "wecode{0}:".format(timestamp) + wecode # 模拟输入验证码,并提交 elem = driver.find_element_by_id("input") elem.clear() elem.send_keys(wecode) driver.find_element_by_id("bt").click() #延时5秒,等待完成页面跳转 time.sleep(5) except NoSuchElementException as e: print e time.sleep(5) content = driver.page_source.encode('utf-8') #print "content:" + content.__str__() driver.quit() return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def login(self): print "bagin login ..." log.info("bagin login ...") self.headers['Referer'] = 'https://www.12306.cn/index/' # 一、访问初始页面,获取cookie data = {'appid': 'otn'} self.session.post( 'https://kyfw.12306.cn/passport/web/auth/uamtk-static', data=data, proxies=self.proxies, headers=self.headers) # 二、获取验证码、提取、转换、识别 jquery = "jQuery191080472046843172_%s" % int(round(time.time() * 1000)) url = ("https://kyfw.12306.cn/passport/captcha/captcha-image64?" "login_site=E&module=login&rand=sjrand&%s&callback=%s&_=%s" % (int(round(time.time() * 1000)), jquery, int(round(time.time() * 1000)))) ret = self.session.get(url, proxies=self.proxies, headers=self.headers) # 提取转换验证码 ret_text = re.split("\(|\)", ret.content)[1] log.info("verify photo base64: %s" % ret_text) ret_json = json.loads(ret_text) verify_photo = base64.b64decode(ret_json.get('image')) # 识别验证码 rc = RClient(rk_username, rk_password, '1', 'b40ffbee5c1cf4e38028c197eb2fc751') verify_text = rc.rk_create(verify_photo, 6113) log.info("verify text: %s" % verify_text) # 三、校验验证码 print "passport/captcha/captcha-check" log.info("passport/captcha/captcha-check") url = "https://kyfw.12306.cn/passport/captcha/captcha-check" get_data = { 'callback': "%s" % jquery, 'answer': "%s" % verify_text, 'rand': "sjrand", 'login_site': "E", '_': "%s" % int(round(time.time() * 1000)), } ret = self.session.get(url, params=get_data, proxies=self.proxies, headers=self.headers) print ret.content log.info(ret.content) # 四、校验用户名密码 print "login" log.info("login") url = "https://kyfw.12306.cn/passport/web/login" data = { 'username': username, 'password': password, 'appid': 'otn', 'answer': verify_text, } ret = self.session.post(url, data=data, proxies=self.proxies, headers=self.headers) print ret.content log.info(ret.content) # 获取用户token print "auth/uamtk" log.info("auth/uamtk") url = "https://kyfw.12306.cn/passport/web/auth/uamtk" ret = self.session.post(url, data={"appid": "otn"}, proxies=self.proxies, headers=self.headers) print ret.content log.info(ret.content) # 获取权限 print "uam auth client" log.info("uam auth client") data = {'tk': ret.json().get('newapptk')} url = "https://kyfw.12306.cn/otn/uamauthclient" ret = self.session.post(url, data=data, proxies=self.proxies, headers=self.headers) print ret.content log.info(ret.content) return True
im = im.crop((left, top, right, bottom)) im.save('verify.png') # js = """ # document.getElementsByClassName('inputGroup')[0].value='18883362563'; # document.getElementsByClassName('inputGroup')[1].value='442891187'; # document.getElementsByClassName('inputGroup')[2].value='442891187'; # document.getElementById('checkbox').click() # """ # # driver.execute_script(js) rc = RClient('username', 'password', 'soft_id', 'soft_key') # 从若快官网注册用户以及开发者即可接入 im = open('verify.png', 'rb').read() verify = rc.rk_create(im, 3040).get('Result') print(verify) element = driver.find_elements_by_class_name('inputGroup') element[0].send_keys(phone) element[1].send_keys(password) element[2].send_keys(password) driver.find_element_by_id('checkbox').click() # 接受协议 driver.find_element_by_id('code_input').send_keys(verify) # 输入图片验证码 driver.find_element_by_id('verifyphonebtn').click() # 获取手机验证码 print('请输入手机验证码!') verify_phone = input()
def register(phone): coll_insert = connect_mongodb('192.168.0.235', 27017, 'fgg', 'user_info') s = requests.session() rc = RClient('ruokuaimiyao', 'goojia123456', '95632', '6b8205cf61944329a5841c30e5ed0d5d') proxies = {'http': 'http://192.168.0.93:4234/'} c = int(time.time() * 1000) s.get('http://www.fungugu.com/ShenQingShiYong/fillInformation', proxies=proxies) jrbqiantai = s.cookies.get_dict()['jrbqiantai'] cookie = 'Hm_lpvt_203904e114edfe3e6ab6bc0bc04207cd' + str( c) + ';Hm_lvt_203904e114edfe3e6ab6bc0bc04207cd' + str( c) + ';jrbqiantai=' + jrbqiantai headers = { 'Cookie': cookie, 'Referer': 'http://www.fungugu.com/ShenQingShiYong/fillInformation', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36' } print(headers) while True: url = 'http://www.fungugu.com/yzmControlller/refreshPicCode?id=pwdCode&' + str( c) response = s.get(url=url, proxies=proxies) code = rc.rk_create(response.content, 3040)['Result'] print('code:', code) params = { 'code': code, 'type': 'pwdCode', } code_url = 'http://www.fungugu.com/yzmControlller/verificationPicCode' result = s.post(url=code_url, params=params, proxies=proxies) print(result.content) if 'true' in result.text: break params = {'phone': phone, 'picCode': code} # 发送手机号短信 send_url = 'http://www.fungugu.com/yzmControlller/sendCode' result = s.post(url=send_url, params=params, proxies=proxies) print(result.content.decode()) print(phone) # phone_verification = get_msg(token, phone) phone_verification = input('验证码 :') params = {'securityCode': phone_verification, 'phone': phone} phone_url = 'http://www.fungugu.com/yzmControlller/verificationSMSCode' result = s.post(url=phone_url, params=params, proxies=proxies) print('aaa:', result.content) user_name = yonghuming() params = { 'keyCode': '', 'keHuShouJi': phone, 'yongHuMing': user_name, 'keHuMiMa': '4ac9fa21a775e4239e4c72317cdca870', 'quanXianChengShi': '上海', 'jiGouMingCheng': user_name, } register_url = 'http://www.fungugu.com/ShenQingShiYong/completionUserInfo' result = s.post(url=register_url, params=params, proxies=proxies) print(result.text) if 'true' in result.text: data = { 'user_name': user_name, } coll_insert.insert_one(data) print('插入成功') else: print('错误')
class Crawl(object): def __init__(self): logging.getLogger("wechatsogou").setLevel(logging.WARNING) logging.getLogger("peewee").setLevel(logging.WARNING) logging.getLogger("requests").setLevel(logging.WARNING) self.logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) self.get_bad_proxies() self.WxTable = WechatInfo() self.get_conn() self.create_target() self.get_saved_data() self.proxies_list = NewGenerationProxy({ 'anony': 'L4', 'post': 'false', 'speed': 3000 }) proxyLine = self.proxies_list.getProxy() self.wx_api = wechatsogou.WechatSogouAPI(timeout=8, proxies={ 'http': proxyLine, 'https': proxyLine }) SpiderConfig = Config.SpiderConfig self.headers = SpiderConfig.headers.json() self.weChat_table = WechatInfo() self.proxies_table = UnableProxies() self.crawled_table = CrawledData() self.rk = RClient('ghost2017b', 'Ghost2017b', '107539', 'a8bd936aa1574ddb96d14564c1a0d022') def get_conn(self): conn = pymysql.connect(host=DBinfo.MYSQL_HOST, port=DBinfo.MYSQL_PORT, user=DBinfo.MYSQL_USER, passwd=str(DBinfo.MYSQL_PASSWD), db=DBinfo.MYSQL_DANAME, charset='utf8') return conn def get_saved_data(self): conn = self.get_conn() cursor = conn.cursor() sql = """select main_url from crawleddata""" cursor.execute(sql) self.saved_data_list = [x for x, in cursor.fetchall()] def get_bad_proxies(self): conn = self.get_conn() cursor = conn.cursor() sql = """select object from unableproxies """ cursor.execute(sql) self.bad_proxies_list = [x for x, in cursor.fetchall()] def create_target(self): self.target_list = [] conn = self.get_conn() cursor = conn.cursor() sql = """select jobTypeId , jobTypeName from data_job_type where level = '3'""" cursor.execute(sql) data = cursor.fetchall() for num, item in enumerate(data): jobTypeId = item[0] jobTypeName = item[1] target = { 'jobTypeId': jobTypeId, 'jobTypeName': jobTypeName, 'items': [] } self.target_list.append(target) cursor.close() # def get_proxy(self, scene='default'): # while True: # # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ20185305368PAYAqN&returnType=1&count=1' # # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ2018634506zLQTDj&returnType=2&count=1' # # url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=' # url = 'http://dev.kdlapi.com/api/getproxy/?orderid=953059683695998&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=2&method=1&an_an=1&an_ha=1&sep=1' # html = requests.get(url).text # time.sleep(random.randint(5, 10)) # if re.findall('普通订单5秒钟内只允许提取1次', html): # self.logger.debug('need to get proxy ' + scene + ' again!') # continue # # try: # proxyLine = html # proxies = { # 'https': proxyLine # } # except: # pprint(html) # continue # # if str(proxies) in self.bad_proxies_list: # self.logger.debug('ip is bad') # continue # else: # self.logger.debug(u'Update %s proxy to %s' % (scene, proxyLine)) # return proxies # proxyLine proxies ''' for run.py ''' def get_target_list(self, target): page = 10 keyword = target['jobTypeName'] target['items'] = [] target['main_url'] = [] for i in range(page): url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % ( i + 1, parse.quote(str(keyword))) if url in self.saved_data_list: self.logger.debug('the url had been crawled %s/%s' % (keyword, i + 1)) continue while True: def identify_image_callback(image): result = self.rk.rk_create(image, 3060) if 'Result' in result: self.logger.debug(u'Captcha: %s, ID: %s' % (result['Result'], result['Id'])) return result['Result'] self.logger.debug(result) return '' try: target_little_list = self.wx_api.search_article( target['jobTypeName'], page=i + 1, identify_image_callback=identify_image_callback, ) target['main_url'].append(url) self.logger.debug( u'Name: %s, Page: %s, Count: %s' % (target['jobTypeName'], i, len(target_little_list))) break except Exception as e: self.logger.debug(traceback.format_exc()) self.logger.debug(u'Name: %s, Page: %s, Error: %s' % (target['jobTypeName'], i, e.__repr__())) proxyLine = self.proxies_list.getProxy() self.logger.debug(u'Update proxy to %s' % (proxyLine)) self.wx_api.requests_kwargs['proxies'] = { "https": proxyLine, } if target_little_list.__len__() == 0: break target['items'].append(target_little_list) return target def get_target_list_v2(self, target): page = 10 keyword = target['jobTypeName'] target['items'] = [] target['main_url'] = [] first_data = 'init_proxy' for i in range(page): while True: url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % ( i + 1, parse.quote(keyword)) self.headers['Referer'] = url self.headers['Cookie'] = 'SUV="";SNUID="";' if url not in self.saved_data_list: try: self.logger.debug("queue get before size:%s" % q_proxies.qsize()) proxies = q_proxies.get() self.logger.debug("queue get after size :%s" % q_proxies.qsize()) resp = requests.get( url, proxies=proxies, headers=self.headers, timeout=8, ) if resp.ok: if u'antispider' in resp.url: # TODO:记录一下被识别为爬虫的代理IP到数据库 # self.proxies_table.create( object=proxies['http'], ) self.logger.debug( u'Name: %s, Page: %s, DetachAntiSpider: %s' % (keyword, str(i + 1), proxies['http'])) first_data = 'detach_spider' continue else: time.sleep(random.randint(2, 5)) target_little_list = WechatSogouStructuring.get_article_by_search( resp.text) if target_little_list.__len__() == 0: break # break for the page doesnt have data target['items'].append(target_little_list) target['main_url'].append(url) self.logger.debug( 'get item %s page %d total %d ' % (keyword, i + 1, target_little_list.__len__())) break # break for success else: self.logger.debug( u'Name: %s, Page: %s, HttpError: %s' % (keyword, str(i + 1), str(resp.status_code))) first_data = 'http_error' except Exception as e: self.logger.debug(u'Name: %s, Page: %s, Error: %s' % (keyword, i + 1, type(e))) first_data = 'catch_exception' else: self.logger.debug('the url had been crawled') break # break for exist self.logger.debug(u'Name: %s, Total: %s' % (keyword, len(target['items']))) return target def get_data(self, target): try: for num, topItem in enumerate(target['items']): url = target['main_url'][num] self.crawled_table.create( main_url=url, target=target['jobTypeName'], ) for item in topItem: detail_url = item['article']['url'] html = requests.get(url=detail_url, headers=self.headers).text content = re.findall( '<div class="rich_media_content " lang=="en" id="js_content">([\s\S]*)</div>', html) biz = re.findall('var biz = ""\|\|"(.*?)"', html) is_delete = re.findall('该内容已被发布者删除', html) if is_delete.__len__() == 0: wechatId = re.findall( '"profile_meta_value">(.*?)</span>', html) if wechatId.__len__() == 0: wechatId = '' elif wechatId[0] == '': wechatId = '' else: wechatId = wechatId[0] if biz.__len__() == 0: biz = '' else: biz = biz[0] obj_wechat = self.weChat_table.create( url=detail_url, jobTypeId=target['jobTypeId'], title=item['article']['title'], content=''.join(content), desc=item['article']['abstract'], createAt=time.strftime( '%Y-%m-%dT%H:%M:%S', time.localtime(item['article']['time'])), wechatId=wechatId, wechatName=item['gzh']['wechat_name'], biz=biz, ) self.logger.debug( 'item: %s save id: %s' % (target['jobTypeName'], str(obj_wechat.id))) else: self.logger.debug('pass for item is deleted by author') except Exception as e: self.logger.warning('something wrong : ' + e.__repr__())