def shibiecode(self,response): if response.status==200: result=json.loads(response.content) code=result.get('code') if code: self.push( Request(url='http://wenshu.court.gov.cn/Content/CheckVisitCode', proxy=response.proxy, method='POST', meta=response.request.meta, cookies=response.request.cookies, data={'ValidateCode': code}, callback='checkcode', priority=9, allow_redirects=False,allow_proxy=False)) else: request = Request(url='http://wenshu.court.gov.cn/User/ValidateCode', meta=response.request.meta, proxy=response.proxy, method='GET', callback='certifycode', priority=7, allow_redirects=False,allow_proxy=False) self.push(request) pass
def start_request(self): #return connect = pymysql.Connect( host='localhost', port=3306, user='******', passwd='Elements123', db=self.custom_settings['MYSQL_DBNAME'], charset='utf8', use_unicode=True ) print('开始查询') cursor = connect.cursor() cursor.execute( "select id,canshu from a_copy where id >2000000 and count=-1 limit 10") connect.commit() result = cursor.fetchall() for i in result: url = 'http://wenshu.court.gov.cn/ValiCode/GetCode' guid=self.get_guid() data = {'guid': guid} headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } request=Request(url=url,method='POST',headers=headers,data=data,meta={'code':i[1],'id':i[0],'guid':guid},callback='parse',priority=1,allow_proxy=True,allow_redirects=False) self.start_push(request)
def start_request(self): for k, v in self.city_dict.items(): for city, code in v.items(): for wtype in self.work_type: url = 'https://%s.58.com%s' % (code.split('|')[0], wtype) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } request = Request(url=url, method='GET', headers=headers, meta={ 'city': city, 'page': 1, 'url': url }, callback='parse', allow_redirects=False, allow_proxy=False) self.start_push(request) time.sleep(0.1)
def start_request(self): connect = pymysql.Connect(host='localhost', port=3306, user='******', passwd='Elements123', db=self.custom_settings['MYSQL_DBNAME'], charset='utf8', use_unicode=True) print('开始查询') cursor = connect.cursor() cursor.execute("select id,url from zp_58_urls WHERE label=9") connect.commit() result = cursor.fetchall() for i in result: headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'zh-CN,zh;q=0.8', 'upgrade-insecure-requests': '1', 'User-Agent': user_agent.random } request = Request(url=i[1], method='GET', headers=headers, callback='parse', priority=1, meta={'id': i[0]}, allow_redirects=False) self.start_push(request)
def im_parse(self, response): if response.status == 200: code = self.model.get_code(response.content) code1 = '|'.join( [code.get(i) for i in response.request.meta['check']]) print(code, code1) params = { 'data': code1, 'callback': 'jQuery111309236942442398923_%s' % time.time() * 1000, '_': '%s' % time.time() * 1000 } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } request = Request(url=response.request.meta['check_url'], method='GET', params=params, headers=headers, callback='check_parse', meta=response.request.meta, allow_redirects=True, allow_proxy=False, priority=5) self.push(request) return
def js_parse(self, response): if response.status == 200: re_tag = '请在下方的键盘中依次点击 <i>(.*)</i>' match = re.findall(re_tag, response.text()) if match: check = [i.strip() for i in match[0].split('-')] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } response.request.meta['check'] = check request = Request(url=response.request.meta['img_url'], method='GET', headers=headers, callback='im_parse', meta=response.request.meta, allow_redirects=True, allow_proxy=False, priority=4) self.push(request) return
def next_parse(self, response): if response.status == 200: headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'zh-CN,zh;q=0.8', 'upgrade-insecure-requests': '1', 'User-Agent': user_agent.random } city = response.request.meta['city'] selector = Selector(text=response.content) items = selector.xpath("//dl[@class='selIndCate']/dd/span/a") for item in items: url = item.xpath("./@href").extract_first('') next_url = 'https:' + url + 'pn1' request = Request(url=next_url, method='GET', headers=headers, callback='info_parse', priority=2, allow_redirects=False, meta={ 'city': city, 'page': 1, 'model_url': 'https:' + url }, allow_proxy=False) self.push(request)
def start_request(self): #time.sleep(100) return connect = pymysql.Connect( host='127.0.0.1', port=3306, user='******', passwd='Elements123', db=self.custom_settings['MYSQL_DBNAME'], charset='utf8', use_unicode=True ) print('开始查询') cursor = connect.cursor() cursor.execute( "select city from wander.baidu_zhaopin_city") connect.commit() result = cursor.fetchall() print('查询成功') for i in result: url='http://zhaopin.baidu.com/quanzhi?city=%s'%i[0] headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} request = Request(url=url, method='GET', headers=headers, callback='parse', priority=1, allow_redirects=False, meta={'city': i[0]}, allow_proxy=True) self.start_push(request)
def check_parse(self, response): if response.status == 200: result = json.loads(response.text().split('(')[-1].split(')')[0]) code = result.get('code') if code: headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } params = { 'ez_verify_code': response.request.params['data'], 'ez_verify_sign': code, 'timestamp': '%s' % time.time() * 1000, 'identity': 'spider', 'redirect': response.request.meta['url'], 'scene': 'spider' } request = Request(url=response.request.meta['check_url'], method='GET', params=params, headers=headers, callback='parse', meta=response.request.meta, allow_redirects=True, allow_proxy=False, priority=5) self.push(request) return
def cookie_parse(self,response): if response.status==200: vjkl5=response.cookies.get('vjkl5').value response.request.meta['vjkl5']=vjkl5 url='http://127.0.0.1:8080/get_vl5x?cookie=%s'%vjkl5 request = Request(url=url, method='GET',meta=response.request.meta,callback='vjkl5_parse', priority=3,allow_redirects=False,allow_proxy=False) self.push(request)
def start_request(self): connect = pymysql.Connect(host='localhost', port=3306, user='******', passwd='Elements123', db=self.custom_settings['MYSQL_DBNAME'], charset='utf8', use_unicode=True) print('开始查询') cursor = connect.cursor() cursor.execute("select city,url from bxw_city_url") connect.commit() result = cursor.fetchall() for i in result: url = i[1] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } request = Request(url=url, method='GET', headers=headers, callback='parse', meta={'city': i[0]}, allow_redirects=True, allow_proxy=False, priority=1) self.start_push(request)
def start_request(self): for i in range(1, 51): url = 'http://www.qzrc.com/Search.ashx?action=c&rnd=0.16554745083462485' data = { 'stype': 'k', 'p': 1, 'k': '公司', 'pn': '150', 'urlfrom': 'http://www.qzrc.com/companyList.shtml', 'ps': '25' } headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } request = Request(url=url, method='POST', headers=headers, data=data, callback='parse', allow_proxy=False, allow_redirects=False) self.start_push(request)
def start_request(self): url = 'http://www.hxrc.com/rcnew/SeniorSearchJobInFront.aspx?SearchKind=1&KeyWord=&area=' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False,timeout=40) self.start_push(request)
def checkcode(self,response): if response.text()=='1': print(str(response.proxy)+'验证成功') self.push(response.request.meta.get('request_pre')) try: self.proxy_dict.pop((response.proxy or '127.0.0.1')) except Exception as e: print(e,(response.proxy or '127.0.0.1')) elif response.text()=='2': request=Request(url='http://wenshu.court.gov.cn/User/ValidateCode',meta=response.request.meta,proxy=response.proxy,method='GET',callback='certifycode',priority=7,allow_redirects=False,allow_proxy=False) self.push(request)
def start_request(self): for i in range(11919613,1,-1): url = 'http://www.cjob.gov.cn/cjobs/htmls/cb21dwPages/%s.html'%i headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random} request=Request(url=url,method='GET',headers=headers,callback='parse',meta={'id':i},allow_redirects=False,allow_proxy=False) self.start_push(request) time.sleep(0.02)
def start_request(self): for i in range(71110,1,-1): url = 'http://www.scrc168.com/PersonalJobs/CompanyInfo.aspx?companyid=%s'%i headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random} request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False) self.start_push(request) time.sleep(0.1)
def start_request(self): for i in range(50016110,51397988): url = 'http://shanghai.baicai.com/company/%s/'%i headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': 'baicai_p=0ghp5s5hruu3lv6ndhfub367g4828; baicai_u=0gejb5b66ih6r4s8iujeieikj3272; baicai_uID=p_15643299; baicai_area=shanghai; PHPSESSID=q2f4brenaqmotmenl6k5q6npo7; bdshare_firstime=1540355107811; baicai_s=hcnr00t3r4ma2f4l67067njkb7; __utmt=1; BC_VisitCookie=61; BC_VisitNum=61; Hm_lvt_2cb4ec3f3a8343adb1703d1115ec562b=1540353836; Hm_lpvt_2cb4ec3f3a8343adb1703d1115ec562b=1540380599; __utma=104663071.661557839.1540353837.1540353837.1540380084.2; __utmb=104663071.25.10.1540380084; __utmc=104663071; __utmz=104663071.1540353837.1.1.utmcsr=hao123.com|utmccn=(referral)|utmcmd=referral|utmcct=/zhaopin/wangzhi', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=True,allow_proxy=False,timeout=60) self.start_push(request) time.sleep(0.02)
def start_request(self): #return for i in range(417972,700000): url = 'http://www.cjol.com/jobs/company-%s'%i headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests':'1', 'User-Agent': user_agent.random} request=Request(url=url,method='GET',headers=headers,callback='parse',meta={'id':i},allow_proxy=False,allow_redirects=False) self.start_push(request) time.sleep(0.03)
def start_request(self): url = 'https://www.nbrc.com.cn/baseApp/app/search/job' data = { 'pageNumber': 1, 'pageSize': 20, 'lieBieIds': '', 'name': '', 'jobTypeId': '', 'cityId': '', 'salaryId': '', 'xingZhiId': '', 'gongLingIds': '', 'xueLiId': '', 'guiMoId': '', 'order': '' } headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'jsonType': 'jsonType', 'keyid': 'bb5259528637fe5466a8d77128dd01c2', 'nbrcafter': 'd5e6332262e2426f810677d6abb191c9', 'nbrcbefore': '1540447660000', 'nbrctoken': '', 'Referer': 'https://www.nbrc.com.cn/job/list.html', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } request = Request(url=url, method='POST', headers=headers, data=data, callback='parse', allow_proxy=False, allow_redirects=False) self.start_push(request)
def parse(self,response): if response.status==200: number=response.text() response.request.meta['number']=number guid=response.request.meta['guid'] headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests':'1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} url='http://wenshu.court.gov.cn/list/list/?sorttype=1&number={}&guid={}&conditions=searchWord+%EF%BC%882012%EF%BC%89%E8%A1%8C%E7%9B%91%E5%AD%97%E7%AC%AC154-1%E5%8F%B7+AH++%E6%A1%88%E5%8F%B7:%EF%BC%882012%EF%BC%89%E8%A1%8C%E7%9B%91%E5%AD%97%E7%AC%AC154-1%E5%8F%B7'.format(number,guid) request = Request(url=url, method='GET', meta=response.request.meta,headers=headers,callback='cookie_parse', priority=2,allow_redirects=False,allow_proxy=True) self.push(request) pass
def start_request(self): for i in range(2140000,1,-1): url = 'http://www.hbsc.cn/ashx/Corp/GetContact.ashx?id=%s&_=0.7466502927798564'%i headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://www.hbsc.cn/corp/corpinfo-%s.html'%i, 'User-Agent': user_agent.random} request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False,timeout=40) self.start_push(request) time.sleep(0.03)
def start_request(self): for i in range(1,11654): url = 'http://zhaogong.chinalao.com/%s/'%i headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://zhaogong.chinalao.com/4/', 'Upgrade-Insecure-Requests':'1', 'User-Agent': user_agent.random} request=Request(url=url,method='GET',headers=headers,callback='parse',allow_proxy=False,allow_redirects=False) self.start_push(request) time.sleep(0.1)
def start_request(self): #return for i in range(1847270,2100000): url = 'http://www.stzp.cn/jw/showent_%s.aspx'%i headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cookie':'GeolocationTimeOutName_2=1; Geolocation_1=%7B%22bczp_CityCN%22%3A%22%u6DC4%u535A%22%2C%22bczp_citycode%22%3A291600%2C%22city%22%3A%22%u6DC4%u535A%u5E02%22%2C%22geodist%22%3Anull%2C%22geodist_int%22%3A0%2C%22lat%22%3A36.80468485%2C%22lng%22%3A118.05913428%2C%22province%22%3A%22%u5C71%u4E1C%u7701%22%2C%22reftime%22%3A%222018-10-24%2014%3A38%3A33%22%2C%22street%22%3A%22%22%2C%22street_number%22%3A%22%22%7D; hidePtAD_1=1; ASP.NET_SessionId=ynldhjyeobvt4prquncvgylo; route=c8088b91cb0f2fbcbdf107bd31e3d195; UM_distinctid=166a49655510-025ae9911e2cdb-474f0820-1fa400-166a49655528a6; bdshare_firstime=1540359621817; Jw_UserName=bczp78663707d; Jw_PassWord=qaw0%2b7P4aWPp0ju05uA%2bDw%3d%3d; Admin_SN=0; Jw_LogIP=218.247.217.98; EntSearchCookies=%cf%fa%ca%db; Hm_lvt_9c09fb6bb32d4dafc6fd4ec18d310d5b=1540359607; Hm_lpvt_9c09fb6bb32d4dafc6fd4ec18d310d5b=1540437667; CNZZDATA49160=cnzz_eid%3D374406922-1540359096-null%26ntime%3D1540434052; bchatjw7866370=0', 'Upgrade-Insecure-Requests':'1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} request=Request(url=url,method='GET',headers=headers,callback='parse',allow_redirects=False,allow_proxy=False,timeout=40) self.start_push(request) time.sleep(0.05)
def parse(self, response): if response.status == 200: result = json.loads(response.content) datas = result.get('data') items = datas.get('list') for item in items: id = item.get('id') url = 'https://www.nbrc.com.cn/baseApp/app/job/getJobDetail' data = {'jobId': id} headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'jsonType': 'jsonType', 'keyid': 'bb5259528637fe5466a8d77128dd01c2', 'nbrcafter': 'd5e6332262e2426f810677d6abb191c9', 'nbrcbefore': '1540447660000', 'nbrctoken': '', 'Referer': 'https://www.nbrc.com.cn/job/list.html', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' } request = Request(url=url, method='POST', headers=headers, data=data, callback='info_parse', allow_proxy=False, allow_redirects=False) self.push(request) pager = datas.get('page') totalPage = pager.get('totalPage') if response.request.data['pageNumber'] < totalPage: response.request.data['pageNumber'] += 1 self.push(response.request) pass
def start_request(self): for i in range(626207, 0, -1): url = 'https://m.jobcn.com/wxapp/getComInfo.ujson?m.comId=%s' % i data = {'page': i} headers = {'User-Agent': user_agent.random} request = Request(url=url, method='GET', headers=headers, data=data, callback='parse', allow_proxy=False, allow_redirects=False, timeout=30) self.start_push(request) time.sleep(0.06)
def parse(self,response): salary_items=['0_0','1_1999','2001_2999','3001_4999','5001_7999','8001_9999','10000_99999999'] date_items=self.get_date(1) if response.status==200: print({'BAIDUID':response.cookies.get('BAIDUID').value}) token='' selector=Selector(text=response.text()) re_tag="window.zp_pc_nekot = '(.*?)';" match=re.findall(re_tag,response.text()) if match: aa=list(match[0]) aa.reverse() token=''.join(aa).replace('\\','') else: self.push(response.request) return area_items=selector.xpath("//span[@class='areaitem']/text()").extract() for area in area_items: for salary in salary_items: for date in date_items: url='http://zhaopin.baidu.com/api/qzasync' params={'query':'', 'city':response.request.meta['city'], 'is_adq':'1', 'pcmod':'1', 'district':area, 'sort_type':'1', 'sort_key':'5', 'pn':0, 'rn':10, 'token':token, 'salary':salary, 'date':date} headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'Referer':str(response.url), 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} request = Request(url=url, method='GET',cookies={'BAIDUID':response.cookies.get('BAIDUID').value}, params=params,headers=headers, callback='info_parse', priority=2, allow_redirects=False,allow_proxy=True) self.push(request) pass
def lasted_parse(self,response): if response.status == 200: if b'remind key' in response.content: print('remind key') pass if b'"remind"' == response.content: if not self.proxy_dict.get((response.proxy or '127.0.0.1')): print((response.proxy or '127.0.0.1'),'出现验证码') headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} response.request.meta['request_pre']=response.request request = Request(url='http://wenshu.court.gov.cn/User/ValidateCode', meta=response.request.meta, method='GET', callback='certifycode',headers=headers, priority=7, proxy=response.proxy,allow_proxy=False, allow_redirects=False) self.proxy_dict[(response.proxy or '127.0.0.1')] = time.time() self.push(request) else: self.push(response.request) else: list_content = json.loads(json.loads(response.text())) try: RunEval = list_content[0].get('RunEval') except Exception as e: print(e,list_content,response.request.meta['code']) self.push(response.request) return count = list_content[0].get('Count') for i in list_content[1:]: nopublish_reason = i.get('不公开理由') jgdge_cx = i.get('审判程序') wenshu_id = i.get('文书ID') aj_name = i.get('案件名称') aj_type = i.get('案件类型') aj_code = i.get('案号') court_name = i.get('法院名称') judge_date = i.get('裁判日期') judge_brief = i.get('裁判要旨段原文') insert_sql='''insert into court_docid(aj_code,RunEval,wenshu_id,aj_name,aj_type,court_name,judge_date,judge_brief,nopublish_reason,jgdge_cx) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''' self.Pipeline.process_item((insert_sql, (aj_code,RunEval,wenshu_id,aj_name,aj_type,court_name,judge_date,judge_brief,nopublish_reason,jgdge_cx))) if response.request.data['Index']*response.request.data['Page']<int(count) and response.request.data['Index']<20: response.request.data['Index']+=1 self.push(response.request)
def parse(self, response): if response.status != 200: print(response.status, response.url) if response.status == 200: selector = Selector(text=response.content) if '系统检测到异常行为,请先进行九宫格验证' in response.text(): im_url = selector.xpath( "//script[contains(@src,'verify.baixing.com.cn/')]/@src" ).extract_first('') headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } response.request.meta['img_url'] = im_url.replace( '.js', '.jpg') response.request.meta['check_url'] = im_url.replace( '.js', '.valid') response.request.meta['url'] = response.request.url request = Request(url=im_url, method='GET', headers=headers, callback='js_parse', meta=response.request.meta, allow_redirects=True, allow_proxy=False, priority=3) self.push(request) return items = selector.xpath("//a[@class='ad-title']/@href").extract() for i in items: insert_sql = '''insert ignore into bxw_url_new(city,url) VALUES (%s,%s)''' self.Pipeline.process_item( (insert_sql, (response.request.meta['city'], i))) next_url = selector.xpath("//a[text()='下一页']/@href").extract_first( '') if next_url: response.request.url = urljoin(response.request.url, next_url) response.request.priority = 3 self.push(response.request) pass
def start_request(self): url = 'https://qy.58.com/citylist/' headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'zh-CN,zh;q=0.8', 'upgrade-insecure-requests': '1', 'User-Agent': user_agent.random } request = Request(url=url, method='GET', headers=headers, callback='parse', priority=1, allow_redirects=False, allow_proxy=False) self.start_push(request)
def start_request(self): url = 'http://jobs.newjobs.com.cn/Jobs/SearchResult?name=' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Upgrade-Insecure-Requests': '1', 'User-Agent': user_agent.random } request = Request(url=url, method='GET', headers=headers, callback='parse', allow_proxy=False, allow_redirects=False, timeout=50) self.start_push(request)