def start_requests(self): response = parsel.Selector( requests.post(self.search_url, { 'pageNo': '1', 'keyword': 'PP粒进出口数据简析' }).text) items = response.xpath("//ul[@class='contentList']/li").getall() temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) temp = code_verify(self.img_url, self.code_verify_url) for item in items: time.sleep(2) url = parsel.Selector(item).xpath("//h2//a/@href").get() date = parsel.Selector(item).xpath( "//span[@class='date']//text()").get() data = { 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw } yield FormRequest(url=self.login_succeed_url, formdata=data, callback=self.parse, meta={'date': date})
def start_requests(self): response = parsel.Selector( requests.post(self.search_url, { 'pageNo': '1', 'keyword': '聚丙烯粉料主要生产企业开工分析' }).text) url = response.xpath("//ul[@class='contentList']/li[1]//a/@href").get() date = response.xpath( "//ul[@class='contentList']/li[1]//span[@class='date']/text()" ).get() # 今日的日期会标记为红色,dom路径有所不同 if date is None: response.xpath( "//ul[@class='contentList']/li[1]//span[@class='date']/font/text()" ).get() temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) temp = code_verify(self.img_url, self.code_verify_url) data = { 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw } yield FormRequest(url=self.login_succeed_url, formdata=data, callback=self.parse, meta={'date': date})
def start_requests(self): response = parsel.Selector( requests.post(self.search_url, {'pageNo': '12', 'keyword': '国内丙烯腈厂家一周产量统计'}).text) lists = response.css('div.zixun.contentactive > ul.contentList > li') temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) temp = code_verify(self.img_url, self.code_verify_url) for li in lists: time.sleep(5) url = li.css('h2 > a::attr(href)').get() date = li.css('span.date::text').get() data = { 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw } yield FormRequest( url=self.login_succeed_url, formdata=data, callback=self.parse, meta={'date': date}, headers={ 'Referer': self.search_url } )
def before_parse(self, response): lists = response.css('div.zixun.contentactive > ul.contentList > li') temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) temp = code_verify(self.img_url, self.code_verify_url) for li in lists: time.sleep(5) url = li.css('h2 > a::attr(href)').get() date = li.css('span.date::text').get() # 今日的日期会标记为红色,dom路径有所不同 if date is None: date = response.xpath( "//ul[@class='contentList']/li[1]//span[@class='date']/font/text()" ).get() data = { 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw } yield FormRequest(url=self.login_succeed_url, formdata=data, callback=self.parse, meta={'date': date}, headers={'Referer': self.search_url})
def getCode(self, response): # # 该方法识别正确率极低 # open('./code.png', 'wb').write(response.body) # # urlretrieve(self.url_code, './code.png') # image = Image.open('./code.png') # content = pytesseract.image_to_string(image) # # 手动输入测试 # # image = Image.open(BytesIO(response.body)) # # image.show() # # content = input("请输入验证码:") # return [ # Request( # url=self.url_check_code + "?code=" + content, # callback=self.checkImgCode, # dont_filter=True, # headers=self.get_headers(host=self.host_login_oilchem) # ) # ] temp = code_verify(self.url_code, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) time.sleep(self.sleep_time) temp = code_verify(self.url_code, self.code_verify_url) form_data = { "username": "******", "password": "******" } return [ FormRequest(url=self.url_login, callback=self.afterLogin, formdata=form_data, dont_filter=True, headers=self.get_headers(self.host_login_oilchem)) ]
def start_requests(self): # 获取最大页数 res = requests.post(self.search_url, { 'pageNo': '1', 'keyword': '[PP粒]:中油华南PP' }).text maxPageNo = re.findall(r"第1页/共 (\d+)页", res)[0] temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) temp = code_verify(self.img_url, self.code_verify_url) # 每页爬取 for pageNo in range(1, int(maxPageNo) + 1): response = parsel.Selector( requests.post(self.search_url, { 'pageNo': str(pageNo), 'keyword': '[PP粒]:中油华南PP' }).text) items = response.xpath("//ul[@class='contentList']/li").getall() for item in items: time.sleep(2) url = parsel.Selector(item).xpath("//h2//a/@href").get() date = parsel.Selector(item).xpath( "//span[@class='date']//text()").get() data = { 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw } if date: yield FormRequest(url=self.login_succeed_url, formdata=data, callback=self.parse, meta={'date': date})
def before_parse(self, response): lists = response.css('div.zixun.contentactive > ul.contentList > li') temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': self.count += 1 print("第{}次识别出错。".format(self.count)) temp = code_verify(self.img_url, self.code_verify_url) for li in lists: time.sleep(5) url = li.css('h2 > a::attr(href)').get() date = li.css('span.date::text').get() # 今日的日期会标记为红色,dom路径有所不同 if date is None: date = response.xpath("//ul[@class='contentList']/li[1]//span[@class='date']/font/text()").get() data = { 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw } yield FormRequest( url=self.login_succeed_url, formdata=data, callback=self.parse, meta={'date': date}, headers={ 'Referer': self.search_url } ) time.sleep(120) next_url = response.css('#simpledatatable_paginate > ul > li:nth-last-child(2) > a::attr(href)').get() next_page = re.search(r"(?<=goPage)\((\d)\)", next_url).group(1) if next_page is not None: yield FormRequest( url=self.search_url, formdata={ 'pageNo': next_page, 'keyword': '库存早报' }, callback=self.before_parse )
def getCode(self, response): temp = code_verify(self.url_code, self.code_verify_url) count = 0 while temp.text != 'true': count += 1 print("第{}次识别出错。".format(count)) time.sleep(0.5) temp = code_verify(self.url_code, self.code_verify_url) form_data = { "username": "******", "password": "******", 'errorPaw': "deya1589", } return [ FormRequest( url=self.url_login, callback=self.afterLogin, formdata=form_data, dont_filter=True, headers=self.get_headers(self.host_login_oilchem) ) ]
def start_requests(self): try: res = parsel.Selector( requests.post( "https://search.oilchem.net/solrSearch/select.htm", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' }, data={ 'pageNo': 1, 'keyword': '全国PP装置生产情况汇总' }).text) if self.mode == 1: urls = [] last_page = res.xpath('//*[@id="simpledatatable_info"]/text()' ).extract_first()[6:8] for i in range(1, int(last_page) + 1): res = parsel.Selector( requests.post( "https://search.oilchem.net/solrSearch/select.htm", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' }, data={ 'pageNo': i, 'keyword': '全国PP装置生产情况汇总' }).text) temp_urls = list( set( res.xpath( '/html/body/div/div[3]/div[1]/div[2]/ul/li//a/@href' ).extract())) time.sleep(5 + random.uniform(1, 10)) for tu in temp_urls: urls.append(tu) print("共爬取 " + str(len(urls)) + ' 条url') for target in urls: # 识别错误次数 count = 0 if target[0:4] != 'http': target = 'https:' + target temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': count += 1 print("第{}次识别出错。".format(count)) temp = code_verify(self.img_url, self.code_verify_url) yield FormRequest(url=self.login_succeed_url, formdata={ 'username': self.username, 'password': self.password, 'target': target, 'errorPaw': self.errorPaw }, callback=self.parse) time.sleep(10 + int(random.uniform(1, 10))) else: url = res.xpath( '/html/body/div/div[3]/div[1]/div[2]/ul/li[1]/h2/a/@href' ).extract_first() date = res.xpath( '/html/body/div/div[3]/div[1]/div[2]/ul/li[1]/div/div/span/span/text()' ).extract_first() today = datetime.date.today().strftime('%Y-%m-%d') if today != date: print("当日全国PP数据还没有出来") return else: count = 0 if url[0:4] != 'http': url = 'https:' + url temp = code_verify(self.img_url, self.code_verify_url) while temp.text != 'true': count += 1 print("第{}次识别出错。".format(count)) temp = code_verify(self.img_url, self.code_verify_url) yield FormRequest(url=self.login_succeed_url, formdata={ 'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw }, callback=self.parse) except Exception as e: print('!' * 30) print('step1') print(e) print('!' * 30)