def process_start_built_url(self): url = 'https://www.to8to.com/index.html' while 1: try: response = requests.get(url, headers={ 'user-agent': random_useragent() }, proxies=AbuyunSpider.returnRequestProxies()) response.encoding = response.apparent_encoding if response.status_code == 200: break except Exception as e: print(e) time.sleep(random.randint(2, 5)) # with open('ddddd.html','w') as f: # f.write(response.text) document = pq(response.text) res = [] for x in document('div[@class="xzcs_dt"] > a').items(): # print(x) item = {} pattern = re.compile('//(.*?)\.') item['city'] = x.text() item['city_num'] = re.search(pattern,x.attr('href')).group(1) res.append(item) return res
def process_requesst(self,session,url): while 1: try: response = session.get(url, headers={'user-agent': random_useragent()}, proxies=AbuyunSpider.returnRequestProxies(),timeout=6) if response.status_code == 200: break except Exception as e: print(e) time.sleep(random.randint(2, 5)) return session, response
def process_request(self, nextPage, meta, item, Referer=None): path_params = '/' + '/'.join(nextPage.split('/')[-3:]) """ header """ headers = self.returnBuiltHeaders(path=path_params, RefererUrl=Referer, item=item) meta['firstUrl'] = headers['sourceUrl'] del headers['sourceUrl'] while 1: try: first_url = nextPage[:-1] first_res = requests.get( first_url, headers={'user-agent': random_useragent()}) with open('ss1.html', 'w') as f: # print(first_res.text) f.write(first_res.text) second_res = nextPage.replace('https', 'http') second_res = requests.get( second_res, headers={'user-agent': random_useragent()}) with open('ss2.html', 'w') as f: # print(second_res.text) f.write(second_res.text) three_yrl = nextPage response = requests.get(url=three_yrl, headers=headers, timeout=3, allow_redirects=False, proxies=self.reutnRequestsProxies()) if response.status_code == 200: return response, meta else: print(response) except Exception as e: print(e) time.sleep(random.randint(2, 5))
def get_cityid(self,url): while 1: print('###############获取城市ID{}'.format(url)) try: response = requests.get(url, headers={ 'user-agent': random_useragent() }, proxies=AbuyunSpider.returnRequestProxies()) if response.status_code == 200: break except Exception as e: print(e) time.sleep(random.randint(2,5)) document = pq(response.text) cityID = document('#cityId').attr('value') return cityID
def returnBuiltHeaders(self, path, RefererUrl=None): """ 构造headers :return: """ headers = { "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh - CN, zh;q = 0.9", "Cache-Control": "max - age = 0", "Connection": "keep - alive", "Host": "www.chenyang.com", "Upgrade-Insecure-Requests": "1", "user-agent": random_useragent(), } if RefererUrl: headers['Referer'] = RefererUrl return headers
def returnBuiltHeaders(self, path, RefererUrl=None): """ 构造headers :return: """ headers = { "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "if-none-match": "1591637021-1", "referer": "https://www.dulux.com.cn/zh/find-a-stockist", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": random_useragent(), } if RefererUrl: headers['Referer'] = RefererUrl return headers
def returnBuiltHeaders(self, path, item, RefererUrl=None, page=None): """ 构造headers :return: """ city = parse.quote(item['city']) city_num = item['city_num'] city_type = item['city_type'] sourceUrl_built_url = 'https%3A%2F%2F{}.to8to.com%2Fcompany%2F'.format( city_num) firstUrl_built_url = 'https://{city_num}.to8to.com/company/{city_type}/'.format( city_num=city_num, city_type=city_type) sourceUrl = item['sourceUrl'].replace('://', '%3A%2F%2F').replace( '/', "%2F") if item.get('sourceUrl') else sourceUrl_built_url firstUrl = item.get('firstUrl').replace('://', '%3A%2F%2F').replace( '/', "%2F") if item.get('firstUrl') else firstUrl_built_url.replace( ':/', '%3A%2F%2F').replace('/', "%2F") nowpage = item.get('firstUrl').replace( '://', '%253A%252F%252F').replace('/', '%252F') if item.get( 'firstUrl') else firstUrl_built_url.replace( ':/', '%253A%252F%252F').replace('/', "%252F") if not page: landpage = 'https%3A//sz.to8to.com/' else: landpage = firstUrl headers = { # "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", # "accept-encoding": "gzip, deflate, br", # "accept-language": "zh-CN,zh;q=0.9", # "cache-control": "no-cache", # "pragma": "no-cache", # "sec-fetch-dest": "document", # "sec-fetch-mode": "navigate", # "sec-fetch-site": "cross-site", # "sec-fetch-user": "******", # "upgrade-insecure-requests": '1', "user-agent": random_useragent(), "cookie": "uid=CgoKUF61XAWCtJc0A7vvAg==; " "to8tocookieid=f982c677f999237b9fe9e5ee3947f2cc806225; " "tracker2019session=%7B%22session%22%3A%2217201522ce2109-0c32fb225c0495-14291003-2073600-17201522ce3201%22%7D; " "tracker2019jssdkcross=%7B%22distinct_id%22%3A%2217201522ce612f-03e21b7f111ee1-14291003-2073600-17201522ce71e%22%7D; " "to8to_sourcepage=; to8to_landtime=1589160062; " "to8to_cook=OkOcClPzRWV8ZFJlCIF4Ag==; " "to8to_townid=1103; to8to_tcode=sh; " "to8to_tname=%E4%B8%8A%E6%B5%B7; " "Hm_lvt_dbdd94468cf0ef471455c47f380f58d2=1589160063; " "tender_popup_flag=true;" " ONEAPM_BI_sessionid=9238.924|1589197648127; " "Hm_lpvt_dbdd94468cf0ef471455c47f380f58d2={times}; act=freshen;" "to8to_landpage={landpage}; " "to8to_tcode={city_num}; to8to_tname={city}; " "to8to_cmp_sourceUrl={sourceUrl}; " "to8to_cmp_firstUrl={firstUrl}; " "to8to_nowpage={nowpage}; ".format(city=city, city_num=item['city_num'], sourceUrl=sourceUrl, firstUrl=firstUrl, nowpage=nowpage, landpage=landpage, times=now_to_timestamp()) } if RefererUrl: headers['Referer'] = RefererUrl headers['sourceUrl'] = sourceUrl return headers