def newThread(self): file = open("d:/IP/availableIP.txt", "w") for ip in self.ip_pool: file.write(ip + '\n') file.close() self.ip_pool = [] file = open("d:/IP/ip.txt") ipList = [] print("test file") for ip in file: ipList.append(ip) file.close() ipSize = len(ipList) for xx in ipList: ip = str(xx) if ip.__contains__(":"): self.requestCount += 1 rr = Request(url="https://baidu.com/", callback=lambda response, typeid=ip.strip(): self. parse_ipResponse(response, typeid)) rr.meta['proxy'] = "http://" + ip.strip() print("ip test-----------------------") print(ip.strip()) print(str(self.requestCount)) print("------------------------------") print("------------------------------") print("------------------------------") print("------------------------------") rr.dont_filter = True rr.meta['DOWNLOAD_TIMEOUT'] = 5 if self.requestCount == ipSize: self.requestCount = 0 self.newThread() break yield rr
def start_requests(self): while True: file = open("d:/IP/ip.txt") ipList = [] print("test file") for ip in file: ipList.append(ip) file.close() ipSize = len(ipList) for xx in ipList: ip = str(xx) if ip.__contains__(":"): self.requestCount += 1 rr = Request(url=random.choice(self.webs), callback=lambda response, typeid=ip.strip(): self.parse_ipResponse(response, typeid)) rr.meta['proxy'] = "http://" + ip.strip() rr.headers.setdefault('User-Agent', random.choice(self.user_agent)) print("ip test-----------------------") print(ip.strip()) print(str(self.requestCount)) print("------------------------------") print("------------------------------") print("------------------------------") print("------------------------------") rr.dont_filter = True # rr.meta['download_timeout'] = 5 yield rr
def start_requests(self): request = Request(url="http://www.qidian.com") request.meta['dont_retry'] = True # 代理池中为空的时候不等待代理,因为爬取代理的爬虫不使用爬取的代理 request.meta['dont_wait_proxy'] = True request.dont_filter = True yield request
def start_requests(self): file = open( "D:/Python/scrapy_learning/getproxiesIP/getproxiesIP/files/ip.txt") for ip in file: rr = Request(url="https://baidu.com/", callback=lambda response, typeid=ip: self. parse_ipResponse(response, typeid)) rr.meta['proxy'] = "http://" + ip rr.dont_filter = True rr.meta['time_out'] = 5 yield rr
def start_requests(self): # 随机爬取前100页代理 flag = True while flag: flag = ProxySpiderSwitch.flag page = random.choice(range(1, 100)) url = 'http://www.xicidaili.com/wt/%d' % page request = Request(url=url) # 不走ProxyFilterMiddleware,不重新发送request,失败了就失败了 request.meta['dont_retry'] = True # 代理池中为空的时候不等待代理,因为爬取代理的爬虫不使用爬取的代理 request.meta['dont_wait_proxy'] = True request.dont_filter = True yield request
def gen_request(self, url, **kwargs): """ generate a Request :param url: the url to generate requests :return: Request """ r = Request( url=url, callback=self.parse, errback=self.parse_err, ) if 'dont_filter' in kwargs: r.dont_filter = True return r
def retry_request_with_get(self, request: Request) -> Generator[Request, None, None]: request.method = 'GET' request.dont_filter = True yield request
def retry_request_with_get( self, request: Request) -> Generator[Request, None, None]: request.method = 'GET' request.dont_filter = True yield request
def parse(self, response): # TODO 你需要首先使用utils的redisListUpload下发任务。否则会一直等待直到有任务来! url = response.url print("当前解析页", url) request = response.request if '玄鸟' not in response.text and 'company_getinfos' not in url: print("虽然正常可达,但登陆失败,需要更新cookie") print(response.text) with open("../web_msg/qcc_loginErr.html", "w", encoding='utf-8') as fp: fp.write(response.text) raise Exception("虽然正常可达,但登陆失败,需要更新cookie") elif '#ipo' in url: print("该页为上市信息,不采集") # IPO不采集 request._set_url = (request.url.replace('#ipo', '#base')) request.dont_filter = True yield request elif 'base' in url or 'firm' in url: print("该页为基本信息") item = QichachaHtmlItem() item['base_html'] = response.text item['name'] = response.css('h1::text')[0].extract() item['id'] = url.split('firm_')[1].split('.')[0] item['mid_requests'] = 1 # 谢鋆Request下载框架MID print("企查查首页解析成功:", item['name'], item['id']) requestNew = Request( url=self.URL_BASE.format(item['id'], item['name'], 'susong')) requestNew.meta['item'] = item requestNew.meta['item']['mid_requests'] = 2 requestNew.priority = request.priority + 100 requestNew.dont_filter = True yield requestNew elif 'susong' in url: print("该页为法律诉讼") request.meta['item']['susong_html'] = response.text request.meta['item']['mid_requests'] = 2 # 谢鋆Request下载框架MID requestNew = Request(url=request.url.replace('susong', 'run')) requestNew.meta['item'] = request.meta['item'] requestNew.priority = request.priority + 100 requestNew.dont_filter = True yield requestNew elif 'run' in url: print("该页为经营状况") request.meta['item']['run_html'] = response.text request.meta['item']['mid_requests'] = 2 # 谢鋆Request下载框架MID requestNew = Request(url=request.url.replace('run', 'fengxian')) requestNew.meta['item'] = request.meta['item'] requestNew.priority = request.priority + 100 requestNew.dont_filter = True yield requestNew elif 'fengxian' in url: print("该页为经营风险") request.meta['item']['fengxian_html'] = response.text request.meta['item']['mid_requests'] = 2 # 谢鋆Request下载框架MID requestNew = Request(url=request.url.replace('fengxian', 'report')) requestNew.meta['item'] = request.meta['item'] requestNew.priority = request.priority + 100 requestNew.dont_filter = True yield requestNew elif 'report' in url: print("该页为企业年报") request.meta['item']['report_html'] = response.text request.meta['item']['mid_requests'] = 2 # 谢鋆Request下载框架MID requestNew = Request(url=request.url.replace('report', 'history')) requestNew.meta['item'] = request.meta['item'] requestNew.priority = request.priority + 100 requestNew.dont_filter = True yield requestNew elif 'history' in url: print("该页为历史股东") request.meta['item']['history_html'] = response.text request.meta['item']['mid_requests'] = 2 # 谢鋆Request下载框架MID print('存储数据') yield request.meta['item'] else: print("跳转到了异常页,停止抓取") print(response.text) raise Exception("抓取异常,url不包括关键字,停止抓取")