def parse_forum_block_list(self, response: HtmlResponse): post_list = response.css('table#threadlisttableid') post_urls = [] for post in post_list.css('tbody[id!="separatorline"]'): loader = ItemLoader(item=LyCommunityPostItem(), selector=post, base_url='http://www.lysq.com/') loader.add_value('block_name', response.meta['forum_block']) loader.add_css('title', 'a.s.xst::text') loader.add_css('url', 'a.s.xst::attr("href")') loader.add_css('author_username', 'td.by:nth-child(3) a::text') loader.add_css('created_time', 'td.by:nth-child(3) span::text') loader.add_css('last_comment_username', 'td.by:nth-child(5) a::text') loader.add_css('last_comment_time', 'td.by:nth-child(5) em span::attr("title")') loader.add_css('last_comment_time', 'td.by:nth-child(5) em span::text') loader.add_css('last_comment_time', 'td.by:nth-child(5) em a::text') loader.add_css('comment_count', 'td.num a::text') loader.add_css('view_count', 'td.num em::text') has_image = True if post.css( 'th.common img[alt="attach_img"]') else False loader.add_value('has_image', has_image) has_attachment = True if post.css( 'th.common img[alt="attachment"]') else False loader.add_value('has_attachment', has_attachment) item = loader.load_item() post_urls.append(item['url']) yield item for post_url in post_urls: yield Request( response.urljoin(post_url), callback=self.parse_forum_post, # 这个post_url是relative的, 并且是post的第一页, 所以不能使用response.url meta={ 'post_url': post_url, 'page': response.meta['page'] }) next_page = response.css('div.pg a.nxt::attr("href")').extract_first() if next_page: yield Request(response.urljoin(next_page), callback=self.parse_forum_block_list, meta={ **response.meta, 'page': response.meta['page'] + 1 })
def parse_main_page(self, response: HtmlResponse): car_types = response.xpath( "//select[@id='make_id']/option[position()>1]/text()").getall() for car in car_types: car = car.replace(' ', '-').replace('&', 'and') car = response.urljoin('/tuning-specs/' + car) yield scrapy.Request(car, callback=self.parse_models_page)
def parse_json(self,response): data = response.body[1:-1] js = json.loads(data) response = HtmlResponse(url=response.url,body=js['data'].encode('utf8')) for href in response.css(settings["el_nacional"]['links']): full_url = response.urljoin(href.extract()) yield scrapy.Request(full_url, callback=self.parse_links)
def parse(self, response): urls = response.css('li.lista-noticia-item > a::attr(href)').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.detalhes_noticia, dont_filter=True) i = 0 while True: i += 1 self.driver.find_element_by_link_text('Próxima').click() self.parse(HtmlResponse(self.driver.page_source)) response = HtmlResponse(self.driver.current_url, body=self.driver.page_source, encoding='utf-8') urls = response.css( 'li.lista-noticia-item > a::attr(href)').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.detalhes_noticia, dont_filter=True) print(i) time.sleep(3) if i > 500: break self.escrever_dados()
def get_video_urls(url): rsp = requests.get(url) response = HtmlResponse(body=rsp.content, url=rsp.url) video_urls = response.xpath('//ul[contains(@class, "ff-playurl") and contains(@class, "active")]/li/a/@href').extract() for i in range(0, len(video_urls)): video_urls[i] = response.urljoin(video_urls[i]) return video_urls
def _parse_links(self, item): """Parse or generate links.""" res = HtmlResponse(url=item["url"], body=item["description"], encoding="utf-8") links = [] for link in res.css("a"): link_href = res.urljoin(link.attrib["href"]) link_title = " ".join(link.css("* ::text").extract()).strip() if "viewform" not in link_href and link_title: links.append({"href": link_href, "title": link_title}) return links
def se_res_parser(self, response: HtmlResponse): """ 参考: https://docs.scrapy.org/en/latest/intro/tutorial.html#more-examples-and-patterns :param response: 由框架请求完每一个url返回的response通过异步回调传入 :return: """ try: filter_: Filter = Filter(self.cursor, PLATFORM_CODE) all_ori_urls: List[str] = filter_.fetch_urls_from_detail_url_tb() except MySQLError: import traceback traceback.print_exc() self.cursor.close() self.conn.close() self.log('The DB connection has closed!') soup_god: Soup = Soup(response.text, 'lxml') # 搜索结果页所有的楼盘a标签soup类型集合 all_comm_soup: List[Soup] = soup_god.select('p.house-name a') for comm in all_comm_soup: # 通过框架方法urljoin()生成完整url href: str = response.urljoin(comm.get('href')) if href in all_ori_urls: self.log('Hit the crawled community of {}!'.format(href)) continue self.detail_item['comm_url'] = href self.detail_item['city_id'] = self.helper.get_city_id_by_url(href) self.detail_item['commit_time'] = self.helper.date_getter() self.detail_item['comm_from'] = PLATFORM_CODE self.detail_item['is_new'] = 0 yield self.detail_item next_page_wrapper = re.search(r'尾页(.*)下一页', response.text) if next_page_wrapper is not None: next_page_wrapper_txt: str = next_page_wrapper.group().replace( '"', '') next_page_reg = re.search(r'href=(.+) class', next_page_wrapper_txt) if next_page_reg is not None: next_page: str = response.urljoin(next_page_reg.group(1)) yield scrapy.Request(next_page, callback=self.se_res_parser) self.helper.avoid_anti_crawling( 20, 'before the next searching page', self)
def get_video_channels(url): rsp = requests.get(url) response = HtmlResponse(body=rsp.content, url=rsp.url) channels = response.xpath('//div[contains(@class, "tab-content")]//ul[contains(@class, "ff-playurl")]') video_channels = [] for channel in channels: video_urls = channel.xpath('./li/a/@href').extract() for i in range(0, len(video_urls)): video_urls[i] = response.urljoin(video_urls[i]) video_channels.append(video_urls) return video_channels
def parse(self, response: HtmlResponse): li_eles = response.selector.css('.container li') for ele in li_eles: cmd_group = ele.attrib['data-group'] cmd_name = ele.attrib['data-name'] detail_url = ele.css('a').attrib['href'] yield Request( response.urljoin(detail_url), callback=self.parse_details, meta={'cmd_group': cmd_group, 'cmd_name': cmd_name} )
def parse(self, response: HtmlResponse): # print('*'*100) # # response是响应对象,包含常用的属性 # print(response.encoding) # 不能修改 # print(response.headers) # print(response.status) # print(response.url) # print(response.request.url) # # 直接查询网页中的title标签,xpath()返回一个selector对象 # print(response.xpath('//title/text()').extract()) # print(response.selector.xpath('//title/text()').extract()) # print(response.css('div[class="author clearfix"] a')) # print(response.text) # 打印文本数据 # print(response.body) # 网页内容的字节码 articles = response.xpath('//div[starts-with(@class,"article")]') # print(articles) for art in articles: # print(art) # article是Selector对象类型 try: name = art.xpath('./div[1]//img/@alt').extract()[0] img = art.xpath('./div[1]//img/@src').extract()[0] content = art.xpath( './/div[@class="content"]/span[1]/text()').extract() # 读取下一页数据 except: pass else: # print(name,img) # print(''.join(content).replace('\n', '')) from qiubai.items import QiubaiItem item = QiubaiItem() item['name'] = name item['img'] = 'http' + img item['content'] = ''.join(content).replace('\n', '') # 将item的数据交给管道去处理 # 利用协程的方式 # yield { # 'name':name, # 'img':'http'+img, # 'content':''.join(content).replace('\n', '') # } yield item next_url = response.xpath( '//ul[@class="pagination"]/li[last()]/a/@href').extract()[0] next_page_url = response.urljoin(next_url) if next_page_url: # 发起下一页的请求 yield scrapy.Request(next_page_url, callback=self.parse)
def parse(self, response:HtmlResponse): # print('---'*100) # response是响应对象,常用属性 # print(response.encoding) # 不能修改 # print(response.headers) # print(response.status) # print(response.url) # print(response.request.url) # 直接查询网页中的title标签, # print(response.xpath('//title/text()')) # print(response.text) # 打印文本数据 # print(response.body) # print(response.selector.xpath('//div[starts-with(@class,"author")]/a')) # print(response.css('div[class="author clearfix"] a')) # from qiubai.qiubai.items import QiubaiItem articles = response.xpath('//div[starts-with(@class,"article")]') for article in articles: try: name = article.xpath('./div[1]//img/@alt').extract()[0] img = article.xpath('./div[1]//img/@src').extract()[0] content = article.xpath('.//div[@class="content"]/span[1]/text()').extract() except: pass else: # print(name,img) # print(''.join(content).replace('\n','')) # item = QiubaiItem() # item.name = name # item.img = 'http:'+img # item.content = ''.join(content).replace('\n','') # 将item yield { "name":name, "img":'http:'+img, 'content':''.join(content).replace('\n','') } # print('---'*100) # //div[starts-with(@class,"article")]/div[1]//img/@src # //div[starts-with(@class,"article")]//div[@class="content"] # //div[starts-with(@class,"article")]//div[@class="thumb"]//img/@src # 读取下一页数据 next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()[0] next_page_url = response.urljoin(next_url) print(next_page_url) yield scrapy.Request(next_page_url,callback=self.parse) # print(next_url) print('---'*100)
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css('.block-news-item'): il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='http://{}'.format(self.name)) link = response.urljoin(item.css('a::attr(href)').extract_first()) il.add_value('link', link) il.add_value('title', item.css('h3::text').extract_first()) il.add_value('updated', item.css('.date::text').extract_first()) yield scrapy.Request(link, self.parse_item, meta={'il': il})
def search(keyword): url = SEARCH_URL.format(keyword=keyword) rsp = requests.get(url) response = HtmlResponse(body=rsp.content, url=rsp.url) search_result = response.xpath('//div[@class="container ff-bg"]/ul[contains(@class, "list-unstyled")]/li') videos = [] for item in search_result: status = item.xpath('.//span[@class="continu"]/text()').extract_first() videos.append({ 'name': item.xpath('./h2/a/text()').extract_first(), 'url': response.urljoin(item.xpath('./h2/a/@href').extract_first()), 'status': status.strip() if status else '' }) return videos
def parse_product_list(self, response): if not isinstance(response, HtmlResponse): try: data = demjson.decode(response.body) response = HtmlResponse(response.url, body=data['data'], encoding='utf-8', request=response.request) except: self.log('No valid json found in %s' % response.url) return products = response.xpath(u'//a[@itemprop="url"]/@href').extract() for url in products: url = response.urljoin(url) yield Request(url, callback=self.parse_product) if products: pages = response.xpath( '//ul[contains(@class, "uk-pagination")]//a/@href').extract() for page in pages: yield Request(response.urljoin(page), callback=self.parse_product_list)
def parse(self, response:HtmlResponse): #response是响应对象,常用属性 # print(response.encoding) #不能修改 # #字节码数据,返回的是字典类型,value为列表类型。例:{b'Server': [b'openresty'], b'Date': [b'Wed, 20 Jun 2018 01:59:00 GMT']} # print(response.headers) # print(response.status) # print(response.url) #返回响应网址,与response.request.url相同 # print(response.request.url) # print(response.body) #字节码数据 # print(response.text) #文本数据 # 可以直接发起xpath,查询网页中的title节点,xpath()返回的是Selector对象的列表集合 # print(response.xpath('//div[@class="content"]/span/text()').extract_first()) #extract()提取selector对象的内容 # t = response.xpath('//div[@class="content"]/span/text()').extract_first() # print(type(t)) # p = response.xpath('//div[@class="content"]/span/text()').extract() # print(type(p)) #print(response.css('div[class="author clearfix"] a')) articles = response.xpath('//div[starts-with(@class,"article")]') for article in articles: try: # article 是Selector对象类型 name = article.xpath('./div[1]//img/@alt').extract()[0] img = article.xpath('./div[1]//img/@src').extract()[0] content = article.xpath('.//div[@class="content"]/span[1]/text()').extract() except: pass else: print(name, img) print(''.join(content).replace('\n', '')) #将item数据交给item管道处理 yield { "name":name, "img":'http:'+img, "content":''.join(content).replace('\n','') } #读取下一页数据 next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()[0] next_page_url = response.urljoin(next_url) #与start_url做拼接,https://www.qiushibaike.com/8hr/page/2/ #发起下一页的请求,callback是回调函数 yield scrapy.Request(next_page_url,callback=self.parse) print(next_url)
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css(".block-news-item"): il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", ignoretz=True, base_url="https://www.{}".format(self.name), ) link = response.urljoin(item.css("a::attr(href)").extract_first()) il.add_value("link", link) il.add_value("title", item.css("h3::text").extract_first()) il.add_value("updated", item.css(".date::text").extract_first()) yield scrapy.Request(link, self.parse_item, meta={"il": il})
def parse(self, response: HtmlResponse): # 获取当前页面页数 url_list = response.url.split('page=') if len(url_list) == 1: current_page_num = 1 else: current_page_num = int(url_list[1]) # 兼容 cookie 失效的情况 if 'login.php' in response.url: raise ValueError('cookie 未设置或失效') else: myvideo_list = response.css('div.maindescwithoutborder') video_info_list = myvideo_list.css('a') self.logger.warn('解析{0}成功,存在{1}个视频'.format(response.url, len(video_info_list))) link_and_title_dict = {} for item in video_info_list: title = item.css('::text').extract_first() link = item.css('a::attr(href)').extract_first() # 不知道为啥有时候会出现 email protected if 'email protected' in title: continue link_and_title_dict[link] = title # 丢给另一个去解析真实的视频地址 yield scrapy.Request(url=link, callback=self.parse_my_follow_real_link) self.logger.warn('最终解析{0}个视频'.format(len(link_and_title_dict))) # 记录下来当前页面的内容 yield SaveMovieInfoItem(page_number=current_page_num, movie_link_and_name=link_and_title_dict) self.logger.warn('解析完毕,检查是否存在下一页'.format(response.url)) next_page_tag = response.css('a[href*="?&page="]') for i in next_page_tag: if '»' == i.css('a::text').extract_first(): ori_link = i.css('a::attr(href)').extract_first() next_link = response.urljoin(ori_link) self.logger.warn('存在下一页') next_headers = { 'Cookie': self.cookie, 'Referer': response.url } yield scrapy.Request(url=next_link, callback=self.parse, headers=next_headers)
def push_to_mq(self, item): """推送到RabbitMQ :param item: """ # 推搜索 if isinstance(item, SearchItem): if item.get('Content'): body = self.packaged_search(item) self.channel.basic_publish(exchange='', routing_key=self.news_queue, body=body) # 推其他 url = item.get("Url") # 1.推送内容中的图片 content = item.get("Content") if content: response = HtmlResponse(url=url, body=content, encoding='utf8') img_list = response.xpath("//img/@src").extract() if img_list: image_list = [response.urljoin(img_url) for img_url in img_list] body = self.packaged_data(website=self.website, url=url, resource_urls=image_list, resource_type="Picture", content=content) self.channel.basic_publish(exchange='', routing_key=self.image_queue, body=body) # 2.推送搜索内容的附件到MQ pdf_file = item.pop("pdf_file") if item.get('pdf_file') else None if pdf_file: pdf_file_list = json.loads(pdf_file).get("附件") body = self.packaged_data(website=self.website, url=url, resource_urls=pdf_file_list, resource_type="Pdf") self.channel.basic_publish(exchange='', routing_key=self.file_queue, body=body) # 推专家 elif isinstance(item, ExpertItem): body = self.packaged_expert(item) self.channel.basic_publish(exchange='', routing_key=self.expert_queue, body=body) # 3.推送专家头像(图片) head_portrait = item.get("img_url") url = item.get("experts_url") if head_portrait: body = self.packaged_data(website=self.website, url=url, resource_urls=[head_portrait], resource_type="Picture") self.channel.basic_publish(exchange='', routing_key=self.expert_img_queue, body=body)
def parse(self, response: HtmlResponse): json_resp = self._process_dirty_json(response) for hero in json_resp['soul']: yield SanguoOlHeroItem( name=hero['name'], pic=response.urljoin(hero['pic']), name_pinyin=hero['pinyin'], sex=hero['sex'], name_zi=hero['zi'], life_range=hero['shengsi'], come_from=hero['jiguan'], brief=hero['content'], cata=hero['cata'] ) if json_resp['page'] < json_resp['mpage']: yield response.follow( url=self.LIST_API.format(page=json_resp['page'] + 1), callback=self.parse )
def get_video_detail(detail_url): video_detail = {} try: rsp = requests.get(detail_url) response = HtmlResponse(body=rsp.content, url=rsp.url) video_detail['title'] = response.xpath('//div[@class="media"]//a[@class="ff-text"]/text()').extract_first() # video_detail['actors'] = response.xpath('//div[@class="media"]//dt[contains(text(), "主演")]/following-sibling::dd[1]//text()').extract() # # video_detail['directors'] = response.xpath( # '//div[@class="media"]//dt[contains(text(), "导演")]/following-sibling::dd[1]//text()').extract() # video_detail['categories'] = response.xpath( # '//div[@class="media"]//dt[contains(text(), "类型")]/following-sibling::dd[1]//text()').extract() # video_detail['area'] = response.xpath( # '//div[@class="media"]//dt[contains(text(), "地区")]/following-sibling::dd[1]//text()').extract_first() # video_detail['year'] = response.xpath( # '//div[@class="media"]//dt[contains(text(), "年份")]/following-sibling::dd[1]//text()').extract_first() data_active = response.xpath('//ul[contains(@class, "ff-playurl") and contains(@class, "active")]/@data-active').extract_first() video_detail['episode'] = response.xpath('//li[@data-id="{data_active}"]//text()'.format(data_active=data_active)).extract_first() player_url = response.xpath('//div[@id="cms_player"]/script[1]/@src').extract_first() rsp = requests.get(response.urljoin(player_url)) video_info = json.loads(re.findall(r'var cms_player = (\{[\s\S]+?\});', rsp.text)[0]) # print(video_info) req = PreparedRequest() url = video_info['url'] if video_info['name'] == 'haokan_baidu': params = {} req.prepare_url(url, params) elif video_info['name'] == '360biaofan': params = {'time': video_info['time'], 'auth_key': video_info['auth_key']} req.prepare_url(url, params) rsp = requests.get(req.url) script_text = Selector(text=rsp.text).xpath('//script/text()').extract_first() # print(script_text) video_detail['video_url'] = re.findall(r'url: *\"(\S+?)\"', script_text)[0] return video_detail except KeyboardInterrupt: print('Interrupted') exit(0) except Exception: return video_detail
def parse(self, response: HtmlResponse): reque = response.request # type:scrapy.Request print() # 当可用代理小于3个时,获取新代理 while len(PROXY_LIST) < 3 & 1 == 0: get_proxy_url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=0&city=0&yys=0&port=1&pack=126415&ts=1&ys=1&cs=1&lb=1&sb=0&pb=45&mr=1®ions=' proxy_obj = requests.get(get_proxy_url).content.decode() # proxy_response = scrapy.Request(url=get_proxy_url, ) # print(proxy_response.body) proxy_obj = json.loads(proxy_obj) if (proxy_obj['success'] == True): temp = {} temp['ip'] = proxy_obj['data'][0]['ip'] temp['port'] = proxy_obj['data'][0]['port'] temp['expire_time'] = proxy_obj['data'][0]['expire_time'] temp['city'] = proxy_obj['data'][0]['city'] temp['isp'] = proxy_obj['data'][0]['isp'] PROXY_LIST.append(temp) # print(response.request.meta) # print(response.request.headers['User-Agent']) movie_el_list = response.xpath('//*[@class="grid_view"]/li') for movie_el in movie_el_list: item = DoubanmovieItem() item['movie_name_zh'] = movie_el.xpath( './/span[@class="title"][1]/text()').extract_first() item['score'] = movie_el.xpath( './/span[@class="rating_num"]/text()').extract_first() item['description'] = movie_el.xpath( './/span[@class="inq"]/text()').extract_first() yield item next_url = response.xpath( '//span[@class="next"]/a/@href').extract_first() if (next_url != None): url = response.urljoin(next_url) yield scrapy.Request(url=url)
def parse(self, response): href = response.selector.xpath('//div[@id="sogou_vr_11002301_box_0"]/@href').extract()[0] cmd="~/bin/phantomjs ./getBody.js '%s'" % href time.sleep(1) stdout, stderr = subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate() print stderr response = HtmlResponse(url=href, body=stdout) for selector in Selector(response=response).xpath('//*[@id="history"]/div/div/div/div'): href = selector.xpath('h4/@hrefs').extract()[0].strip() title = "" for elem in selector.xpath('h4/text()').extract(): if len(elem.strip()) > 0: title = elem.strip() abstract = selector.xpath('//*[contains(@class, "weui_media_desc")]/text()').extract()[0].strip() pubtime = selector.xpath('//*[contains(@class, "weui_media_extra_info")]/text()').extract()[0].strip() full_url = response.urljoin(href) n = 0 if len(title) != 0: sql = "select * from CrawlPage where title='%s'" % title n = self.cursor.execute(sql) if len(title) == 0 or n == 0: yield scrapy.Request(full_url, callback=self.parse_profile)
# -*- coding: utf-8 -*- from scrapy.http import HtmlResponse url = 'http://www.mzitu.com/32288' response = HtmlResponse(url=url) # url assert response.url == url assert response.urljoin(url='2') == 'http://www.mzitu.com/2' assert response.url + '/2' == 'http://www.mzitu.com/32288/2' response = HtmlResponse(url=url + '/2') photo_number, album_number = response.url.split('/')[:2:-1] assert photo_number == '2' assert album_number == '32288'