def parse_film(self, response): item = response.meta['item'] film_rank = Selector(response=response).xpath( '/html/body/div[2]/div/div[1]/div[2]/div/ul/li[1]/p/text()' ).extract_first() # film_viewcount = Selector(response=response).xpath('//*[@id="resource_views"]/../..') # film_viewcount = Selector(response=response).xpath('//*[@id="resource_views"]//text()').extract()[0] film_viewcount = '0' film_class = Selector(response=response).xpath( '/html/body/div[2]/div/div[1]/div[1]/div[2]/div[2]/div/img/@src' ).extract_first() film_cover = Selector(response=response).xpath( '/html/body/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/a/img/@src' ).extract_first() index = film_rank.index(':') film_rank = film_rank[index + 1:].lstrip().rstrip() print(f'film_rank: {film_rank}') print(f'film_class: {film_class}') print(f'film_cover: {film_cover}') print(f'film_viewcount: {film_viewcount}') # print(f'film_viewcount: {film_viewcount.xpath("./div/label/text()").extract()[0]}') item['film_class'] = 'a' item['film_rank'] = film_rank item['film_class'] = film_class item['film_cover'] = film_cover item['film_viewcount'] = film_viewcount # item['film_viewcount'] = film_viewcount # print(item) print(f"in parse_film , item: {item}") return item
def parse(self, response): try: for div in (response.xpath( "//div[@id='list']/table[@class='table']/tbody/tr/td[@class='ip']" )): tag_str = re.findall(r'>(\S+)</|>\W+<', str(div.extract())) if tag_str: ip_port = "" for temp_str in tag_str: if not temp_str: temp_str = temp_str.replace("", ":") target_str = re.search(r'[^<>/]\d+|\d+\.|\.|\d+|\:', temp_str) if target_str: ip_port += target_str.group() print ip_port str_list = ip_port.strip().split(":") ip = str_list[0] port = str_list[1] item = IpspiderItem(ip=ip, port=port) yield item current_page = response.xpath(".//span[@class='current']/text()") if current_page: current_page = current_page.extract()[0] nextPage_list = Selector( response=response).re(u'<a href="(\S*)">\d+</a>') index = "" if nextPage_list: for temp in nextPage_list: page_num = re.search(r'\d+', str(temp)) if page_num: if int(page_num.group()) > int(current_page): index = nextPage_list.index(temp) break next_page = nextPage_list[int(index)] print next_page yield scrapy.Request(url="http://www.goubanjia.com/free/" + str(next_page), callback=self.parse) except Exception, e: print e pass
def xt(cls, response): docs = [] raw_docs = response.xpath(cls.LI_XPATH) for raw_doc in raw_docs: html_url, pdf_url = "", "" urls = raw_doc.css("a").xpath("@href").extract() for url in urls: if url.endswith(".pdf"): pdf_url = url elif url.endswith(".html"): html_url = url title = Selector(text=raw_doc.extract()).xpath("//a[1]/text()").extract()[0] title = title[: title.index("/")].strip() docs.append({"title": title, "html_url": html_url, "pdf_url": pdf_url}) return docs
def xt(cls, response): docs = [] raw_docs = response.xpath(cls.LI_XPATH) for raw_doc in raw_docs: html_url, pdf_url = "", "" urls = raw_doc.css('a').xpath('@href').extract() for url in urls: if url.endswith('.pdf'): pdf_url = url elif url.endswith('.html'): html_url = url title = Selector( text=raw_doc.extract()).xpath('//a[1]/text()').extract()[0] title = title[:title.index('/')].strip() docs.append({ 'title': title, 'html_url': html_url, 'pdf_url': pdf_url }) return docs
def xt(cls, response): docs = [] raw_docs = response.xpath(cls.LI_XPATH) for raw_doc in raw_docs: html_url, pdf_url = "", "" urls = raw_doc.css('a').xpath('@href').extract() for url in urls: if url.endswith('.pdf'): pdf_url = url elif url.endswith('.html'): html_url = url title = Selector(text=raw_doc.extract()).xpath( '//a[1]/text()').extract()[0] title = title[:title.index('/')].strip() docs.append({ 'title': title, 'html_url': html_url, 'pdf_url': pdf_url }) return docs