def parse_do(self, response): item = DianyingItem() #取得链接 #link=response.xpath('//*[@id="Zoom"]/table[2]/tbody/tr/td/a/@href').extract() link = response.xpath('//*[@id="Zoom"]//a/text()').extract() print("zzzzzzzz=") print link if (len(link) == 0): print "1...begin" link = response.xpath( '//*[@id="Zoom"]/table[1]/tbody/tr/td/anchor/a/text()' ).extract() print link print('1...over') if (not len(link) == 0): video_link = "" for i in link: print "i=========" + i video_link += i video_link += "##" item['VideoId'] = str(uuid.uuid4()) try: item['VideoLink'] = video_link except Exception as e: item['VideoLink'] = "" item['VideoTitle'] = response.xpath( '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[1]/h1/text()' ).extract()[0] try: item['VideoTag'] = response.xpath( '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[2]/ul/div[1]/span[2]/a/text()' ).extract()[0] except Exception as e: item['VideoTag'] = "" try: item['VideoND'] = response.xpath( '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[2]/ul/div[1]/span[3]/text()' ).extract()[0] except Exception as e: item['VideoND'] = "" try: item['VideoPF'] = response.xpath( '//*[@id="header"]/div/div[3]/div[2]/div[6]/div[2]/ul/div[1]/span[1]/strong/text()' ).extract()[0] except Exception as e: item['VideoPF'] = "" try: item['VideoContent'] = response.xpath( '//*[@id="Zoom"]').extract() except Exception as e: item['VideoContent'] = "" imglink = response.xpath('//*[@id="Zoom"]/p[1]/img/@src').extract() item['VideoImg'] = imglink if (not len(imglink) == 0): item['VideoImgName'] = imglink[0].split('/')[-1] print 'will yield...' yield item
def parse_get(self, response): print('开始爬取') item = DianyingItem() lists = response.xpath("//tbody[starts-with(@id,'normalthread')]") for list in lists: b = '百度云' # 获取所有的标题和网址 print('获取标题') lname = list.xpath("./tr/th/a[2]/text()").extract() # 返回的是list,我们转化为str names = ''.join(lname) print(names) # 筛选出百度云的链接 if b in names: print('资源网址:') lurl = list.xpath("./tr/th/a[2]/@href").extract() # 同上进行str转换 urls = ''.join(lurl) print(urls) baiduurl, tiquma = self.get_url(urls) print(baiduurl, tiquma) item['name'] = names item['baiduurl'] = baiduurl item['tiquma'] = tiquma yield item
def neirong(self, response): data = Selector(response) itme = DianyingItem() itme['moviename'] = data.xpath( '//h1[@class="font14w"]/text()').extract() itme['jianjie'] = ''.join( data.xpath( '//div[@class="info" and child::h1[@class="font14w"]]/span/text()' )[0:2].extract()).strip() itme['actor'] = data.xpath( '//span/a[contains(@href,"actor")]/text()').extract() itme['kind'] = data.xpath( '//span/a[contains(@href,"----")]/text()').extract() itme['country'] = data.xpath( '//span[child::span[contains(text(),"地区")]]/a/text()').extract() itme['language'] = data.xpath( '//span[child::span[contains(text(),"语言")]]/a/text()').extract() itme['daoyan'] = data.xpath( '//span/a[contains(@href,"dir")]/text()').extract() itme['sysj'] = data.re('上映日期:.*?(\d{4}-\d{2}-\d{2})') itme['pc'] = data.re('片长:\D+?(\d+[\u4E00-\u9FA5]+)') itme['gxsj'] = data.re('更新日期:.*?(\d{4}-\d{2}-\d{2})') itme['jqjs'] = ''.join( data.xpath('//div[@id="movie_content"]/text()').extract()).strip() itme['dbpf'] = data.xpath( '//span[child::span[contains(text(),"豆瓣评分")]]/text()').re( '\d+.\d+') downlink = data.xpath( '//div[@id="cpdl2list"]//a[@rel="nofollow"]/@href').extract() downlink_2 = data.xpath('//input[@class="checkone"]/@value').extract() for i in downlink_2: downlink.append(i) itme['downlink'] = downlink print(itme) return itme
def parse_1(self, response): item = DianyingItem() item['title'] = response.css('title::text').extract_first() item['url'] = response.url return item
def parse(self, response): # 从响应体中提取出所有的电影信息 viod_list = response.xpath("//div[@class='co_content8']//table") for viod in viod_list: item = DianyingItem() item['title'] = viod.xpath(".//a/text()").extract_first() item['data'] = viod.xpath(".//font/text()").extract_first() url_next = "http://www.dytt8.net" + viod.xpath( ".//a/@href").extract_first() yield scrapy.Request(url=url_next, callback=self.parse_next, meta={'item': item})
def parse(self, response): reload(sys) sys.setdefaultencoding('utf-8') print self.start_urls[0] items = DianyingItem() #取得名字 a = response.xpath('//*[@id="header"]/div/div[3]/div[2]/div[6]/div[1]/h1/text()').extract()[0] items['name'] = a #取得链接 link=response.xpath('//*[@id="Zoom"]/table[2]/tbody/tr/td/a/@href').extract() if(not len(link) == 0): items['link'] =link yield items if self.i < 98050: self.i+=1 yield scrapy.Request(self.url+str(self.i)+".html",callback=self.parse)
def parse_onepage(self, response): item = DianyingItem() # 获取目标网页中的href,有可能包含多个href,且返回结果为list了,所以我们进行遍历 print('获取页面中的所有链接') lists = response.xpath("//td[@class='t_f']//a/@href").extract() for list in lists: if 'baidu.com' in list: # 这块转换str是因为如果是多个a标签的话,list是lxml的unicode对象,所以先进行转换 item['baiduurl'] = str(list) # 获取页面中的所有文本,找到提取码 all = response.xpath("//td[@class='t_f']//text()").extract() for i in all: a = '提取' if a in i: item['tiquma'] = i # 获取该链接的标题 name = response.xpath( "//span[@id='thread_subject']/text()").extract_first() item['name'] = name yield item