def parse(self, response): item = MovieItem() for sel in response.xpath('//div[@class="info"]'): title = sel.xpath('div[@class="hd"]/a/span/text()').extract() full_title = '' for each in title: full_title += each movie_info = sel.xpath('div[@class="bd"]/p/text()').extract() star = sel.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = sel.xpath('div[@class="bd"]/p/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = full_title print(item['title'].encode('GBK', 'ignore').decode('GBK')) item['movie_info'] = ';'.join(movie_info).replace(' ', '').replace( '\n', '') print(item['movie_info'].encode('GBK', 'ignore').decode('GBK')) item['star'] = star[0] print(item['star']) item['quote'] = quote print(item['quote'].encode('GBK', 'ignore').decode('GBK')) yield item next_page = response.xpath( '//span[@class="next"]/link/@href').extract() if next_page: next_page = next_page[0] print(self.start_urls[0] + str(next_page)) yield scrapy.Request(self.start_urls[0] + str(next_page), callback=self.parse)
def parse(self, response): selector = Selector(response) movies = selector.xpath('//div[@class="info"]') for movie in movies: item = MovieItem() title = movie.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in title: fullTitle += each movieInfo = movie.xpath('div[@class="bd"]/p/text()').extract() star = movie.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = movie.xpath('div[@class="bd"]/p/span/text()').extract() url1 = movie.xpath('div[@class="hd"]/a/@href').extract()[0] if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo).replace(' ', '').replace( '\n', '') item['star'] = star[0] item['quote'] = quote item['url'] = url1 print(url1) yield Request(url1, callback=self.parseContent, meta={'item': item}) nextPage = selector.xpath('//span[@class="next"]/link/@href').extract() if nextPage: nextPage = nextPage[0] print(self.url + str(nextPage)) yield Request(self.url + str(nextPage), callback=self.parse)
def parse(self, response): movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] sta_lst = each_movie.xpath( './span[@class="state1 new100state1"]/text()').extract() sta = sta_lst[0].strip() if sta_lst else None if not sta: sta = each_movie.xpath( './span[@class="state1 new100state1"]/font/text()' ).extract()[0] else: sta = sta_lst[0].strip() item['status'] = sta tv = each_movie.xpath('./span[@class="mjtv"]/text()').extract() item['tv'] = tv[0] if tv else None lst_update_tm = each_movie.xpath( './div[@class="lasted-time new100time fn-right"]/text()' ).extract() lst_up_time = lst_update_tm[0] if lst_update_tm else None if not lst_up_time: lst_up_time = each_movie.xpath( './div[@class="lasted-time new100time fn-right"]/font/text()' ).extract()[0] item['update_time'] = lst_up_time yield item
def parse(self, response): # print('Preparing parse') # 业务逻辑 # print(response) searchResults = json.loads(response.body)['data']['searchResults'] for result in searchResults: movie = result["movie"] if movie["movieId"] not in self.movieMap.keys(): # print(movie["movieId"]) pass else: urls = [] urls.append(self.imageUrlPrefix + movie["posterPath"]) item = MovieItem() item['imageUrls'] = urls item['movieId'] = movie["movieId"] item['movieName'] = movie['title'] item['directors'] = movie['directors'] item['actors'] = movie['actors'] item['posterPath'] = movie['posterPath'] item['plotSummary'] = movie['plotSummary'] item['avgRating'] = movie['avgRating'] item['numRatings'] = movie['numRatings'] self.movieMap.pop(movie["movieId"]) self.totalNum += 1 yield item
def parse(self, response): movies = response.xpath('//div[@class="list_2"]/ul/li') for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./a/@title').extract()[0] yield item
def parse2(self, response): star_to_num = { '力荐': 5, '推荐': 4, '还行': 3, '较差': 2, '很差': 1, } item = MovieItem() # 最后 class 空格不能省略 /div[@class="comment-item "] comments = Selector(response=response).xpath( '//*[@id="comments"]/div[@class="comment-item "]') for comment in comments: star = comment.xpath('./div[2]/h3/span[2]/span[2]/@title').get() if star not in star_to_num: # 处理没有评分的情况,那上面得到的是完整时间,比如 2020-05-17 17:27:09 continue star = star_to_num[star] record_time = comment.xpath( './div[2]/h3/span[2]/span[3]/text()').get().strip() short = comment.xpath('./div[2]/p/span/text()').get() item['short'] = short item['star'] = star item['record_time'] = record_time yield item
def parse_detail(self, response): import re import codecs p = re.compile(r'(http:\/\/cdn.*?_240.mp4.*?\")') res = p.findall(str(response.body)) res = re.sub('"', '', res[0]).split(':') res = 'http:' + res[len(res) - 1] # for index, val in enumerate(res): # if re.match('http://.*?\.m$', val): # del res[index] video_url = '\r\n'.join(res) with codecs.open('mov.txt', 'a', 'utf-8') as f: f.write(video_url + '\r\n') item_loader = MovItem(item=MovieItem(), response=response) item_loader.add_value("url", response.url) item_loader.add_value("thumb", response.meta.get("thumb")) item_loader.add_value("title", response.meta.get("title")) item_loader.add_value("duration", response.meta.get("duration")) item_loader.add_value("post_date", response.meta.get("post_date")) item_loader.add_value("video_url", video_url) item_loader.add_xpath( "views_num", "//div[contains(@id,'tabInfo')]/div[contains(@class,'col3')]/p[1]/text()" ) item_loader.add_xpath( "channel", "//div[contains(@id,'tabInfo')]/div[contains(@class,'col3')]/p[2]/a/text()" ) movie_item = item_loader.load_item() yield movie_item
def parse(self, response): #response是网页的内容 selector2 = lxml.etree.HTML(response.text) item = MovieItem() num = selector2.xpath('//p[@class="f4"]/text()') level = re.findall("\d+", str(num)) item['level'] = level yield item
def parse(self, response): r = json.loads(response.text) html = r['html'] soup = BeautifulSoup(html, 'lxml') all_comment = soup.find_all(class_='comment') for comment in all_comment: item = MovieItem() item['Commentator'] = comment.find('a', class_='').text item['time'] = comment.find('span', class_='comment-time').text.strip() item['votes'] = comment.find('span', class_='votes').text item['short'] = comment.find('span', class_='short').text try: allstar = comment.find('span', { 'class': re.compile('allstar\d.*') }).attrs['class'] item['allstar'] = allstar[0][-2] except: item['allstar'] = 0 yield item self.offset += 20 if self.offset > 1001: return None data = { 'start': self.offset, 'limit': '20', 'sort': 'new_score', 'status': 'P', 'comments_only': '1' } url = 'https://movie.douban.com/subject/26985127/comments?' + urlencode( data) yield scrapy.Request(url, callback=self.parse)
def parse_item(self, response): hxs = Selector(response) movie_name = hxs.xpath( '//*[@id="content"]/h1/span[1]/text()').extract() movie_director = hxs.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract() movie_writer = hxs.xpath( '/html/body/div[3]/div[1]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[2]/span[2]/a[1]/text()' ).extract() movie_description = hxs.xpath( '//*[@id="info"]/span[2]/span[2]/a/text()').extract() movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]') movie_roles = [] for movie_roles_path in movie_roles_paths: movie_roles = movie_roles_path.xpath( './/*[@rel="v:starring"]/text()').extract() item = MovieItem() item['movie_name'] = ''.join(movie_name).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_director'] = movie_director[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_director) > 0 else '' item['movie_description'] = movie_description[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_description) > 0 else '' item['movie_writer'] = ';'.join(movie_writer).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_roles'] = ';'.join(movie_roles).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') yield item
def parse(self, response): movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = MovieItem() # 新建一个类 item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] print(item['name']) yield item
def parse(self, response): movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for movie in movies: item = MovieItem() # scrapy 自带的解析器 item['name'] = movie.xpath('./h5/a/@title').extract()[0] yield item
def parse(self, response): movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] state = each_movie.xpath( './span[@class="state1 new100state1"]/font/text()').extract() if state: item['state'] = state[0] else: item['state'] = each_movie.xpath( './span[@class="state1 new100state1"]/text()').extract()[0] item['type'] = each_movie.xpath( './span[@class="mjjq"]/text()').extract()[0] updateDate = each_movie.xpath('./div/font/text()').extract() if updateDate: item['updateDate'] = updateDate[0] else: item['updateDate'] = each_movie.xpath( './div/text()').extract()[0] yield item
def parse(self, response): print "crazylog------------------response =", type(response) movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] yield item
def parse(self, response): items = [] lilist = response.xpath('/html/body//ul[@class="picList clearfix"]/li') for li in lilist: item = MovieItem() item['title'] = li.xpath( './div[@class="txt"]/p[@class="pTit"]/span[@class="sTit"]//text()' ).extract()[0] item['img'] = 'http:' + li.xpath( './div[@class="pic"]/img/@src').extract()[0] if len( li.xpath( './div[@class="txt"]/p[@class="pTxt pIntroHide"]//text()' ).extract()): item['intro'] = li.xpath( './div[@class="txt"]/p[@class="pTxt pIntroHide"]//text()' ).extract()[0] else: item['intro'] = li.xpath( './div[@class="txt"]/p[@class="pTxt pIntroShow"]//text()' ).extract()[0] names = '' for name in li.xpath( './div[@class="txt"]/p[@class="pActor"]/a//text()'): actor = name.extract() names = names + '#' + actor item['names'] = names items.append(item) return items
def parse(self, response): print response movies = response.xpath('//ul[@class="top-list fn-clear"]/li') print 'movies %r', movies for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./h5/a@title').extract()[0] yield item
def parse_detail(self, response): # 解析详情页中的电影类型,进行持久化存储 item = MovieItem() item['name'] = response.xpath( '//div[@class="stui-content__detail"]/h1/text()').get() desc = response.xpath('//span[@class="detail-content"]/text()').get() desc = ''.join(desc) item['desc'] = desc
def parse(self, response): soup = BeautifulSoup(response.body, "lxml") movies = soup.find_all('ul', class_="top-list")[0].find_all("li") for movie in movies: name = movie.find('a').get_text() item = MovieItem() item['name'] = name yield item
def parse(self, response): print('start crawl') movies = response.xpath('//ul[@class="top-list fn-clear"]/li') #movies= response.xpath('//ul[@class="top-list fn-clear"]/li') print(len(movies)) for each_movie in movies: item=MovieItem() item['name']=each_movie.xpath('./h5/a/@title').extract()[0] yield item
def parse(self, response): movie = response.xpath( '//ul[contains(@class,"top-list") and contains(@class,"fn-clear")]/li') # 用xpath对网页返回的response对象进行解析 for each_movie in movie: item = MovieItem() # 实例出item对象 item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] # 提起name字段 item["classification"] = each_movie.xpath('./span[2]/text()').extract_first() # 提取classification字段 item["state"] = each_movie.xpath('./span[1]/font[1]/text()').extract_first() yield item # 返回item,并交由pipeline管道处理
def parse(self, response): movies = response.xpath( '//ul[contains(@class,"top-list") and contains(@class,"fn-clear")]/li' ) # ul[@class=" fn-clear"]/li for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] yield item
def parse_item(self, response): selector = Selector(response) title = selector.xpath('//div[@class="title_all"]/h1/font/text()') link = selector.xpath('//td[@bgcolor="#fdfddf"]/a/@href') if len(title) > 0 and len(link) > 0: item = MovieItem() item['movie_title'] = title[0].extract() item['movie_link'] = link[0].extract() yield item
def parse(self, response): # print(response.status_code, response.content, response.text) # 非框架下写法 dom = lxml.etree.HTML(response.text); dom.xpath('') # scrapy框架下正确写法Selector(response.text).xpath('').extract() # 获取剧集名 movie_list = response.xpath( '//ul[@class="top-list fn-clear"]/li') # [<li>对象,<li>对象] # /h5/text() for movie in movie_list: # movie.xpath('./h5/text()').extract_first() # xpath()返回[Selector(),Selector()] xpath().extract()返回['剧集名1','剧集名2'xpath()] # xpath().extract_first() 返回'剧集名1' # .表示在子标签基础上继续解析 name = movie.xpath('./h5/text()').extract_first() item = MovieItem() item.name = name # item['name'] = name yield item # 相当于同步脚本方法中的return
def parse(self, response): # movies = response.xpath('//ul[@class="top-list fn-clear"]/li') movies = response.xpath('//div[@class="l week-hot layout-box"]/ul/li') for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath( './a/@title | ./p/a/@title').extract()[0] yield item
def parse_detail_page(self, response): doc = pq(response.text) movie = MovieItem() movie['movie_name'] = doc( '#header > div > div.bd2 > div.bd3 > div.co_area2 > div.title_all > h1' ).text().strip() movie['raw_url'] = response.url movie['cover_image'] = doc('#Zoom > p:nth-child(1) > img').attr('src') download_url = self.get_download_url(doc) movie['download_url'] = [url for url in download_url if url] yield movie
def parse_movie_item(self, response): item = MovieItem() item['url'] = response.url item['name'] = response.xpath( '//span[@property="v:itemreviewed"]/text()').extract_first() item['summary'] = response.xpath( '//span[@property="v:summary"]/text()').extract_first() item['score'] = response.xpath( '//strong[@property="v:average"]/text()').extract_first() print('------------------', item) yield item
def parse(self, response): #pass movies = response.xpath('//ul[@class="top-list fn-clear"]/li') for each_movie in movies: item = MovieItem() item['name'] = each_movie.xpath('./h5/a/@title').extract()[0] #add item #item['subType'] = each_movie.xpath('./span[@class="mjjq"]/text()').extract()[0] item['subType'] = each_movie.xpath( './span[@class="mjjq"]/text()').extract()[0] yield item
def parse_item(self, response): a_list = response.xpath('//div[@class="co_content8"]//table//a') for a in a_list: # 要注意关注seletor对象中的data属性值 title = a.xpath('./text()').extract_first() href = a.xpath('./@href').extract_first() url = 'http://www.ygdy8.net' + href movie = MovieItem(title=title) # yield Request(参数) url 发送的请求 callback 执行的方法 meta就是 响应时候携带的参数 yield scrapy.Request(url=url, callback=self.parse_detail, meta={'movie': movie})
def parse(self, response): item = MovieItem() selector = Selector(response) movie = selector.xpath('//*[@id="content"]/div/div[1]/ol/li') for each in movie: item['title'] = each.xpath('div/div[2]/div[1]/a/span[1]/text()').extract()[0] print(item['title']) content = each.xpath('div/div[2]/div[2]/p[1]/text()').extract()[0]\ +each.xpath('div/div[2]/div[2]/p[1]/text()[2]').extract()[0] item['movie_info'] = content.replace('\n','').replace(' ', '') item['star'] = each.xpath('div/div[2]/div[2]/div/span[2]/text()').extract()[0] item['quote'] = each.xpath('div/div[2]/div[2]/p[2]/span/text()').extract()[0] yield item
def parse(self, response): # if(re.match("http://www.dawnfly.cn/article-0-\d+|\d+-\d+.html",response.url)): movies = response.xpath('//nav[@class="main-nav"]/div/ul/li') print response.url for each_movie in movies: item = MovieItem() item['cat_url']=each_movie.xpath('./a/@href').extract()[0] if(item['cat_url'].startswith("/article-")): # 栏目分类url # print item['cat_url'] # if(re.match("/article-",item['cat_url']): # yield item yield Request("http://www.dawnfly.cn"+item['cat_url'],callback=self.parse_cat,meta={'cat_url':item})