def parse(self, response): # try: self.items = [] html = etree.HTML(response.text) dls = html.xpath('//*/dd') for dl in dls[10:]: name = dl.xpath('./div[1]/div[2]/a/div/div[1]/span/text()')[0] type = dl.xpath('./div[1]/div[2]/a/div/div[2]/text()')[1].strip() time = dl.xpath('./div[1]/div[2]/a/div/div[4]/text()')[1].strip() # //*[@id="app"]/div/div[2]/div[2]/dl/dd[1]/div[1]/div[2]/a/div/div[1]/span[2] # //*[@id="app"]/div/div[2]/div[2]/dl/dd[1]/div[1]/div[2]/a/div/div[1]/span[2]/i[1] integer = dl.xpath( './div[1]/div[2]/a/div/div[1]/span[2]/i[1]/text()')[0] fraction = dl.xpath( './div[1]/div[2]/a/div/div[1]/span[2]/i[2]/text()')[0] grade = f'{integer}{fraction}' href = dl.xpath("./div[1]/a/@href")[0] item = SpidersItem() item["name"] = name item["type"] = type item["grade"] = grade item["time"] = time url = f'https://maoyan.com{href}' yield scrapy.Request(url=url, meta={"item": item}, callback=self.parse_detail)
def parse2(self, response): try: # Selector print('parse2') movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') # print(len(movies)) # 切片限定爬取数量Top10 for movie in movies[0:10]: item = SpidersItem() # print(movie) movie_title = movie.xpath('./div[1]/@title').extract_first() # print(movie_title) movie_type = movie.xpath( './div[2]/text()[2]').extract_first().strip() # print(movie_type) movie_showtime = movie.xpath( './div[4]/text()[2]').extract_first().strip() # print(movie_showtime) item['mtitle'] = movie_title item['mtype'] = movie_type item['mshowtime'] = movie_showtime # print(item) yield item except Exception as e: print(e)
def parse(self, response): page = response.meta['page'] page_movie_num = self.movie_num - (page - 1) * 30 selector_info = Selector(response=response) try: for i, movie_block in enumerate( selector_info.xpath('//div[@class="movie-hover-info"]')): if i == page_movie_num: break movie_name = None movie_type = None movie_time = None item = SpidersItem() for movie_info in movie_block.xpath('./div'): movie_name = movie_info.xpath('./@title').extract_first() div_text = movie_info.xpath('./text()').extract() span_text = movie_info.xpath( './span/text()').extract_first() if span_text == '类型:': movie_type = div_text[1].strip() elif span_text == '上映时间:': movie_time = div_text[1].strip() item['movie_name'] = movie_name item['movie_type'] = movie_type item['movie_time'] = movie_time yield item except Exception as e: print(e)
def parse(self, response): print(response.url) movies = Selector( response=response).xpath("//div[@class='movie-hover-info']") for i in range(10): movie = movies[i].xpath( "./div[contains(@class,'movie-hover-title')]") movie_name = movie[0].xpath("./span[1]/text()") movie_type = movie[1].xpath("./text()") movie_time = movie[3].xpath("./text()") res_name = movie_name.get() res_type = movie_type.extract()[1].strip() res_time = movie_time.extract()[1].strip() print(res_name) print(res_type) print(res_time) item = SpidersItem() item['name'] = res_name item['m_type'] = res_type item['m_time'] = res_time yield item
def parse(self, response): item = SpidersItem() movies = response.xpath("//div[@class='info']") for each in movies: title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()' ).extract_first() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract_first() info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()' ).extract_first() item['title'] = title item['content'] = ';'.join(content) item['score'] = score item['info'] = info yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
def parse(self, response): # bs_info = bs(response.text, 'html.parser') # moviedict=defaultdict(dict) # for tags in bs_info.find_all('div', attrs={'class': 'movie-hover-info'})[:10]: # for atag in tags.find_all('div',attrs={'class':'movie-hover-title'}): # name=atag.get('title') # hovertag=atag.find('span') # if hovertag: # tagtype=hovertag.text.strip() # if tagtype in ['上映时间:','类型:']: # txt=atag.text.strip().split('\n')[1].strip() # moviedict[name][tagtype]=txt # for name in moviedict: # item=SpidersItem() # item['name']=name # item['movietype']=moviedict[name]['类型:'] # item['time']=moviedict[name]['上映时间:'] # #items.append(item) # #print(name,moviedict[name]['类型:'],moviedict[name]['上映时间:']) # yield item select = Selector(response=response) movies=select.xpath('//div[@class="movie-hover-info"]') for movie in movies[:10]: item=SpidersItem() name = movie.xpath('./div/@title')[0] movietype=movie.xpath('./div/text()')[4] time=movie.xpath('./div/text()')[-1] item['name']=name.extract() item['movietype']=movietype.extract().strip() item['time']=time.extract().strip() #print(name.extract(),movietype.extract().strip(),movie.xpath('./div/text()')[-1].extract()) yield item
def parse(self, Response): print('scrapy is parsing') print(Response.url) items = [] movies = Selector( response=Response).xpath('//div[@class="movie-hover-info"]') try: for movie in movies[:10]: name = movie.xpath( './div[1]/span[@class="name "]/text()').get() movie_type = movie.xpath('./div[2]/text()')[1].get().strip() show_time = movie.xpath('./div[3]/text()')[1].get().strip() # print('name is',name) # print('movie_type is: ',movie_type) # print('show_time is ', show_time) item = SpidersItem() item['name'] = name item['movie_type'] = movie_type item['show_time'] = show_time items.append(item) except: print('an error occurred') finally: print('items are ', items) return items
def get_spiders_item(sel, fields, item=None): if item is None: item = SpidersItem() for f in fields: if f['type'] == 'xpath': # xpath selector if f['extract_type'] == 'text': # text content query = f['query'] + '/text()' else: # attribute attribute = f["attribute"] query = f['query'] + f'/@("{attribute}")' item[f['name']] = sel.xpath(query).extract_first() else: # css selector if f['extract_type'] == 'text': # text content query = f['query'] + '::text' else: # attribute attribute = f["attribute"] query = f['query'] + f'::attr("{attribute}")' item[f['name']] = sel.css(query).extract_first() return item
def parse(self, response): # 打印网页的url print(response.url) # 打印网页的内容 # print(response.text) # soup = BeautifulSoup(response.text, 'html.parser') # title_list = soup.find_all('div', attrs={'class': 'hd'}) movies = Selector(response=response).xpath('//div[@class="hd"]') for i in range(10): #for movie in movies: # title = i.find('a').find('span',).text # link = i.find('a').get('href') # 路径使用 / . .. 不同的含义 item = SpidersItem () title = movies[i].xpath('./a/span/text()') link = movies[i].xpath('./a/@href') # print('---------') # print('num::{i}') # print(title) # # print(link) # print('----------') # title2 = title.extract() # print(f'title::{title2}') # print(f'link::{link.extract()}') # print(f'title_first:::{title.extract_first()}') # print(f'link_first:::{link.extract_first()}') # print(f'title_first:::::::::{title.extract_first().strip()}') # print(f'link_first::::::::::{link.extract_first().strip()}') item['title'] = title.extract_first().strip() item['link'] = link.extract_first().strip() yield scrapy.Request(url=link.extract_first().strip(), meta={'item': item}, callback=self.parse2)
def parse(self, response): # print(response.url) movies = Selector(response=response).xpath('//div[@class="hd"]') for movie in movies: titles = movie.xpath('./a/span/text()') links = movie.xpath('./a/@href') # 查看 mysql movie 表是否有数据 # conn = pymysql.connect(host = '192.168.100.101', # port = 3306, # user = '******', # password = '******', # database = 'douban', # charset = 'utf8mb4' # ) # try: # with conn.cursor() as cursor: # count = cursor.execute('select * from movie;') # print(f'查询到 {count} 条记录') # result = cursor.fetchone() # print(result) # finally: # conn.close() title = titles.extract_first().strip() link = links.extract_first().strip() item = SpidersItem() item['title'] = title item['link'] = link yield item
def parse(self, response): # 打印网页的url movies = Selector( response=response).xpath('//div[@class="movie-item film-channel"]') print('----0') print(len(movies)) cookies = { 'uuid': '66a0f5e7546b4e068497.1542881406.1.0.0', '_lxsdk_cuid': '1673ae5bfd3c8-0ab24c91d32ccc8-143d7240-144000-1673ae5bfd4c8', '__mta': '222746148.1542881402495.1542881402495.1542881402495.1', 'ci': '20', 'rvct': '20%2C92%2C282%2C281%2C1', '_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic', '_lxsdk_s': '1674f401e2a-d02-c7d-438%7C%7C35' } for i in range(10): item = SpidersItem() link = movies[i].xpath('./a/@href').extract_first() print('----1') print(link) print('----2') link = "https://maoyan.com" + link print(link) yield scrapy.Request(url=link, meta={'item': item}, cookies=cookies, callback=self.parse2)
def parse(self, response): movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]') for movie in movies[:10]: item = SpidersItem() item['title'] = movie.xpath('./div[1]/span[1]/text()').extract_first() item['types'] = movie.xpath('./div[2]/text()').extract()[1].strip() item['date'] = movie.xpath('./div[4]/text()').extract()[1].strip() yield item
def parse(self, response): selector = lxml.etree.HTML(response.text) links_xpath = '//*[@class="hd"]/a/@href' links = selector.xpath(links_xpath) for link in links: item = SpidersItem() item['link'] = str(link) yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): items = [] movies = Selector(response).xpath('//div[@class="hd"]') for movie in movies: item = SpidersItem() titleData = movie.xpath('./a/span/text()') linkData = movie.xpath('./a/@href') item['title'] = titleData.extract_first() item['link'] = linkData.extract_first() items.append(item) yield scrapy.Request(url=item['link'], meta = {'item': item}, callback=self.parse2)
def parse(self, response): movies = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for movie in movies[0:10]: item = SpidersItem() link_uri = movie.xpath('./a/@href').extract_first().strip() link = 'https://maoyan.com' + link_uri item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]') link = movies.response.xpath('./a/@href') for i in link: i1 = Selector(response=response).xpatn('//div[class="movie-brief-container"]') # 在items.py定义 item = SpidersItem() title = i1.xpath('./div[1]/text()').get() ca = i1.xpath('./div[3]/text()').get().extract() date = i1.xpath('./div[4]/text()').get() yield item
def parse(self, response): soup = BeautifulSoup(response.text, 'html.parser') title_list = soup.find_all('div', attrs={'class': 'hd'}) for i in title_list: item = SpidersItem() title = i.find('a').find('span', ).text link = i.find('a').get('href') item['title'] = title item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]')[:10] for movie in movies: item = SpidersItem() item['movie_name'] = movie.xpath( './div[1]/span[1]/text()').extract_first() item['movie_type'] = movie.xpath( './div[2]/text()[2]').extract_first().strip() item['movie_time'] = movie.xpath( './div[4]/text()[2]').extract_first().strip() yield item
def parse_details(self, response): item = SpidersItem() selector = Selector(response=response) movie_brief = selector.xpath('//div[@class="movie-brief-container"]') title = movie_brief.xpath('./h1/text()').get() categories = movie_brief.xpath('.//a/text()').getall() categories = [category.strip() for category in categories] release_time = movie_brief.xpath('.//li[last()]/text()').get() item['title'] = title item['categories'] = categories item['release_time'] = release_time yield item
def parse(self, response): et_html = et.HTML(response.text) for selector in response.xpath( '//div[@class="movie-hover-info"]')[:10]: item = SpidersItem() item['film_name'] = selector.xpath( './div[1]/span[1]/text()').extract_first().strip() item['film_type'] = selector.xpath( './div[2]/text()[2]').extract_first().strip() item['file_date'] = selector.xpath( './div[4]/text()[2]').extract_first().strip() yield item
def parse(self, response): movie_list = Selector( response=response).xpath('//div[@class="movie-item film-channel"]') for i in range(10): movie = movie_list[i] item = SpidersItem() item['movie_name'] = movie.xpath( './/span[contains(@class,"name")]/text()').extract_first() type_date = movie.xpath( './/span[@class="hover-tag"]/../text()').extract() item['movie_type'] = type_date[1].strip('\n').strip() item['movie_time'] = type_date[5].strip('\n').strip() yield item
def parse(self, response): base_url = 'https://maoyan.com' movies = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for movie in movies: item = SpidersItem() title = movie.xpath('./a/text()') # 电影名称 link = movie.xpath('./a/@href') # 链接 item['film_name'] = title.extract_first().strip() item['link'] = base_url + link.extract_first().strip() # 拼接完整的 url yield scrapy.Request(url=item['link'], meta={'item': item}, callback=self.parse2)
def parse(self, response): selector = lxml.etree.HTML(response.text) links_xpath = '//*[@class="movie-item-hover"]/a/@href' links = selector.xpath(links_xpath) print('-------------------------') print(links) print('-------------------------') for link in links: item = SpidersItem() item['link'] = str('https://maoyan.com' + link) yield scrapy.Request(url=item['link'], meta={'item': item}, callback=self.parse2)
def parse(self, response): select = Selector(response=response) #print(select) movies=select.xpath('//div[@class="movie-hover-info"]') for movie in movies[:10]: item=SpidersItem() name = movie.xpath('./div/@title')[0] movietype=movie.xpath('./div/text()')[4] time=movie.xpath('./div/text()')[-1] item['name']=name.extract() item['movie_type']=movietype.extract().strip() item['time']=time.extract().strip() print(name.extract(),movietype.extract().strip(),movie.xpath('./div/text()')[-1].extract()) yield item
def parse(self, response): items = [] soup = bs(response.text, 'html.parser') for index, tags in enumerate(soup.find_all('div', {'class': 'hd'})): for content in tags.find_all('a'): href = content.get('href') name = content.find('span').text # print(f'名称:{name} 电影链接:{href}') # 利用提供的Item item = SpidersItem() item["name"] = name item["href"] = href items.append(item) return items
def parse(self, response): items = [] movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') for movie in movies: item = SpidersItem() title = movie.xpath('./div/span/text()').get().strip() info = movie.xpath('./div/text()') movie_type = info[4].get().strip() date = info[-1].get().strip() item['title'] = title item['movie_type'] = movie_type item['date'] = date items.append(item) return items
def parse(self, response): movies = Selector(response).xpath('//div[@class="movie-item-hover"]') i = 0 for movie in movies: if (i > 9): break item = SpidersItem() title = movie.xpath( './a/div/div[1]/span[1]/text()').extract()[0].strip() t = movie.xpath('./a/div/div[2]//text()').extract()[2].strip() timet = movie.xpath('./a/div/div[4]//text()').extract()[2].strip() item = {'title': title, 'type': t, 'time': timet} i += 1 yield item
def parse(self, response): movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]') i = 0 for movie in movies: if i > 9: break item = SpidersItem() film_name = movie.xpath('./div[1]/span/text()') film_type = movie.xpath('./div[2]/text()') plan_date = movie.xpath('./div[4]/text()') item['film_name'] = film_name.extract()[0].strip() item['film_type'] = film_type.extract()[1].strip() item['plan_date'] = plan_date.extract()[1].strip() i = i + 1 yield item
def parse(self, response): item = SpidersItem() count = 0 movies = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]') for movie in movies: url = f'https://maoyan.com' + movie.xpath( './a/@href').extract_first() #print('------------------------------------------------------') #print(url) if (count < 10): yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse2) count += 1
def parse(self, response): for movies in Selector(response=response).xpath( '//div[@class="movie-hover-info"]')[:10]: movie_name = movies.xpath('./div[1]/span[1]/text()').extract()[0] movie_type = movies.xpath( './div[2]/text()[2]').extract()[0].strip() release_time = movies.xpath( './div[4]/text()[2]').extract()[0].strip() content = SpidersItem() content['movie_name'] = movie_name content['movie_type'] = movie_type content['release_time'] = release_time yield content