Beispiel #1
0
    def parse(self, response):
        counter = 0
        divTags = Selector(
            response=response).xpath('//dd/div/div[@class="movie-item-hover"]')

        for divTag in divTags:
            if counter <= 10:
                title = divTag.xpath(
                    './a/div/div/span[@class="name "]/text()').extract_first()
                link = divTag.xpath(
                    './a[@data-act="movie-click"]/@href').extract_first()
                cat = divTag.xpath('./a/div/div[2]/text()').extract()[1].strip(
                    '\n').strip()
                time = divTag.xpath(
                    './a/div/div[4]/text()').extract()[1].strip('\n').strip()

                item = MaoyanmovieItem()
                item['title'] = title
                item['link'] = 'https://maoyan.com' + link
                item['time'] = time
                item['category'] = cat
                counter += 1
                yield item
            else:
                yield
Beispiel #2
0
 def parse (self,response):
     print(response.url)
     print(response.text)
     movies = Selector(response = response).xpath('//div[@class="movie-hover-info"]')
     for movie in movies:
         moviename = movie.xpath('./div[@class="movie-hover-title"]/span/text()[1]').extract_first()
         movietype = movie.xpath('./div[@class="movie-hover-title"]/text()').extract()[-3].strip()
         showtime = movie.xpath('./div[@class="movie-hover-title movie-hover-brief"]/text()').extract()[1].strip()
         item = MaoyanmovieItem()
         item['moviename'] = moviename
         item['movietype'] = movietype
         item['showtime'] = showtime
         print('-----------')
         print(moviename)
         print(movietype)
         print(showtime)
         # print('-----------')
         # print(moviename.extract())
         # print(moviename.extract_first())
         # print(moviename.extract_first().strip())
         # print('-----------')
         # print(movietype.extract())
         # print(movietype.extract_first())
         # print(movietype.extract_first().strip())
         # print('-----------')
         # print(showtime.extract())
         # print(showtime.extract_first())
         # print(showtime.extract_first().strip())
         yield item
Beispiel #3
0
    def parse(self, response):
        try:
            movies = Selector(
                response=response).xpath('//div[@class="movie-hover-info"]')
            for movie in movies[:10]:
                divs = movie.xpath('./div[@class="movie-hover-title"]')
                movie_name = divs[0].xpath(
                    './span[@class="name "]/text()').get()
                if not movie_name:
                    movie_name = divs[0].xpath(
                        './span[@class="name noscore"]/text()').get()
                movie_type = divs[1].xpath('./text()').getall()[1].strip()
                movie_releasetime = movie.xpath(
                    './div[@class="movie-hover-title movie-hover-brief"]'
                ).xpath('./text()').getall()[1].strip()

                item = MaoyanmovieItem()
                uuid5 = str(uuid.uuid5(uuid.NAMESPACE_DNS, movie_name))
                item['id'] = ''.join(uuid5.split('-'))  # 对应数据库表字段 id (uuid)
                item['movie_name'] = movie_name
                item['movie_type'] = movie_type
                item['movie_releasetime'] = movie_releasetime

                yield item
        except Exception as e:
            print(f'页面下载异常:{e}')
Beispiel #4
0
    def parse(self, response):
        items = []
        soup = BeautifulSoup(response.text, 'html.parser')
        title_list = soup.find_all('div', attrs={'class': 'movie-item-info'})

        # 2a. 在items.py定义
        # for i in range(len(title_list)):
        #     item = MaoyanmovieItem()
        #     title = title_list[i].find('a').get('title')
        #     link = 'https://maoyan.com' + title_list[i].find('a').get('href')
        #     plan_date = title_list[i].find('p', attrs={'class': 'releasetime'}).get_text()
        #     item['title'] = title
        #     item['link'] = link
        #     item['plan_date'] = plan_date
        #     items.append(item)
        # return items

        # 2b. 在items.py定义(在Python中应该这样写)
        for i in title_list:
            item = MaoyanmovieItem()
            title = i.find('a').get('title')
            link = 'https://maoyan.com' + i.find('a').get('href')
            plan_date = i.find('p', attrs={'class': 'releasetime'}).get_text()
            item['title'] = title
            item['link'] = link
            item['plan_date'] = plan_date
            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)
Beispiel #5
0
    def parse(self, response):
        # item = MaoyanmovieItem()
        movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]')
        # print(response.text)

        for movie in movies:
            # print(movie)
            item = MaoyanmovieItem()

            film_name = movie.xpath('./div[@class="movie-hover-title"][1]/@title')

            film_type = movie.xpath('./div[@class="movie-hover-title"][2]/text()')

            plan_date = movie.xpath('./div[@class="movie-hover-title movie-hover-brief"]/text()')

            # print('-----------------------------')
            # print(film_name)
            # print(film_type)
            # print(plan_date)
            # print('=============================')
            # print(film_name.extract_first().strip())
            # print(film_type.extract()[-1].strip())
            # print(plan_date.extract()[-1].strip())

            item['film_name'] = film_name.extract_first().strip()
            item['film_type'] = film_type.extract()[-1].strip()
            item['plan_date'] = plan_date.extract()[-1].strip()

            yield item
Beispiel #6
0
    def parse2(self, response):

        details = Selector(
            response=response).xpath('//div[@class="movie-brief-container"]')
        items = []
        for detail in details:
            # 第一个取名字
            name = detail.xpath('./h1/text()').extract_first().strip()
            # li 集合
            li_elements = detail.xpath('./ul/li')
            print(li_elements)
            style = []
            # 第三个获取上映时间
            date = detail.xpath('./ul/li[3]/text()').extract_first().strip()
            # 类型比较麻烦,需要获取第一个li里的集合
            for key, value in enumerate(li_elements):
                if (key == 0):
                    for style_element in value.xpath('./a/text()'):
                        style.append(style_element.extract().strip())

            item = MaoyanmovieItem()
            item['movie_name'] = name
            item['movie_type'] = ",".join(style)
            item['movie_time'] = date
            items.append(item)

            yield item
Beispiel #7
0
 def parse(self, response):
     # print(response.url)
     movies = Selector(response=response).xpath('//dd')
     # print('++++++++++')
     # print(movies)
     for movie in movies[0:10]:
         title = movie.xpath('./div[1]/div[2]/a/div/div[1]/span[1]/text()')
         movieType = movie.xpath('./div[1]/div[2]/a/div/div[2]/text()')
         releaseTime = movie.xpath('./div[1]/div[2]/a/div/div[4]/text()')
         # print('--------------')
         # print(title)
         # print(movieType)
         # print(releaseTime)
         print('--------------')
         title = title.extract()[0]
         movieType = movieType.extract()[1].strip()
         releaseTime = releaseTime.extract()[1].strip()
         print(title)
         print(movieType)
         print(releaseTime)
         item = MaoyanmovieItem()
         item['title'] = title
         item['movieType'] = movieType
         item['releaseTime'] = releaseTime
         yield item
Beispiel #8
0
    def parse2(self, response):
        # 打印网页的url
        print("")
        print(response.url)

        movie_container = Selector(response=response).xpath(
            '//div[1][@class="movie-brief-container"]')
        print('movie_container:', movie_container)
        if (not movie_container):
            return

        item = MaoyanmovieItem()
        movie_container = movie_container[0]
        filename = movie_container.xpath('./h1[@class="name"]/text()')
        print('filename:', filename)
        item['title'] = " ".join(filename.extract())

        file_type = movie_container.xpath('./ul/li[1]/a/text()')
        print('file_type:', file_type)
        item['type'] = "/".join(file_type.extract())

        film_date = movie_container.xpath('./ul/li[3]/text()')
        print('file_date:', film_date)
        item['film_date'] = " ".join(film_date.extract())

        print('item:', item)
        yield item
Beispiel #9
0
 def parse(self, response):
     #打印网页的url
     print(response.url)
     # 打印网页的内容
     # print(response.text)
     i = 0
     movies = Selector(
         response=response).xpath('//div[@class="movie-hover-info"]')
     for movie in movies:
         if i < 10:
             # 路径使用 / .  .. 不同的含义
             filmname = movie.xpath(
                 './div/span[@class="name "]/text()').extract_first()
             filmtype = movie.xpath('./div[2]/text()').extract()[1].strip(
                 '\n').strip()
             plandate = movie.xpath('./div[4]/text()').extract()[1].strip(
                 '\n').strip()
             # link = movie.xpath('./a/@href')
             i = i + 1
             print('-----------')
             print(filmname)
             print(filmtype)
             print(plandate)
             print(i)
             # print(link)
             print('-----------')
             item = MaoyanmovieItem()
             item['filmname'] = filmname
             item['filmtype'] = filmtype
             item['plandate'] = plandate
             yield item
         else:
             yield
Beispiel #10
0
 def parse1(self, response):
     details = Selector(response=response).xpath('//div[@class="movie-hover-info"]')
     for i in range(10):
         detail = details[i]
         item = MaoyanmovieItem()
         title = detail.xpath('./div[1]/span[1]/text()').extract()[0]
         movie_type = detail.xpath('./div[2]/text()[2]').extract()[0].strip()
         release_date = detail.xpath('./div[4]/text()[2]').extract()[0].strip()
         item['title'] = title
         item['movie_type'] = movie_type
         item['release_date'] = release_date
         yield item
Beispiel #11
0
 def parse(self, response):
     movies = Selector(
         response=response).xpath('//div[@class="movie-item film-channel"]')
     for movie in movies:
         item = MaoyanmovieItem()
         item['movie_name'] = movie.xpath(
             './div[2]/a/div/div[1]/span[1]/text()').get().strip()
         item['movie_tag'] = movie.xpath(
             './div[2]/a/div/div[2]/text()[2]').get().strip()
         item['movie_brief'] = movie.xpath(
             './div[2]/a/div/div[4]/text()[2]').get().strip()
         yield item
Beispiel #12
0
 def parse(self, response):
     """Get movie details directly from the landing page"""
     url_prefix = 'https://maoyan.com'
     movies = Selector(
         response=response).xpath('//div[@class="movie-item film-channel"]')
     for movie in movies[:10]:
         item = MaoyanmovieItem()
         url = url_prefix + movie.xpath('./a/@href').extract()[0]
         item['movie_url'] = url
         print(item['movie_url'])
         yield scrapy.Request(url=url,
                              meta={'item': item},
                              callback=self.parse_single_movie)
Beispiel #13
0
    def get_target_urls(self, response):
        item = MaoyanmovieItem()
        movies = scrapy.Selector(response).xpath(
            '//div[@class="channel-detail movie-item-title"]')
        for m in movies[:10]:
            href = m.xpath('./a/@href').extract_first()

            url = f'https://maoyan.com{href}'
            # item['Link'] = url
            yield scrapy.Request(url,
                                 callback=self.get_movie_info,
                                 cookies=self.cookies,
                                 meta={'item': item})
Beispiel #14
0
    def parse(self, response):
        print(response.url)
        titles = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]')
        for i in titles:
        # 在Python中应该这样写
	    # for i in title_list:
            # 在items.py定义
            item = MaoyanmovieItem()
            title = i.xpath('./a/text()')
            link =  i.xpath('./a/@href')
            item['title'] = title.extract_first().strip()
            item['link'] = 'https://' + self.allowed_domains[0] + link.extract_first().strip()
            yield scrapy.Request(url=item['link'],meta={'item': item},callback=self.parse2)
Beispiel #15
0
 def parse(self, response):
     base_url = 'https://maoyan.com'
     title_list = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for film in title_list[:10]:
         title = film.xpath('./a/text()').extract()[0]
         # print(film.xpath('./a/@href').extract()[0])
         link = base_url + film.xpath('./a/@href').extract()[0]
         item = MaoyanmovieItem()
         item['title'] = title
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
Beispiel #16
0
 def parse(self, response):
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies:
         item = MaoyanmovieItem()
         title = movie.xpath('./a/text()')
         linkpart = movie.xpath('./a/@href')
         item['title'] = title.extract_first().strip()
         item['link'] = 'https://maoyan.com' + linkpart.extract_first(
         ).strip()
         #print(title[0])
         #print('https://maoyan.com' + linkpart[0])
         yield scrapy.Request(url=item['link'],
                              meta={'item': item},
                              callback=self.parsedetail)
Beispiel #17
0
    def parse(self, response):
        movie_list = Selector(response=response).xpath('//div[@class="movie-hover-info"]')[:10]
        for movie in movie_list:
            # 修改为新方法get和getall
            movie_name = movie.xpath('./div[1]/span[@class="name "]/text()').get()
            print(movie_name)
            catagories = movie.xpath(f'./div/span[contains(text(), "类型")]/parent::*/text()').getall()[-1].strip()
            print(catagories)
            release_date = movie.xpath('./div/span[contains(text(), "上映时间")]/parent::*/text()').getall()[-1].strip()
            print(release_date)

            item = MaoyanmovieItem()
            item['movie_name'] = movie_name
            item['catagories'] = catagories
            item['release_date'] = release_date
            yield item
Beispiel #18
0
    def parse(self, response):
        # print(response.url)
        movies = Selector(response=response).xpath(
            '//div[@class="movie-hover-info"]')  #//dl[@class="movie-list"]
        for movie in movies[:10]:
            item = MaoyanmovieItem()
            my_name = movie.xpath('./div[1]/span[1]/text()')
            my_type = movie.xpath('./div[2]/text()')
            my_time = movie.xpath('./div[4]/text()')

            item['my_name'] = my_name.extract_first().strip()
            item['my_type'] = my_type.extract()[1].replace('\n', '').replace(
                ' ', '').strip()
            item['my_time'] = my_time.extract()[1].replace('\n', '').replace(
                ' ', '').strip()
            yield item
Beispiel #19
0
    def parse(self, response):
        items = []
        soup = BeautifulSoup(response.text, 'html.parser')
        title_list = soup.find_all('div', attrs={'class': 'movie-item-info'})

        for i in title_list:
            item = MaoyanmovieItem()
            title = i.find('a').get('title')
            link = 'https://maoyan.com' + i.find('a').get('href')
            plan_date = i.find('p', attrs={'class': 'releasetime'}).get_text()
            item['title'] = title
            item['link'] = link
            item['plan_date'] = plan_date
            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)
 def parse(self, response):
     movies_list = []
     movie_type = None
     movie_time = None
     maoyan_movies = Selector(
         response=response).xpath('//div[@class="movie-hover-info"]')
     for movie in maoyan_movies[0:10]:
         m = MaoyanmovieItem()
         movie_name = movie.xpath('./div[1]/span[1]/text()').extract()[0]
         movie_type = movie.xpath('./div[2]/text()').extract()[1].strip()
         movie_time = movie.xpath('./div[4]/text()').extract()[1].strip()
         m['movie_name'] = movie_name
         m['movie_type'] = movie_type
         m['movie_time'] = movie_time
         movies_list.append(m)
     return movies_list
 def parse(self, response):
     items = []
     print(response.url)
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies:
         item = MaoyanmovieItem()
         # title = movie.xpath('./a/text()') # 电影名称
         link = 'https://maoyan.com' + movie.xpath(
             './a/@href').get().strip()  # 链接
         # item['title'] = title.extract.strip()
         # item['link'] = link.extract.strip()
         item['link'] = link
         items.append(item)
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
 def parse(self, response):
     # 打印网页的url
     # print(response.url)
     # 打印网页的内容
     # print(response.text)
     movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]')
     for index,movie in enumerate(movies):
         if index < 10:
             try:
                 item = MaoyanmovieItem()
                 name = movie.xpath('./a/text()').extract_first().strip()
                 link = movie.xpath('./a/@href').extract_first().strip()
                 item['name'] = name
                 item['link'] = link
                 link = f'https://maoyan.com{link}'
                 yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
             except Exception as e:
                 print(e)
Beispiel #23
0
    def parse(self, response):
        print(response.url)

        movies = Selector(response=response).xpath('//dd/div[@class="movie-item film-channel"]/div[@class="movie-item-hover"]/a/div[@class="movie-hover-info"]')
        
        for movie in movies[:10]:
            item = MaoyanmovieItem()
            
            print('------------------------------')
            title = movie.xpath('./div[1]/span/text()').extract()[0]
            category = movie.xpath('./div[2]/text()').extract()[1].strip()
            release = movie.xpath('./div[4]/text()').extract()[1].strip()

            item['title'] = title
            item['category'] = category
            item['release'] = release

            yield item
Beispiel #24
0
    def parse(self, response):
        # 打印网页url
        # print(response.url)
        # 打印网页内容
        # print(response.text)

        dd_nodes = Selector(response=response).xpath(
            '//*[@id="app"]/div/div[2]/div[2]/dl[@class="movie-list"]//dd'
        )[:10]
        for dd_node in dd_nodes:
            item = MaoyanmovieItem()
            title = dd_node.xpath(
                './div[1]/div[2]/a/div/div[1]/span[1]/text()')
            film_type = dd_node.xpath('./div[1]/div[2]/a/div/div[2]/text()')
            film_date = dd_node.xpath('./div[1]/div[2]/a/div/div[4]/text()')
            item['title'] = ''.join(title.extract()).strip()
            item['film_type'] = ''.join(film_type.extract()).strip()
            item['film_date'] = ''.join(film_date.extract()).strip()
            yield item
Beispiel #25
0
 def parse(self, response):
     url_prefix = 'https://maoyan.com'
     #soup = BeautifulSoup(response.text, 'html.parser')
     '//*[@id="app"]/div/div[2]/div[2]/dl/dd[1]/div[2]/a'
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     #title_list = soup.find_all('div', attrs={'class': 'channel-detail movie-item-title'})
     counter = 0
     for movie in movies:
         # Only interested in the first 10 movies
         if counter < 10:
             item = MaoyanmovieItem()
             movie_url = url_prefix + movie.xpath(
                 './a/@href').extract_first().strip()
             item['url'] = movie_url
             yield scrapy.Request(url=movie_url,
                                  meta={'item': item},
                                  callback=self.parse_single_movie)
         counter += 1
Beispiel #26
0
    def parse(self, response):
        # # 打印网页的url
        # print(response.url)
        # # 打印网页的内容
        # # print(response.text)
        # # soup = BeautifulSoup(response.text, 'html.parser')
        # # title_list = soup.find_all('div', attrs={'class': 'hd'})
        # movies = Selector(response=response).xpath('//div[@class="hd"]')
        # for movie in movies:
        # #     title = i.find('a').find('span',).text
        # #     link = i.find('a').get('href')
        #     # 路径使用 / .  .. 不同的含义
        #     title = movie.xpath('./a/span/text()')
        #     link = movie.xpath('./a/@href')
        #     print('-----------')
        #     print(title)
        #     print(link)
        #     print('-----------')
        #     print(title.extract())
        #     print(link.extract())
        #     print(title.extract_first())
        #     print(link.extract_first())
        #     print(title.extract_first().strip())
        #     print(link.extract_first().strip())

        xpathexp_main = '//div[@class="movie-hover-info"]'
        movies = Selector(response=response).xpath(xpathexp_main)
        for movie in movies[:10]:
            item = MaoyanmovieItem()

            xpathexp_sub = './div[1]/span[1]/text()'
            xpathtxt_sub = movie.xpath(xpathexp_sub)
            item['title'] = xpathtxt_sub.extract_first().strip()

            xpathexp_sub = './div[2]/text()[2]'
            xpathtxt_sub = movie.xpath(xpathexp_sub)
            item['movie_type'] = xpathtxt_sub.extract_first().strip()

            xpathexp_sub = './div[4]/text()[2]'
            xpathtxt_sub = movie.xpath(xpathexp_sub)
            item['time'] = xpathtxt_sub.extract_first().strip()

            yield item
Beispiel #27
0
    def parse(self, response):
        xpathexp_main = '//div[@class="movie-hover-info"]'
        movies = Selector(response=response).xpath(xpathexp_main)
        for movie in movies[:10]:
            item = MaoyanmovieItem()

            xpathexp_sub = './div[1]/span[1]/text()'
            xpathtxt_sub = movie.xpath(xpathexp_sub)
            item['title'] = xpathtxt_sub.extract_first().strip()

            xpathexp_sub = './div[2]/text()[2]'
            xpathtxt_sub = movie.xpath(xpathexp_sub)
            item['movie_type'] = xpathtxt_sub.extract_first().strip()

            xpathexp_sub = './div[4]/text()[2]'
            xpathtxt_sub = movie.xpath(xpathexp_sub)
            item['time'] = xpathtxt_sub.extract_first().strip()

            yield item
Beispiel #28
0
    def parse(self, response):
        movies = Selector(
            response=response).xpath('//div[@class="movie-hover-info"]')

        n = 0
        for movie in movies:
            if n >= 10:
                break

            item = MaoyanmovieItem()
            item['name'] = movie.xpath(
                './div[1]/span[1]/text()').extract_first()
            item['genra'] = movie.xpath('./div[2]/text()').extract()[1].split(
                '\n')[1].strip()
            item['release_date'] = movie.xpath(
                './div[4]/text()')[1].extract().split('\n')[1].strip()

            n += 1

            yield item
Beispiel #29
0
 def parse(self, response):
     #基准的xpath
     movies = Selector(
         response=response).xpath('//*[@id="app"]/div/div[2]/div[2]/dl/dd')
     #for循环依次遍历
     for rank in range(10):
         #创建对象'
         item = MaoyanmovieItem()
         # 电影名称
         item['movie_name'] = movies[rank].xpath(
             "./div[1]/div[2]/a/div/div[1]/span[1]/text()").extract_first(
             ).strip()
         # 电影类型
         item['movie_type'] = movies[rank].xpath(
             "./div[1]/div[2]/a/div/div[2]/text()").extract_first().strip()
         #上映时间
         item['movie_time'] = movies[rank].xpath(
             './div[1]/div[2]/a/div/div[4]/text()').extract_first().strip()
         #把爬取的数据交给管道文件pipeline处理
         yield item
Beispiel #30
0
    def parse(self, response):
        movies = Selector(
            response=response).xpath('//div[@class="movie-item-hover"]')

        for movie in movies[:10]:
            title = movie.xpath(
                './a/div/div[1]/span[1]/text()').extract_first()
            link = movie.xpath(
                './a[@data-act="movie-click"]/@href').extract_first()
            category = movie.xpath('./a/div/div[2]/text()').extract()[1].strip(
                '\n').strip()
            time = movie.xpath('./a/div/div[4]/text()').extract()[1].strip(
                '\n').strip()

            item = MaoyanmovieItem()
            item['title'] = title
            item['link'] = 'https://maoyan.com' + link
            item['time'] = time
            item['category'] = category

            yield item