def parse2(self, response):
        item = response.meta['item']
        movie = Selector(response).xpath('//*[@class="indent"]/span[1]/text()')
        if (movie.extract_first().strip() == ''):
            movie = Selector(response).xpath('//*[@class="indent"]/span[2]')

        item['content'] = movie.extract_first().strip()
        yield item
Beispiel #2
0
 def parse2(self,response):
     title = Selector(response=response).xpath('//div[@class="movie-brief-container"]/h1/text()')
     type = Selector(response=response).xpath('//li[@class="ellipsis"]/a/text()')
     movie_date = Selector(response=response).xpath('//li[@class="ellipsis"][last()]/text()')
     item = Spider2Item()
     item['title'] = title.extract_first().strip()
     item['type'] = type.extract_first().strip()
     item['movie_date'] = movie_date.extract_first().strip()
     #print('------------')
     yield item
Beispiel #3
0
 def parse_single_movie(self, response):
     item = response.meta['item']
     movie_title = Selector(response=response).xpath(
         '/html/body/div[3]/div/div[2]/div[1]/h1/text()')
     movie_type = Selector(response=response).xpath(
         '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()')
     movie_release_date = Selector(response=response).xpath(
         '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()')
     item['title'] = movie_title.extract_first().strip()
     item['release_date'] = movie_release_date.extract_first().strip()
     item['type'] = movie_type.extract_first().strip()
     yield item
Beispiel #4
0
    def parse2(self, response):
        item = response.meta['item']
        count = Selector(response=response).xpath(
            '//*[@id="score_list"]/div[1]/div[2]/text()')
        score = Selector(response=response).xpath(
            '//div[@class="box score-box"]/ul/li[1]/p/text()')

        # debug
        # print(count.extract_first().strip())
        # print(score.extract_first().strip())
        item['counts'] = count.extract_first().strip()
        item['scores'] = score.extract_first().strip()
        print(item)
        yield item
Beispiel #5
0
 def parse2(self, response):
     item = response.meta['item']
     info = Selector(
         response=response).xpath('//*[@id="link-report"]/span/text()')
     content = info.extract_first().strip()
     item['content'] = content
     yield item
 def parse2(self, response):
     item = response.meta['item']
     try:
         movies_name = Selector(
             response=response).xpath('//h1[@class="name"]/text()')
         #movies_tag = Selector(response=response).xpath('//h1[@class="name"]/ul/li/a[@class="text-link"]').getall()
         movies_tag1 = Selector(response=response).xpath(
             '//ul/li[1]/a[1][@class="text-link"]/text()').get()
         movies_tag2 = Selector(response=response).xpath(
             '//ul/li[1]/a[2][@class="text-link"]/text()').get()
         movies_tag3 = Selector(response=response).xpath(
             '//ul/li[1]/a[3][@class="text-link"]/text()').get()
         movies_time = Selector(response=response).xpath(
             '//ul/li[3][@class="ellipsis"]/text()').extract_first()
         movies_name = movies_name.extract_first()
         item['name'] = movies_name
         #item['tag'] = movies_tag
         item['tag1'] = movies_tag1
         item['tag2'] = movies_tag2
         item['tag3'] = movies_tag3
         item['time'] = movies_time
         # print(movies_tag1,movies_tag2,movies_tag3)
     except Exception as e:
         print(e)
     finally:
         yield item
    def parseMovieDetail(self, response):
        movie_info = response.meta['movie_info']

        level_sel = Selector(
            response=response).xpath('//div[@class="level-item"]//img/@src')
        level = self.levelImgUrl2levelNum(level_sel.extract_first().strip())
        movie_info['movie_level'] = level

        brief_sel = Selector(response=response).xpath(
            '//div[@class="resource-desc"]/div[@class="con"]/span[1]//text()')
        brief = brief_sel.extract_first().strip()
        movie_info['brief_desc'] = brief

        browse_times_sel = Selector(
            response=response).xpath('//li[@class="score-star"]//label/text()')
        browse_times = int(browse_times_sel.extract_first().strip())
        movie_info['browse_times'] = browse_times

        yield movie_info
Beispiel #8
0
    def parse2(self, response):
        item = response.meta['item']
        # 电影排名
        seniority = Selector(
            response=response).xpath('//p[@class="f4"]/text()')
        # 电影分级
        mvrank = Selector(
            response=response).xpath('//div[@class="level-item"]/img/@src')
        # 封面信息
        cover = Selector(
            response=response).xpath('//div[@class="imglink"]/a/img/@src')
        item['seniority'] = seniority.extract_first().strip()
        item['rank'] = mvrank.extract_first().strip()
        item['cover'] = cover.extract_first().strip()

        # print(item['seniority'])
        # print(item['rank'])
        # print(item['cover'])
        bt_url = item['link'].replace(
            '/resource', '/resource/index_json/rid') + '/channel/movie'
        yield scrapy.Request(url=bt_url,
                             meta={'item': item},
                             callback=self.parse_views)
    def parse2(self, response):
        item = response.meta['item']
        # print(item)
        # soup = BeautifulSoup(response.text, 'html.parser')
        filmtop = Selector(response=response).xpath(
            "//div[@id='score_star']/../p/text()").re('\d+')
        print(filmtop)
        film_top = filmtop[0]
        # print(film_top)
        item['film_top'] = film_top
        filmlevel = Selector(
            response=response).xpath("//div[@class='level-item']/img/@src")
        film_level = filmlevel.extract_first()
        # print(film_level)
        if film_level == 'http://js.jstucdn.com/images/level-icon/a-big-1.png':
            item['film_level'] = 'A级'
        elif film_level == 'http://js.jstucdn.com/images/level-icon/b-big-1.png':
            item['film_level'] = 'B级'
        elif film_level == 'http://js.jstucdn.com/images/level-icon/c-big-1.png':
            item['film_level'] = 'C级'
        elif film_level == 'http://js.jstucdn.com/images/level-icon/d-big-1.png':
            item['film_level'] = 'D级'
        elif film_level == 'http://js.jstucdn.com/images/level-icon/e-big-1.png':
            item['film_level'] = 'E级'

        film_views = Selector(
            response=response).xpath("//li[@id='score_list']/div[1]").re('\d+')

        item['film_views'] = film_views[1]
        film_covertinfo = Selector(
            response=response).xpath('//div[@class="imglink"]/a/img/@src')
        film_covertinfo = film_covertinfo.extract_first()
        print(film_covertinfo)
        item['film_covertinfo'] = film_covertinfo

        print(item)
        yield item
Beispiel #10
0
    def movie_detail_parse(self, response):

        item = response.meta['item']

        rank_elmt = Selector(response=response).xpath(
            './/div[@class="box score-box"]//p[@class="f4"]/text()')
        rank_p = re.compile("[0-9]+")
        rank = int(rank_p.findall(rank_elmt.extract_first())[0])

        image_elmt = Selector(
            response=response).xpath('.//div[@class="imglink"]//a/@href')
        image_link = image_elmt.extract_first()

        grade_image_elmt = Selector(
            response=response).xpath('.//div[@class="level-item"]//img/@src')
        grade_image_link = grade_image_elmt.extract_first()
        grade = grade_image_link.split('/')[-1][0]

        view_selector = Selector(response=response).xpath(
            '//script[@type="text/javascript" and contains(@src,"rid")]/@src')
        view_uri = view_selector.extract_first()
        parsed_uri = urllib.parse.urlsplit(response.url)
        view_link = f'{parsed_uri.scheme}://{parsed_uri.netloc}{view_uri}'
        view_response = requests.get(view_link)
        view = None
        if view_response.status_code == 200:
            view = int(
                json.loads(
                    view_response.text.split("index_info=")[1])['views'])

        item['rank'] = rank
        item['image'] = image_link
        item['grade'] = grade
        item['view'] = view

        print(item)
        yield item
Beispiel #11
0
    def parse2(self, response):
        """ 解析函数2:获取单个电影的详细信息 """
        item = response.meta['item']
        Movie_class = Selector(response=response).xpath(
            '//div[@class="fl view-left"]//div[@class="level-item"]//img/@src')
        Cover_info = Selector(response=response).xpath(
            '//div[@class="fl view-left"]//div[@class="imglink"]//img/@src')
        browse_times = Selector(
            response=response).xpath('//li[@class="score-star"]//label/text()')
        Browse_times = int(browse_times.extract_first().strip())
        item['Movie_class'] = Movie_class
        item['Cover_info'] = Cover_info
        item['Browse_times'] = Browse_times

        yield item
Beispiel #12
0
 def parse2(self, response):
     print(response)
     item = response.meta['item']
     rank_selector = Selector(
         response=response).xpath('//p[@class="f4"]/text()')
     level_selector = Selector(
         response=response).xpath('//div[@class="level-item"]')
     cover_selector = Selector(
         response=response).xpath('//div[@class="level-item"]')
     rank = rank_selector.extract_first().strip()
     item['rank'] = rank
     level = level_selector.xpath('./img/@src').extract_first().strip()
     item['level'] = level
     cover = cover_selector.xpath('./img/@src').extract_first().strip()
     item['cover'] = cover
     yield item
Beispiel #13
0
    def parse2(self, response):
        item = response.meta['item']
        # movie = Selector(response=response).xpath('//div[@class="fl box top24"]//li')
        # //div[@class="fl view-left"]//div[@class="level-item"]//img/@src
        # //div[@class="fl view-left"]//div[@class="imglink"]//img/@src
        # content = movie.xpath('./a/@href').get_text().strip()
        # item['content'] = content
        classification=Selector(response=response).xpath('//div[@class="fl view-left"]//div[@class="level-item"]//img/@src')
        coverInfo=Selector(response=response).xpath('//div[@class="fl view-left"]//div[@class="imglink"]//img/@src')
        browse_times = Selector(response=response).xpath('//li[@class="score-star"]//label/text()')
        browseTimes = int(browse_times.extract_first().strip())
        item['classification']=classification
        item['coverInfo']=coverInfo
        item['browseTimes']=browseTimes

        yield item
Beispiel #14
0
    def parse(self, response):
        items = [dict(film_name='电影名称',film_type='电影类型',plan_date='上映日期')]
        print(response.encoding)
        response=response.text.replace("<dd>","</dd><dd>")
        for i in range(1, 11):
            item = SpidersItem()
            film_name = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[1]/span[1]/text()')
            film_type = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[2]/text()')
            plan_date = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[4]/text()')
            # 设置item
            item['film_name'] = film_name.extract_first().strip()
            item['film_type'] = film_type.extract()[1].strip()
            # print(film_type.extract()[1].strip())
            item['plan_date'] = plan_date.extract()[1].strip()
            # print(plan_date.extract()[1].strip())

            items.append(item)
        return items
Beispiel #15
0
    def parse2(self, response):
        print('response.url: ', response.url)
        commonitems = Selector(
            response=response).xpath('//li[@class="comment-item"]')
        for ci in commonitems:
            short = ci.xpath(
                './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()'
            ).extract_first().strip()
            shorttime = ci.xpath(
                './div[@class="comment"]//span[@class="comment-info"]/span[2]/text()'
            ).extract_first().strip()
            # 判断数据是否已经读取过,读取过则返回
            sql = 'select count(*) from hlmshorts_new t where t.S_SHORTSTIME = "%s" and t.S_SHORTS = "%s"' % (
                shorttime, short)
            df = db.readtable(sql)
            cnt = df.iat[0, 0]
            if cnt > 0:
                return

            star = ci.xpath(
                './div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/@title'
            ).extract_first().strip()
            vote = ci.xpath(
                './div[@class="comment"]/h3/span[@class="comment-vote"]/span[@class="vote-count"]/text()'
            ).extract_first().strip()
            # 在items.py定义DoubanbookItem
            item = DoubanbookItem()
            item['star'] = star
            item['vote'] = vote
            item['short'] = short
            item['shorttime'] = shorttime
            yield item
        # 取下一页数据
        nextpage1 = Selector(response=response).xpath(
            '//div[@class="paginator-wrapper"]/ul[@class="comment-paginator"]/li[last()]/a/@href'
        )
        if nextpage1:
            nextpage = nextpage1.extract_first().strip()
            print('nextpage: ', nextpage)
            url = f'{HongloumengSpider.start_urls[0]}{nextpage}'
            yield scrapy.Request(url=url, callback=self.parse2)
            time.sleep(5)
Beispiel #16
0
 def parse_link(self, response):
     """ 
     获取对应标题的电影简介
     """
     # 反爬检测
     # print('-'*100)
     # print(response.text)
     # print('-'*100)
     item = response.meta['item']
     content = Selector(
         response=response).xpath('//div[@id="link-report"]/span/text()')
     m_id = Selector(
         response=response).xpath('//span[@class="top250-no"]/text()'
                                  ).extract_first().strip().split('.')[-1]
     item['content'] = content.extract_first().strip()
     item['m_id'] = m_id
     # print('-'*100)
     # print(item)
     # print('-'*100)
     yield item
Beispiel #17
0
class ParseNombre:
    nombre_de_pila = str
    apellidos = str
    nombre_completo = str
    html_data = None

    def __init__(self, decoded_html=str):
        # primero tomar la decicion
        if decoded_html is not None:
            self.html_data = Selector(text=decoded_html,
                                      type="html").xpath("//b")
            if decoded_html.find("REGISTRO ELECTORAL - CONSULTA DE DATOS") > 0:
                self._extraer_nombre_html_cne()
            else:
                self._calc_nombre(self.html_data.extract_first())
            self.nombre_de_pila = self.nombre_de_pila.title()
            self.apellidos = self.apellidos.title()
            self.nombre_completo = self.nombre_completo.title()

    def _extraer_nombre_html_cne(self):
        nombre_html = self.html_data[3].extract()
        self._calc_nombre(nombre_html)

    def _extraer_nombre_html_registro_civil(self):
        nombre_html = self.html_data[3].extract()
        self._calc_nombre(nombre_html)

    def _calc_nombre(self, nombre_html_de_scrapy):
        self.nombre_completo = nombre_html_de_scrapy.replace('</b>',
                                                             '').replace(
                                                                 '<b>', '')
        nombre = self.nombre_completo.split()
        # si el ciudadano tiene un solo apellido devuelve True
        un_apellido_test = nombre_html_de_scrapy.find(' </b>') > 0
        # si el ciudadano tiene un solo nombre devuelve True
        un_nombre_test = nombre_html_de_scrapy.find('  ') > 0
        if len(nombre) == 4:
            self.nombre_de_pila = f"{nombre[0]} {nombre[1]}"
            self.apellidos = f"{nombre[-2]} {nombre[-1]}"
        elif len(nombre) == 3:
            if un_apellido_test:
                self.nombre_de_pila = f"{nombre[0]} {nombre[1]}"
                # obtenemos el apellido apuntando al ultimo item -1
                self.apellidos = nombre[-1]
            if un_nombre_test:
                # COLOCAMOS LOS NOMBRES Y APELLIDOS DONDE VAN
                self.nombre_de_pila = f"{nombre[0]} {nombre[1]}"
                self.apellidos = f"{nombre[1]} {nombre[2]}"
        elif len(nombre) == 2:
            self.nombre_de_pila = f"{nombre[0]}"
            self.apellidos = nombre[-1]
        elif len(nombre) == 5:
            self.nombre_de_pila = f"{nombre[0]} {nombre[1]} {nombre[2]}"
            self.apellidos = f"{nombre[-2]} {nombre[-1]}"
        elif nombre == "DE" or nombre == "DEL" in nombre:
            conectivos = []
            for k, v in enumerate(nombre):
                if v == "DE" or v == "DEL":
                    decision = len(nombre) / 2
                    if k <= decision:
                        print("CONECTIVO INICIO ENCONTRADO >" + v)
                        conectivos.append([k, v])
                        offset = k + 1
                        self.nombre_de_pila = f"{nombre[0]} {nombre[offset - 1]} {nombre[offset]}"
                        self.apellidos = nombre[-1]
                    else:
                        print("CONECTIVO FINAL ENCONTRADO >" + v)
                        conectivos.append([k, v])
                        self.nombre_de_pila = f"{nombre[0]} {nombre[1]}"
                        self.apellidos = f"{nombre[-3]} {conectivos[0][1]} {nombre[conectivos[0][0] + 1]}"
            if len(conectivos) > 1:
                print("HAY MAS DE 1 CONECTIVO> ")
                print(conectivos)
                self.nombre_de_pila = f"{nombre[0]} {nombre[1]} {nombre[2]}"
                self.apellidos = f"{nombre[-3]} {conectivos[-1][1]} {nombre[-1]}"
Beispiel #18
0
    def parse_novel(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        novel_id = response.meta.get('novel_id')
        novel = Novel()
        novel['id'] = novel_id
        novel['title'] = response.xpath(
            '//span[@class="bigtext"]//span/text()').extract_first()
        novel['author'] = response.xpath(
            '//span[@itemprop="author"]/text()').extract_first()

        novel['intro'] = soup.find('div', {"id": "novelintro"}).text

        novel_metadata = response.xpath(
            '//ul[@name="printright"]/li').extract()
        # 第一行:文章类型
        novel['genre'] = Selector(text=novel_metadata[0]).xpath('//span[@itemprop="genre"]/text()').extract_first()\
            .strip()
        # 第二行:作品视角
        novel['view'] = Selector(text=novel_metadata[1]).xpath(
            '//li/text()').extract_first().strip()

        # 第三行:作品风格
        novel['style'] = Selector(text=novel_metadata[2]).xpath(
            '//li/text()').extract_first().strip()

        # 第四行:作品系列
        novel['series'] = Selector(text=novel_metadata[3]).xpath('//span[@itemprop="series"]/text()').extract_first()\
            .strip()

        # 第五行:文章进度
        if (Selector(text=novel_metadata[4]).xpath(
                '//span[@itemprop="updataStatus"]/font')):
            novel['updateStatus'] = Selector(text=novel_metadata[4]).xpath(
                '//span[@itemprop="updataStatus"]/font/text()') \
                .extract_first().strip()
        else:
            novel['updateStatus'] = Selector(text=novel_metadata[4]).xpath('//span[@itemprop="updataStatus"]/text()')\
                .extract_first().strip()

        # 第六行:全文字数
        novel['wordCount'] = Selector(text=novel_metadata[5]).xpath('//span[@itemprop="wordCount"]/text()')\
            .extract_first().strip()

        # 第七行:是否出版
        published = BeautifulSoup(novel_metadata[6], "lxml").text
        novel['published'] = published.replace("是否出版:",
                                               "").replace("(联系出版)",
                                                           "").strip()

        # 第八行:签约状态
        novel['signed'] = Selector(text=novel_metadata[7]).xpath(
            '//font/text()').extract_first().strip()

        comment = Selector(text=novel_metadata[8]).xpath(
            '//div[@id="marknovel_message"]/text()')
        if comment:
            novel['comment'] = comment.extract_first()
        else:
            novel['comment'] = ''

        poster = parse.urlsplit(
            soup.find('img', {"itemprop": "image"})["src"]).path.split('/')[-1]
        poster_url = soup.find('img', {"itemprop": "image"})["src"]
        novel["poster"] = poster
        novel["images"] = [poster]
        novel["image_urls"] = [poster_url]

        tags = []
        html_tags = soup.findAll(
            "div", {"class": "smallreadbody"})[-1].findAll("font")
        for html_tag in html_tags:
            tags.append(html_tag.text.strip())
        novel["tags"] = '|'.join(tags)

        html_keys = soup.findAll("div", {"class": "smallreadbody"})[-1].find(
            "span", {
                "class": "bluetext"
            }).text
        key_array = html_keys.replace("搜索关键字:", "").split("┃")
        key_leadings = key_array[0].replace("主角:", "").strip().split(",")
        key_supportings = key_array[1].replace("配角:", "").strip().split(",")
        key_other = key_array[2].replace("其它:", "").strip().split(",")

        novel["key_leadings"] = key_leadings
        novel["key_supportings"] = key_supportings
        novel["key_other"] = key_other

        soup_table = soup.find("table", {"id": "oneboolt"})

        soup_lines = soup_table.findAll('tr')[3:-1]
        current_group = ""
        for soup_line in soup_lines:
            soup_tds = soup_line.findAll("td")
            if len(soup_tds) == 1:
                current_group = soup_tds[0].text
            else:
                chapter = Chapter()
                if soup_tds[1].find('a').has_attr("href"):
                    chapter['vip'] = "No"
                    chapter['url'] = soup_tds[1].find('a')['href']
                else:
                    chapter['vip'] = "Yes"
                    chapter['url'] = soup_tds[1].find('a')['rel']
                chapter['novel_id'] = novel_id
                chapter['group'] = current_group
                chapter['novel_id'] = soup_tds[0].text.strip()
                chapter['title'] = soup_tds[1].text.strip()
                chapter['summary'] = soup_tds[2].text.strip()
                chapter['word_count'] = soup_tds[3].text.strip()
                if (len(soup_tds) == 5):
                    chapter['updated'] = soup_tds[4].text.strip()
                else:
                    chapter['updated'] = soup_tds[5].text.strip()
                if (chapter['vip'] == "No"):
                    yield scrapy.Request(url=chapter['url'],
                                         callback=self.parse_chapter,
                                         meta={'chapter': chapter})
        yield (novel)
Beispiel #19
0
 def parse_detail(self, response):
     item = response.meta["item"]
     content = Selector(
         response=response).xpath('//div[@class="mod-content"]/span/text()')
     item["content"] = content.extract_first().strip()
     yield item