Ejemplo n.º 1
0
    def parse_kepu_article(self, response):
        item = ArticleItem()
        item["imageList"] = []

        ptags = response.xpath(
            '//div[@class="atcle-mid"]/div[@class="artle-cont"]/p')

        text = []
        for p in ptags:
            text.append(p.xpath("string()").extract()[0].strip())

        content = "<br>".join(text)

        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        item["tagName"] = self.keyword
        item["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        item["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        item["title"] = extract_with_css(
            "div.atcle-mid div.atcle-top  h1::text")
        item["author"] = ""
        item["content"] = content
        item["source"] = response.request.url

        images = response.xpath(
            '//div[@class="atcle-mid"]/div[@class="artle-cont"]//img')

        for img in images:
            img_url = img.xpath('./@src').extract()[0] or ''
            item["imageList"].append(img_url)

        yield item
Ejemplo n.º 2
0
    def parse_article(self, response):

        print("parse article",
              response.xpath("//div[@class='postDetailsContent']"))

        item = ArticleItem()
        item["images"] = []

        text = response.xpath(
            '//div[@class="postDetailsContent"]//text()').extract()

        _text = []

        for t in text:
            _text.append(t.strip())

        content = "<br>".join(_text)

        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        item["tagName"] = self.keyword
        item["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        item["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        item["title"] = extract_with_css("div.postTitle::text")
        item["author"] = ""
        item["content"] = content
        item["topicUrl"] = ""
        item["source"] = response.request.url
        item["visits"] = response.xpath(
            "//span[@class='read_icon']/text()").extract()[0]
        item["likes"] = response.xpath(
            "//span[@class='L-zann']/text()").extract()[0]
        item["commentList"] = []

        comments = response.xpath("//ul[contains(@class, 'commentList')]/li")

        if (len(comments) > 0):
            for comment in comments:
                _item = {}
                _item["username"] = comment.xpath(
                    ".//h6[contains(@class, 'username')]/a/text()").extract(
                    )[0].strip()
                _item["content"] = comment.xpath(
                    ".//div[contains(@class, 'reply')]/text()").extract(
                    )[0].strip()
                _item["headPortrait"] = comment.xpath(
                    ".//div[@class='head_img']//img/@src").extract()[0]

                item["commentList"].append(_item)

        images = response.xpath('//div[@class="postDetailsContent"]//img')

        for img in images:
            img_url = img.xpath('./@src').extract()[0] or ''
            item["images"].append(img_url)

        yield item
Ejemplo n.º 3
0
    def parse_news(self, response):
        article = ArticleItem()

        article["tagName"] = self.keyword

        if response.xpath("//meta[@name='keywords']"):
            article["keyword"] = response.xpath(
                "//meta[@name='keywords']/@content").extract()[0]
        else:
            article["keyword"] = ""

        article["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        article["title"] = response.xpath(
            "//h1[@class='audio-intro-h1']/text()").extract()[0]
        if response.xpath(
                "//div[@class='dr-li-item audio-intro-l']/a[@class='a']"):
            article["author"] = response.xpath(
                "//div[@class='dr-li-item audio-intro-l']/a[@class='a']/span[@class='nm']/text()"
            ).extract()[0]
        else:
            article["author"] = response.xpath(
                "//div[@class='dr-li-item audio-intro-l']/span[@class='a']/span[@class='nm']/text()"
            ).extract()[0]

        article["images"] = []
        article["visits"] = response.xpath(
            "//div[@class='intro-ts mt25']/span[contains(@class, 'shows')]/text()"
        ).extract()[0].split("阅读量:")[1]
        article["likes"] = 0
        article["topicUrl"] = ""
        article["commentList"] = []

        yield article
Ejemplo n.º 4
0
    def parse_news(self, response):
        article = ArticleItem()
        article["tagName"] = self.keyword
        article["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        article["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        # article["keyword"] = response.xpath("//meta[@name='keywords']/@content").extract()[0]
        article["title"] = response.xpath(
            "//p[@class='a_title']/text()").extract()[0]
        article["author"] = response.xpath(
            "//p[@class='box_p']/span[1]/text()").extract()[0]

        article_content = response.xpath(
            "//div[@class='yxli']//td[@id='article_content']")

        if article_content:
            divs = article_content.xpath(
                "./div | ./p")
            print("divs: ", divs)
            text = []
            for d in divs:
                t = d.xpath("string()").extract()[0].strip()
                if (len(t) > 0):
                    text.append(t)
            article["content"] = "<br>".join(text)
        else:
            ptags = response.xpath(
                "//div[@class='yxli']/div/ul/ul/div | //div[@class='yxli']/div/ul/ul/p")

            if (len(ptags) > 0):

                text = []
                for p in ptags:
                    t = p.xpath("string()").extract()[0].strip()
                    if len(t) > 0:
                        text.append(t)
                article["content"] = "<br>".join(text)
            else:
                article["content"] = response.xpath(
                    "string(//div[@class='yxli']/div/ul/ul)").extract()[0]

        article["source"] = response.meta["origin_url"]
        article["images"] = []

        images = response.xpath("//div[@class='yxli']//img/@src").extract()
        if len(images) > 0:
            for img in images:
                if img.find("http") >= 0:
                    article["images"].append(img)

        article["visits"] = 0
        article["likes"] = response.xpath(
            "//td[@id='diggnum']/strong/text()").extract()[0]
        article["topicUrl"] = ""
        article["commentList"] = []

        yield article
Ejemplo n.º 5
0
    def parse_article(self, response):
        article = ArticleItem()

        article["tagName"] = self.keyword
        article["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        article["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        article["title"] = response.xpath(
            "//div[contains(@class, 'white')]//div[@class='h hm']/h1/text()"
        ).extract()[0]
        article["author"] = "shiguanzhijia"
        article["visits"] = 0
        article_info = response.xpath(
            "string(//div[contains(@class, 'white')]//p[@class='xg1'])"
        ).extract()[0].split("|")

        for info in article_info:
            if (info.find("原作者") >= 0):
                article["author"] = info.split("原作者: ")[1]
            elif (info.find("查看") >= 0):
                article["visits"] = info.split("查看: ")[1]

        article["likes"] = 0
        article["topicUrl"] = ""
        article["commentList"] = []

        comments = response.xpath("//div[@id='comment_ul']/dl")
        if (len(comments) > 0):
            for comment in comments:
                _item = {}
                _item["username"] = comment.xpath("./dt/a/text()").extract()[0]
                _item["content"] = comment.xpath(
                    "./dd/text()").extract()[0].strip()
                article["commentList"].append(_item)

        text = []
        div_tags = response.xpath(
            "//td[@id='article_content']//div | //td[@id='article_content']//p"
        )
        for div in div_tags:
            text.append(div.xpath("string()").extract()[0].strip())

        article["content"] = "<br>".join(text)
        article["source"] = response.meta["origin_url"]
        article["images"] = []

        imgs = response.xpath("//td[@class='article_content']//img")

        for img in imgs:
            img_url = img.xpath("./@src").extract()[0]
            article["images"].append(img_url)

        yield article
Ejemplo n.º 6
0
    def parse_article(self, response):
        article = ArticleItem()
        article["tagName"] = self.keyword
        if response.xpath("//meta[@name='keywords']"):
            article["keyword"] = response.xpath(
                "//meta[@name='keywords']/@content").extract()[0]
        else:
            article["keyword"] = self.keyword

        article["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        article["title"] = response.xpath(
            "//div[@class='note_detail_title']/h1/text()").extract()[0]
        article["author"] = ""

        ptags = response.xpath(
            "//div[contains(@class, 'detailinfo')]//p | //div[contains(@class, 'detailinfo')]//span"
        )

        _text = []
        for p in ptags:
            t = p.xpath("string()").extract()[0].strip()
            t.replace('\n', '<br>')
            _text.append(t)

        article["content"] = "<br>".join(_text)
        article["source"] = response.meta["origin_url"]

        article["images"] = []
        img_tags = response.xpath("//div[contains(@class, 'detailinfo')]//img")

        for img in img_tags:
            article["images"].append(img.xpath(".//@src").extract()[0])

        article["visits"] = response.xpath(
            "//div[@class='note_detail_title']/span/dl[1]/text()").extract(
            )[0].split("浏览:")[1]

        article["likes"] = response.xpath(
            "//div[@class='note_detail_title']/span/dl[2]/text()").extract(
            )[0].split("点赞:")[1]

        article["topicUrl"] = 0
        article["commentList"] = []

        yield article
Ejemplo n.º 7
0
    def parse_article(self, response):

        item = ArticleItem()
        item["tagName"] = self.keyword
        item["images"] = []

        ptags = response.xpath(
            './/div[@class="main_content"]//div[@class="detail"]/ul[@class="detailc"]//p'
        )

        text = []
        for p in ptags:
            text.append(p.xpath("string()").extract()[0].strip())

        content = "<br>".join(text)

        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        item["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        item["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        item["title"] = extract_with_css("div.detail > ul.detaila::text")
        item["content"] = content
        item["source"] = response.request.url

        info = response.xpath(
            "//div[@class='detail']/ul[@class='detailb']/ul[1]/text()"
        ).extract()[0]
        [_, author, visits] = info.split("\xa0\xa0\xa0\xa0")

        item["author"] = author.split("来源:")[1]
        item["visits"] = visits.split("浏览数:")[1]
        item["likes"] = 0
        item["topicUrl"] = ""
        item["commentList"] = []

        yield item
Ejemplo n.º 8
0
    def parse_article(self, response):

        item = ArticleItem()
        item["images"] = []

        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        item["tagName"] = self.keyword
        item["keyword"] = response.xpath(
            "//meta[@name='Keywords']/@content").extract()[0]

        if response.xpath("//meta[@name='Description']"):
            item["description"] = response.xpath(
                "//meta[@name='Description']/@content").extract()[0]
        else:
            item["description"] = response.xpath(
                "//meta[@name='description']/@content").extract()[0]
        item["title"] = extract_with_css("div.art_box h1::text")
        item["author"] = ""
        item["source"] = response.request.url
        item["visits"] = 0
        item["likes"] = 0
        item["topicUrl"] = ""
        item["commentList"] = []

        ptags = response.xpath('.//div[@class="art_con"]/p')

        # content
        text = []
        for p in ptags:
            text.append(p.xpath("string()").extract()[0].strip())

        content = "<br>".join(text)

        item["content"] = content

        # images
        images = response.xpath('.//div[@class="art_con"]//img')

        for img in images:
            img_url = img.xpath('./@src').extract()[0] or ''
            item["images"].append(img_url)

        page_base_url = ""
        page_count = 0

        if response.css("div.art_page"):
            page_end_url = response.xpath(
                '//div[@class="art_page"]//a/@href').extract()[-1]
            page_base_url, page_count = regex.findall('(.*?)_(.*?).html',
                                                      page_end_url)[0]
            page_count = int(page_count)
        elif response.css("div.art_con + div.atp_yema"):
            page_end_url = response.css(
                'div.art_con + div.atp_yema a::attr(href)').extract()[-1]
            page_base_url, page_count = regex.findall('(.*?)_(.*?).html',
                                                      page_end_url)[0]
            page_count = int(page_count)

        for page in range(1, page_count + 1):
            url = page_base_url + '_{}.html'.format(page)
            request = scrapy.Request(url=url,
                                     callback=self.parse_content,
                                     priority=-page)
            request.meta["item"] = item
            request.meta["page_count"] = page_count
            request.meta["page"] = page
            yield request
Ejemplo n.º 9
0
    def parse_article_page(self, response):
        huatiContent = response.meta["huatiContent"]

        post = response.xpath("//article[@class='Post-Main Post-NormalMain']")
        if (len(post) == 0):
            return None

        article = ArticleItem()
        article["tagName"] = self.keyword
        article["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        article["description"] = response.xpath(
            "//meta[@name='description']/@content").extract()[0]
        article["title"] = response.xpath(
            "//article[@class='Post-Main Post-NormalMain']/header[@class='Post-Header']/h1/text()"
        ).extract()[0]
        article["author"] = response.xpath(
            "//div[@class='AuthorInfo-head']//a[@class='UserLink-link']/text()"
        ).extract()[0]

        ptags = response.xpath("//div[@class='Post-RichTextContainer']/div/p")
        text = []
        for p in ptags:
            text.append(p.xpath("string()").extract()[0].strip())

        article["content"] = "<br>".join(text)
        article["images"] = []
        article["commentList"] = []

        images = response.xpath("//div[@class='Post-RichTextContainer']//img")
        for img in images:
            img_url = img.xpath("./@src").extract()[0]
            if img_url.find("http") == 0:
                article["images"].append(img_url)

        article["visits"] = 0
        likesText = response.xpath(
            "//div[@class='ContentItem-actions']/span/button[contains(@class, 'VoteButton--up')]/text()"
        ).extract()[0].strip()
        article["likes"] = likesText.split("赞同")[1]
        article["source"] = response.meta["origin_url"]
        article["topicUrl"] = response.meta["topic_url"]

        comments = response.xpath("//div[@class='CommentItemV2']")

        if (len(comments) > 0):
            for comment in comments:
                _item = {}
                if comment.xpath(".//span[@class='UserLink']/a"):
                    _item["username"] = comment.xpath(
                        ".//span[@class='UserLink']/a/text()").extract()[0]
                else:
                    _item["username"] = "******"
                _item["content"] = comment.xpath(
                    "string(.//div[@class='CommentItemV2-metaSibling'])"
                ).extract()[0]
                _item["headPortrait"] = comment.xpath(
                    ".//span[@class='UserLink CommentItemV2-avatar']//img/@src"
                ).extract()[0]

                article["commentList"].append(_item)

        huatiContent["content"] = article

        yield huatiContent
Ejemplo n.º 10
0
    def parse_article(self, response):

        item = ArticleItem()
        item["images"] = []

        detail_div = response.xpath(
            './/div[@class="pb20 article_detail"]/div').get()

        content = ""
        images = []

        if detail_div:
            text = response.xpath(
                './/div[@class="pb20 article_detail"]/div/p//text()').extract()

            _text = []

            for t in text:
                _text.append(t.strip())

            content = '<br>'.join(_text)
            images = response.xpath(
                './/div[@class="pb20 article_detail"]/div//img')

        else:
            ptags = response.xpath(
                './/div[@class="pb20 article_detail"]//p')
            text = []
            for p in ptags:
                text.append(p.xpath("string()").extract()[0].strip())

            content = "<br>".join(text)

            images = response.xpath(
                './/div[@class="pb20 article_detail"]//img')

        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        item["keyword"] = response.xpath(
            "//meta[@name='keywords']/@content").extract()[0]
        if response.xpath("//meta[@name='Description']"):
            item["description"] = response.xpath(
                "//meta[@name='Description']/@content").extract()[0]
        else:
            item["description"] = response.xpath(
                "//meta[@name='description']/@content").extract()[0]

        item["tagName"] = self.keyword
        item["title"] = extract_with_css("div.article_l h1.fn + p::text")
        item["author"] = extract_with_css('a.article_writer::text')
        item["content"] = content
        item["source"] = response.request.url
        item["visits"] = extract_with_css("font.orange1::text")
        item["likes"] = 0
        item["topicUrl"] = ""
        item["commentList"] = []

        for img in images:
            img_url = img.xpath('./@src').extract()[0] or ''
            item["images"].append(img_url)

        yield item
Ejemplo n.º 11
0
    def parse_article(self, response):

        item = ArticleItem()
        item["images"] = []

        ptags = response.xpath(
            '//article/p')

        _text = []

        for p in ptags:
            _text.append(p.xpath("string()").extract()[0].strip())

        content = '<br>'.join(_text)

        images = response.xpath(
            '//article//img')

        def extract_with_css(query):
            return response.css(query).get(default="").strip()

        item["tagName"] = self.keyword

        if response.xpath("//meta[@name='keywords']"):
            item["keyword"] = response.xpath(
                "//meta[@name='keywords']/@content").extract()[0]
        else:
            item["keyword"] = self.keyword

        if response.xpath("//meta[@name='description']"):
            item["description"] = response.xpath(
                "//meta[@name='description']/@content").extract()[0]
        else:
            item["description"] = ""

        item["title"] = extract_with_css("section.ouvJEz h1::text")
        item["author"] = extract_with_css('div.rEsl9f a._1OhGeD::text')
        item["content"] = content
        item["source"] = response.meta["origin_url"]

        item["visits"] = response.xpath(
            "//div[@class='rEsl9f']//div[@class='s-dsoj']/span[last()]/text()").extract()[0].split("阅读 ")[1]
        item["likes"] = extract_with_css(
            "div._1kCBjS span._1LOh_5::text").split("人点赞")[0]
        item["topicUrl"] = ""
        item["commentList"] = []

        for img in images:
            img_src = img.xpath('./@src').extract()
            if len(img_src) > 0:
                img_url = img_src[0] or ''
                item["images"].append(img_url)

        comments = response.xpath(
            "//div[@class='_2gPNSa']//div[contains(@class,'_2IUqvs _3uuww8')]")

        print("comments;   ", comments)

        for comment in comments:
            _item = {}
            _item["username"] = comment.xpath(
                ".//div[@class='_23G05g']/a").extract()[0]
            _item["content"] = comment.xpath(
                ".//div[@class='_2bDGm4']//text()").extract()[0].strip()
            _item["headPortrait"] = comment.xpath(
                ".//a[@class='_1OhGeD']/img/@src").extract()[0]
            item["commentList"].append(_item)

        yield item
Ejemplo n.º 12
0
    def parse_news(self, response):
        huatiContent = response.meta["huatiContent"]

        if response.xpath("//div[@class='content-top-One']"):
            article = ArticleItem()
            article["tagName"] = response.meta["tagName"]
            article["keyword"] = response.xpath(
                "//meta[@name='keywords']/@content").extract()[0]
            article["description"] = response.xpath(
                "//meta[@name='description']/@content").extract()[0]
            article["title"] = response.xpath(
                "//div[@class='content-top-One']/text()").extract()[0]
            article["author"] = "相因网"

            ptags = response.xpath("//div[@class='Article-content']//p")

            text = []
            for p in ptags:
                t = p.xpath("string()").extract()[0].strip()
                if (len(t) > 0):
                    text.append(t)

            article["content"] = "<br>".join(text)

            article["images"] = response.xpath(
                "//div[@class='Article-content']//img/@src").extract()
            article["commentList"] = []
            article["visits"] = response.xpath(
                "//span[@class='follow']//em/text()").extract()[0]
            article["likes"] = 0
            article["source"] = response.meta["origin_url"]
            article["topicUrl"] = response.meta["topic_url"]
            huatiContent["content"] = article

            yield huatiContent
        else:
            article = ArticleItem()
            article["keyword"] = response.xpath(
                "//meta[@name='keywords']/@content").extract()[0]
            article["description"] = response.xpath(
                "//meta[@name='description']/@content").extract()[0]
            article["title"] = response.xpath(
                "//div[@class='title']/text()").extract()[0]
            article["author"] = ""

            ptags = response.xpath("//div[@class='content']//p")

            _text = []
            for p in ptags:
                t = p.xpath("string()").extract()[0].strip()
                t.replace("\n", "<br>")
                _text.append(t)

            article["content"] = "<br>".join(_text)

            img_tags = response.xpath("//div[@class='content']//img")

            _images = []

            for img in img_tags:
                _images.append(img.xpath("./@src").extract()[0])

            article["images"] = _images
            article["commentList"] = []
            article["visits"] = response.xpath(
                "//div[@class='gnxx']//div[@class='onclick']/text()").extract(
                )[0].split("已阅读")[0]
            article["likes"] = 0
            article["source"] = response.meta["origin_url"]
            article["topicUrl"] = response.meta["topic_url"]

            huatiContent["content"] = article

            yield huatiContent