Beispiel #1
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        pub_time = response.xpath(
            "//span[@class='publish-time']/text()").get().replace("*", "")
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split('/')[-1]
        content = response.xpath("//div[@class='show-content']").get()
        word_count = response.xpath("//span[@class='wordage']/text()").get()
        comment_count = response.xpath(
            "//span[@class='comments-count']/text()").get()
        read_count = response.xpath(
            "//span[@class='views-count']/text()").get()
        like_count = response.xpath(
            "//span[@class='likes-count']/text()").get()
        subjects = ",".join(
            response.xpath(
                "//div[@class='include-collection']/a/div/text()").getall())

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           content=content,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           subjects=subjects,
                           word_count=word_count,
                           comment_count=comment_count,
                           read_count=read_count,
                           like_count=like_count)
        yield item
Beispiel #2
0
 def parse_detail(self, response):
     print(response.text)
     title = response.xpath("//h1[@class='_1RuRku']/text()").get()
     author = response.xpath("//span[@class='_22gUMi']/text()").get()
     content = response.xpath("//article[@class='_2rhmJa']").get()
     pub_time = response.xpath("//time/text()").get()
     origin_url = response.url
     word_count = response.xpath(
         "//div[@class='s-dsoj']/span[2]/text()").get()
     read_count = response.xpath(
         "//div[@class='s-dsoj']/span[3]/text()").get()
     article_id = response.url.split("/")[-1]
     subjects = response.xpath(
         "//div[@class='_2Nttfz']/a/span/text()").getall()
     read_count = read_count.split()[-1]
     word_count = word_count.split()[-1]
     dimand = response.xpath(
         "//div[@class='s-dsoj']/span[1]/span/text()").get()
     print(response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())
     print(read_count)
     print(origin_url)
     subjects = ','.join(subjects)
     item = ArticleItem(title=title,
                        author=author,
                        content=content,
                        pub_time=pub_time,
                        origin_url=origin_url,
                        article_id=article_id,
                        read_count=read_count,
                        dimand=dimand,
                        subjects=subjects,
                        word_count=word_count)
     print('read_count' + read_count, "dimand" + dimand,
           "subjects" + subjects, "word_count" + word_count)
     yield item
Beispiel #3
0
    def parse_detail(self, response):
        # item = {}
        # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        # item['name'] = response.xpath('//div[@id="name"]').get()
        # item['description'] = response.xpath('//div[@id="description"]').get()
        # return item
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        # avater = response.xpath("//span[@class='FxYr8x']/a/text()").get()
        # pub_time = response.xpath("//span[@class='_3tCVn5']/time/text()").get()
        url = response.url
        url1 = url.split('?')[0]
        # 以'/'分割为多部份,传入-1取最后一个
        article_id = url1.split('/')[-1]
        content = response.xpath("//article[@class='_2rhmJa']").get()

        # 返回值是一个列表,mysql不支持,需要转化为字符串
        subjects = ','.join(response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())



        item = ArticleItem(
            title=title,
            origin_url=response.url,
            article_id=article_id,
            content=content,
            subjects=subjects
        )
        yield item
Beispiel #4
0
    def parse_detail(self, response):
        title = response.xpath('//h1[@class="title"]/text()').get()

        author = response.xpath('//div[@class="info"]/span/a/text()').get()

        avatar = self.HTTPS + response.xpath(
            '//div[@class="author"]/a/img/@src').get()

        pub_time = response.xpath(
            '//span[@class="publish-time"]/text()').get().replace("*", "")

        current_url = response.url
        real_url = current_url.split(r"?")[0]

        article_id = real_url.split(r'/')[-1]

        # 保留标签的H5内容[保留格式,方便后面排版]
        content = response.xpath('//div[@class="show-content"]').get()

        item = ArticleItem(title=title,
                           avatar=avatar,
                           pubtime=pub_time,
                           origin_url=current_url,
                           author=author,
                           article_id=article_id,
                           content=content)

        yield item
Beispiel #5
0
    def parse_detail(self, response):
        title = response.xpath(
            '//*[@id="__next"]/div[1]/div/div/section[1]/h1/text()').get()
        avatar = response.xpath(
            '//*[@id="__next"]/div[1]/div/div/section[1]/div[1]/div/a[@class="_1OhGeD"]/img/@src'
        ).get()
        author = response.xpath(
            '///*[@id="__next"]/div[1]/div/div/section[1]/div[1]/div/div/div[1]/span[@class="FxYr8x"]/a/text()'
        ).get()
        pub_time = response.xpath(
            '//*[@id="__next"]/div[1]/div/div/section[1]/div[1]/div/div/div[2]/time/text()'
        ).get()
        url = response.url
        url1 = url.split('?')[0]
        article_id = url1.split('/')[-1]
        content = ''.join(
            response.xpath(
                '//*[@id="__next"]/div[1]/div/div/section[1]/article//text()').
            getall())

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           pub_time=pub_time,
                           origin_url=url,
                           article_id=article_id,
                           content=content)
        yield item
Beispiel #6
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        avatar = response.xpath("//div[@class='_2mYfmT']/a/img/@src").get()
        author = response.xpath("//span[@class='FxYr8x']/a[@class='_1OhGeD']/text()").get()
        put_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
        url = response.url
        content = response.xpath("//article[@class='_2rhmJa']").get()
        article_id = url.split("/")[-1]

        word_count = response.xpath("//div[@class='s-dsoj']/span[2]").get()
        comment_count = response.xpath("//div[@class='_3nj4GN'][1]/span").get()
        like_count = response.xpath("//div[@class='_3nj4GN'][2]/span").get()
        read_count = response.xpath("//div[@class='s-dsoj']/span[3]").get()

        subjects = ",".join(response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())


        item = ArticleItem(title=title,
                           author=author,
                           avatar=avatar,
                           put_time=put_time,
                           article_id=article_id,
                           origin_url=url,
                           content=content,
                           word_count=word_count,
                           comment_count=comment_count,
                           like_count=like_count,
                           read_count=read_count,
                           subjects=subjects)
        yield item
Beispiel #7
0
 def parse_detail(self, response):
     title = response.xpath(".//h1[@class='_1RuRku']/text()").get()
     avatar = response.xpath(".//a[@class='_1OhGeD']/img/@src").get()
     # article = response.xpath(".//article[@class='_2rhmJa']/p/text()").get()
     author = response.xpath(".//span[@class='_22gUMi']/text()").get()
     time = response.xpath(".//time/text()").get()
     url = response.url
     url1 = url.split("?")[0]
     article_id = url1.split('/')[-1]
     content = response.xpath(" .//article[@class='_2rhmJa']").get()
     word_count = response.xpath(
         ".//div[@class='s-dsoj']/span[1]/text()").get()
     like_count = response.xpath(".//span[@class='_1LOh_5']/text()").get()
     read_count = response.xpath(
         ".//div[@class='s-dsoj']/span[2]/text()").get()
     subjects = ",".join(
         response.xpath(".//div[@class='_2Nttfz']/a/span/text()").getall())
     item = ArticleItem(title=title,
                        avatar=avatar,
                        pub_time=time,
                        origin_url=response.url,
                        article_id=article_id,
                        author=author,
                        content=content,
                        subjects=subjects,
                        word_count=word_count,
                        like_count=like_count,
                        read_count=read_count)
     yield item
Beispiel #8
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        avatar = response.xpath("//a[@class='_1OhGeD']/img/@src").get()
        author = response.xpath("//span[@class='FxYr8x']/a/text()").get()
        pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split('/')[-1]
        content = response.xpath("//article[@class='_2rhmJa']").get()

        word_count = response.xpath(
            "//span[@class='_3tCVn5']/following-sibling::span[1]/text()").get(
            )
        read_count = response.xpath(
            "//span[@class='_3tCVn5']/following-sibling::span[2]/text()").get(
            )
        subjects = ",".join(
            response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           content=content,
                           word_count=word_count,
                           read_count=read_count,
                           subjects=subjects)
        yield item
Beispiel #9
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").extract()[0]
        author = response.xpath("//span[@class='name']/a/text()").extract()[0]
        avatar = response.xpath("//a[@class='avatar']/img/@src").extract()[0]
        pub_time = response.xpath(
            "//span[@class='publish-time']/text()").extract()[0]
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        content = response.xpath("//div[@class='show-content']").get()

        word_count = response.xpath(
            "//span[@class='wordage']/text()").extract()[0]
        re_wc = re.match(".*?(\d+)", word_count)
        if re_wc:
            word_count = int(re_wc.group(1))

        like_count = response.xpath(
            "//span[@class='likes-count']/text()").extract()[0]
        re_lc = re.match(".*?(\d+)", like_count)
        if re_lc:
            like_count = int(re_lc.group(1))

        read_count = response.xpath(
            "//span[@class='views-count']/text()").extract()[0]
        re_rc = re.match(".*?(\d+)", read_count)
        if re_rc:
            read_count = int(re_rc.group(1))

        comments_count = response.xpath(
            "//span[@class='comments-count']/text()").extract()[0]
        re_cc = re.match(".*?(\d+)", comments_count)
        if re_cc:
            comments_count = int(re_cc.group(1))

        subjects = ",".join(
            response.xpath(
                "//div[@class='include-collection']/a/div/text()").extract())

        item = ArticleItem(title=title,
                           author=author,
                           avatar=avatar,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           content=content,
                           word_count=word_count,
                           subjects=subjects,
                           like_count=like_count,
                           read_count=read_count,
                           comments_count=comments_count)
        yield item
Beispiel #10
0
 def parse_item(self, response):
     title = response.xpath('//h1[@class="title"]/text()').get()
     avatar = response.xpath('//a[@class="avatar"]/img/@src').get()
     author = response.xpath('//span[@class="name"]/a/text()').get()
     pub_time = response.xpath(
         '//span[@class="publish-time"]/text()').get().replace("*", "")
     url = response.url
     url1 = url.split("?")[0]
     article_id = url1.split('/')[-1]
     content = response.xpath("//div[@class='show-content-free']").get()
     item = ArticleItem(title=title,
                        avatar=avatar,
                        pub_time=pub_time,
                        origin_url=response.url,
                        article_id=article_id,
                        author=author,
                        content=content)
     yield item
Beispiel #11
0
 def parse_detail(self, response):
     article_ele = response.xpath("//div[@class='article']")
     title = article_ele.xpath(".//h1[@class='title']/text()").get()
     avatar = article_ele.xpath(".//a[@class='avatar']/img/@src").get()
     author = article_ele.xpath(".//span[@class='name']/a/text()").get()
     pub_time = article_ele.xpath('.//span[@class="publish-time"]/text()').get()
     content = article_ele.xpath(".//div[@class='show-content']").get()
     origin_url = response.url
     url1 = origin_url.split('?')[0]
     article_id = url1.split('/')[-1]
     item = ArticleItem(title=title,
                        avatar=avatar,
                        author=author,
                        pub_time=pub_time,
                        content=content,
                        origin_url=origin_url,
                        article_id=article_id)
     return item
Beispiel #12
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        pub_time = response.xpath(
            "//span[@class='publish-time']/text()").get().replace("*", "")
        # article_id 是藏在url中的
        article_id = response.url.split("?")[0].split("/")[-1]
        # 内容中包含了布局标签
        content = response.xpath("//div[@class='show-content']").get()

        word_count = int(
            re.findall(r"\d+",
                       response.xpath("//span[@class='wordage']/text()").get())
            [0])  # 这里不加int转也可以,会自动转
        comment_count = re.findall(
            r"\d+",
            response.xpath("//span[@class='comments-count']/text()").get())[0]
        read_count = re.findall(
            r"\d+",
            response.xpath("//span[@class='views-count']/text()").get())[0]
        like_count = re.findall(
            r"\d+",
            response.xpath("//span[@class='likes-count']/text()").get())[0]
        subjects = ",".join(
            response.xpath(
                "//div[@class='include-collection']/a/div/text()").getall())

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           content=content,
                           subjects=subjects,
                           word_count=word_count,
                           comment_count=comment_count,
                           read_count=read_count,
                           like_count=like_count)

        yield item
Beispiel #13
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        pub_time = response.xpath("//div[@class='s-dsoj']//time/text()").get()
        word_count = response.xpath(
            "//div[@class='s-dsoj']/span/text()")[0].get()
        read_count = response.xpath(
            "//div[@class='s-dsoj']/span/text()")[1].get()
        # 把HTML格式也拿了出来 get()只会得到纯文本
        content = response.xpath("//div[@class='_gp-ck']").get()
        subjects = "".join(
            response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())

        item = ArticleItem(title=title,
                           pub_time=pub_time,
                           origin_rul=response.url,
                           content=content,
                           word_count=word_count,
                           read_count=read_count,
                           subjects=subjects)
        yield item
Beispiel #14
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//span[@class='name']/a/text()").get()
        pub_time = response.xpath(
            "//span[@class='publish-time']/text()").get().replace("*", "")
        # https://www.jianshu.com/p/d30d0f91554a?utm_campaign=maleskine&utm_content=note&utm_medium=pc_all_hots&utm_source=recommendation
        # https://www.jianshu.com/p/d30d0f91554a
        url = response.url
        # ['https://www.jianshu.com/p/d30d0f91554a','utm_campaign=maleskine&utm_content=note&utm_medium=pc_all_hots&utm_source=recommendation']
        # ['https://www.jianshu.com/p/d30d0f91554a']
        url1 = url.split("?")[0]
        article_id = url1.split('/')[-1]

        content = response.xpath("//div[@class='show-content']").get()

        word_count = response.xpath("//span[@class='wordage']/text()").get()
        comment_count = response.xpath(
            "//span[@class='comments-count']/text()").get()
        read_count = response.xpath(
            "//span[@class='views-count']/text()").get()
        like_count = response.xpath(
            "//span[@class='likes-count']/text()").get()

        subjects = ",".join(
            response.xpath(
                "//div[@class='include-collection']/a/div/text()").getall())

        item = ArticleItem(title=title,
                           avatar=avatar,
                           author=author,
                           pub_time=pub_time,
                           origin_url=response.url,
                           article_id=article_id,
                           content=content,
                           subjects=subjects,
                           word_count=word_count,
                           comment_count=comment_count,
                           read_count=read_count,
                           like_count=like_count)
        yield item
Beispiel #15
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        author = response.xpath(
            "//span[@class='FxYr8x']/a[@class='_1OhGeD']/text()").get()
        avatar = response.xpath(
            "//div[@class='_2mYfmT']/a[@class='_1OhGeD']/img/@src").get()
        pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
        # https://www.jianshu.com/p/e86c6f35c556
        # https://www.jianshu.com/p/e86c6f35c556?utm_campaign=maleskine&utm_content=note
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        content = response.xpath("//article[@class='_2rhmJa']").get()
        read_count = response.xpath(
            "//div[@class='s-dsoj']/span[last()]/text()").get()
        word_count = response.xpath(
            "//div[@class='s-dsoj']/span[last()-1]/text()").get()
        like_count = response.xpath(
            "//div[@class='-pXE92']/div[@class='_3nj4GN'][last()]/span/text()"
        ).getall()[-1]
        comment_count = response.xpath(
            "//div[@class='-pXE92']/div[@class='_3nj4GN'][last()-1]/span/text()"
        ).getall()[-1]
        subjects = ",".join(
            response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())

        item = ArticleItem(title=title,
                           author=author,
                           avatar=avatar,
                           pub_time=pub_time,
                           article_id=article_id,
                           content=content,
                           origin_url=response.url,
                           word_count=word_count,
                           read_count=read_count,
                           comment_count=comment_count,
                           like_count=like_count,
                           subjects=subjects)
        yield item
Beispiel #16
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        content = response.xpath("//div[@class='_2rhmJa']").get()
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split('/')[-1]
        origin_url = response.url
        author = response.xpath("//div[1]/div/div/section[1]/div[1]/div/div/div[1]/span/a/text()").get()
        avatar = response.xpath("//a[@class='_1OhGeD']/img/@src").get()
        pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()

        word_count = response.xpath("//div[@class='s-dsoj']/span[2]/text()").get()
        read_count = response.xpath("//div[@class='s-dsoj']/span[3]/text()").get()
        like_count = response.xpath("//div[@class='_3U4Smb']/div[@class='s-dsoj']/span[1]/span/text()").get()

        # subjects = ",".join(response.xpath("/div[@class='include-collection]/a/div/text()").getall())
        print(">>>")
        print(title)
        print(pub_time)
        print(word_count)
        print(read_count)

        if not read_count:
            read_count = str("_NULL")

        item = ArticleItem(
            title=title,
            content=content,
            author=author,
            avatar=avatar,
            pub_time=pub_time,
            origin_url=origin_url,
            article_id=article_id,
            read_count=read_count,
            like_count=like_count,
            word_count=word_count
        )

        yield item