Beispiel #1
0
    def parse_baijiahao_contents(self, response):
        # 从html中提取出所有的正文内容
        content = response.xpath(
            '/html/body/div[@id = "detail-page"]/div[@id="content-container"]',
            encoding='UTF-8')
        content = content.xpath('//span[@class="bjh-p"]/text()',
                                encoding="UTF-8").extract()

        # 将正文的所有句子连接成字符串
        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        # 去掉正文中的换行符和空格
        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")

        # 将获取到的字符串存入item中
        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        self.id += 1
        yield item
Beispiel #2
0
    def parse(self, response):
        news_body = response.xpath("//div[@class='content_czyd']")[0]
        news_list = news_body.xpath(".//div[@class='list_list']")
        for news in news_list:
            title = news.xpath("./div/dt/a/text()")
            time = news.xpath("./div/dt/span/text()")
            content = news.xpath("./div/dd/a/text()")
            href = news.xpath("./div/dt/a/@href")
            if len(title) == 0 or len(time) == 0 or len(content) == 0 or len(
                    href) == 0:
                continue
            title = title[0].extract()
            time = time[0].extract()
            content = content[0].extract()
            href = prefixURL + href[0].extract()
            author = "北京鲁迅博物馆"
            description = "1"
            tag = 1
            item = MuseumNewsSpiderItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = href
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if self.page <= 6:
            self.page += 1
            new_url = URL.format(page=self.page)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
    def parse(self, response):
        news_body = response.xpath("//td[@height='450']")[0]
        news_list = news_body.xpath(".//table[@width='85%']")
        for news in news_list:
            info = news.xpath(".//text()")
            if len(info) == 0:
                continue
            title = info[1].extract()
            time = info[0].extract().replace("\xa0", "")
            content = title
            href = prefixURL + news.xpath(".//@href")[0].extract()
            author = "首都博物馆"
            description = "1"
            tag = 1
            item = MuseumNewsSpiderItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = href
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if self.page < 71:
            self.page += 1
            new_url = URL.format(page=self.page)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
Beispiel #4
0
    def parse(self, response):
        news_list = response.xpath('//div[@class="result"]')
        # print(news_list)
        if not news_list:
            self.end = True
            return
        for news in news_list:
            href = news.xpath('./h3[@class="c-title"]/a/@href').extract()
            url = "".join(href).replace("\n", "").replace(" ", "")

            title = news.xpath('./h3[@class="c-title"]/a/text()').extract()
            title = "".join(title).replace("\n", "").replace(" ", "")

            content = news.xpath(
                './div[@class="c-summary c-row "]/text()').extract()
            content = "".join(content).replace("\n", "").replace(" ", "")
            if content == "":
                content = news.xpath(
                    './div[@class="c-summary c-row "]/div[2]/text()').extract(
                    )
            content = "".join(content).replace("\n", "").replace(" ", "")

            author_time = news.xpath(
                './div[@class="c-summary c-row "]//p[@class="c-author"]/text()'
            ).extract()
            author_time = "".join(author_time).replace("\n",
                                                       "").replace(" ",
                                                                   "").split()
            author = ""
            time = ""
            if author_time:  # 有些新闻没有作者和时间
                author = author_time[0]
                s_time = author_time[1]
                if s_time:
                    time = self.parse_time(s_time)
                else:
                    time = s_time

            description = "1"
            tag = 1
            item = MuseumNewsSpiderItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = url
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if not self.end:
            self.page += 1
            new_url = URL.format(museum=self.museum,
                                 bt=self.startTime,
                                 et=self.endTime,
                                 page=self.page * 10)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
Beispiel #5
0
    def parse(self, response):
        url = response.url
        if "baijiahao" in url:
            yield scrapy.Request(url, callback=self.parse_baijiahao_contents)
        elif "new.qq.com" in url:
            yield scrapy.Request(url, callback=self.parse_tencent_contents)
        elif "news.ifeng.com" in url:
            yield scrapy.Request(url, callback=self.parse_ifeng_contents)
        elif "news.163.com" in url:
            yield scrapy.Request(url, callback=self.parse_163_news_contents)
        elif "3g.163.com" in url:
            yield scrapy.Request(url, callback=self.parse_3g_163_contents)
        elif "thepaper.cn" in url:
            yield scrapy.Request(url, callback=self.parse_pengpai_contents)
        elif "news.sina.com.cn" in url:
            yield scrapy.Request(url, callback=self.parse_sina_contents)
        elif "paper.people.com.cn" in url:
            yield scrapy.Request(url,
                                 callback=self.parse_paper_people_contents)
        elif "xinhuanet.com" in url:
            yield scrapy.Request(url, callback=self.parse_xinhuanet_contents)
        elif "bmnh.org.cn" in url:
            yield scrapy.Request(url, callback=self.parse_bmnh_contents)
        elif "capitalmuseum" in url:
            yield scrapy.Request(url,
                                 callback=self.parse_capital_museum_contents)
        elif "cstm.cdstm.cn" in url:
            yield scrapy.Request(url, callback=self.parse_cstm_contents)
        elif "luxunmuseum" in url:
            yield scrapy.Request(url, callback=self.parse_luxunmuseum_contents)
        elif "jb.mil.cn" in url:
            yield scrapy.Request(url,
                                 callback=self.parse_military_museum_contents)
        elif "gmc" in url:
            yield scrapy.Request(url, callback=self.parse_gmc_contents)
        else:
            item = MuseumNewsSpiderItem()
            cursor = self.mydatabase.cursor()
            sql = "select content from new where url='{}'".format(url)
            cursor.execute(sql)
            result = cursor.fetchone()
            cursor.close()

            id = self.getNewsID(url)
            if id != []:
                item['news_id'] = id
            else:
                item['news_id'] = -1
            item['main_content'] = result[0]
            yield item
Beispiel #6
0
    def parse_xinhuanet_contents(self, response):
        content = response.xpath('//div[@id="p-detail"]', encoding="UTF-8")
        content = content.xpath('//p/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item
Beispiel #7
0
    def parse_tencent_contents(self, response):
        content = response.xpath('/html/body', encoding="UTF-8")
        content = content.xpath('//p[@class="one-p"]/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item
Beispiel #8
0
    def parse_gmc_contents(self, response):
        content = response.xpath('//div[@class="article-cont"]')
        content = content.xpath('//p/span/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")
        content_string = content_string.replace("\xa0", "").replace("\r", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item
Beispiel #9
0
    def parse_capital_museum_contents(self, response):
        content = response.xpath("//span[@class='wcontent']")
        content = content.xpath("//p/text()").extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")
        content_string = content_string.replace("\xa0", "").replace("\r", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item
Beispiel #10
0
    def parse_ifeng_contents(self, response):
        content = response.xpath("//div[@class='text-3zQ3cZD4']",
                                 encoding="UTF-8")
        content = content.xpath("//p/text()").extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item
Beispiel #11
0
    def parse_pengpai_contents(self, response):
        content = response.xpath(
            '//*[@id="root"]/div/div[3]/div[1]/div[1]/div[3]/div/div[1]',
            encoding="UTF-8")
        content = content[0].xpath('//p/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item
Beispiel #12
0
    def parse_bmnh_contents(self, response):
        content = response.xpath(
            '//div[@class="content_singler"]/div[@class="single_block"]',
            encoding="UTF-8")
        content = content.xpath('//p')
        content = content.xpath('//span/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")

        item = MuseumNewsSpiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['main_content'] = content_string

        yield item