コード例 #1
0
    def parse(self, response):
        print('a new page'.center(60,'='))
        item = ArticleItem()
        item['title'] = response.xpath('//*[@class="wxTitle"]/h2/text()').extract()[0].strip()
        item['abstract'] = response.xpath('//*[@class="wxBaseinfo"]//*[@id="ChDivSummary"]/text()').extract()[0]
        # item['cite'] = response.xpath('//div[@class="map"]/div/span/text()').extract()
        item['date'] = response.xpath('//div[@class="sourinfo"]/p[3]/a/text()').extract()[0].strip().replace(';\r\n','')
        return item

        if response.xpath('//div[@class="wxBaseinfo"]/p[2]/label[@id="catalog_FUND"]'):
            keywords = response.xpath('//div[@class="wxBaseinfo"]/p[3]/a/text()').extract()
            item['keywords'] = [i.strip().replace(';','')  for i in keywords]
            fund = response.xpath('//div[@class="wxBaseinfo"]/p[2]/a/text()').extract()
            item['fund'] = [i.strip().replace(';','') for i in fund]
        else:
            keywords = response.xpath('//div[@class="wxBaseinfo"]/p[2]/a/text()').extract()
            item['keywords'] = [i.strip().replace(';','') for i in keywords]
            item['fund'] = []

        authors_info =  response.xpath('//div[@class="author"]/span/a/@onclick').extract()
        authors_list = [i.strip()[21:-3].replace('\'','').split(',') for i in authors_info]
        for author in authors_list:
            author_url = 'http://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=au&skey={}&code={}'.format(author[0],author[1])
            # item['ins'] = author_url
            yield scrapy.Request(author_url, meta={'item': item},callback=self.parse_author) 
コード例 #2
0
    def parse_article(self, response):

        city = response.meta['city']
        item = ArticleItem()

        # item["the_id"] it is a counter will be asigned in pipelines
        item["website"] = '搜狐焦点 资讯 ' + city
        item["title"] = remove_csv_noise(
            response.xpath('//div[@class="main-content"]/h1/text()').extract())
        item["link"] = response.url
        item["summary"] = remove_csv_noise(response.meta['summary'])
        item["category"] = remove_csv_noise(
            response.xpath(
                '//div[@class="bread-crumbs-area global-clearfix"]/span/a/text()'
            ).extract())
        item["date"] = remove_csv_noise(
            response.xpath(
                '//div[@class="info-source"]/span/text()').extract()[0])
        item["author"] = remove_csv_noise(
            response.xpath(
                '//div[@class="info-source"]/span/a/text()').extract()[0])
        # 根据xpath语法选取正文部分的HTML传递给html2text
        item["text"] = remove_csv_noise(
            html2text.html2text(
                response.xpath('//div[@class="info-content"]').extract()[0]))
        # item["crwaler_time"] =
        item["other"] = '搜狐焦点 资讯 ' + city

        yield item
コード例 #3
0
ファイル: yunyuedu_spider.py プロジェクト: whu404/getontrip-1
    def parse_list(self, response):

        items = []
        bookItem = response.meta['bookItem']
        result = JSONDecoder().decode(response.body)
        for jsonitem in result['catalog']:
            if jsonitem['grade'] == 2:
                sourceUuid = result['book']['sourceUuid']
                item = ArticleItem()
                item['author'] = bookItem['author']
                item['source'] = bookItem['source']
                item['issue'] = bookItem['issue']
                item['title'] = jsonitem['title']
                item[
                    'url'] = 'http://yuedu.163.com/book_reader/' + sourceUuid + '/' + jsonitem[
                        'uuid']
                # 这里用content字段暂时保存下一步的ajax请求url
                item[
                    'content'] = 'http://yuedu.163.com/getArticleContent.do?sourceUuid=' + sourceUuid + '&articleUuid=' + jsonitem[
                        'uuid']
                items.append(item)
                # yield item
                pass
        for item in items:
            yield Request(item['content'],
                          meta={'item': item},
                          callback=self.parse_details)
コード例 #4
0
ファイル: yunyuedu_spider.py プロジェクト: whu404/getontrip-1
 def parse_pages(self, response):
     items = []
     for sel in response.xpath(
             '//*[@id="page-163-com"]/div[2]/div[3]/div/div[2]/div[2]/div/div[2]/div'
     ):
         item = ArticleItem()
         sourceUuid = sel.xpath('a/@href').extract()[0].split('/')[2]
         item['author'] = sel.xpath(
             '//*[@class="author-container"]/dl/dd/text()').extract()[0]
         reload(sys)
         sys.setdefaultencoding('utf-8')
         item['source'] = sel.xpath('a/h2/text()').extract()[0].replace(
             '《', '').replace('》', ' ').split()[0]
         item['issue'] = sel.xpath('a/h2/text()').extract()[0].replace(
             '《', '').replace('》', ' ').split()[1]
         item['url'] = 'http://yuedu.163.com/newBookReader.do?operation=info&catalogOnly=true&sourceUuid=' + \
             sourceUuid
         items.append(item)
         # yield item_url
     for item in items:
         yield Request(item['url'],
                       meta={
                           'sourceUuid': sourceUuid,
                           'bookItem': item
                       },
                       callback=self.parse_list)
コード例 #5
0
ファイル: dmoz_spider.py プロジェクト: whu404/getontrip-1
 def parseBooks(self, response):
     for sel in response.xpath('//*[@id="main"]/article/div[1]/div[1]'):
         item = ArticleItem()
         item['url'] = sys.getdefaultencoding()
         # item['title'] = sys.getdefaultencoding()
         item['title'] = sel.xpath('h1/text()').extract()[0].strip()
         #
         #
         return item
コード例 #6
0
 def parse_pages(self, response):
     item_urls = []
     for sel in response.xpath('//*[@id="top2"]/div/ul/li/div[1]'):
         item_url = ArticleItem()
         item_url['url'] = sel.xpath('a/@href').extract()[0]
         item_urls.append(item_url)
         # yield item_url
     for item_url in item_urls:
         yield Request(item_url['url'], callback=self.parse_list)
コード例 #7
0
    def parse_list(self, response):
        items = []
        for sel in response.xpath('//*[@class="jx_Article"]/ul/li/h2'):
            item = ArticleItem()
            item['url'] = 'http://www.dooland.com/magazine/' + sel.xpath(
                'a/@href').extract()[0].strip()
            # item['title'] = sel.xpath('a/@title').extract()[0].strip()
            items.append(item)

        for item in items:
            # yield Request(item['url'],meta={'item': item}, callback=self.parse_details)
            yield Request(item['url'], callback=self.parse_details)
コード例 #8
0
    def parse_article(self, response):

        item = ArticleItem()

        # item["the_id"]       =
        item["website"]      = "辅导圈"
        item["title"]        = remove_csv_noise(response.meta['title'])
        item["link"]         = response.url
        item["summary"]      = remove_csv_noise(response.meta['summary'])
        item["date"]         = remove_csv_noise(response.meta['date'])
        item["category"]     = remove_csv_noise(response.xpath('//div[@class="article-meta"]/span/a/text()').extract())
        item["author"]       = remove_csv_noise(response.meta['author'])
        # 根据xpath语法选取正文部分的HTML传递给html2text
        item["text"]         = remove_csv_noise(html2text.html2text(response.xpath('//article[@class="article-content"]').extract()[0]))
        # item["crwaler_time"] = 
        item["other"]        = '教育'

        yield item
コード例 #9
0
    def parse_article(self, response):

        item = ArticleItem()

        # item["the_id"]       =
        item["website"]      = "鲸媒体"
        item["title"]        = str(response.xpath('//h1[@class="title"]/text()').extract()[0]).replace(',',',').replace('\n','').replace('\t','').replace('\r','')
        item["link"]         = response.url
        item["summary"]      = str(response.meta['summary']).replace(',',',').replace('\n','').replace('\t','').replace('\r','')
        item["category"]     = str(response.xpath('//span[@itemprop="name"]/text()').extract()).replace(',',',').replace('\n','').replace('\t','').replace('\r','')
        item["date"]         = str(response.xpath('//span[@class="postclock"]/text()').extract()[0]).replace(',',',').replace('\n','').replace('\t','').replace('\r','')
        item["author"]       = str(response.xpath('//span[@class="postoriginal"]/text()').extract()).replace(',',',').replace('\n','').replace('\t','').replace('\r','')
        # 根据xpath语法选取正文部分的HTML传递给html2text
        item["text"]         = str(html2text.html2text(response.xpath('//div[@class="post-content"]').extract()[0])).replace(',',',').replace('\n','').replace('\t','').replace('\r','')
        # item["crwaler_time"] = 
        item["other"]        = ''

        yield item
コード例 #10
0
    def parse_details(self, response):

        item = ArticleItem()
        sel = Selector(response)
        item['url'] = response.url
        item['title'] = sel.xpath(
            '//*[@class="title"]/div/h1/text()').extract()[0].strip().strip()
        item['content'] = sel.xpath('//*[@id="article"]/div').extract()[0]

        item['source'] = sel.xpath('//*[@id="main"]/aside/section[1]/h3/text()'
                                   ).extract()[0].split()[0]
        item['issue'] = sel.xpath('//*[@id="main"]/aside/section[1]/h3/text()'
                                  ).extract()[0].split()[1]

        # TODO 来源ID
        item['source_id'] = sel.xpath(
            '//*[@id="main"]/aside/section[1]/h3/text()').extract()[0]
        item['author'] = ''

        return item
コード例 #11
0
    def parse_article(self, response):
        item = ArticleItem()
        # item["the_id"]       =
        item["website"] = "胡润百富"
        item["title"] = str(
            response.xpath('//div[@class="title"]/text()').extract()).replace(
                ',', ',').replace('\n', '').replace('\t',
                                                    '').replace('\r', '')
        item["link"] = response.url
        item["summary"] = str(
            response.xpath(
                '//section[@class][@style]/text()').extract()).replace(
                    ',', ',').replace('\n', '').replace('\t',
                                                        '').replace('\r', '')
        item["category"] = str(
            response.xpath('//ol/li/text()').extract()).replace(
                ',', ',').replace('\n', '').replace('\t',
                                                    '').replace('\r', '')
        item["date"] = str(
            response.xpath('//div[@class="col-sm-6 navsource-l"]/text()').
            extract()).replace(',',
                               ',').replace('\n',
                                            '').replace('\t',
                                                        '').replace('\r', '')
        item["author"] = str(
            response.xpath('//div[@class="col-xs-12 text-right"]/text()').
            extract()).replace(',',
                               ',').replace('\n',
                                            '').replace('\t',
                                                        '').replace('\r', '')
        # 根据xpath语法选取正文部分的HTML传递给html2text
        item["text"] = str(
            html2text.html2text(
                response.xpath(
                    '//section[@style="font-size:16px;line-height:24px;"]').
                extract()[0])).replace(',', ',').replace('\n', '').replace(
                    '\t', '').replace('\r', '')
        # item["crwaler_time"] =
        item["other"] = ''

        yield item
コード例 #12
0
ファイル: jobbole.py プロジェクト: QFyears/python_crawler
    def parse_detail(self, response):
        item = ArticleItem()
        # 提取目标数据
        # front_img_url = response.meta["front_img_url"]
        front_img_url = response.meta.get('front_img_url', '')  # 文章封面图的URL
        title = response.css('div.entry-header h1::text').extract()[0]
        release_date = response.css(
            'p.entry-meta-hide-on-mobile ::text').extract()[0].replace(
                ' ·', '').strip()
        tag = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        tags = ','.join(tag)
        voteup_num = int(
            response.css('span.vote-post-up h10::text').extract()[0])
        collection_num = response.css('span.bookmark-btn::text').extract()[0]
        collection_pattern = re.match('.*?(\d+).*', collection_num)
        if collection_pattern:
            collection_num = int(collection_pattern.group(1))
        else:
            collection_num = 0

        comment_num = response.css(
            'a[href="#article-comment"] span::text').extract()[0]
        comment_pattern = re.match('.*?(\d+).*', comment_num)
        if comment_pattern:
            comment_num = int(comment_pattern.group(1))
        else:
            comment_num = 0

        content = response.css('div.entry').extract()[0]

        item['front_img_url'] = front_img_url
        item['title'] = title
        item['url'] = response.url
        item['release_date'] = release_date
        item['tags'] = tags
        item['voteup_num'] = voteup_num
        item['collection_num'] = collection_num
        item['comment_num'] = comment_num
        item['content'] = content

        yield item
コード例 #13
0
ファイル: jobbole.py プロジェクト: yutao9023/py
 def parse_detail(self, response):
     item = ArticleItem()
     item['url_object_id'] = get_md5(response.url)
     item['front_image_url'] = [response.meta.get('front_image_url', '')]
     item['post_url'] = response.url
     item['description'] = response.meta.get('description', '')  #默认为空
     item['title'] = response.xpath(
         '//div[@class="entry-header"]/h1/text()').extract()[0]
     item['date'] = response.xpath(
         '//p[@class="entry-meta-hide-on-mobile"]/text()').extract(
         )[0].strip().replace('·', '').strip()
     item['category'] = response.xpath(
         '//p[@class="entry-meta-hide-on-mobile"]/a[@rel="category tag"]/text()'
     ).extract()[0]
     fav_path = '//span[contains(@class, "vote-post-up")]/h10/text()'
     item['fav_num'] = 0 if not response.xpath(fav_path).re('\d+') else int(
         response.xpath(fav_path).re('\d+')[0])
     collections_path = '//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()'
     item['collections'] = 0 if not response.xpath(collections_path).re(
         '\d+') else int(response.xpath(collections_path).re('\d+')[0])
     comment_path = '//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()'
     item['comment'] = 0 if not response.xpath(comment_path).re(
         '\d+') else int(response.xpath(comment_path).re('\d+')[0])
     yield item
コード例 #14
0
    def readFile(self, response):
        # def parse(self, response):
        dir = "E:\\kanlishi"
        wildcard = ".txt"
        exts = wildcard.split(" ")
        files = os.listdir(dir)
        count = 0
        items = []
        for name in files:
            for ext in exts:
                if (name.endswith(ext)):
                    aid = name.split('_')[2]
                    count = count + 1
                    item = ArticleItem()
                    item['source'] = '看历史'.decode('utf8')
                    item['issue'] = name.split('_')[0].decode('GBK')
                    item[
                        'url'] = 'http://www.dooland.com/magazine/article_' + aid + '.html'
                    items.append(item)
                    yield Request(item['url'],
                                  meta={'item': item},
                                  callback=self.parse_kanlishi_details)

                    break
コード例 #15
0
    def parse_article(self, response):
        if 'html><head>' in response.body_as_unicode():
            return

        item = ArticleItem()

        # item["the_id"] it is a counter will be asigned in pipelines
        item["website"] = '观点房产'
        item["title"] = remove_csv_noise(response.meta['title'])
        item["link"] = response.url
        item["summary"] = remove_csv_noise(response.meta['summary'])
        item["date"] = remove_csv_noise(response.meta['date'])
        item["category"] = '资讯'
        item["author"] = remove_csv_noise(
            response.xpath('//div[@class="con_l_info_l"]/a/text()').extract()
            [-1])
        # 根据xpath语法选取正文部分的HTML传递给html2text
        item["text"] = remove_csv_noise(
            html2text.html2text(
                response.xpath('//div[@class="con_l_inner"]').extract()[0]))
        # item["crwaler_time"] =
        item["other"] = '观点房产 资讯'

        yield item
コード例 #16
0
    def parse(self, response):
        if len(self.currentUrl) == 0:
            self.currentUrl = str(response.url)
        sel = Selector(response)
        title = sel.xpath('//div[@class="content"]/h1/text()').extract()
        nextArticleUrlList = sel.xpath(
            '//div[@class="content"]/div[@class="pre_art"]/a').extract()
        nextArticleUrl = ''
        if len(nextArticleUrlList) > 1:
            nextArticleUrl = sel.xpath(
                '//div[@class="content"]/div[@class="pre_art"]/a[last()]/@href'
            ).extract()[0]
        contents = sel.xpath(
            '//div[@class="content"]/div[@class="content_01"]/p')
        nextPage = sel.xpath('//div[@class="page2"]/a[last()]')
        nextPageStr = nextPage.xpath('./text()').extract()[0].encode('utf-8')
        nextPageUrl = nextPage.xpath('./@href').extract()[0]
        # log.msg("Append done." + nextPageStr + nextPageUrl)
        for content in contents:
            # 过滤一些特殊的情况
            # 判断是不是图片
            imgs = content.xpath('./img')
            if imgs:  # 是图片
                for img in imgs:
                    if str(img.xpath('@src').extract()[0]).startswith(
                            'data:image/'):
                        log.msg('diu qi image')
                    else:
                        imgpath = img.xpath('@src').extract()[0]
                        if imgpath.startswith('http:') or imgpath.startswith(
                                'https:'):
                            log.msg('nothing to do')
                        else:
                            imgpath = self.imageQianZhui + imgpath
                        if len(self.currentMainImage) == 0:
                            self.currentMainImage = imgpath
                        self.contentList.append(imgpath)
            else:  # 不是图片
                # 如果是加粗过的文字
                if content.xpath('./strong'):
                    strongStr = content.xpath('./strong/text()').extract()[0]
                    self.contentList.append(strongStr)
                else:
                    textStr = content.xpath('./text()').extract()[0]
                    self.contentList.append(textStr)

        nextStr = '下一页'
        nextStr.encode('utf-8')
        if nextPageStr == nextStr:  # 说明是下一页
            # log.msg("Append done.----equal")
            # log.msg("Append done.----nextPageUrl:" + nextPageUrl)
            yield Request(nextPageUrl, callback=self.parse)
        else:  # 没有下一页了
            item = ArticleItem()
            item['title'] = [t.encode('utf-8') for t in title]
            item['title'] = item['title'][0]
            contentStr = ""
            for index in range(len(self.contentList)):
                contentStr += self.contentList[index].encode('utf-8')
                if index <> len(self.contentList) - 1:
                    contentStr += '$'

            item['content'] = contentStr
            item['url'] = self.currentUrl.encode('utf-8')
            item['mainImage'] = self.currentMainImage.encode('utf-8')
            print self.contentList
            self.contentList = []
            self.articleCount += 1
            self.currentUrl = ''
            self.currentMainImage = ''
            yield item
            # 尝试抓取下一篇文章
            if nextArticleUrl and self.articleCount < self.articleMaxCount:
                log.msg("Append done.----nextArticleUrl:" + nextArticleUrl)
                yield Request(nextArticleUrl, callback=self.parse)