Esempio n. 1
0
    def parse(self, response):
        
        nextPage=True
            
        for sel in response.xpath('//div[@class="list"]/ul/li'):
            item = NewsItem()
            url = sel.xpath('a/@href').extract()[0]            
            title = sel.xpath('a/text()').extract()[0]
            time = sel.xpath('span/text()').extract()[0]+':00'
            if time < self.endDate : 
                nextPage=False
                break
            item['url'] = url
            item['title'] = title
            item['pub_date'] = time
            item['crawl_ts']=TimeUtils.getCurrentTimeStamp()    
            
            self.logger.debug("get article time %s" % time )
            
            request = scrapy.Request(url, callback=self.parse_article)
            request.meta['item'] = item

            yield request
            
            
        if  nextPage is True:
            page = int(response.url[response.url.find("_")+1:len(response.url) - 5]) + 1
            url = ('http://finance.eastmoney.com/news/ccjdd_' + str(page) + '.html')
            
            self.logger.debug("get nextPage url %s" % url)
            request = scrapy.Request(url, callback=self.parse)
            yield request
Esempio n. 2
0
    def parse_article(self, response):
        item = response.meta['item']
        item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()
        # retrieve document body
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath('//*[@id="artibody"]').extract()[0]
        if raw.find(u"原始正文start") != -1:
            real_content_start = raw.find(u"原始正文start") + 13
            raw = raw[real_content_start:].trim()

        content = converter.handle(raw)
        item['content'] = content

        # retrieve source
        src_raw = response.xpath('//span[@class="time-source"]').extract()[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt.split(" ", 1)[1]
        item['article_source'] = source
        pub_date = str(
            datetime.fromtimestamp(
                mktime(
                    time.strptime(
                        src_txt.replace(u"\xa0", "").replace(" ", "")[:16],
                        u"%Y年%m月%d日%H:%M"))))
        item['pub_date'] = pub_date
        self.logger.info('pub_date: ' + item['pub_date'])
        yield item
 def parse_comment(self, response):
     print "parse_comment, url: "+response.url
     article_hive_id=response.meta["article_hive_id"]
     nextPage=True
     
     respons_xpath=response.xpath("//div[@class='comment-mod-bd']/div[@*]")
     if len(respons_xpath) == 0 :  #last page
         return
     
     for sel in respons_xpath:
         item=CommentItem()
         comment_id=sel.xpath('@id').extract()[0][8:]
         print "comment_id:"+comment_id
         item["id"]=article_hive_id+"&&"+comment_id
         
         if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled, comment_id) is not None:
             nextPage=False
             print "nextPage=False"
             break
         
         userName=sel.xpath("div[@class='comment-item-bd']/h4/a[@class='name']/text()").extract()[0]
         item["username"]=userName
         comment_content=sel.xpath("div[@class='comment-item-bd']/div[@class='comment-item-content']").extract()[0]   #div[@class='detail']/i
         converter = html2text.HTML2Text()
         converter.ignore_links = True
         comment_content = converter.handle(comment_content)
         item["content"]=comment_content
     
         comment_pub_date=sel.xpath("div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()").extract()[0]
         #print "comment_pub_date original:"+comment_pub_date
         if comment_pub_date.find(u"今天") != -1: #今天 17:24
             comment_pub_date=TimeUtils.getCurrentDate()+comment_pub_date[2:]+":00"
         elif comment_pub_date.find(u"分钟前") != -1:
             minute_before=comment_pub_date[0:comment_pub_date.find(u"分钟前")]
             comment_pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before))
         else: #07-13 14:08
             comment_pub_date=TimeUtils.getCurrentYear()+"-"+comment_pub_date+":00"
         
         item["pub_date"]=comment_pub_date
         item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
         
         #print "userName: "******"comment_content: "+comment_content
         print "comment_pub_date: "+comment_pub_date
         self.logger.info("comment_pub_date:"+comment_pub_date)
         
         yield item
         
     if nextPage == True:
         page=int(response.url[-1])+1
         article_id=response.meta["article_id"]
         user_id=response.meta["user_id"]
         comment_url="https://xueqiu.com/service/comment/list?id="+article_id+"&user_id="+user_id+"&type=status&sort=false&page="+str(page)
         request = scrapy.Request(comment_url,headers=self.headers,callback=self.parse_comment)
         request.meta["article_hive_id"] = article_hive_id
         request.meta["article_id"]=article_id
         request.meta["user_id"]=user_id
         yield request
Esempio n. 4
0
    def parse_article(self, response):
        item=response.meta["item"]
        item["crawl_ts"]=TimeUtils.getCurrentTimeStamp()
        #某些文章内容中没有title
        #title = response.xpath("//div[@class='status-content']/h4[@class='status-title']/text()").extract()[0]
        #print "title: "+title
        
        #retrieve content
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath("//div[@class='status-content']/div[@class='detail']/text()").extract()[0]
        content = converter.handle(raw)
        item["content"] = content
        print "content: "+content

        # retrieve source
        src_raw = response.xpath("//div[@class='subtitle']/span[@class='source']/text()").extract()[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt[2:]
        item["article_source"]=source
        print "article_source: "+source
        
        yield item
    def parse_article(self, response):
        item = response.meta['item']
        item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
        # retrieve document body
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath('//*[@id="artibody"]').extract()[0]
        if raw.find(u"原始正文start") !=-1:
            real_content_start=raw.find(u"原始正文start")+13
            raw=raw[real_content_start:].trim()
            
        content = converter.handle(raw)
        item['content'] = content

        # retrieve source
        src_raw = response.xpath('//span[@class="time-source"]').extract()[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt.split(" ",1)[1]
        item['article_source']=source
        pub_date= str(datetime.fromtimestamp(mktime(time.strptime(src_txt.replace(u"\xa0", "").replace(" ", "")[:16], u"%Y年%m月%d日%H:%M"))))
        item['pub_date']= pub_date
        self.logger.info('pub_date: '+ item['pub_date'])
        yield item
 def parse(self, response):
     result = re.findall(r'type=(\d+)',response.url)
     flag=result[0]
             
     jsonresponse = json.loads(response.body_as_unicode())
     try:
         if self.endDate is None:
             historyOpinion=jsonresponse["data"]["historyOpinion"][0]  #only get current day
             latest_date=historyOpinion['opinionTime']
             self.logger.debug("latest_date: "+latest_date)
             self.logger.debug("current_date: "+self.current_date)
             latest_date=latest_date.encode('UTF-8','ignore')
             if latest_date != self.current_date:
                 current_time=str(time.strftime("%Y%m%d  %H:%M:%S", time.localtime()))
                 self.logger.debug("there is no data in current time: "+current_time)
                 return
             
             self.logger.debug("start to crawl date: "+latest_date)
             stocks=historyOpinion['hotSearchOpinionDetail']
             for i in range(5):
                 item=BaiduStockOpinionItem()
                 item['pub_date']=latest_date
                 item['batch']=int(self.batch)
                 item['code']=stocks[i]['stockCode']
                 item['name']=stocks[i]['stockName']
                 rankString=stocks[i]['showtext']  # No.1
                 item['rank']=int(rankString.split('.')[1])
                 opinionKeywords=stocks[i]['opinionKeywords']
                 item['keywords']=",".join(opinionKeywords)
                 item['flag']=int(flag)
                 item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
                 yield item
         else:
             for historyOpinion in jsonresponse["data"]["historyOpinion"]:
                 latest_date=historyOpinion['opinionTime']
                 #print "test in redis"
                 if self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \
                     latest_date) is None:  #redis 是否已爬取
                     #print "crawl Date: "+latest_date
                     self.logger.info("start to crawl date: "+latest_date)
                     stocks=historyOpinion['hotSearchOpinionDetail']
                     for i in range(5):
                         item=BaiduStockOpinionItem()
                         item['pub_date']=latest_date
                         item['batch']=int(self.batch)
                         item['code']=stocks[i]['stockCode']
                         item['name']=stocks[i]['stockName']
                         rankString=stocks[i]['showtext']  # No.1
                         item['rank']=int(rankString.split('.')[1])
                         opinionKeywords=stocks[i]['opinionKeywords']
                         item['keywords']=",".join(opinionKeywords)
                         item['flag']=int(flag)
                         item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
                         yield item
                 elif self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \
                     latest_date) is not None:
                     continue
                 else:
                     self.nextPage=False
                     break
             
             if self.nextPage:
                 page = int(response.url[-1])+1
                 url = response.url[0:-1] + str(page)
                 request = scrapy.Request(url, callback=self.parse)
                 yield request
     except KeyError as e:
         self.logger.error("exception is %s" % e)