def parse_comment(self, response):
     print "parse_comment, url: "+response.url
     article_hive_id=response.meta["article_hive_id"]
     nextPage=True
     
     respons_xpath=response.xpath("//div[@class='comment-mod-bd']/div[@*]")
     if len(respons_xpath) == 0 :  #last page
         return
     
     for sel in respons_xpath:
         item=CommentItem()
         comment_id=sel.xpath('@id').extract()[0][8:]
         print "comment_id:"+comment_id
         item["id"]=article_hive_id+"&&"+comment_id
         
         if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled, comment_id) is not None:
             nextPage=False
             print "nextPage=False"
             break
         
         userName=sel.xpath("div[@class='comment-item-bd']/h4/a[@class='name']/text()").extract()[0]
         item["username"]=userName
         comment_content=sel.xpath("div[@class='comment-item-bd']/div[@class='comment-item-content']").extract()[0]   #div[@class='detail']/i
         converter = html2text.HTML2Text()
         converter.ignore_links = True
         comment_content = converter.handle(comment_content)
         item["content"]=comment_content
     
         comment_pub_date=sel.xpath("div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()").extract()[0]
         #print "comment_pub_date original:"+comment_pub_date
         if comment_pub_date.find(u"今天") != -1: #今天 17:24
             comment_pub_date=TimeUtils.getCurrentDate()+comment_pub_date[2:]+":00"
         elif comment_pub_date.find(u"分钟前") != -1:
             minute_before=comment_pub_date[0:comment_pub_date.find(u"分钟前")]
             comment_pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before))
         else: #07-13 14:08
             comment_pub_date=TimeUtils.getCurrentYear()+"-"+comment_pub_date+":00"
         
         item["pub_date"]=comment_pub_date
         item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
         
         #print "userName: "******"comment_content: "+comment_content
         print "comment_pub_date: "+comment_pub_date
         self.logger.info("comment_pub_date:"+comment_pub_date)
         
         yield item
         
     if nextPage == True:
         page=int(response.url[-1])+1
         article_id=response.meta["article_id"]
         user_id=response.meta["user_id"]
         comment_url="https://xueqiu.com/service/comment/list?id="+article_id+"&user_id="+user_id+"&type=status&sort=false&page="+str(page)
         request = scrapy.Request(comment_url,headers=self.headers,callback=self.parse_comment)
         request.meta["article_hive_id"] = article_hive_id
         request.meta["article_id"]=article_id
         request.meta["user_id"]=user_id
         yield request
Esempio n. 2
0
    def parse_list(self, response):
        result = json.loads(response.body_as_unicode())
        scope = response.meta['scope']
        nextPage = True
        less_than_endDate_time = int(0)
        for i in range(20):
            target = result[i]["target"]
            if target == "_blank":
                continue

            item = NewsItem()
            url = "https://xueqiu.com/" + target
            item["url"] = url
            if self.redis_conn.zscore(RedisKeys.xueqiu_url_crawled,
                                      url) is not None:
                self.logger.debug('url has benn got: ' + url)
                continue

            item["scope"] = scope
            item["title"] = result[i]["topic_title"]
            print "title: " + item["title"]
            pub_date = result[i]['timeBefore']  #str
            print "pub_date: " + pub_date
            if pub_date.find(u"今天") != -1:  #今天 17:24
                pub_date = TimeUtils.getCurrentDate() + pub_date[2:] + ":00"
            elif pub_date.find(u"分钟前") != -1:
                minute_before = pub_date[0:pub_date.find(u"分钟前")]
                pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before))
            else:  #07-13 14:08
                pub_date = TimeUtils.getCurrentYear() + "-" + pub_date + ":00"
            print "pub_date: " + pub_date
            item["pub_date"] = pub_date
            if pub_date < self.endDate:
                less_than_endDate_time += 1

            article_id = str(result[i]["id"])
            user_id = str(result[i]["user_id"])
            item["id"] = user_id + "&&" + article_id + "&&"
            request = scrapy.Request(url,
                                     headers=self.headers,
                                     callback=self.parse_article)
            request.meta['item'] = item
            request.meta['user_id'] = user_id
            request.meta['id'] = article_id
            yield request

        if less_than_endDate_time > 4:  #at least 4 times, 避免老数据以为修改显示在前面
            nextPage = False

        if nextPage is True:
            page = int(response.url[-1]) + 1
            time_stamp = str(time.time())[0:-3]
            url = "https://xueqiu.com/statuses/topic.json?simple_user=1&filter_text=1&topicType=0&_=" + time_stamp + "&page=" + str(
                page)
            request = scrapy.Request(url,
                                     headers=self.headers,
                                     callback=self.parse_list)
            yield request
Esempio n. 3
0
 def parse_list(self, response):
     result = json.loads(response.body_as_unicode())
     scope=response.meta['scope']
     nextPage=True
     less_than_endDate_time=int(0)
     for i in range(20):
         target=result[i]["target"]
         if target == "_blank":
             continue
         
         item = NewsItem()
         url="https://xueqiu.com/"+target
         item["url"] = url
         if self.redis_conn.zscore(RedisKeys.xueqiu_url_crawled, url) is not None:
             self.logger.debug('url has benn got: '+url)
             continue
         
         item["scope"]=scope
         item["title"]=result[i]["topic_title"]
         print "title: "+item["title"]
         pub_date = result[i]['timeBefore']   #str
         print "pub_date: "+pub_date
         if pub_date.find(u"今天") != -1: #今天 17:24
             pub_date=TimeUtils.getCurrentDate()+pub_date[2:]+":00"
         elif pub_date.find(u"分钟前") !=-1:
             minute_before=pub_date[0:pub_date.find(u"分钟前")]
             pub_date=TimeUtils.getDateSubtractMinutes(int(minute_before))
         else: #07-13 14:08
             pub_date=TimeUtils.getCurrentYear()+"-"+pub_date+":00"
         print "pub_date: "+pub_date
         item["pub_date"]=pub_date
         if pub_date < self.endDate:
             less_than_endDate_time += 1
         
         article_id=str(result[i]["id"])
         user_id=str(result[i]["user_id"])
         item["id"]=user_id+"&&"+article_id+"&&"
         request = scrapy.Request(url,headers=self.headers,callback=self.parse_article)
         request.meta['item'] = item
         request.meta['user_id'] = user_id
         request.meta['id'] = article_id
         yield request
     
     if less_than_endDate_time > 4: #at least 4 times, 避免老数据以为修改显示在前面
         nextPage=False
         
     if nextPage is True:
         page=int(response.url[-1])+1
         time_stamp=str(time.time())[0:-3]
         url="https://xueqiu.com/statuses/topic.json?simple_user=1&filter_text=1&topicType=0&_="+time_stamp+"&page="+str(page)
         request = scrapy.Request(url,headers=self.headers,callback=self.parse_list)
         yield request
Esempio n. 4
0
    def parse_comment(self, response):
        print "parse_comment, url: " + response.url
        article_hive_id = response.meta["article_hive_id"]
        nextPage = True

        respons_xpath = response.xpath(
            "//div[@class='comment-mod-bd']/div[@*]")
        if len(respons_xpath) == 0:  #last page
            return

        for sel in respons_xpath:
            item = CommentItem()
            comment_id = sel.xpath('@id').extract()[0][8:]
            print "comment_id:" + comment_id
            item["id"] = article_hive_id + "&&" + comment_id

            if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled,
                                      comment_id) is not None:
                nextPage = False
                print "nextPage=False"
                break

            userName = sel.xpath(
                "div[@class='comment-item-bd']/h4/a[@class='name']/text()"
            ).extract()[0]
            item["username"] = userName
            comment_content = sel.xpath(
                "div[@class='comment-item-bd']/div[@class='comment-item-content']"
            ).extract()[0]  #div[@class='detail']/i
            converter = html2text.HTML2Text()
            converter.ignore_links = True
            comment_content = converter.handle(comment_content)
            item["content"] = comment_content

            comment_pub_date = sel.xpath(
                "div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()"
            ).extract()[0]
            #print "comment_pub_date original:"+comment_pub_date
            if comment_pub_date.find(u"今天") != -1:  #今天 17:24
                comment_pub_date = TimeUtils.getCurrentDate(
                ) + comment_pub_date[2:] + ":00"
            elif comment_pub_date.find(u"分钟前") != -1:
                minute_before = comment_pub_date[0:comment_pub_date.find(u"分钟前"
                                                                         )]
                comment_pub_date = TimeUtils.getDateSubtractMinutes(
                    int(minute_before))
            else:  #07-13 14:08
                comment_pub_date = TimeUtils.getCurrentYear(
                ) + "-" + comment_pub_date + ":00"

            item["pub_date"] = comment_pub_date
            item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()

            #print "userName: "******"comment_content: "+comment_content
            print "comment_pub_date: " + comment_pub_date
            self.logger.info("comment_pub_date:" + comment_pub_date)

            yield item

        if nextPage == True:
            page = int(response.url[-1]) + 1
            article_id = response.meta["article_id"]
            user_id = response.meta["user_id"]
            comment_url = "https://xueqiu.com/service/comment/list?id=" + article_id + "&user_id=" + user_id + "&type=status&sort=false&page=" + str(
                page)
            request = scrapy.Request(comment_url,
                                     headers=self.headers,
                                     callback=self.parse_comment)
            request.meta["article_hive_id"] = article_hive_id
            request.meta["article_id"] = article_id
            request.meta["user_id"] = user_id
            yield request