def parse(self, response):
        try:
            logging.info('<JS_progress> parse url:' + response.url)
            items = response.xpath('//div[@class="col-xs-8"]/div[@class="wrap"]')
            if len(items) > 0:
                for item in items:
                    author_url = item.xpath('a/@href').extract()[0]
                    author_id = author_url.split('/').pop()
                    parsingItem = AuthorIdItem()
                    parsingItem['author_id'] = author_id
                    logging.info('<JS><parsingitem> author_id: %s' % author_id)
                    yield parsingItem

                if self.recommend_page_index == 1:
                    self.session = get_db_session()
                self.recommend_page_index += 1
                url = self.recommend_base_url % self.recommend_page_index
                yield Request(url, headers=self.headers, callback=self.parse, dont_filter=True)

                request = self.start_request_author()
                if request:
                    logging.info('<js_scrapy> start next author....4')
                    yield request
        except Exception as ex:
            logging.error('<JS><Author_Commit>parse error:\n' + repr(ex))
            logging.error(traceback.format_exc())
 def __init__(self):
     super(jianshu_spider, self).__init__()
     init_mysql()
     self.session = get_db_session()
     list = self.session.query(User).filter(User.is_article_complete == 1 or User.is_follower_complete == 1).all()
     if list:
         for item in list:
             if item.is_article_complete == 1:
                 item.is_article_complete = 0
             if item.is_follower_complete == 1:
                 item.is_follower_complete = 0
         self.session.flush()
         self.session.commit()
     list = self.session.query(ParsingItem).filter(ParsingItem.is_parsed == 1).all()
     if list:
         for item in list:
             item.is_parsed = 0
         self.session.flush()
         self.session.commit()
Esempio n. 3
0
 def open_spider(self, spider):
     self.session = get_db_session()