def parse(self, page): ''' Parse the page, store weibos into DB and return {weibo id, weibo comment number} If the trees getting failed(Exception), stop consequent processing and raise AdvKeywordWeiboPageParseException Return [crawl_feed_count, new_feed_count, {mid:n_comment}] ''' url_wrapper = self.url_wrapper search_url = url_wrapper.to_url() weibo_type = url_wrapper.get_url_type() page_num = url_wrapper.page_num weibo_comment_infor = {} crawl_feed_count = 0 new_feed_count = 0 increment = 0 increment_mark = True last_mid = '0' last_mid_mark = True weibo_trees = [] try: weibo_trees = page_2_weibo_trees_adv(page) except AdvKeywordWeiboPageParseException as err: #for exception hanlde, if the weibo_trees parse failed, this page process failed raise err if not weibo_trees: return [crawl_feed_count, new_feed_count, weibo_comment_infor] for weibo_tree in weibo_trees: weibo = None crawl_feed_count += 1 try: weibo = weibo_tree_2_weibo_adv(self.url_wrapper.keyword, weibo_tree, page_num, weibo_type) #need to log_type except AdvKeywordWeiboPageParseException as err: my_log.write_log(log_type=weibo_type, operation_status=0, fail_code=err.get_error_code(), err_msg=search_url) if not weibo: continue storer = AdvKeywordHotWeiboStorer(weibo) weibo_comment_infor[weibo.mid] = weibo.n_comment if storer.store() is True: new_feed_count += 1 if last_mid_mark: last_mid = weibo.mid last_mid_mark = False if weibo.mid == self.url_wrapper.last_mid: increment_mark = False if increment_mark: increment += 1 if increment < new_feed_count: increment = new_feed_count self.url_wrapper.last_mid = last_mid return [crawl_feed_count, new_feed_count, increment, weibo_comment_infor]
def parse(self, page): ''' Parse the page, store weibos into DB and return {weibo id, weibo comment number} If the trees getting failed(Exception), stop consequent processing and raise AdvKeywordWeiboPageParseException Return [crawl_feed_count, new_feed_count, {mid:n_comment}] ''' url_wrapper = self.url_wrapper search_url = url_wrapper.to_url() weibo_type = url_wrapper.get_url_type() page_num = url_wrapper.page_num weibo_comment_infor = {} crawl_feed_count = 0 new_feed_count = 0 weibo_trees = [] try: weibo_trees = page_2_weibo_trees_adv(page) # print "weibo_trees's size is: " + str(len(weibo_trees)) except AdvKeywordWeiboPageParseException as err: #for exception hanlde, if the weibo_trees parse failed, this page process failed s = traceback.format_exc() scheduler_logger.error(s) scheduler_logger.error(self.url_wrapper.tostring() + "\t" + self.url_wrapper.to_url()) raise err if not weibo_trees: print 'no weibo_trees' return [crawl_feed_count, new_feed_count, weibo_comment_infor] increment = 0 increment_mark = True last_mid = '0' last_mid_mark = True for weibo_tree in weibo_trees: weibo = None crawl_feed_count += 1 try: weibo = weibo_tree_2_weibo_adv(self.url_wrapper.keyword, weibo_tree, page_num, weibo_type) #need to log_type except AdvKeywordWeiboPageParseException as err: my_log.write_log(log_type=weibo_type, operation_status=0, fail_code=err.get_error_code(), err_msg=search_url) except AttributeError: continue # except: # s = traceback.format_exc() # print s if not weibo: # print "continue" continue storer = AdvKeywordRealWeiboStorer(weibo) weibo_comment_infor[weibo.mid] = weibo.n_comment if storer.store() is True: new_feed_count += 1 # else: # print weibo.create_time # print weibo.mid # print weibo.content if last_mid_mark: last_mid = weibo.mid last_mid_mark = False if weibo.mid == self.url_wrapper.last_mid: increment_mark = False if increment_mark: increment += 1 if increment < new_feed_count: increment = new_feed_count self.url_wrapper.last_mid = last_mid return [crawl_feed_count, new_feed_count, increment, weibo_comment_infor]
def parse(self, page): ''' Parse the page, store weibos into DB and return {weibo id, weibo comment number} If the trees getting failed(Exception), stop consequent processing and raise AdvKeywordWeiboPageParseException Return [crawl_feed_count, new_feed_count, {mid:n_comment}] ''' url_wrapper = self.url_wrapper search_url = url_wrapper.to_url() weibo_type = url_wrapper.get_url_type() page_num = url_wrapper.page_num weibo_comment_infor = {} crawl_feed_count = 0 new_feed_count = 0 weibo_trees = [] try: weibo_trees = page_2_weibo_trees_adv(page) # print "weibo_trees's size is: " + str(len(weibo_trees)) except AdvKeywordWeiboPageParseException as err: #for exception hanlde, if the weibo_trees parse failed, this page process failed s = traceback.format_exc() scheduler_logger.error(s) scheduler_logger.error(self.url_wrapper.tostring() + "\t" + self.url_wrapper.to_url()) raise err if not weibo_trees: print 'no weibo_trees' return [crawl_feed_count, new_feed_count, weibo_comment_infor] increment = 0 increment_mark = True last_mid = '0' last_mid_mark = True for weibo_tree in weibo_trees: weibo = None crawl_feed_count += 1 try: weibo = weibo_tree_2_weibo_adv(self.url_wrapper.keyword, weibo_tree, page_num, weibo_type) #need to log_type except AdvKeywordWeiboPageParseException as err: my_log.write_log(log_type=weibo_type, operation_status=0, fail_code=err.get_error_code(), err_msg=search_url) except AttributeError: continue # except: # s = traceback.format_exc() # print s if not weibo: # print "continue" continue storer = AdvKeywordRealWeiboStorer(weibo) weibo_comment_infor[weibo.mid] = weibo.n_comment if storer.store() is True: new_feed_count += 1 # else: # print weibo.create_time # print weibo.mid # print weibo.content if last_mid_mark: last_mid = weibo.mid last_mid_mark = False if weibo.mid == self.url_wrapper.last_mid: increment_mark = False if increment_mark: increment += 1 if increment < new_feed_count: increment = new_feed_count self.url_wrapper.last_mid = last_mid return [ crawl_feed_count, new_feed_count, increment, weibo_comment_infor ]