def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() match_obj = RETopic.match(url) match_obj2 = REComment.match(url) if flag: if match_obj is not None: topic_id = match_obj.group(1) topic = Topic(topic_id, self.group_id) comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing self.topic_dict[topic_id] = topic # 保存到单个文件(已废弃不用) #self.save_thread.putTask(self._saveHandler, comment_list, topic = topic) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) # 抽取非第一页的评论数据 if topic_id in self.topic_dict: topic = self.topic_dict[topic_id] if topic is None: log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id) self.topic_dict[topic_id] = None return False else: # 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现 log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id) self.failed.add(topic_id) self.finished.add(topic_id) return False comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing # 保存到单个文件(已废弃不用) #self.save_thread.putTask(self._saveHandler, comment_list, topic = None) else: #pdb.set_trace() log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if topic.isComplete(): self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id) #self.topic_dict[topic_id] = None # 释放资源 self.finished.add(topic_id) log.info('Topic: %s 抓取结束。' % topic_id) self.visited_href.add(url) return True else: # 处理抓取失败的网页集合 # 只要一个网页抓取失败,则加入到finished if match_obj is not None: # 讨论贴的第一页就没有抓到,则将其列入finished名单中 topic_id = match_obj.group(1) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) else: log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id)) # 添加抓取失败的topic id和标记抓取结束的topic self.failed.add(topic_id) self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息 self.visited_href.add(url) return False
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() match_obj = RETopic.match(url) match_obj2 = REComment.match(url) if flag: if match_obj is not None: topic_id = match_obj.group(1) topic = Topic(topic_id, self.groupID) comment_list = topic.parse(webPage, True) # First page parsing self.topicDict[topic_id] = topic # 保存到文件 self.saveThread.putTask(self._save_handler, comment_list, topic = topic) # 如果 elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) # 抽取非第一页的评论数据 if topic_id in self.topicDict: topic = self.topicDict[topic_id] if topic is None: log.error('未知程序错误:该topic已经抓取结束,已释放相关内存,topic id:%s' % topic_id) return False else: log.error('未知程序错误:在topicDict字典中找不到topic id: %s' % topic_id) self.failed.add(topic_id) self.finished.add(topic_id) return False comment_list = topic.parse(webPage, False) # non-firstpage parsing # 保存到文件 self.saveThread.putTask(self._save_handler, comment_list, topic = None) else: #pdb.set_trace() log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if topic.isComplete(): self.topicDict[topic_id] = None self.finished.add(topic_id) log.info('Topic: %s 抓取结束。' % topic_id) self.visitedHref.add(url) return True else: # 处理抓取失败的网页集合 # 只要一个网页抓取失败,则加入到finished if match_obj is not None: # 讨论贴的第一页就没有抓到,则将其列入finished名单中 topic_id = match_obj.group(1) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) else: log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID)) # 添加抓取失败的topic id和标记抓取结束的topic self.failed.add(topic_id) self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息 self.visitedHref.add(url) return False