def init_record(): init_db() init_record() # mnews.get_info() # crawl_news("http://www.solidot.org/story?sid=40747") # pool.wait() print "my source" News_source_m.print_all() print "error page" UnparsePage_m.print_all()
def __init__(self,link): self.link = link self.title = "" self.time = 0 self.content = "" self.keywords = "" self.refer = [] self.status = False # 是否解析成功 # 检查是否在已解析的连接里面 # 检查是否在无法解析的名单内 if link.find('http://') == -1: return # invalid link base_url = 'http://' + link.split('/')[2] # unparse_check = store.find(UnparsePage_m, UnparsePage_m.url == base_url.decode('utf-8')) # if unparse_check.count() != 0: # print "can not parse this link" # return self.pq = "" try: self.pq = pq(url=link).make_links_absolute() #可能会解析失败 except Exception as err: print "failed to open this link " + link if self.pq == "": return # get title self.title = get_title(self.pq) self.time = time.time() self.content = get_content(self.pq) self.refer = get_refer(self.pq) if len(self.title) == 0 or \ len(self.content) == 0 or len(self.refer) == 0: # 无法成功解析 print "can not parse " + link # 把网址添加异常网站数据库 mpage = UnparsePage_m() mpage.url = base_url.decode('utf-8') mpage.save() self.keywords = '' return else: # get keywords self.keywords = jieba.cut_for_search(self.title) self.status = True