def crawl(self,urls,depth=2): for url in urls: html = getPage(url) if html is None or html == '' : continue soup = BeautifulSoup(html) url_list = [] for link in soup('a'): url_str = link.get('href') if url_str is None or url_str == '' : continue elif url_str[0:4] == 'http' : url_list.append(url_str) else : pass
critics={'李楠':{'一代宗师':'2.5','小时代':'3.5','钢铁侠':'2.0','蜘蛛侠':'3.5','重返地球':'2.5','007':'3.0'}, '宋瑶':{'一代宗师':'1.0','小时代':'3.5','超人':'1.0','蜘蛛侠':'2.5','星际穿越':'2.5','007':'2.0'}, '吴琼':{'一代宗师':'1.5','小时代':'2.5','钢铁侠':'3.0','蝙蝠侠':'2.5','星际穿越':'1.5','007':'1.0'}, '卞雪达':{'一代宗师':'2.0','小时代':'1.5','钢铁侠':'3.0','蜘蛛侠':'2.5','星际穿越':'1.5','007':'4.0','生化危机':'5.0'}, '卞冬至':{'一代宗师':'2.5','小时代':'4.0','超人':'3.0','蜘蛛侠':'1.5','星际穿越':'1.5','007':'3.0'}, '吴会来':{'一代宗师':'3.0','小时代':'1.0','钢铁侠':'3.5','蜘蛛侠':'4.0','星际穿越':'4.0','007':'2.0'}, '张薇':{'一代宗师':'4.0','小时代':'1.0','超人':'3.5','蜘蛛侠':'2.0','星际穿越':'3.0','007':'3.5','生化危机':'4.0'}, '尼培伦':{'一代宗师':'3.5','小时代':'2.5','钢铁侠':'4.0','蜘蛛侠':'3.0','重返地球':'2.0','007':'2.5'}, '石相扬':{'一代宗师':'4.5','小时代':'0.5','钢铁侠':'4.5','蜘蛛侠':'3.5','星际穿越':'2.5','007':'1.5'}, '王瑞元':{'一代宗师':'0.5','小时代':'4.5','超人':'1.0','蝙蝠侠':'3.5','星际穿越':'3.0','007':'3.0'}, '大宝':{'我爱你':'0.5','小时代3':'4.5','超人2':'1.0','蝙蝠侠2':'3.5'}} url_base = 'http://zhidao.baidu.com' url_resouce = '/question/1432695535674275539.html' url_para = '?push=asking&entry=qb_home_new' html = getPage(url_base+url_resouce+url_para) if html is None or html == '':print 'html读取失败' open('a.html','w+').write(html) soup = BeautifulSoup(html) url_list = [] for link in soup('a'): url = link.get('href') if url is None or url == '' : continue if url[0] == '/' : url_list.append(url_base+url) elif url[0:4] == 'http' : url_list.append(url) else : pass for url in url_list : print url