def get_pages(url): """获取当前页面的页数""" soup = BeautifulSoup(response.get_source(url)) max_page = int( soup.find('div', { 'id': 'Pages' }).findAll('a')[-1].get('href').split('.')[0].split('_')[1]) return max_page
def get_types(url): """获取单章的题目类型""" soup = BeautifulSoup(response.get_source(url)) types = [(i.text, i.get('href')) for i in soup.find('div', { 'id': 'TypeIn' }).findAll('a')] return types
def get_page_items(url): """获取当前页面的所有题目的解析页面地址""" soup = BeautifulSoup(response.get_source(url)) items = [ link.get("href") for link in soup.find("div", {"id": "ProDiv"}).findAll("a") if link.get("href").startswith("http://www.tikubaba.com") ] return items
def get_page_items(url): """获取当前页面的所有题目的解析页面地址""" soup = BeautifulSoup(response.get_source(url)) items = [ link.get('href') for link in soup.find('div', { 'id': 'ProDiv' }).findAll('a') if link.get('href').startswith('http://www.tikubaba.com') ] return items
def get_tiku(): """获取首页所有课程名以及url""" linkdict = {} tables = BeautifulSoup(response.get_source(URL)).findAll("div", id="search_main")[0].findAll("table") for table in tables: name = table.findAll(attrs={"align": "center"}) try: if len(name) > 0: name = name[0].select("font")[0].text questions = table.findAll(attrs={"style": "width:750px;"})[0].findAll("a") linkdict[name] = questions except: pass return linkdict
def get_tiku(): """获取首页所有课程名以及url""" linkdict = {} tables = BeautifulSoup(response.get_source(URL)).findAll( 'div', id='search_main')[0].findAll('table') for table in tables: name = table.findAll(attrs={'align': 'center'}) try: if len(name) > 0: name = name[0].select('font')[0].text questions = table.findAll( attrs={'style': 'width:750px;'})[0].findAll('a') linkdict[name] = questions except: pass return linkdict
def top250(): result = [] for i in [0, 50, 100, 150, 200, 250]: soup = BeautifulSoup(response.get_source( TOP250 % i)).findAll(attrs={'class': 'item'}) for movie in soup: result.append({ 'movie_id': movie.find(attrs={ 'class': 'm_order' }).text.strip(), 'url': movie.find('a').get('href'), 'title': movie.a.text, 'year': movie.span.text, 'rating': movie.em.text, 'comments': movie.find(attrs={ 'headers': 'm_rating_num' }).text.strip() }) #(登陆次数, 电影名) print 'id:', result[-1]['movie_id'] index_result = index( int(movie.find(attrs={ 'class': 'm_order' }).text.strip()), movie.a.text.split('/')[0].strip()) result[-1]['ztsszs'] = index_result[6] result[-1]['ydsszs'] = index_result[7] result[-1]['zttb'] = index_result[8] result[-1]['zthb'] = index_result[9] result[-1]['ydtb'] = index_result[10] result[-1]['ydhb'] = index_result[11] createxls(result)
def top250(): result = [] for i in [0, 50, 100, 150, 200, 250]: soup = BeautifulSoup(response.get_source(TOP250 % i)).findAll(attrs={'class': 'item'}) for movie in soup: result.append({ 'movie_id' : movie.find(attrs={'class':'m_order'}).text.strip(), 'url' : movie.find('a').get('href'), 'title' : movie.a.text, 'year' : movie.span.text, 'rating' : movie.em.text, 'comments' : movie.find(attrs={'headers':'m_rating_num'}).text.strip() }) #(登陆次数, 电影名) print 'id:',result[-1]['movie_id'] index_result = index(int(movie.find(attrs={'class':'m_order'}).text.strip()), movie.a.text.split('/')[0].strip()) result[-1]['ztsszs'] = index_result[6] result[-1]['ydsszs'] = index_result[7] result[-1]['zttb'] = index_result[8] result[-1]['zthb'] = index_result[9] result[-1]['ydtb'] = index_result[10] result[-1]['ydhb'] = index_result[11] createxls(result)
def get_pages(url): """获取当前页面的页数""" soup = BeautifulSoup(response.get_source(url)) max_page = int(soup.find("div", {"id": "Pages"}).findAll("a")[-1].get("href").split(".")[0].split("_")[1]) return max_page
def get_types(url): """获取单章的题目类型""" soup = BeautifulSoup(response.get_source(url)) types = [(i.text, i.get("href")) for i in soup.find("div", {"id": "TypeIn"}).findAll("a")] return types