Beispiel #1
0
def get_pages(url):
    """获取当前页面的页数"""
    soup = BeautifulSoup(response.get_source(url))
    max_page = int(
        soup.find('div', {
            'id': 'Pages'
        }).findAll('a')[-1].get('href').split('.')[0].split('_')[1])
    return max_page
Beispiel #2
0
def get_types(url):
    """获取单章的题目类型"""
    soup = BeautifulSoup(response.get_source(url))
    types = [(i.text, i.get('href'))
             for i in soup.find('div', {
                 'id': 'TypeIn'
             }).findAll('a')]
    return types
Beispiel #3
0
def get_page_items(url):
    """获取当前页面的所有题目的解析页面地址"""
    soup = BeautifulSoup(response.get_source(url))
    items = [
        link.get("href")
        for link in soup.find("div", {"id": "ProDiv"}).findAll("a")
        if link.get("href").startswith("http://www.tikubaba.com")
    ]
    return items
Beispiel #4
0
def get_page_items(url):
    """获取当前页面的所有题目的解析页面地址"""
    soup = BeautifulSoup(response.get_source(url))
    items = [
        link.get('href') for link in soup.find('div', {
            'id': 'ProDiv'
        }).findAll('a')
        if link.get('href').startswith('http://www.tikubaba.com')
    ]
    return items
Beispiel #5
0
def get_tiku():
    """获取首页所有课程名以及url"""
    linkdict = {}
    tables = BeautifulSoup(response.get_source(URL)).findAll("div", id="search_main")[0].findAll("table")
    for table in tables:
        name = table.findAll(attrs={"align": "center"})
        try:
            if len(name) > 0:
                name = name[0].select("font")[0].text
            questions = table.findAll(attrs={"style": "width:750px;"})[0].findAll("a")
            linkdict[name] = questions
        except:
            pass
    return linkdict
Beispiel #6
0
def get_tiku():
    """获取首页所有课程名以及url"""
    linkdict = {}
    tables = BeautifulSoup(response.get_source(URL)).findAll(
        'div', id='search_main')[0].findAll('table')
    for table in tables:
        name = table.findAll(attrs={'align': 'center'})
        try:
            if len(name) > 0:
                name = name[0].select('font')[0].text
            questions = table.findAll(
                attrs={'style': 'width:750px;'})[0].findAll('a')
            linkdict[name] = questions
        except:
            pass
    return linkdict
Beispiel #7
0
def top250():
    result = []
    for i in [0, 50, 100, 150, 200, 250]:
        soup = BeautifulSoup(response.get_source(
            TOP250 % i)).findAll(attrs={'class': 'item'})
        for movie in soup:
            result.append({
                'movie_id':
                movie.find(attrs={
                    'class': 'm_order'
                }).text.strip(),
                'url':
                movie.find('a').get('href'),
                'title':
                movie.a.text,
                'year':
                movie.span.text,
                'rating':
                movie.em.text,
                'comments':
                movie.find(attrs={
                    'headers': 'm_rating_num'
                }).text.strip()
            })
            #(登陆次数, 电影名)
            print 'id:', result[-1]['movie_id']
            index_result = index(
                int(movie.find(attrs={
                    'class': 'm_order'
                }).text.strip()),
                movie.a.text.split('/')[0].strip())
            result[-1]['ztsszs'] = index_result[6]
            result[-1]['ydsszs'] = index_result[7]
            result[-1]['zttb'] = index_result[8]
            result[-1]['zthb'] = index_result[9]
            result[-1]['ydtb'] = index_result[10]
            result[-1]['ydhb'] = index_result[11]
    createxls(result)
Beispiel #8
0
def top250():
    result = []
    for i in [0, 50, 100, 150, 200, 250]:
        soup = BeautifulSoup(response.get_source(TOP250 % i)).findAll(attrs={'class': 'item'})
        for movie in soup:
            result.append({
                'movie_id' : movie.find(attrs={'class':'m_order'}).text.strip(),
                'url' : movie.find('a').get('href'),
                'title' : movie.a.text,
                'year' : movie.span.text,
                'rating' : movie.em.text,
                'comments' : movie.find(attrs={'headers':'m_rating_num'}).text.strip()
            })
            #(登陆次数, 电影名)
            print 'id:',result[-1]['movie_id']
            index_result = index(int(movie.find(attrs={'class':'m_order'}).text.strip()), movie.a.text.split('/')[0].strip())
            result[-1]['ztsszs'] = index_result[6]
            result[-1]['ydsszs'] = index_result[7]
            result[-1]['zttb'] = index_result[8]
            result[-1]['zthb'] = index_result[9]
            result[-1]['ydtb'] = index_result[10]
            result[-1]['ydhb'] = index_result[11]
    createxls(result)
Beispiel #9
0
def get_pages(url):
    """获取当前页面的页数"""
    soup = BeautifulSoup(response.get_source(url))
    max_page = int(soup.find("div", {"id": "Pages"}).findAll("a")[-1].get("href").split(".")[0].split("_")[1])
    return max_page
Beispiel #10
0
def get_types(url):
    """获取单章的题目类型"""
    soup = BeautifulSoup(response.get_source(url))
    types = [(i.text, i.get("href")) for i in soup.find("div", {"id": "TypeIn"}).findAll("a")]
    return types