Example #1
0
def parsed_html(url):
    # 接收一个网址
    # 返回解析出的
    # 1,电影名  <span class="title">这个杀手不太冷</span>
    # 2,分数   <span class="rating_num" property="v:average">9.4</span>
    # 3,评价人数  <span>994476人评价</span>
    # 4,介绍 <span class="inq">怪蜀黍和小萝莉不得不说的故事。</span>
    # 标签找到元素
    url = 'http://movie.douban.com/top250'
    status_code, headers, body = get(url)
    movies_labels = ['<span class="title">', '</span>']
    sorces_labels = [
        '<span class="rating_num" property="v:average">', '</span>'
    ]
    coments_labels = ['<span>', '人评价</span>']
    inqs_labels = ['<span class="inq">', '</span>']

    movies = find_element_by_label(body, movies_labels[0], movies_labels[1])
    sorces = find_element_by_label(body, sorces_labels[0], sorces_labels[1])
    coments = find_element_by_label(body, coments_labels[0], coments_labels[1])
    inqs = find_element_by_label(body, inqs_labels[0], inqs_labels[1])
    # log("inqs_labels[0]", inqs_labels[0], inqs_labels[1])
    # log("m s c i", inqs)
    result = [len(movies)]
    log("len", len(movies), len(sorces), len(coments), len(inqs))
    for i in range(len(movies)):
        m = movies[i]
        s = sorces[i]
        c = coments[i]
        i = inqs[i]
        op = '\n'
        result = m + op + s + op + c + op + i + op
        log("result\n", result)

    return movies, sorces, coments, inqs
Example #2
0
def test_find_between():
    url = 'http://movie.douban.com/top250'
    status_code, headers, body = get(url)
    content = body
    left = '<span class="title">'
    right = '</span>'
    left1 = '<span class="rating_num" property="v:average">'
    # print("contens", content )
    content1 = find_between(content, left, right)
    content2 = find_between(content, left1, right)
    print('结果1 ', content1)
Example #3
0
def parsed_html(url):
    # 接收一个网址
    # 返回解析出的
    # 1,电影名  <span class="title">这个杀手不太冷</span>
    # 2,分数   <span class="rating_num" property="v:average">9.4</span>
    # 3,评价人数  <span>994476人评价</span>
    # 4,<span class="inq">怪蜀黍和小萝莉不得不说的故事。</span>
    # 标签找到元素
    url = 'http://movie.douban.com/top250'
    status_code, headers, body = get(url)

    movies = find_element_by_label(body, l, r)
Example #4
0
def test_find_element_by_label():
    url = 'http://movie.douban.com/top250'
    status_code, headers, body = get(url)
    test_items = [
        #   ('<a href="https://www.douban.com/doubanapp/app?channel=qipao" class="tip-link">豆瓣 5.0 全新发布</a>', ("</a>")),
        ('<span class="title">', ('</span>')),
        #   ('<span class="title">肖申克的救赎', ("</span>")),
    ]
    # test_function = find_element_by_label
    # left = now_str.find(left_label) + len(left_label)
    # right = now_str.find(right_label)
    # test_function_name = test_function.__name__
    for t in test_items:
        log("t", t)
        left_label, right_label = t
        find_element_by_label(body, left_label, right_label)
Example #5
0
def test_find_element_by_label():
    # left = '<span class="title">'
    left = '<a href="https://www.douban.com/group" target="_blank" data-moreurl-dict="{&quot;from&quot;:&quot;top-nav-click-group&quot;,&quot;uid&quot;:&quot;0&quot;}">'
    right = '</span>'
    url = 'http://movie.douban.com/top250'
    status_code, headers, body = get(url)
    test_items = [
        #   ('<a href="https://www.douban.com/doubanapp/app?channel=qipao" class="tip-link">豆瓣 5.0 全新发布</a>', ("</a>")),
        ('<span class="title">', ('</span>')),
        #   ('<span class="title">肖申克的救赎', ("</span>")),
    ]
    # test_function = find_element_by_label
    # left = now_str.find(left_label) + len(left_label)
    # right = now_str.find(right_label)
    # test_function_name = test_function.__name__
    for t in test_items:
        log("t", t)
        left_label, right_label = t
        find_element_by_label(body, left_label, right_label)
Example #6
0
# 作业 2.6
#
"""
通过在浏览器页面中访问 豆瓣电影 top250 可以发现
1, 每页 25 个条目
2, 下一页的 URL 如下
https://movie.douban.com/top250?start=25

因此可以用循环爬出豆瓣 top250 的所有网页

于是就有了豆瓣电影 top250 的所有网页

由于这 10 个页面都是一样的结构,所以我们只要能解析其中一个页面就能循环得到所有信息

所以现在的程序就只剩下了解析 HTML

请观察规律,解析出
1,电影名
2,分数
3,评价人数
4,引用语(比如第一部肖申克的救赎中的「希望让人自由。」)

解析方式可以用任意手段,如果你没有想法,用字符串查找匹配比较好(find 特征字符串加切片)
"""
if __name__ == '__main__':
    # test_path_with_query()
    # test_header_from_dict()
    dic = get('https://movie.douban.com/top250')
    # log('body', dic[-1])
    parse_body(dic[-1])