def parsed_html(url): # 接收一个网址 # 返回解析出的 # 1,电影名 <span class="title">这个杀手不太冷</span> # 2,分数 <span class="rating_num" property="v:average">9.4</span> # 3,评价人数 <span>994476人评价</span> # 4,介绍 <span class="inq">怪蜀黍和小萝莉不得不说的故事。</span> # 标签找到元素 url = 'http://movie.douban.com/top250' status_code, headers, body = get(url) movies_labels = ['<span class="title">', '</span>'] sorces_labels = [ '<span class="rating_num" property="v:average">', '</span>' ] coments_labels = ['<span>', '人评价</span>'] inqs_labels = ['<span class="inq">', '</span>'] movies = find_element_by_label(body, movies_labels[0], movies_labels[1]) sorces = find_element_by_label(body, sorces_labels[0], sorces_labels[1]) coments = find_element_by_label(body, coments_labels[0], coments_labels[1]) inqs = find_element_by_label(body, inqs_labels[0], inqs_labels[1]) # log("inqs_labels[0]", inqs_labels[0], inqs_labels[1]) # log("m s c i", inqs) result = [len(movies)] log("len", len(movies), len(sorces), len(coments), len(inqs)) for i in range(len(movies)): m = movies[i] s = sorces[i] c = coments[i] i = inqs[i] op = '\n' result = m + op + s + op + c + op + i + op log("result\n", result) return movies, sorces, coments, inqs
def test_find_between(): url = 'http://movie.douban.com/top250' status_code, headers, body = get(url) content = body left = '<span class="title">' right = '</span>' left1 = '<span class="rating_num" property="v:average">' # print("contens", content ) content1 = find_between(content, left, right) content2 = find_between(content, left1, right) print('结果1 ', content1)
def parsed_html(url): # 接收一个网址 # 返回解析出的 # 1,电影名 <span class="title">这个杀手不太冷</span> # 2,分数 <span class="rating_num" property="v:average">9.4</span> # 3,评价人数 <span>994476人评价</span> # 4,<span class="inq">怪蜀黍和小萝莉不得不说的故事。</span> # 标签找到元素 url = 'http://movie.douban.com/top250' status_code, headers, body = get(url) movies = find_element_by_label(body, l, r)
def test_find_element_by_label(): url = 'http://movie.douban.com/top250' status_code, headers, body = get(url) test_items = [ # ('<a href="https://www.douban.com/doubanapp/app?channel=qipao" class="tip-link">豆瓣 5.0 全新发布</a>', ("</a>")), ('<span class="title">', ('</span>')), # ('<span class="title">肖申克的救赎', ("</span>")), ] # test_function = find_element_by_label # left = now_str.find(left_label) + len(left_label) # right = now_str.find(right_label) # test_function_name = test_function.__name__ for t in test_items: log("t", t) left_label, right_label = t find_element_by_label(body, left_label, right_label)
def test_find_element_by_label(): # left = '<span class="title">' left = '<a href="https://www.douban.com/group" target="_blank" data-moreurl-dict="{"from":"top-nav-click-group","uid":"0"}">' right = '</span>' url = 'http://movie.douban.com/top250' status_code, headers, body = get(url) test_items = [ # ('<a href="https://www.douban.com/doubanapp/app?channel=qipao" class="tip-link">豆瓣 5.0 全新发布</a>', ("</a>")), ('<span class="title">', ('</span>')), # ('<span class="title">肖申克的救赎', ("</span>")), ] # test_function = find_element_by_label # left = now_str.find(left_label) + len(left_label) # right = now_str.find(right_label) # test_function_name = test_function.__name__ for t in test_items: log("t", t) left_label, right_label = t find_element_by_label(body, left_label, right_label)
# 作业 2.6 # """ 通过在浏览器页面中访问 豆瓣电影 top250 可以发现 1, 每页 25 个条目 2, 下一页的 URL 如下 https://movie.douban.com/top250?start=25 因此可以用循环爬出豆瓣 top250 的所有网页 于是就有了豆瓣电影 top250 的所有网页 由于这 10 个页面都是一样的结构,所以我们只要能解析其中一个页面就能循环得到所有信息 所以现在的程序就只剩下了解析 HTML 请观察规律,解析出 1,电影名 2,分数 3,评价人数 4,引用语(比如第一部肖申克的救赎中的「希望让人自由。」) 解析方式可以用任意手段,如果你没有想法,用字符串查找匹配比较好(find 特征字符串加切片) """ if __name__ == '__main__': # test_path_with_query() # test_header_from_dict() dic = get('https://movie.douban.com/top250') # log('body', dic[-1]) parse_body(dic[-1])