def getMovies(category, location): Url_list = getMovieUrl(category, location) results_all = [] for url in Url_list: html = expanddouban.getHtml(url) #html = requests.get(getMovieUrl(category, location)).text soup = BeautifulSoup(html, "html.parser") movie_list = soup.find(class_="list-wp") movie_list = movie_list.find_all("a") result_tag = [] for i in movie_list: # 解析html获取需要的元素 movie_tag = soup.find_all("ul", class_="category") category_name = movie_tag[1].find( class_="tag-checked tag").get_text() name = i.p(class_="title")[0].get_text() rate = i.p(class_="rate")[0].get_text() category = category_name info_link = i.get('href') cover_link = i.find('img').get('src') movie_info_html = expanddouban.getHtml(info_link) info_soup = BeautifulSoup(movie_info_html, "html.parser") a = info_soup.find_all("div", id="info") patt = ".*制片国家/地区:</span> (.*)<br/>" location_name = re.search(patt, str(a)).group(1) location = location_name m = Movie(name, rate, location, category, info_link, cover_link) result = m.get_list() result_tag.append(result) m = result_tag results_all = results_all + m return results_all
def myFavoriteMovies(categories): num_movies = 0 name_movies = [] for category in categories: for location in getLocations(category): url = getMovieUrl(category, location) html = expanddouban.getHtml(url, True) soup = bs4.BeautifulSoup(html, "html.parser") content_div = soup.find("div", class_="list-wp") for element in content_div.find_all("a", recursive=False): if element.p.span: name = element.p.span.get_text() rate = element.p.span.find_next_sibling("span").get_text() info_link = element.get('href') cover_link = element.img.get('src') if name not in name_movies: name_movies.append(name) createVar = locals() createVar['movie' + str(num_movies)] = Movie( name, rate, location, category, info_link, cover_link) num_movies += 1 with open('movies.csv', 'w', newline='') as csvfile: spamwriter = csv.writer(csvfile, dialect='excel') for num in range(len(name_movies)): spamwriter.writerow([ createVar['movie' + str(num)].name, createVar['movie' + str(num)].rate, createVar['movie' + str(num)].location, createVar['movie' + str(num)].category, createVar['movie' + str(num)].info_link, createVar['movie' + str(num)].cover_link ]) print(num)
def getMovies(category, location): """return a list of obj of films""" # 获取url, 调用函数getMovieUrl url = getMovieUrl(category, location) # 爬取网页,返回html html = expanddouban.getHtml(url, loadmore=True, waittime=2) soup = bs4.BeautifulSoup(html, "html.parser") # 创建各元素(属性)列表 names = [] rates = [] info_links = [] cover_links = [] movies_shelf = soup.find(class_="list-wp") # 锁定页面范围 # 查找,并为各属性赋值 for each in movies_shelf.find_all('a'): # 遍历每一条电影记录, tag=a names.append(each.find(class_="title").string) # .string方法获得tag的唯一子节点 rates.append(each.find(class_="rate").string) info_links.append(each.get("href")) cover_links.append(each.find("img").get("src")) # 获取<img>中的src超链接 movies = [] # 该列表储存电影对象 # 创建类Movie的对象,并将对象放入列表movies中 for index in range(len(names)): # 调用类Movies来实例化 movies.append( Movie(names[index], rates[index], location, category, info_links[index], cover_links[index])) return movies
def getMovies(category,location): #定义几个列表,用于把解析 HTML 元素放进来 m = [] r = [] l = [] c = [] link = [] pic = [] #得到要打开的 url,输入类型和地区 searchURL = getMovieUrl(category, location) html = expanddouban.getHtml(searchURL) soup = bs4.BeautifulSoup(html, "html.parser") #任务4 通过类型和地区构造URL,并获取对应的HTML。解析 HTML 中的每个电影元素,并构造电影对象列表 #核心语句,用 soup 找到后再逐一得到各个元素,其中地区和类型是一样的,所以添加了传入的参数 for i in soup.find_all('a',class_ = 'item'): m.append(i.find('span',class_ = 'title').string) r.append(i.find('span',class_ = 'rate').string) l.append(location) c.append(category) link.append(i.get('href')) pic.append(i.find('img').get('src')) return m,r,l,c,link,pic
def getMovies(category, location): """ return a list of Movie objects with a given category and location. """ url = getMovieUrl(category, location) html = expanddouban.getHtml(url, True, 3) #获取html soup = BeautifulSoup(html, "html.parser") #将html变为可以解析的soup对象 movieList = [] #获取父元素,class必须加下划线 parent_a = soup.find('div', class_="list-wp").find_all("a", class_="item") for child in parent_a: #从html中获取名称、评分等信息 name = child.find('span', class_="title").string rate = child.find('span', class_="rate").string info_link = child.get('href') #找到链接,见BeautifulSoup“从文档中找到所有<a>标签的链接” cover_link = child.find('img').get('src') #建立新实例,用getOneMovie方法变成tuple,并添加到list m = Movie(name, rate, location, category, info_link, cover_link).getOneMovie() movieList.append(m) return movieList
def getMovies(category, location): #电影列表用来存放电影信息 movie_lists=list() #获取请求地址 url=getMovieUrl(category,location) #获取url的html html = expanddouban.getHtml(url,loadmore=True,waittime=5) soup=bs4.BeautifulSoup(html,'html.parser') context_div=soup.find(id='app').find(class_='article').find(class_='list-wp',recursive =False) ul_list=soup.find(id='app').find(class_='article').find(class_='tags',recursive =False).find_all('ul',recursive =False) #地区 movie_location=ul_list[-2].find(class_="tag-checked tag").text #类型 movie_category=ul_list[1].find(class_="tag-checked tag").text for element in context_div.find_all('a',recursive=False): if element.get('href'): info_link= element.get('href') cover_link=element.find('img').get('src') name = element.find('p').find(class_='title',recursive =False).text rate = element.find('p').find(class_='rate',recursive =False).text m = Movie(name,rate,movie_location,movie_category,info_link,cover_link) movie_lists.append(m) return movie_lists
def getMovie(category, location): """ catrgory: 电影类型 location:地区 return: Movie类的实例化对象构成的list """ # 获取电影列表链接 url = getMovieUrl(category, location) # 由链接下载html文件 html = expanddouban.getHtml(url, loadmore=True) # 解析html文件 soup = BeautifulSoup(html) # 创建空movie列表 movie_list = [] # 查找html文件中的所有item类的'a'标签 item_list = soup.find_all('a', class_='item') for item in item_list: info_link = item.get('href') cover_link = item.div.img.get('src') name = item.p.span.text rate_str = item.p.find('span', class_='rate').text try: rate = float(float(rate_str)) except Exception as err: print('The rate of {} is {} which is not a number'.format( title, rate_str)) # print(name, rate, info_link, cover_link) m = Movie(name, rate, location, category, info_link, cover_link) movie_list.append(m) return movie_list
def getHtmlTxT(url, loadmore=False, waittime=2): #这里loadmore 在True的时候会翻页,False的时候只开第一页,测试代码的时候最好开成False #作业提交的时候并不需要你完全爬完一个类型的所有电影,只需要在loadmore = False的情况下上传运行结果就足够了。 #waittime主要是打开网页多久不反应了就退出,如果网速不好可以提高一点,如果太高在类型全电影信息爬取的时候可能会翻页过多,几个小时可能都到不了底。 html = expanddouban.getHtml(url, loadmore, waittime) soup = bs4.BeautifulSoup(html, "html.parser") return soup
def getMovies(category, location): # 获取链接 url = getMovieUrl(category, location) # 获取 DOM 结构 html = expanddouban.getHtml(url) # 解析 DOM 内容 soup = bs4.BeautifulSoup(html, "html.parser") html_content = soup.find_all('a', 'item') moviesList = [] for item in html_content: name = item.find('span', 'title').string rate = item.find('span', 'rate').string info_link = item['href'] cover_link = item.find('img')['src'] moviesList.append( Movie(name, rate, location, category, info_link, cover_link).print_data()) return moviesList
def getMovies(category, location): goal_url = getMovieUrl(url, category, location) html = expanddouban.getHtml(goal_url) soup = bs4.BeautifulSoup(html, "html.parser") content = soup.find(id="app") # print(content) allImage = content.findAll("a") print(allImage) list_item = [] for element in allImage: list = [] if element.find("span", class_="title") != None: name = element.find("span", class_="title").text list.append(name) if element.find("span", class_="rate") != None: rate = element.find("span", class_="rate").text list.append(rate) list.append(location) list.append(category) info_link = element.find("a", class_="item", target="_blank") cover_link = element.find("img", x="movie:cover_x").get('src') # print(info_link) list.append(info_link) list.append(cover_link) # print(tuple(list)) if list != []: list_item.append(tuple(list)) print(list_item) return list_item
def getMovies(category, location): html = expanddouban.getHtml(getMovieUrl(category, location), loadmore=True, waittime=2) soup = bs4.BeautifulSoup(html, "html.parser") #bs来解析拿到的html内容 movies = soup.find(class_='list-wp') names = [] rates = [] info_links = [] cover_links = [] for movie in movies.find_all('a'): names.append(movie.find(class_="title").string) rates.append(movie.find(class_="rate").string) info_links.append(movie.get("href")) cover_links.append(movie.find("img").get("src")) movie_list = [] for i in range(len(names)): movie_list.append( Movie(names[i], rates[i], location, category, info_links[i], cover_links[i])) return movie_list
def TotalAmount(category): url_c = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,{}'.format( category) html = expanddouban.getHtml((url_c), True) soup = BeautifulSoup(html, 'html.parser') item_c = soup.find(id='content').find(class_='list-wp').find_all( class_='item') total_amount = len(item_c) return total_amount
def getLocationTags(url): #return a location tag list from the movie page ListLocation = [] ListPageHtml = expanddouban.getHtml(url) soup = BeautifulSoup(ListPageHtml, "html.parser") LocationDiv = soup.find(string="全部地区") for name in LocationDiv.parent.parent.next_siblings: ListLocation.append(name.string) return ListLocation
def task4(category, location): #def_var area: cache_name_dict = {} cache_rate_dict = {} cache_info_link_dict = {} cache_cover_link_dict = {} cache_jump_number = 0 m = {} m_cache = [] cache_name_string_replace = "" #step1:从整个页面HTML编码中分离出所需部分,对所需部分进行筛选和提取。 soup = BeautifulSoup( expanddouban.getHtml(getMovieUrl(category, location), True), "lxml") content_div = soup.find(class_="list-wp") for element in content_div.find_all("a", recursive=False): cache_jump_number += 1 for name in element.find_all(class_="title"): if "," not in name.string and "," not in name.string: cache_name_dict[cache_jump_number] = name.string elif ", " in name.string: cache_name_string_replace = name.string cache_name_dict[ cache_jump_number] = cache_name_string_replace.replace( ", ", "/") cache_name_string_replace = "" elif "," in name.string: cache_name_string_replace = name.string cache_name_dict[ cache_jump_number] = cache_name_string_replace.replace( ",", "/") cache_name_string_replace = "" for rate in element.find_all(class_="rate"): if rate.string == None: cache_rate_dict[cache_jump_number] = "9.0" elif rate.string != None: cache_rate_dict[cache_jump_number] = rate.string for info in content_div.find_all(class_="item"): #PS:在之前使用bs4的“html.parser”没有注意到<a>应该从content_div调取。 cache_info_link_dict[cache_jump_number] = info.get("href") for img in element.find_all(x="movie:cover_x"): cache_cover_link_dict[cache_jump_number] = img.get("src") #step2:将所需信息提取到cache变量,而后将cache变量携带的内容写入到构造函数输出中,m变量格式为嵌套列表。 for i in range(1, cache_jump_number + 1): class_cache = Movie(cache_name_dict[i], cache_rate_dict[i], location.replace(' ', ''), category.replace(' ', ''), cache_info_link_dict[i], cache_cover_link_dict[i]) m_cache.append(class_cache.name) m_cache.append(class_cache.rate) m_cache.append(class_cache.location) m_cache.append(class_cache.category) m_cache.append(class_cache.info_link) m_cache.append(class_cache.cover_link) m[i] = m_cache m_cache = [] return m, cache_jump_number
def getMovieHtml(category, location): #get the full html document by using expanddouban and BeautifulSoup html = expanddouban.getHtml(getMovieUrl(category, location), loadmore=True, waittime=2) soup = BeautifulSoup(html, 'html.parser') #find movie infomation in the html document and turn it into string all_movie_info = str(soup.find_all("a", class_="item")) return all_movie_info
def getMovies(category, location): url = getMovieUrl(category, location) html = expanddouban.getHtml(url, True) soup = bs4.BeautifulSoup(html, "html.parser") content_div = soup.find("div", class_="list-wp") movie_name = [] for element in content_div.find_all("a", recursive=False): if element.p.span: movie_name.append(element.p.span.get_text()) return movie_name
def get_movies(category, location): url = get_movie_url(category, location) html = expanddouban.getHtml(url, loadmore=True) soup = BeautifulSoup(html, 'lxml') movies_list = soup.find('div', 'list-wp') movies_items = movies_list.find_all('a', 'item') yield from (parse_movie(category, location, item) for item in movies_items)
def get_location(): location_list = [] url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影' html = expanddouban.getHtml(url) soup = BeautifulSoup(html, 'html.parser') for tag in soup.find(class_='tags').find_all(class_='category'): if tag.find(class_="tag-checked tag").get_text() == '全部地区': for loc in tag.find(class_="tag-checked tag").parent.next_siblings: location_list.append(loc.get_text()) return location_list
def get_location_tags(): url = getMovieUrl() html = getHtml(url) soup = BeautifulSoup(html, 'html.parser') tags = soup.find(id='wrapper').find(class_='tags').find_all( class_='category')[2].find_all(class_='tag') categories = [] for item in tags[1:]: categories.append(item.text) return categories
def getalllocations(): loclist = list() url = getMovieUrl('剧情', '大陆') html = expanddouban.getHtml(url, True) soup = BeautifulSoup(html, "html.parser") loc = soup.find(id = 'app').find(class_ = 'category').next_sibling.next_sibling.li while loc != None: loclist.append(loc.string) loc = loc.next_sibling return loclist
def getLocations(): html = expanddouban.getHtml( 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影') soup = BeautifulSoup(html, 'html.parser') locationList = [] for child in soup.find(class_='tags').find( class_='category').next_sibling.next_sibling: location = child.find(class_='tag').get_text() if location != '全部地区': locationList.append(location) return locationList
def getLocations(): locations = [] url = 'https://movie.douban.com/tag/#/' html = expanddouban.getHtml(url) soup = BeautifulSoup(html, 'html.parser') categories = soup.find_all('ul', class_='category') for span in categories[2].find_all('span', class_='tag'): locations.append(span.string) return locations[1:]
def getLocations(): url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影' html = expanddouban.getHtml(url, loadmore=False, waittime=2) soup = bs4.BeautifulSoup(html, "html.parser") content = soup.find(class_='tags').find( class_='category').next_sibling.next_sibling for sibling in content: location = sibling.find(class_='tag').get_text() if location != '全部地区': locations.append(location)
def getLocations(category): url = "https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,{}".format( category) html = expanddouban.getHtml(url) soup = bs4.BeautifulSoup(html, "html.parser") locations = [] content_ul = soup.find( "div", class_="tags").ul.find_next_sibling("ul").find_next_sibling("ul") for child in content_ul.children: locations.append(child.span.get_text()) return locations[1:]
def get_all_locations(): all_location_url = getMovieUrl('全部类型', '全部地区') html = expanddouban.getHtml(all_location_url) soup = BeautifulSoup(html, 'lxml') locationList = [] for child in soup.find(class_='tags').find( class_='category').next_sibling.next_sibling: location = child.find(class_='tag').get_text() if location != '全部地区': locationList.append(location) return locationList
def getLocations(): result = [] base_url = getMovieUrl() html = edb.getHtml(base_url) soup = BeautifulSoup(html, "html.parser") ul_list = soup.find("div", class_="tags").select('ul[class="category"]') if len(ul_list) == 4: for li in ul_list[2].find_all("li"): location = li.find('span', class_="tag").get_text() if location != "全部地区": result.append(location) return result
def getMovies(category, location, minRateValue=0): html = getHtml(getMovieUrl(category, location, minRateValue), True) soup = BeautifulSoup(html, "html.parser") movie_tags = soup.find_all("a", class_="item") def construct_movie(movie_tag, location, category): name = movie_tag.find("span", class_="title").text rate = movie_tag.find("span", class_="rate").text info = movie_tag["href"] cover = movie_tag.find("img")["scr"] return Movie(name, rate, category, location, info, cover) return [construct_movie(tag, location, category) for tag in movie_tags]
def getMovies(category, location): url = getMovieUrl(category, location) html = expanddouban.getHtml(url, True) soup = BeautifulSoup(html, 'html.parser') content = soup.find(id="content").find(class_="list-wp").find_all( "a", class_="item") for element in content: cover_link = element.find("img").get("src") name = element.find("p").find(class_="title").string rate = element.find("p").find(class_="rate").string info_link = element.get("href") m = Movie(name, rate, location, category, info_link, cover_link) movie_list.append(m)
def getMovies(category, location): movies = [] for loc in location: html = expanddouban.getHtml(getMovieUrl(category,loc),True) soup = BeautifulSoup(html,'html.parser') content_a = soup.find(id='content').find(class_='list-wp').find_all('a',recursive=False) for element in content_a: M_name = element.find(class_='title').string M_rate = element.find(class_='rate').string M_location = loc M_category = category M_info_link = element.get('href') M_cover_link = element.find('img').get('src') movies.append(Movie(M_name,M_rate,M_location,M_category,M_info_link,M_cover_link).print_data()) return movies
def count_movie(): final_category = ['暴力', '恐怖', '文艺'] final_location = [] final_movie = [] url = "https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影" html = expanddouban.getHtml(url, True) soup = BeautifulSoup(html, "html.parser") for child in soup.find(class_='tags').find( class_='category').next_sibling.next_sibling: location = child.find(class_='tag').get_text() if location != '全部地区': final_location.append(location) for e in final_category: for l in final_location: final_movie += getMovies(e, l) #把所有类型和地区的组合的网页内的电影信息写入finalmovie的列表中 with open('movies.csv', 'w', encoding='utf_8_sig') as f: #输出成文本 writer = csv.writer(f) for e in final_movie: writer.writerow( [e.name, e.rate, e.location, e.category, e.in_link, e.c_link]) #统计数据 with open('movie.txt', 'w', encoding='utf-8') as f: with open('movies.csv', 'r', encoding='utf-8') as g: file = list(csv.reader(g)) for i in final_category: sametype_movie = [k for k in file if k[3] == i] #筛选出第一种类型,放入sametype列表中 num_samearea = [] for j in final_location: num_samearea.append( (j, len([k for k in sametype_movie if k[2] == j]))) #利用元祖将地区和对应的数量固定 sort = sorted(num_samearea, key=lambda x: x[1], reverse=True) fir_location, fri_number = sort[0] sec_location, sec_number = sort[1] third_location, third_number = sort[2] total_number = len(sametype_movie) f.write( "[{}]类型电影数量排名前三的地区分别为{},{},{},它们分别占总量的{:.2%},{:.2%},{:.2%}。\n" .format(i, fir_location, sec_location, third_location, fri_number / total_number, sec_number / total_number, third_number / total_number))
def getUrlHtml(category, location): #该函数接受电影清单url的输入,返回该url页面的html代码经BeautifulSoup解析后的BeautifulSoup对象 html_of_url = expanddouban.getHtml(getMovieUrl(category, location), True, 5) #根据豆瓣robots.txt,waittime设置为5 soup_of_html = BeautifulSoup(html_of_url, 'html.parser') return soup_of_html