コード例 #1
0
def getMovies(category, location):
    Url_list = getMovieUrl(category, location)
    results_all = []
    for url in Url_list:
        html = expanddouban.getHtml(url)
        #html = requests.get(getMovieUrl(category, location)).text
        soup = BeautifulSoup(html, "html.parser")
        movie_list = soup.find(class_="list-wp")
        movie_list = movie_list.find_all("a")
        result_tag = []
        for i in movie_list:  # 解析html获取需要的元素
            movie_tag = soup.find_all("ul", class_="category")
            category_name = movie_tag[1].find(
                class_="tag-checked tag").get_text()
            name = i.p(class_="title")[0].get_text()
            rate = i.p(class_="rate")[0].get_text()
            category = category_name
            info_link = i.get('href')
            cover_link = i.find('img').get('src')
            movie_info_html = expanddouban.getHtml(info_link)
            info_soup = BeautifulSoup(movie_info_html, "html.parser")
            a = info_soup.find_all("div", id="info")
            patt = ".*制片国家/地区:</span> (.*)<br/>"
            location_name = re.search(patt, str(a)).group(1)
            location = location_name
            m = Movie(name, rate, location, category, info_link, cover_link)
            result = m.get_list()
            result_tag.append(result)
        m = result_tag
        results_all = results_all + m
    return results_all
コード例 #2
0
def myFavoriteMovies(categories):
    num_movies = 0
    name_movies = []
    for category in categories:
        for location in getLocations(category):
            url = getMovieUrl(category, location)
            html = expanddouban.getHtml(url, True)
            soup = bs4.BeautifulSoup(html, "html.parser")
            content_div = soup.find("div", class_="list-wp")
            for element in content_div.find_all("a", recursive=False):
                if element.p.span:
                    name = element.p.span.get_text()
                    rate = element.p.span.find_next_sibling("span").get_text()
                    info_link = element.get('href')
                    cover_link = element.img.get('src')
                    if name not in name_movies:
                        name_movies.append(name)
                        createVar = locals()
                        createVar['movie' + str(num_movies)] = Movie(
                            name, rate, location, category, info_link,
                            cover_link)
                        num_movies += 1
    with open('movies.csv', 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, dialect='excel')
        for num in range(len(name_movies)):
            spamwriter.writerow([
                createVar['movie' + str(num)].name,
                createVar['movie' + str(num)].rate,
                createVar['movie' + str(num)].location,
                createVar['movie' + str(num)].category,
                createVar['movie' + str(num)].info_link,
                createVar['movie' + str(num)].cover_link
            ])
            print(num)
コード例 #3
0
def getMovies(category, location):
    """return a list of obj of films"""
    # 获取url, 调用函数getMovieUrl
    url = getMovieUrl(category, location)
    # 爬取网页,返回html
    html = expanddouban.getHtml(url, loadmore=True, waittime=2)

    soup = bs4.BeautifulSoup(html, "html.parser")
    # 创建各元素(属性)列表
    names = []
    rates = []
    info_links = []
    cover_links = []
    movies_shelf = soup.find(class_="list-wp")  # 锁定页面范围
    # 查找,并为各属性赋值
    for each in movies_shelf.find_all('a'):  # 遍历每一条电影记录, tag=a
        names.append(each.find(class_="title").string)  # .string方法获得tag的唯一子节点
        rates.append(each.find(class_="rate").string)
        info_links.append(each.get("href"))
        cover_links.append(each.find("img").get("src"))  # 获取<img>中的src超链接

    movies = []  # 该列表储存电影对象
    # 创建类Movie的对象,并将对象放入列表movies中
    for index in range(len(names)):
        # 调用类Movies来实例化
        movies.append(
            Movie(names[index], rates[index], location, category,
                  info_links[index], cover_links[index]))

    return movies
コード例 #4
0
def getMovies(category,location):
#定义几个列表,用于把解析 HTML 元素放进来
    m = []
    r = []
    l = []
    c = []
    link = []
    pic = []




#得到要打开的 url,输入类型和地区
    searchURL = getMovieUrl(category, location)
    html = expanddouban.getHtml(searchURL)
    soup = bs4.BeautifulSoup(html, "html.parser")

#任务4	通过类型和地区构造URL,并获取对应的HTML。解析 HTML 中的每个电影元素,并构造电影对象列表
    #核心语句,用 soup 找到后再逐一得到各个元素,其中地区和类型是一样的,所以添加了传入的参数
    for i in soup.find_all('a',class_ = 'item'):
        m.append(i.find('span',class_ = 'title').string)
        r.append(i.find('span',class_ = 'rate').string)
        l.append(location)
        c.append(category)
        link.append(i.get('href'))
        pic.append(i.find('img').get('src'))

    return m,r,l,c,link,pic
コード例 #5
0
def getMovies(category, location):
    """
	return a list of Movie objects with a given category and location.
	"""
    url = getMovieUrl(category, location)
    html = expanddouban.getHtml(url, True, 3)  #获取html
    soup = BeautifulSoup(html, "html.parser")  #将html变为可以解析的soup对象

    movieList = []

    #获取父元素,class必须加下划线
    parent_a = soup.find('div', class_="list-wp").find_all("a", class_="item")
    for child in parent_a:
        #从html中获取名称、评分等信息
        name = child.find('span', class_="title").string
        rate = child.find('span', class_="rate").string
        info_link = child.get('href')  #找到链接,见BeautifulSoup“从文档中找到所有<a>标签的链接”
        cover_link = child.find('img').get('src')

        #建立新实例,用getOneMovie方法变成tuple,并添加到list
        m = Movie(name, rate, location, category, info_link,
                  cover_link).getOneMovie()
        movieList.append(m)

    return movieList
コード例 #6
0
ファイル: DoubanCrawler.py プロジェクト: ln0491/douban
def  getMovies(category, location):
    #电影列表用来存放电影信息
    movie_lists=list()
    #获取请求地址
    url=getMovieUrl(category,location)
    #获取url的html
    html = expanddouban.getHtml(url,loadmore=True,waittime=5)
    soup=bs4.BeautifulSoup(html,'html.parser')
    context_div=soup.find(id='app').find(class_='article').find(class_='list-wp',recursive =False)
    ul_list=soup.find(id='app').find(class_='article').find(class_='tags',recursive =False).find_all('ul',recursive =False)
    #地区
    movie_location=ul_list[-2].find(class_="tag-checked tag").text
    #类型
    movie_category=ul_list[1].find(class_="tag-checked tag").text

    for element in context_div.find_all('a',recursive=False):
        if element.get('href'):

            info_link= element.get('href')
            cover_link=element.find('img').get('src')
            name = element.find('p').find(class_='title',recursive =False).text
            rate = element.find('p').find(class_='rate',recursive =False).text
            m = Movie(name,rate,movie_location,movie_category,info_link,cover_link)
            movie_lists.append(m)

    return movie_lists
コード例 #7
0
def getMovie(category, location):
    """
    catrgory: 电影类型
    location:地区
    return: Movie类的实例化对象构成的list
    """
    # 获取电影列表链接
    url = getMovieUrl(category, location)
    # 由链接下载html文件
    html = expanddouban.getHtml(url, loadmore=True)
    # 解析html文件
    soup = BeautifulSoup(html)
    # 创建空movie列表
    movie_list = []
    # 查找html文件中的所有item类的'a'标签
    item_list = soup.find_all('a', class_='item')
    for item in item_list:
        info_link = item.get('href')
        cover_link = item.div.img.get('src')
        name = item.p.span.text
        rate_str = item.p.find('span', class_='rate').text
        try:
            rate = float(float(rate_str))
        except Exception as err:
            print('The rate of {} is {} which is not a number'.format(
                title, rate_str))
        # print(name, rate, info_link, cover_link)
        m = Movie(name, rate, location, category, info_link, cover_link)
        movie_list.append(m)
    return movie_list
コード例 #8
0
def getHtmlTxT(url, loadmore=False, waittime=2):
    #这里loadmore 在True的时候会翻页,False的时候只开第一页,测试代码的时候最好开成False
    #作业提交的时候并不需要你完全爬完一个类型的所有电影,只需要在loadmore = False的情况下上传运行结果就足够了。
    #waittime主要是打开网页多久不反应了就退出,如果网速不好可以提高一点,如果太高在类型全电影信息爬取的时候可能会翻页过多,几个小时可能都到不了底。
    html = expanddouban.getHtml(url, loadmore, waittime)
    soup = bs4.BeautifulSoup(html, "html.parser")
    return soup
コード例 #9
0
def getMovies(category, location):

    # 获取链接
    url = getMovieUrl(category, location)

    # 获取 DOM 结构
    html = expanddouban.getHtml(url)

    # 解析 DOM 内容
    soup = bs4.BeautifulSoup(html, "html.parser")
    html_content = soup.find_all('a', 'item')

    moviesList = []

    for item in html_content:
        name = item.find('span', 'title').string
        rate = item.find('span', 'rate').string
        info_link = item['href']
        cover_link = item.find('img')['src']

        moviesList.append(
            Movie(name, rate, location, category, info_link,
                  cover_link).print_data())

    return moviesList
コード例 #10
0
def getMovies(category, location):
    goal_url = getMovieUrl(url, category, location)
    html = expanddouban.getHtml(goal_url)

    soup = bs4.BeautifulSoup(html, "html.parser")
    content = soup.find(id="app")
    # print(content)
    allImage = content.findAll("a")
    print(allImage)
    list_item = []
    for element in allImage:
        list = []
        if element.find("span", class_="title") != None:
            name = element.find("span", class_="title").text
            list.append(name)
        if element.find("span", class_="rate") != None:
            rate = element.find("span", class_="rate").text
            list.append(rate)
            list.append(location)
            list.append(category)

            info_link = element.find("a", class_="item", target="_blank")
            cover_link = element.find("img", x="movie:cover_x").get('src')
            # print(info_link)
            list.append(info_link)
            list.append(cover_link)
        # print(tuple(list))
        if list != []:
            list_item.append(tuple(list))
    print(list_item)
    return list_item
コード例 #11
0
def getMovies(category, location):
    html = expanddouban.getHtml(getMovieUrl(category, location),
                                loadmore=True,
                                waittime=2)
    soup = bs4.BeautifulSoup(html, "html.parser")  #bs来解析拿到的html内容

    movies = soup.find(class_='list-wp')

    names = []
    rates = []
    info_links = []
    cover_links = []

    for movie in movies.find_all('a'):
        names.append(movie.find(class_="title").string)
        rates.append(movie.find(class_="rate").string)
        info_links.append(movie.get("href"))
        cover_links.append(movie.find("img").get("src"))

    movie_list = []
    for i in range(len(names)):
        movie_list.append(
            Movie(names[i], rates[i], location, category, info_links[i],
                  cover_links[i]))

    return movie_list
コード例 #12
0
def TotalAmount(category):
    url_c = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,{}'.format(
        category)
    html = expanddouban.getHtml((url_c), True)
    soup = BeautifulSoup(html, 'html.parser')
    item_c = soup.find(id='content').find(class_='list-wp').find_all(
        class_='item')
    total_amount = len(item_c)
    return total_amount
コード例 #13
0
def getLocationTags(url):
    #return a location tag list from the movie page
    ListLocation = []
    ListPageHtml = expanddouban.getHtml(url)
    soup = BeautifulSoup(ListPageHtml, "html.parser")
    LocationDiv = soup.find(string="全部地区")
    for name in LocationDiv.parent.parent.next_siblings:
        ListLocation.append(name.string)
    return ListLocation
コード例 #14
0
def task4(category, location):
    #def_var area:
    cache_name_dict = {}
    cache_rate_dict = {}
    cache_info_link_dict = {}
    cache_cover_link_dict = {}
    cache_jump_number = 0
    m = {}
    m_cache = []
    cache_name_string_replace = ""
    #step1:从整个页面HTML编码中分离出所需部分,对所需部分进行筛选和提取。
    soup = BeautifulSoup(
        expanddouban.getHtml(getMovieUrl(category, location), True), "lxml")
    content_div = soup.find(class_="list-wp")
    for element in content_div.find_all("a", recursive=False):
        cache_jump_number += 1
        for name in element.find_all(class_="title"):
            if "," not in name.string and "," not in name.string:
                cache_name_dict[cache_jump_number] = name.string
            elif ", " in name.string:
                cache_name_string_replace = name.string
                cache_name_dict[
                    cache_jump_number] = cache_name_string_replace.replace(
                        ", ", "/")
                cache_name_string_replace = ""
            elif "," in name.string:
                cache_name_string_replace = name.string
                cache_name_dict[
                    cache_jump_number] = cache_name_string_replace.replace(
                        ",", "/")
                cache_name_string_replace = ""
        for rate in element.find_all(class_="rate"):
            if rate.string == None:
                cache_rate_dict[cache_jump_number] = "9.0"
            elif rate.string != None:
                cache_rate_dict[cache_jump_number] = rate.string
        for info in content_div.find_all(class_="item"):
            #PS:在之前使用bs4的“html.parser”没有注意到<a>应该从content_div调取。
            cache_info_link_dict[cache_jump_number] = info.get("href")
        for img in element.find_all(x="movie:cover_x"):
            cache_cover_link_dict[cache_jump_number] = img.get("src")
    #step2:将所需信息提取到cache变量,而后将cache变量携带的内容写入到构造函数输出中,m变量格式为嵌套列表。
    for i in range(1, cache_jump_number + 1):
        class_cache = Movie(cache_name_dict[i], cache_rate_dict[i],
                            location.replace(' ',
                                             ''), category.replace(' ', ''),
                            cache_info_link_dict[i], cache_cover_link_dict[i])
        m_cache.append(class_cache.name)
        m_cache.append(class_cache.rate)
        m_cache.append(class_cache.location)
        m_cache.append(class_cache.category)
        m_cache.append(class_cache.info_link)
        m_cache.append(class_cache.cover_link)
        m[i] = m_cache
        m_cache = []
    return m, cache_jump_number
コード例 #15
0
def getMovieHtml(category, location):
    #get the full html document by using expanddouban and BeautifulSoup
    html = expanddouban.getHtml(getMovieUrl(category, location),
                                loadmore=True,
                                waittime=2)
    soup = BeautifulSoup(html, 'html.parser')

    #find movie infomation in the html document and turn it into string
    all_movie_info = str(soup.find_all("a", class_="item"))
    return all_movie_info
コード例 #16
0
def getMovies(category, location):
    url = getMovieUrl(category, location)
    html = expanddouban.getHtml(url, True)
    soup = bs4.BeautifulSoup(html, "html.parser")
    content_div = soup.find("div", class_="list-wp")
    movie_name = []
    for element in content_div.find_all("a", recursive=False):
        if element.p.span:
            movie_name.append(element.p.span.get_text())
    return movie_name
コード例 #17
0
def get_movies(category, location):
    url = get_movie_url(category, location)
    html = expanddouban.getHtml(url, loadmore=True)

    soup = BeautifulSoup(html, 'lxml')
    movies_list = soup.find('div', 'list-wp')
    movies_items = movies_list.find_all('a', 'item')

    yield from (parse_movie(category, location, item) 
                for item in movies_items)
コード例 #18
0
def get_location():
    location_list = []
    url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影'
    html = expanddouban.getHtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    for tag in soup.find(class_='tags').find_all(class_='category'):
        if tag.find(class_="tag-checked tag").get_text() == '全部地区':
            for loc in tag.find(class_="tag-checked tag").parent.next_siblings:
                location_list.append(loc.get_text())
    return location_list
コード例 #19
0
def get_location_tags():
    url = getMovieUrl()
    html = getHtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup.find(id='wrapper').find(class_='tags').find_all(
        class_='category')[2].find_all(class_='tag')
    categories = []
    for item in tags[1:]:
        categories.append(item.text)
    return categories
コード例 #20
0
def getalllocations():
    loclist = list()
    url = getMovieUrl('剧情', '大陆')
    html = expanddouban.getHtml(url, True)
    soup = BeautifulSoup(html, "html.parser")
    loc = soup.find(id = 'app').find(class_ = 'category').next_sibling.next_sibling.li
    while loc != None:
        loclist.append(loc.string)
        loc = loc.next_sibling
    return loclist
コード例 #21
0
def getLocations():
    html = expanddouban.getHtml(
        'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影')
    soup = BeautifulSoup(html, 'html.parser')
    locationList = []
    for child in soup.find(class_='tags').find(
            class_='category').next_sibling.next_sibling:
        location = child.find(class_='tag').get_text()
        if location != '全部地区':
            locationList.append(location)
    return locationList
コード例 #22
0
def getLocations():
    locations = []
    url = 'https://movie.douban.com/tag/#/'
    html = expanddouban.getHtml(url)
    soup = BeautifulSoup(html, 'html.parser')
    categories = soup.find_all('ul', class_='category')

    for span in categories[2].find_all('span', class_='tag'):
        locations.append(span.string)

    return locations[1:]
コード例 #23
0
def getLocations():
    url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影'
    html = expanddouban.getHtml(url, loadmore=False, waittime=2)
    soup = bs4.BeautifulSoup(html, "html.parser")

    content = soup.find(class_='tags').find(
        class_='category').next_sibling.next_sibling
    for sibling in content:
        location = sibling.find(class_='tag').get_text()
        if location != '全部地区':
            locations.append(location)
コード例 #24
0
def getLocations(category):
    url = "https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,{}".format(
        category)
    html = expanddouban.getHtml(url)
    soup = bs4.BeautifulSoup(html, "html.parser")
    locations = []
    content_ul = soup.find(
        "div",
        class_="tags").ul.find_next_sibling("ul").find_next_sibling("ul")
    for child in content_ul.children:
        locations.append(child.span.get_text())
    return locations[1:]
コード例 #25
0
def get_all_locations():
    all_location_url = getMovieUrl('全部类型', '全部地区')
    html = expanddouban.getHtml(all_location_url)
    soup = BeautifulSoup(html, 'lxml')
    locationList = []
    for child in soup.find(class_='tags').find(
            class_='category').next_sibling.next_sibling:
        location = child.find(class_='tag').get_text()
        if location != '全部地区':
            locationList.append(location)

    return locationList
コード例 #26
0
def getLocations():
    result = []
    base_url = getMovieUrl()
    html = edb.getHtml(base_url)
    soup = BeautifulSoup(html, "html.parser")
    ul_list = soup.find("div", class_="tags").select('ul[class="category"]')
    if len(ul_list) == 4:
        for li in ul_list[2].find_all("li"):
            location = li.find('span', class_="tag").get_text()
            if location != "全部地区":
                result.append(location)

    return result
コード例 #27
0
def getMovies(category, location, minRateValue=0):
    html = getHtml(getMovieUrl(category, location, minRateValue), True)
    soup = BeautifulSoup(html, "html.parser")
    movie_tags = soup.find_all("a", class_="item")

    def construct_movie(movie_tag, location, category):
        name = movie_tag.find("span", class_="title").text
        rate = movie_tag.find("span", class_="rate").text
        info = movie_tag["href"]
        cover = movie_tag.find("img")["scr"]
        return Movie(name, rate, category, location, info, cover)

    return [construct_movie(tag, location, category) for tag in movie_tags]
コード例 #28
0
def getMovies(category, location):
    url = getMovieUrl(category, location)
    html = expanddouban.getHtml(url, True)
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.find(id="content").find(class_="list-wp").find_all(
        "a", class_="item")
    for element in content:
        cover_link = element.find("img").get("src")
        name = element.find("p").find(class_="title").string
        rate = element.find("p").find(class_="rate").string
        info_link = element.get("href")
        m = Movie(name, rate, location, category, info_link, cover_link)
        movie_list.append(m)
コード例 #29
0
def getMovies(category, location):
	movies = []
	for loc in location:
		html = expanddouban.getHtml(getMovieUrl(category,loc),True)
		soup = BeautifulSoup(html,'html.parser')
		content_a = soup.find(id='content').find(class_='list-wp').find_all('a',recursive=False)
		for element in content_a:
		    M_name = element.find(class_='title').string
		    M_rate = element.find(class_='rate').string
		    M_location = loc
		    M_category = category
		    M_info_link = element.get('href')
		    M_cover_link = element.find('img').get('src')
		    movies.append(Movie(M_name,M_rate,M_location,M_category,M_info_link,M_cover_link).print_data())
		return movies
コード例 #30
0
def count_movie():

    final_category = ['暴力', '恐怖', '文艺']
    final_location = []
    final_movie = []
    url = "https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影"
    html = expanddouban.getHtml(url, True)
    soup = BeautifulSoup(html, "html.parser")
    for child in soup.find(class_='tags').find(
            class_='category').next_sibling.next_sibling:
        location = child.find(class_='tag').get_text()
        if location != '全部地区':
            final_location.append(location)

    for e in final_category:
        for l in final_location:
            final_movie += getMovies(e,
                                     l)  #把所有类型和地区的组合的网页内的电影信息写入finalmovie的列表中

    with open('movies.csv', 'w', encoding='utf_8_sig') as f:  #输出成文本
        writer = csv.writer(f)
        for e in final_movie:
            writer.writerow(
                [e.name, e.rate, e.location, e.category, e.in_link, e.c_link])

    #统计数据
    with open('movie.txt', 'w', encoding='utf-8') as f:
        with open('movies.csv', 'r', encoding='utf-8') as g:
            file = list(csv.reader(g))
            for i in final_category:
                sametype_movie = [k for k in file
                                  if k[3] == i]  #筛选出第一种类型,放入sametype列表中
                num_samearea = []
                for j in final_location:
                    num_samearea.append(
                        (j, len([k for k in sametype_movie
                                 if k[2] == j])))  #利用元祖将地区和对应的数量固定
                sort = sorted(num_samearea, key=lambda x: x[1], reverse=True)
                fir_location, fri_number = sort[0]
                sec_location, sec_number = sort[1]
                third_location, third_number = sort[2]
                total_number = len(sametype_movie)
                f.write(
                    "[{}]类型电影数量排名前三的地区分别为{},{},{},它们分别占总量的{:.2%},{:.2%},{:.2%}。\n"
                    .format(i, fir_location, sec_location, third_location,
                            fri_number / total_number,
                            sec_number / total_number,
                            third_number / total_number))
コード例 #31
0
def getUrlHtml(category, location):
	#该函数接受电影清单url的输入,返回该url页面的html代码经BeautifulSoup解析后的BeautifulSoup对象
	html_of_url =  expanddouban.getHtml(getMovieUrl(category, location), True, 5) #根据豆瓣robots.txt,waittime设置为5
	soup_of_html = BeautifulSoup(html_of_url, 'html.parser')
	return soup_of_html