Ejemplo n.º 1
0
def get_movies(username, celebrity, start_number, role='performer'):
	choose_role = {'performer':'A', 'director':'D'}
	movie_role = choose_role[role]
	url = 'http://movie.douban.com/celebrity/%s/movies?start=%s&format=text&sortby=vote&role=%s'%(celebrity.ID, start_number, movie_role)
	soup = get_soup(url, timeout=15)
	movie_htmls = soup.findAll('a', href=re.compile('http://movie.douban.com/subject/\d{7,8}'))
	star_htmls = soup.findAll('span', class_='rating_nums')

	movie_IDs = [re.search('\d{7,8}', movie_html['href']).group() for movie_html in movie_htmls]
	movie_names = [movie_html.text for movie_html in movie_htmls]
	stars = [star_html.text for star_html in star_htmls]
	recommend_movies = Movie_list([Recommend_movie(movie_ID, movie_name, star, score=celebrity.final_score) 
						for movie_ID, movie_name, star in zip(movie_IDs,
																 movie_names, stars)])

	choose_list = {0:second_page_celebrities, 25:third_page_celebrities, 50:[]}
	exist_html = soup.find("span", class_='allstar00')
	if not exist_html:
		choose_list[start_number].append(celebrity)

	for movie in recommend_movies:
		movie.add_celebrity(celebrity)


	movie_list.extends(recommend_movies, celebrity)
	print('4.celebrity ID %s OK '%(celebrity.ID))
def get_movies(username, celebrity, start_number, role='performer'):
	choose_role = {'performer':'A', 'director':'D'}
	movie_role = choose_role[role]
	url = 'http://movie.douban.com/celebrity/%s/movies?start=%s&format=text&sortby=vote&role=%s'%(celebrity.ID, start_number, movie_role)
	soup = get_soup(url, timeout=8)
	movie_htmls = soup.findAll('a', href=re.compile('http://movie.douban.com/subject/\d{7,8}'))
	star_htmls = soup.findAll('span', class_='rating_nums')

	movie_IDs = [re.search('\d{7,8}', movie_html['href']).group() for movie_html in movie_htmls]
	movie_names = [movie_html.text for movie_html in movie_htmls]
	stars = [star_html.text for star_html in star_htmls]
	recommend_movies = Movie_list([Recommend_movie(movie_ID, movie_name, star, score=celebrity.final_score) 
						for movie_ID, movie_name, star in zip(movie_IDs,
																 movie_names, stars)])

	choose_list = {0:second_page_celebrities, 25:third_page_celebrities, 50:[]}
	exist_html = soup.find("span", class_='allstar00')
	if not exist_html:
		choose_list[start_number].append(celebrity)

	for movie in recommend_movies:
		movie.add_celebrity(celebrity)


	movie_list.extends(recommend_movies, celebrity)
	print('4.celebrity ID %s OK '%(celebrity.ID))
Ejemplo n.º 3
0
def get_celebrities(username, star, star_movie_ID):
	url = 'http://movie.douban.com/subject/%s/'%(star_movie_ID)
	soup = get_soup(url, timeout=15)
	#celebrity
	celebrity_htmls = soup.findAll('a', {'rel':'v:starring'}, href=re.compile('/celebrity/\d{7}'), limit=4)
	page_celebrity_IDs = [re.search('(\d{7})', celebrity_html['href']).group() for celebrity_html in celebrity_htmls]
	page_celebrity_names =[celebrity.text for celebrity in celebrity_htmls]
	#TODO!the directors are not included!
	directors_htmls = soup.findAll('a', {'rel':'v:directedBy'}, href=re.compile('/celebrity/\d{7}'))
	directors_IDs = [re.search('(\d{7})', directors_html['href']).group() for directors_html in directors_htmls]
	directors_names = [director.text for director in directors_htmls]
	page_directors = [Celebrity(directors_ID, original_score=star, name=name, role='director')
						 for directors_ID, name in zip(directors_IDs, directors_names) ]
	
	page_celebrities = [Celebrity(page_celebrity_ID, original_score=star, name=name)
						 for page_celebrity_ID,name in zip(page_celebrity_IDs, page_celebrity_names)]
	#movie information
	movie_name =soup.find('span', {'property':'v:itemreviewed'}).text 
	movie = Movie(star_movie_ID, movie_name)
	for page_celebrity in page_celebrities:
		page_celebrity.add_loved_movie(movie) 
	for page_director in page_directors:
		page_director.add_loved_movie(movie)

	star_directors.extends(page_directors, movie, star)
	star_celebrities.extends(page_celebrities, movie, star)
	print('3.OK %s movie ID'%(star_movie_ID))
Ejemplo n.º 4
0
def get_celebrities_pages(username):
	url = 'http://movie.douban.com/people/%s/celebrities'%(username)
	print('Start!')
	soup = get_soup(url, priority='high', timeout=2)
	title = soup.title.text
	pages = int(re.search('\((\d+)\)$', title).group(1))	
	return pages
Ejemplo n.º 5
0
def get_celebrities_pages(username):
    url = 'http://movie.douban.com/people/%s/celebrities' % (username)
    print('Start!')
    soup = get_soup(url, priority='high', timeout=10)
    title = soup.title.text
    pages = int(re.search('\((\d+)\)$', title).group(1))
    return pages
Ejemplo n.º 6
0
def get_movies_pages(username):
    url = 'http://movie.douban.com/people/%s/collect' % (username)
    soup = get_soup(url, timeout=15, priority='high')
    title_text = soup.title.text
    movies_pages = (int(re.search('\((\d+)\)', title_text).group(1)) / 30 +
                    1) * 30
    return movies_pages
Ejemplo n.º 7
0
def get_special(celebrity):
    url = 'http://movie.douban.com/celebrity/%s/' % (celebrity.ID)
    soup = get_soup(url, timeout=5)
    image_url = soup.find('img', title=u'点击看大图')['src']
    pp = re.compile('medium')
    image_url = pp.sub('small', image_url)
    celebrity.image_url = image_url
    print('6.celebrity%s image' % (celebrity.ID))
def get_special(celebrity):
	url = 'http://movie.douban.com/celebrity/%s/'%(celebrity.ID)
	soup = get_soup(url, timeout=5)
	image_url = soup.find('img', title=u'点击看大图')['src']
	pp = re.compile('medium')
	image_url = pp.sub('small', image_url)
	celebrity.image_url = image_url
	print('6.celebrity%s image'%(celebrity.ID))
Ejemplo n.º 9
0
 def __analysis_article__(self, base_url):
     soup = get_soup(base_url)
     if not soup:
         return
     childrens = []
     li = soup.find(name='li', class_='pager_theme_4')
     if li:
         [
             childrens.append(child) for child in soup.find(
                 name='li', class_='pager_theme_4').children
         ]
         childrens = list(
             filter(
                 lambda x: str(x) != '\n' and str(x).__contains__('href'),
                 childrens))
     pages = []
     pages.append(base_url)
     try:
         if (len(childrens) > 2):
             [
                 pages.append(base_url + '?pn=' + str(i)) for i in range(
                     2,
                     int(
                         re.findall(r'.*pn=(\d+)', childrens[
                             len(childrens) - 1]['href'])[0]) + 1)
             ]
     except Exception as ex:
         print('get_all_pages...exception:{} base_url:{}'.format(
             str(ex), base_url))
     for page in pages:
         soup = get_soup(page)
         if soup:
             for content_div in soup.find_all(
                     'div', attrs={'class', 'l_post_bright'}):
                 try:
                     user, content = parse_user_and_content(content_div)
                     self.lock.acquire()
                     if self.all_user_contents.__contains__(user):
                         self.all_user_contents[user].append(content)
                     else:
                         self.all_user_contents[user] = [content]
                     self.lock.release()
                 except Exception as ex:
                     print(ex)
Ejemplo n.º 10
0
 def __get_articles__(self):
     while len(self.pages) > 0:
         url = self.pages.pop()
         soup = get_soup(url)
         self.lock.acquire()
         self.article_urls.extend([
             baidu_base_url + a['href'] for a in soup.findAll(
                 'a', attrs={'href': re.compile(r'^/p/\d{10}')})
         ])
         self.lock.release()
Ejemplo n.º 11
0
def get_celebrities(username, start_number):
	url = 'http://movie.douban.com/people/%s/celebrities?start=%s'%(username, start_number)
	soup = get_soup(url, timeout=4)
	page_celebrities = soup.findAll('a', href=re.compile('http://movie.douban.com/celebrity/\d{7}/$'))
	page_celebrities = [re.search('\d{7}', celebrity['href']).group() 
						for celebrity in page_celebrities 
						if page_celebrities.index(celebrity)%2 == 0]

	names_html = soup.findAll('em')
	names = [unicode(name.text) for name in names_html]
	page_celebrities = [Celebrity(page_celebrity, collect_or_watch='collect', 
						original_score=5, name=name) for page_celebrity,name in 
						zip(page_celebrities, names)]
	celebrities.extend(page_celebrities)
	print('1.collect page%s OK'%(start_number))
Ejemplo n.º 12
0
def _get_image(url, movie_ID):
	'''
	Return the image's file path ,store the image's file
	# Parameters:
	* url : poster's url
	* movie_ID : movie_ID used for naming the image file
	'''

	content = spider.get_soup(url, timeout=5, priority='high',
							 content=True)
	#Warning!It is really a bad design!!
	image_url = os.path.join(dirname, 'media/poster/%s.jpg'%(movie_ID))
	with open(image_url, 'wb') as tt:
		tt.write(content)
	image_url = '/picture/poster/%s.jpg'%(movie_ID)
	return image_url
Ejemplo n.º 13
0
def get_celebrities(username, star, star_movie_ID):
    url = 'http://movie.douban.com/subject/%s/' % (star_movie_ID)
    soup = get_soup(url, timeout=8)
    #celebrity
    if soup:

        celebrity_htmls = soup.findAll('a', {'rel': 'v:starring'},
                                       href=re.compile('/celebrity/\d{7}'),
                                       limit=4)
        page_celebrity_IDs = [
            re.search('(\d{7})', celebrity_html['href']).group()
            for celebrity_html in celebrity_htmls
        ]
        page_celebrity_names = [
            celebrity.text for celebrity in celebrity_htmls
        ]
        directors_htmls = soup.findAll('a', {'rel': 'v:directedBy'},
                                       href=re.compile('/celebrity/\d{7}'))
        directors_IDs = [
            re.search('(\d{7})', directors_html['href']).group()
            for directors_html in directors_htmls
        ]
        directors_names = [director.text for director in directors_htmls]
        page_directors = [
            Celebrity(directors_ID,
                      original_score=star,
                      name=name,
                      role='director')
            for directors_ID, name in zip(directors_IDs, directors_names)
        ]

        page_celebrities = [
            Celebrity(page_celebrity_ID, original_score=star,
                      name=name) for page_celebrity_ID, name in zip(
                          page_celebrity_IDs, page_celebrity_names)
        ]
        #movie information
        movie_name = soup.find('span', {'property': 'v:itemreviewed'}).text
        movie = Movie(star_movie_ID, movie_name)
        for page_celebrity in page_celebrities:
            page_celebrity.add_loved_movie(movie)
        for page_director in page_directors:
            page_director.add_loved_movie(movie)

        star_directors.extends(page_directors, movie, star)
        star_celebrities.extends(page_celebrities, movie, star)
        print('3.OK %s movie ID' % (star_movie_ID))
Ejemplo n.º 14
0
def get_movies(username, start_number):
	url = 'http://movie.douban.com/people/%s/collect?start=%s&mode=list'%(username, start_number)
	soup = get_soup(url, timeout=10)
	htmls = soup.findAll('li', id=re.compile('list\d{7,8}'),  class_=re.compile('item')) 
	for html in htmls:
		star_html = html.find('span', class_=re.compile('rating\d-t'))
		if star_html:
			star_html = star_html['class'][0]
		else:
			star_html = None
		movie_ID = re.search('\d{7,8}', html['id']).group()
		if star_html == 'rating5-t':
			five_star_movies_IDs.append(movie_ID)
		elif star_html == 'rating4-t':
			four_star_movies_IDs.append(movie_ID)
		movies_have_seen.append(Movie(movie_ID))
	print('2.start number %s'%(start_number))
Ejemplo n.º 15
0
def get_final_movies(movie):
    def find_author_comment(star):
        star = soup.find('span', class_=("allstar%s0 rating" % (star)))
        if star:
            comment = star.parent.parent.next_sibling.next_sibling.next
            comment = str(comment)
            author = star.previous_sibling.previous_sibling.text
        else:
            comment = None
            author = None
        return comment, author

    def get_comment():
        is_movie = not bool(soup.find('div', class_='episode_list'))

        audience_number = soup.find('span', {'property': 'v:votes'})
        if audience_number:
            audience_number = audience_number.text
        else:
            audience_number = 0
        has_enough_audience = True if int(audience_number) > 250 else False
        if is_movie and has_enough_audience:
            four_comment, four_author = find_author_comment(4)
            five_comment, five_author = find_author_comment(5)
            if not five_comment:
                movie.comment = Comment(four_comment, four_author, 4)
            else:
                movie.comment = Comment(five_comment, five_author, 5)
            return movie.comment

    url = 'http://movie.douban.com/subject/%s/' % (movie.ID)
    soup = get_soup(url, timeout=8)
    comment = get_comment()
    #Poster url
    poster_url = soup.find('img', {'rel': 'v:image'},
                           {'title': u'点击看更多海报'})['src']
    if comment:
        movie.poster_url = poster_url
        print('5.Ok %s' % (movie.ID))
        final_movies.append(movie)
Ejemplo n.º 16
0
def get_movies(username, start_number):
    url = 'http://movie.douban.com/people/%s/collect?start=%s&mode=list' % (
        username, start_number)
    soup = get_soup(url, timeout=10)
    if soup:

        htmls = soup.findAll('li',
                             id=re.compile('list\d{7,8}'),
                             class_=re.compile('item'))
        for html in htmls:
            star_html = html.find('span', class_=re.compile('rating\d-t'))
            if star_html:
                star_html = star_html['class'][0]
            else:
                star_html = None
            movie_ID = re.search('\d{7,8}', html['id']).group()
            if star_html == 'rating5-t':
                five_star_movies_IDs.append(movie_ID)
            elif star_html == 'rating4-t':
                four_star_movies_IDs.append(movie_ID)
            movies_have_seen.append(Movie(movie_ID))
        print('2.start number %s' % (start_number))
def get_final_movies(movie):
	def find_author_comment(star):
		star = soup.find('span', class_=("allstar%s0 rating"%(star)))
		if star:
			comment = star.parent.parent.next_sibling.next_sibling.next
			comment = str(comment)
			author = star.previous_sibling.previous_sibling.text
		else:
			comment = None
			author = None
		return comment, author
	def get_comment():
		is_movie = not bool(soup.find('div', class_='episode_list'))
		
		audience_number = soup.find('span', {'property':'v:votes'})
		if audience_number:
			audience_number = audience_number.text
		else:
			audience_number = 0
		has_enough_audience = True if int(audience_number)>250 else False
		if is_movie and has_enough_audience:
			four_comment,four_author = find_author_comment(4) 
			five_comment,five_author = find_author_comment(5) 
			if not five_comment:
				movie.comment = Comment(four_comment, four_author, 4)
			else:
				movie.comment = Comment(five_comment, five_author, 5)
			return movie.comment

	url = 'http://movie.douban.com/subject/%s/'%(movie.ID)
	soup = get_soup(url, timeout=8)
	comment = get_comment()
	#Poster url
	poster_url = soup.find('img', {'rel':'v:image'}, 
							{'title':u'点击看更多海报'})['src']
	if comment:
		movie.poster_url = poster_url
		print('5.Ok %s'%(movie.ID))
		final_movies.append(movie)
Ejemplo n.º 18
0
def get_celebrities(username, start_number):
    url = 'http://movie.douban.com/people/%s/celebrities?start=%s' % (
        username, start_number)
    soup = get_soup(url, timeout=10)
    page_celebrities = soup.findAll(
        'a', href=re.compile('http://movie.douban.com/celebrity/\d{7}/$'))
    page_celebrities = [
        re.search('\d{7}', celebrity['href']).group()
        for celebrity in page_celebrities
        if page_celebrities.index(celebrity) % 2 == 0
    ]

    names_html = soup.findAll('em')
    names = [unicode(name.text) for name in names_html]
    page_celebrities = [
        Celebrity(page_celebrity,
                  collect_or_watch='collect',
                  original_score=5,
                  name=name)
        for page_celebrity, name in zip(page_celebrities, names)
    ]
    celebrities.extend(page_celebrities)
    print('1.collect page%s OK' % (start_number))
Ejemplo n.º 19
0
def parse_user(url):
    soup = get_soup(url)
    if not soup:
        return None
    userInfo = soup.find(name='div', attrs={'class': 'userinfo_userdata'})
    userinfo__head = soup.find(name='div', attrs={'id': 'j_userhead'})
    if userInfo:
        sex = userInfo.find(name='span', attrs={'class': 'userinfo_sex_male'})
        if sex:
            sex_ = 'male'
        else:
            sex_ = 'female'
        try:
            user_head = userinfo__head.find(
                name='img', attrs={'src': re.compile(r'http://?.*')})['src']
        except Exception as ex:
            print(ex)
            user_head = ''
        name_ = re.findall(r'用户名:(.+?)<', str(userInfo))[0]
        age_ = re.findall(r'吧龄:(.+?)<', str(userInfo))[0]
        titles_ = re.findall(r'发贴:(.+?)<', str(userInfo))[0]
        u = User(name_, user_head, url, sex_, age_, titles_)
        return str(json.dumps(u.__dict__, ensure_ascii=False))
    return str(json.dumps(User(url=url).__dict__, ensure_ascii=False))
Ejemplo n.º 20
0
def get_final_movies(movie):
	url = 'http://movie.douban.com/subject/%s/'%(movie.ID)
	soup = get_soup(url, timeout=15)
	is_movie = not bool(soup.find('div', class_='episode_list'))

	def find_author_comment(star):
		star = soup.find('span', class_=("allstar%s0 rating"%(star)))
		if star:
			comment = star.parent.parent.next_sibling.next_sibling.next
			author = star.previous_sibling.previous_sibling.text
		else:
			comment = None
			author = None
		return comment, author
	if is_movie:
		four_comment,four_author = find_author_comment(4) 
		five_comment,five_author = find_author_comment(5) 
		movie.comment = [Comment(four_comment, four_author, 4), 
						 Comment(five_comment, five_author, 5)]
	#Poster url
		poster_url = soup.find('img', {'rel':'v:image'}, {'title':u'点击看更多海报'})['src']
		movie.poster_url = poster_url
		print('5.Ok %s'%(movie.ID))
		final_movies.append(movie)
Ejemplo n.º 21
0
def get_special(celebrity):
	url = 'http://movie.douban.com/celebrity/%s/'%(celebrity.ID)
	soup = get_soup(url, timeout=12)
	image_url = soup.find('img', title=u'点击看大图')['src']
	celebrity.image_url = image_url
Ejemplo n.º 22
0
import spider
import download

# 设置根url 获取唯品会商品页面内容
root_url = 'https://category.vip.com/'
soup = spider.get_soup(root_url)

# 抓取tree_id、c_id
tree_id = spider.get_tree_id(soup)
c_id = spider.get_c_id(soup)

for c_item in c_id[6:7]:
    # 使用tree_id、c_id设置获取类别对应链接的url
    format_url = 'https://category.vip.com/ajax/getTreeList.php?cid={}&tree_id={}'.format(
        c_item, tree_id)
    url_soup = spider.get_soup(format_url)
    name_dic = spider.get_name_dic(url_soup)
    url_dic = spider.get_url_dic(url_soup, name_dic)
    for folder, url in url_dic.items():
        folder = spider.folder_name_simplify(folder)
        product_list = spider.get_product_list(url)
        # 由于唯品会网站限制,先取前五十个进行存取
        download.down_img(folder, product_list[:50])
        if len(product_list) >= 50:
            download.down_img(folder, product_list[50:])
        print(folder + ' 已下载完成')
Ejemplo n.º 23
0
def get_movies_pages(username):
	url = 'http://movie.douban.com/people/%s/collect?&mode=list'%(username)
	soup = get_soup(url, timeout=1, priority='high')
	title_text = soup.title.text
	movies_pages = (int(re.search('\((\d+)\)', title_text).group(1))/30 + 1)*30
	return movies_pages