def content_handle(movie): # movie_names = re.findall(rer1, movie, re.S) # print(movie_names) _id = list( set(re.findall('https://movie.douban.com/subject/(\\d*?)/', movie))) if len(_id) == 1: _id = _id[0] else: print(_id) raise Exception("!!!more than one!") comment_table_create(_id) db = DbHandle(database='comment') db.table = _id block = re.findall('id="comments".*?id="paginator"', movie, re.S) block = block[0] # soup = BeautifulSoup(movie, 'lxml') # comments = soup.find_all(class_='comment-item') pattern = 'title="(.*?)".*?"https://www.douban.com/people/(.*?)/".*?src="(.*?)".*?<.*?comment-time.*?title="(.*?)".*?short">(.*?)<' comments = re.findall(pattern, block, re.S) # x = 1 for comment in comments[:10]: pic = re.sub('/u(.*?)-.*?\\.', '/ul\\1.', comment[2]) data = [ comment[1], comment[0], str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4]) ] print(_id, comment[1], pic, comment[2], str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4])) db.save(data)
def get_urls(self): db = DbHandle(database='spider') self.db.table = 'ranking' _id_list = db.get(table='ranking', _range='movie_id') id_list = list(set([i[0] for i in _id_list])) done_ = db.get(table='spider_movie', _range='id') # print(done_) print(len(id_list)) for i in done_: id_list.remove(i[0]) print(len(id_list)) # 处理评论 # comments_urls = ['https://movie.douban.com/subject/%s/comments?status=P' % _id for _id in id_list] # 处理电影 movies_urls = [ 'https://movie.douban.com/subject/%s/' % _id for _id in id_list ] # 处理获奖 for m, i in zip(id_list, movies_urls): requests.get( i, hooks={ 'response': lambda r, *args, **kwargs: self.movie_handle(m, r.text) })
def get_urls(): db = DbHandle(database='ranking') db.table = 'movie' actor_ids = db.get(_range='details') urls = [] for ids in actor_ids: if ids[0]: id_list = re.findall('celebrity/(\\d*?)/', str(ids[0])) url_list = [ 'https://movie.douban.com/celebrity/%s/' % _id for _id in id_list ] urls.extend(url_list) else: continue urls.remove('https://movie.douban.com/celebrity/1376098/') # db.table = 'person' # actor_ids = db.get(_range='id') # have_urls = [] # for ids in actor_ids: # url = 'https://movie.douban.com/celebrity/%s/' % ids # have_urls.append(url) # # print(len(have_urls)) # u = list(set(urls) ^ set(have_urls)) # print(len(set(have_urls)), len(set(urls))) # print(len(u)) # # u.remove('https://movie.douban.com/celebrity/1369442/') request = GUrlHandle(content_handle=content_handle, max_=400) request.get_contents(urls)
def db_save(name, data): db = DbHandle() db.table = 'init' data_ = [name, data] try: db.save(data_) except Exception as e: print(e)
def get_urls(): db = DbHandle() _id = db.get(table='movie', _range='id') urls = [] for i in _id: urls.append('https://movie.douban.com/subject/{}/'.format(*i)) session = GUrlHandle(content_handle=content_handle, max_=200, use_id=True) session.get_contents(urls)
def get_urls(): db = DbHandle() db.table = 'movie' _id_list = db.get(_range='id') id_list = [i[0] for i in _id_list] urls = [ 'https://movie.douban.com/subject/%s/comments?status=P' % _id for _id in id_list ] request = GUrlHandle(content_handle=content_handle) request.get_contents(urls)
def create_table(): sql = '''CREATE TABLE `person`( `id` CHAR(15) PRIMARY KEY NOT NULL , `name` CHAR(50) NOT NULL , `sex` VARCHAR(5), `constellation` VARCHAR(10), `birthday` VARCHAR(30), `birthplace` VARCHAR(50) , `profession` VARCHAR(60), `imdb` VARCHAR(15), `introduce` TEXT, )''' db = DbHandle() db.create_table(sql)
def comment_table_create(_id, database=None): # db = pymysql.connect(host='localhost', user='******', password='******', database='ranking') db = DbHandle(database='comment') # cursor = db.cursor() sql = '''CREATE TABLE `%s`( `user_id` VARCHAR(30), `user_name` VARCHAR(30), `date` VARCHAR(30), `comment` TEXT )''' % _id try: db.execute(query=sql) except Exception as e: print(e)
def __init__(self): self.db = DbHandle() self.cursor = self.db.cursor self.fake = UserAgent()
class Main(object): def __init__(self): self.db = DbHandle() self.cursor = self.db.cursor self.fake = UserAgent() def get_urls(self): db = DbHandle(database='spider') self.db.table = 'ranking' _id_list = db.get(table='ranking', _range='movie_id') id_list = list(set([i[0] for i in _id_list])) done_ = db.get(table='spider_movie', _range='id') # print(done_) print(len(id_list)) for i in done_: id_list.remove(i[0]) print(len(id_list)) # 处理评论 # comments_urls = ['https://movie.douban.com/subject/%s/comments?status=P' % _id for _id in id_list] # 处理电影 movies_urls = [ 'https://movie.douban.com/subject/%s/' % _id for _id in id_list ] # 处理获奖 for m, i in zip(id_list, movies_urls): requests.get( i, hooks={ 'response': lambda r, *args, **kwargs: self.movie_handle(m, r.text) }) # request = GUrlHandle(content_handle=self.movie_handle, use_id=True) # request.get_contents(movies_urls) def movie_handle(self, _id, t): movie_id = _id print(movie_id) self.cursor.execute('USE `spider`') response = etree.HTML(t) # response = html.xpath('//a/@href') try: name = response.xpath( '//*[@id="content"]/h1/span[1]/text()')[0].split()[0] except: print(response.xpath('//*[@id="content"]/h1/text()')) return try: rank = re.findall('ratingValue": "(.*?)"', t, re.S)[0] except: rank = '0.0' if rank == '': rank = '0.0' try: star_num = re.findall('ratingCount": "(.*?)"', t, re.S)[0] except: star_num = None try: year = re.findall('datePublished": "(.*?)"', t, re.S)[0] except: year = None try: _class = json.dumps(json.loads( re.findall('genre": (\[.*?\])', t, re.S)[0]), ensure_ascii=False) except: _class = None try: countries = re.findall('制片国家/地区:</span> (.*?)<', t)[0] except: countries = None try: long = response.xpath( '//*[@id="info"]/span[@property="v:runtime"]/text()')[0] except: long = None poster = response.xpath('//*[@id="mainpic"]/a/img/@src')[0] pic = movie_id + '.jpg' pic_path = os.path.join('movie', pic) with open(os.path.join(SOURCE, pic_path), 'wb') as f: f.write( requests.get(poster, { 'User-Agent': self.fake.random }).content) poster = pic_path review = None content = '' contents = response.xpath('//span[@class="all hidden"]/text()') if contents: for i in contents: content += i.strip() else: for i in response.xpath('//span[@property="v:summary"]/text()'): content += i.strip() details = content # 对电影剧照的收集 images = response.xpath('//*[@id="related-pic"]/ul/li') image = [] for image_ in images: img = image_.xpath('./a/img/@src') image.extend(img) _image = [] for n, i in enumerate(image): pic = movie_id + '_' + str(n) + '.jpg' pic_path = os.path.join('movie', pic) with open(os.path.join(SOURCE, pic_path), 'wb') as f: f.write( requests.get(i, { 'User-Agent': self.fake.random }).content) _image.append(pic_path.replace('\\', '/')) image = json.dumps(_image) if not self.cursor.execute( 'SELECT `movie_name` FROM `spider_movie` WHERE id=%s', _id): self.cursor.execute( ''' INSERT INTO `spider_movie`( `movie_name`, `long`, `rank`, `star_num`, `year`, `class`, `countries`, `id`, `review`, `details`, `poster`, `image` ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ''', [ name, long, rank, star_num, year, _class, countries, _id, review, details, poster, image ]) self.db.commit() print([ name, long, rank, star_num, year, _class, countries, _id, review, details, poster, image ])
import requests from scrapy.handle_db.DBApi import DbHandle from scrapy.requests.g_handle import GUrlHandle from lxml import etree db = DbHandle() url = 'https://movie.douban.com/subject/1292052/awards/' def get_awards(url): sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36' r = sessions.get(url) r.encoding = 'utf-8' text = r.text awards_handle(text) def get_url(): l = db.get(table='movie', _range='(`id`, `awards_url`)') for i in l: get_content(i[0], hook=awards_handle) def awards_handle(text): content = etree.HTML(text) d = content.xpath("//div[@class='awards']") num = len(d) data_list = []
def content_handle(info): # all_actor_information = [] _id = re.findall( 'id="headline".*?rel="nofollow".*?https://movie.douban.com/celebrity/(\d*?)/', info, re.S) data = [_id[0]] name = re.findall(r'<div id="content">.*?<h1>(.+)</h1>', info, re.S)[0] try: sex = re.findall(r'<span>性别<.+>:\s*(.*)\s*', info)[0] except: print('Can not find actor sex') sex = None try: constellation = re.findall(r'<span>星座<.+>:\s*(.*)\s*', info)[0] except: print('Can not find constellation') constellation = None try: birthday = re.findall(r'<span>出生日期<.+>:\s*(.*)\s*', info)[0] except Exception as e: try: birthday = re.findall(r'<span>生卒日期<.+>:\s*(.*)\s*', info)[0] except: print('Can not find birthday') birthday = None try: birthplace = re.findall(r'<span>出生地<.+>:\s*(.*)\s*', info)[0] except: print('Can not find birthplace') birthplace = None try: profession = re.findall(r'<span>职业<.+>:\s*(.*)\s*', info)[0] except: print('Can not find profession') profession = None try: imdb_number = re.findall(r'<span>imdb编号<.+>:\s*.+>(.+)</a>', info)[0] except: print('Can not find IMDB编号') imdb_number = None all_introduce = re.findall(r'<span class="all hidden">\s*(.+)<', info) if not bool(all_introduce): normal_introduce = re.findall( r'<h2>\s*影人简介\s*.+\s*<.+>\s*</div>\s*<div class="bd">\s*(.+)\s*', info) _dict = { "姓名": name, "性别": sex, "星座": constellation, "出生日期": birthday, "出生地": birthplace, "职业": profession, "imdb编号": imdb_number, "简介": normal_introduce[0] } # all_actor_information.append(dict) else: _dict = { "姓名": name, "性别": sex, "星座": constellation, "出生日期": birthday, "出生地": birthplace, "职业": profession, "imdb编号": imdb_number, "简介": all_introduce[0] } # all_actor_information.append(dict) # print(_dict) data.extend(_dict.values()) if data[-1] == '</div>': data[-1] = None db = DbHandle() db.table = 'movie_person' if not db.get_by_id(_id=int(_id[0])): print(data) db.save(data) else: print('Already have this')
def __init__(self): self.db = DbHandle(database='spider') self.cursor = self.db.cursor self.fake = UserAgent()
class Main(object): def __init__(self): self.db = DbHandle(database='spider') self.cursor = self.db.cursor self.fake = UserAgent() def get_urls(self): self.db.table = 'ranking' _id_list = self.db.get(_range='movie_id') id_list = list(set([i[0] for i in _id_list])) self.cursor.execute('use `spider_comment`') self.cursor.execute('show tables') tables = self.cursor.fetchall() for i in tables: try: id_list.remove(i[0]) except ValueError as e: id_list.append(i[0]) # 处理评论 comments_urls = [ 'https://movie.douban.com/subject/%s/comments?status=P' % _id for _id in id_list ] print(comments_urls) # 处理电影 movies_urls = [ 'https://movie.douban.com/subject/%s/' % _id for _id in id_list ] # 处理获奖 for m, i in zip(id_list, comments_urls): requests.get( i, hooks={ 'response': lambda r, *args, **kwargs: self.comments_handle(m, r.text) }) # request = GUrlHandle(content_handle=self.comments_handle, use_id=True) # request.get_contents(comments_urls) def comments_handle(self, _id, text): # html = etree.HTML(text) # t = html.xpath('//a/@href') self.cursor.execute('USE `spider_comment`') movie_id = _id self.cursor.execute('SHOW TABLES') all_table = self.cursor.fetchall() if (movie_id, ) not in all_table: try: s = 'CREATE TABLE ' + '`' + movie_id + '`' self.cursor.execute(''' {}( `user_id` VARCHAR(30), `user_name` VARCHAR(30), `comment_time` DATETIME, `comment` TEXT, `image` VARCHAR(50) ) '''.format(s)) except Exception as e: print(e) comments = re.findall( 'class="avatar".*?title="(.*?)".*?"https://www.douban.com/people/(.*?)/".*?src="(.*?)".*?<.*?comment-time.*?title="(.*?)".*?short">(.*?)<', text, re.S) for comment_ in comments: # [comment[1], comment[0], str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4])] user_id = comment_[1] user_name = comment_[0] comm_time = str(comment_[3]).replace('\n', '') comm = comment_[4] image = re.sub('/u(.*?)-.*?\\.', '/ul\\1.', comment_[2]) pic = user_id + '.jpg' pic_path = os.path.join('user', pic) with open(os.path.join(SOURCE, pic_path), 'wb') as f: f.write( requests.get(image, { 'User-Agent': self.fake.random }).content) image = pic_path sql = 'INSERT INTO ' + '`' + movie_id + '`' self.cursor.execute( sql + '(`user_id`, `user_name`, `comment_time`, `comment`,`image`) VALUES(%s, %s, %s, %s, %s)', [user_id, user_name, comm_time, comm, image]) self.db.commit() print(user_id, user_name, comm_time, comm, image)
def content_handle(movie_id, content): block = re.findall('type="application/ld\\+json".*?datePublished', content, re.S) # print(block) actors = re.findall('celebrity/(\\d*?)/', block[0]) db = DbHandle() db.table = 'movie_cast' for actor in actors: try: if not db.get(_filter='where person_id={} and movie_id={}'.format(actor, movie_id)): print(actor) db.save(data=[movie_id, actor], _range='movie_id, person_id') else: print(movie_id, actor, 'Already have this connect') except Exception as e: # 桐本拓哉,你狠 if actor == '1376098': db.save(data=[movie_id, '1250852'], _range='movie_id, person_id') continue db.close() print(e) extra(actor) print(actor) db = DbHandle() db.table = 'movie_cast' db.save(data=[movie_id, actor], _range='movie_id, person_id')