def content_handle(movie): # movie_names = re.findall(rer1, movie, re.S) # print(movie_names) _id = list( set(re.findall('https://movie.douban.com/subject/(\\d*?)/', movie))) if len(_id) == 1: _id = _id[0] else: print(_id) raise Exception("!!!more than one!") comment_table_create(_id) db = DbHandle(database='comment') db.table = _id block = re.findall('id="comments".*?id="paginator"', movie, re.S) block = block[0] # soup = BeautifulSoup(movie, 'lxml') # comments = soup.find_all(class_='comment-item') pattern = 'title="(.*?)".*?"https://www.douban.com/people/(.*?)/".*?src="(.*?)".*?<.*?comment-time.*?title="(.*?)".*?short">(.*?)<' comments = re.findall(pattern, block, re.S) # x = 1 for comment in comments[:10]: pic = re.sub('/u(.*?)-.*?\\.', '/ul\\1.', comment[2]) data = [ comment[1], comment[0], str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4]) ] print(_id, comment[1], pic, comment[2], str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4])) db.save(data)
def db_save(name, data): db = DbHandle() db.table = 'init' data_ = [name, data] try: db.save(data_) except Exception as e: print(e)
def content_handle(movie_id, content): block = re.findall('type="application/ld\\+json".*?datePublished', content, re.S) # print(block) actors = re.findall('celebrity/(\\d*?)/', block[0]) db = DbHandle() db.table = 'movie_cast' for actor in actors: try: if not db.get(_filter='where person_id={} and movie_id={}'.format(actor, movie_id)): print(actor) db.save(data=[movie_id, actor], _range='movie_id, person_id') else: print(movie_id, actor, 'Already have this connect') except Exception as e: # 桐本拓哉,你狠 if actor == '1376098': db.save(data=[movie_id, '1250852'], _range='movie_id, person_id') continue db.close() print(e) extra(actor) print(actor) db = DbHandle() db.table = 'movie_cast' db.save(data=[movie_id, actor], _range='movie_id, person_id')
def content_handle(info): # all_actor_information = [] _id = re.findall( 'id="headline".*?rel="nofollow".*?https://movie.douban.com/celebrity/(\d*?)/', info, re.S) data = [_id[0]] name = re.findall(r'<div id="content">.*?<h1>(.+)</h1>', info, re.S)[0] try: sex = re.findall(r'<span>性别<.+>:\s*(.*)\s*', info)[0] except: print('Can not find actor sex') sex = None try: constellation = re.findall(r'<span>星座<.+>:\s*(.*)\s*', info)[0] except: print('Can not find constellation') constellation = None try: birthday = re.findall(r'<span>出生日期<.+>:\s*(.*)\s*', info)[0] except Exception as e: try: birthday = re.findall(r'<span>生卒日期<.+>:\s*(.*)\s*', info)[0] except: print('Can not find birthday') birthday = None try: birthplace = re.findall(r'<span>出生地<.+>:\s*(.*)\s*', info)[0] except: print('Can not find birthplace') birthplace = None try: profession = re.findall(r'<span>职业<.+>:\s*(.*)\s*', info)[0] except: print('Can not find profession') profession = None try: imdb_number = re.findall(r'<span>imdb编号<.+>:\s*.+>(.+)</a>', info)[0] except: print('Can not find IMDB编号') imdb_number = None all_introduce = re.findall(r'<span class="all hidden">\s*(.+)<', info) if not bool(all_introduce): normal_introduce = re.findall( r'<h2>\s*影人简介\s*.+\s*<.+>\s*</div>\s*<div class="bd">\s*(.+)\s*', info) _dict = { "姓名": name, "性别": sex, "星座": constellation, "出生日期": birthday, "出生地": birthplace, "职业": profession, "imdb编号": imdb_number, "简介": normal_introduce[0] } # all_actor_information.append(dict) else: _dict = { "姓名": name, "性别": sex, "星座": constellation, "出生日期": birthday, "出生地": birthplace, "职业": profession, "imdb编号": imdb_number, "简介": all_introduce[0] } # all_actor_information.append(dict) # print(_dict) data.extend(_dict.values()) if data[-1] == '</div>': data[-1] = None db = DbHandle() db.table = 'movie_person' if not db.get_by_id(_id=int(_id[0])): print(data) db.save(data) else: print('Already have this')