def content_handle(movie):
    # movie_names = re.findall(rer1, movie, re.S)
    # print(movie_names)
    _id = list(
        set(re.findall('https://movie.douban.com/subject/(\\d*?)/', movie)))
    if len(_id) == 1:
        _id = _id[0]
    else:
        print(_id)
        raise Exception("!!!more than one!")
    comment_table_create(_id)
    db = DbHandle(database='comment')
    db.table = _id
    block = re.findall('id="comments".*?id="paginator"', movie, re.S)
    block = block[0]
    # soup = BeautifulSoup(movie, 'lxml')
    # comments = soup.find_all(class_='comment-item')
    pattern = 'title="(.*?)".*?"https://www.douban.com/people/(.*?)/".*?src="(.*?)".*?<.*?comment-time.*?title="(.*?)".*?short">(.*?)<'
    comments = re.findall(pattern, block, re.S)
    # x = 1
    for comment in comments[:10]:
        pic = re.sub('/u(.*?)-.*?\\.', '/ul\\1.', comment[2])
        data = [
            comment[1], comment[0],
            str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4])
        ]
        print(_id, comment[1], pic, comment[2],
              str(comment[3]).replace('\n', ''), '''{}'''.format(comment[4]))
        db.save(data)
Exemple #2
0
def db_save(name, data):
    db = DbHandle()
    db.table = 'init'
    data_ = [name, data]
    try:
        db.save(data_)
    except Exception as e:
        print(e)
Exemple #3
0
def content_handle(movie_id, content):
    block = re.findall('type="application/ld\\+json".*?datePublished', content, re.S)
    # print(block)
    actors = re.findall('celebrity/(\\d*?)/', block[0])
    db = DbHandle()
    db.table = 'movie_cast'
    for actor in actors:
        try:
            if not db.get(_filter='where person_id={} and movie_id={}'.format(actor, movie_id)):
                print(actor)
                db.save(data=[movie_id, actor], _range='movie_id, person_id')
            else:
                print(movie_id, actor, 'Already have this connect')
        except Exception as e:
            # 桐本拓哉,你狠
            if actor == '1376098':
                db.save(data=[movie_id, '1250852'], _range='movie_id, person_id')
                continue
            db.close()
            print(e)


            extra(actor)
            print(actor)
            db = DbHandle()
            db.table = 'movie_cast'
            db.save(data=[movie_id, actor], _range='movie_id, person_id')
Exemple #4
0
def content_handle(info):
    # all_actor_information = []
    _id = re.findall(
        'id="headline".*?rel="nofollow".*?https://movie.douban.com/celebrity/(\d*?)/',
        info, re.S)
    data = [_id[0]]
    name = re.findall(r'<div id="content">.*?<h1>(.+)</h1>', info, re.S)[0]
    try:
        sex = re.findall(r'<span>性别<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find actor sex')
        sex = None
    try:
        constellation = re.findall(r'<span>星座<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find constellation')
        constellation = None
    try:
        birthday = re.findall(r'<span>出生日期<.+>:\s*(.*)\s*', info)[0]
    except Exception as e:
        try:
            birthday = re.findall(r'<span>生卒日期<.+>:\s*(.*)\s*', info)[0]
        except:
            print('Can not find birthday')
            birthday = None
    try:
        birthplace = re.findall(r'<span>出生地<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find birthplace')
        birthplace = None
    try:
        profession = re.findall(r'<span>职业<.+>:\s*(.*)\s*', info)[0]
    except:
        print('Can not find profession')
        profession = None
    try:
        imdb_number = re.findall(r'<span>imdb编号<.+>:\s*.+>(.+)</a>', info)[0]
    except:
        print('Can not find IMDB编号')
        imdb_number = None
    all_introduce = re.findall(r'<span class="all hidden">\s*(.+)<', info)
    if not bool(all_introduce):
        normal_introduce = re.findall(
            r'<h2>\s*影人简介\s*.+\s*<.+>\s*</div>\s*<div class="bd">\s*(.+)\s*',
            info)
        _dict = {
            "姓名": name,
            "性别": sex,
            "星座": constellation,
            "出生日期": birthday,
            "出生地": birthplace,
            "职业": profession,
            "imdb编号": imdb_number,
            "简介": normal_introduce[0]
        }
        # all_actor_information.append(dict)
    else:
        _dict = {
            "姓名": name,
            "性别": sex,
            "星座": constellation,
            "出生日期": birthday,
            "出生地": birthplace,
            "职业": profession,
            "imdb编号": imdb_number,
            "简介": all_introduce[0]
        }
        # all_actor_information.append(dict)
    # print(_dict)
    data.extend(_dict.values())
    if data[-1] == '</div>':
        data[-1] = None
    db = DbHandle()
    db.table = 'movie_person'
    if not db.get_by_id(_id=int(_id[0])):
        print(data)
        db.save(data)
    else:
        print('Already have this')