Beispiel #1
0
def crawl_movie(av_info, expt_queue, proc_num):
    # get movie data, genre data, star data, images data from javbus website according avnum
    av_num = get_av_num(av_info[0])
    if database.check_existence(av_num) is True:
        return
    print('Process {} get avnum: {}'.format(proc_num, av_num))
    # get movie soup
    soup = get_movie_soup(av_info, proc_num)

    # get movie class
    try:
        movie = get_movie_class(soup, av_num, av_info[1])
    except AvExpt as e:
        print('Process {} facing exception when database inserting:'.format(
            proc_num))
        print(e)
        expt_queue.put(e)
        return
    print('movie class:', movie)

    # get starID list of a movie
    star_id_iter = get_star_iter(soup)

    # get genre list of a movie
    genres = get_genre_iter(soup)

    # get links of a movie
    try:
        link_iter = get_download_iter(soup, av_info[0], av_num, proc_num)
    except AvExpt as e:
        print(
            'Process {} facing exception when crawling link:'.format(proc_num))
        print(e)
        expt_queue.put(e)
        return

    # get sample images of a movie
    images = get_sample_img_iter(soup)

    # store movie info to database
    try:
        database.insert_movie(movie)
    except Exception as e:
        print(
            'Process {} facing exception when insert movie:'.format(proc_num))
        print(e)
        expt_queue.put(
            AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                   'movie_insert'))

    # store movie and star info to database
    for s in star_id_iter:
        if database.check_stars(s[0]):
            # print('Process{},avnum: {},starid: {}'.format(proc_num, av_num, s[0]))
            try:
                database.insert_m_s(av_num, s[0])
            except Exception as e:
                print('Process {} facing exception when insert m_s:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_s_insert'))
        else:
            try:
                p = get_star(s, proc_num)
            except AvExpt as e:
                print('Process {} facing exception when crawling star:'.format(
                    proc_num))
                print(e)
                expt_queue.put(e)
                continue
            try:
                database.insert_star(p)
            except Exception as e:
                print('Process {} facing exception when insert star:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'star_insert'))
            try:
                database.insert_m_s(av_num, s[0])
            except Exception as e:
                print('Process {} facing exception when insert m_s:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_s_insert'))

    for g in genres:
        if database.check_genres(g[0]):
            try:
                database.insert_m_g(av_num, g[0])
            except Exception as e:
                print('Process {} facing exception when insert m_g:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_g_insert'))

        else:
            try:
                p = crawl_genre(g, proc_num)
            except AvExpt as e:
                print(
                    'Process {} facing exception when crawling genre:'.format(
                        proc_num))
                print(e)
                expt_queue.put(e)
                continue
            try:
                database.insert_genre(p)
            except Exception as e:
                print('Process {} facing exception when insert genre:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'genre_insert'))
            try:
                database.insert_m_g(av_num, g[0])
            except Exception as e:
                print('Process {} facing exception when insert m_g:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_g_insert'))

    # store links info to database
    for li in link_iter:
        try:
            database.insert_magnet(li)
        except Exception as e:
            print('Process {} facing exception when insert magnet:'.format(
                proc_num))
            print(e)
            expt_queue.put(
                AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                       'magnet_insert'))

    # store images url to database
    for im in images:
        try:
            database.insert_img(im, av_num)
        except Exception as e:
            print('Process {} facing exception when insert img:'.format(
                proc_num))
            print(e)
            expt_queue.put(
                AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                       'img_insert'))

    print('Process {} 已扒取完毕:第 {} 页 番号:{}'.format(
        str(proc_num), str(os.path.basename(av_info[0])), av_num))

    return
Beispiel #2
0
def parse_page(url, thread_num, counter):
    """ parse function for each page"""

    # get main page soup
    main_page_soup = parser.get_main_page_soup(url)

    # request the website and get the elements
    movie_links = parser.get_movie_page_list(main_page_soup)

    # get next page url
    next_page = parser.get_next_page_url(main_page_soup)

    # loop through each movie box in the main page
    for i in movie_links:

        # get av num from the soup
        av_num = parser.get_av_num(i)

        # skip existed movie
        if database.check_existence(av_num):
            print('* 已存在 %s 停止爬取 *' % av_num)
            continue

        # get view page soup
        soup = parser.get_link_soup(i)

        # show current working status
        print('Thread {} 正在扒取:第 {} 页 番号:{}'.format(str(thread_num),
                                                   str(os.path.basename(url)),
                                                   av_num))

        # get movie object info
        movie = parser.get_movie(soup, av_num)

        # show movie object
        # print(movie)

        stars = parser.get_star_list(soup)
        links = parser.get_download_link(soup, url, av_num)

        images = parser.get_sample_img_list(soup)

        # store movie info to database
        database.insert_movie(movie)

        # store star info to database
        for s in stars:
            database.insert_star(s, av_num)

        # store links info to database
        for l in links:
            database.insert_magnet(l)

        # store images url to database
        for g in images:
            database.insert_img(g, av_num)

        counter.increment_parse()

    print('第 ' + str(os.path.basename(url)) + ' 页扒取完毕')
    print('-------------------------')

    return next_page
Beispiel #3
0
# encoding: utf-8
"""
@project = javbus_crawler
@file = parsetest
@author = ThinkPad
@create_time = 2019-10-2819:48
"""

import parsertest
import database

link_url = 'https://www.dmmsee.icu/AVOP-144'
av_num = 'AVOP-144'
soup = parsertest.get_link_soup(link_url)
star_id_list = parsertest.get_starID_list(soup)
for s in star_id_list:
    print('enter')
    if database.check_stars(s[0]):
        continue  # database.insert_m_s(av_num, s[0])
    else:
        p = parsertest.get_star(s)
        database.insert_star(p)
        # database.insert_m_s(av_num, s[0])