Python main_htmlcode 예제들, WebCrawler.fanza.main_htmlcode Python 예제들

예제 #1

0

파일 보기

파일: javbus.py 프로젝트: zhoucheng8023/AV_Data_Capture

def main_uncensored(number):
    htmlcode = get_html('https://www.javbus.com/ja/' + number)
    if getTitle(htmlcode) == '':
        htmlcode = get_html('https://www.javbus.com/ja/' +
                            number.replace('-', '_'))
    try:
        dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
    except:
        dww_htmlcode = ''
    dic = {
        'title':
        str(re.sub('\w+-\d+-', '',
                   getTitle(htmlcode))).replace(getNum(htmlcode) + '-', ''),
        'studio':
        getStudio(htmlcode),
        'year':
        getYear(htmlcode),
        'outline':
        getOutline(dww_htmlcode),
        'runtime':
        getRuntime(htmlcode),
        'director':
        getDirector(htmlcode),
        'actor':
        getActor(htmlcode),
        'release':
        getRelease(htmlcode),
        'number':
        getNum(htmlcode),
        'cover':
        getCover(htmlcode),
        'tag':
        getTag(htmlcode),
        'extrafanart':
        getExtrafanart(htmlcode),
        'label':
        getSerise(htmlcode),
        'imagecut':
        0,
        'actor_photo':
        '',
        'website':
        'https://www.javbus.com/ja/' + number,
        'source':
        'javbus.py',
        'series':
        getSerise(htmlcode),
    }
    js = json.dumps(
        dic,
        ensure_ascii=False,
        sort_keys=True,
        indent=4,
        separators=(',', ':'),
    )  # .encode('UTF-8')
    return js

예제 #2

0

파일 보기

파일: javbus.py 프로젝트: zhoucheng8023/AV_Data_Capture

def main(number):
    try:
        try:
            try:
                htmlcode = get_html('https://www.fanbus.us/' + number)
            except:
                htmlcode = get_html('https://www.javbus.com/' + number)
            try:
                dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
            except:
                dww_htmlcode = ''
            dic = {
                'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
                'studio': getStudio(htmlcode),
                'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
                'outline': getOutline(dww_htmlcode),
                'runtime': getRuntime(htmlcode),
                'director': getDirector(htmlcode),
                'actor': getActor(htmlcode),
                'release': getRelease(htmlcode),
                'number': getNum(htmlcode),
                'cover': getCover(htmlcode),
                'imagecut': 1,
                'tag': getTag(htmlcode),
                'extrafanart': getExtrafanart(htmlcode),
                'label': getSerise(htmlcode),
                'actor_photo': getActorPhoto(htmlcode),
                'website': 'https://www.javbus.com/' + number,
                'source': 'javbus.py',
                'series': getSerise(htmlcode),
            }
            js = json.dumps(
                dic,
                ensure_ascii=False,
                sort_keys=True,
                indent=4,
                separators=(',', ':'),
            )  # .encode('UTF-8')
            return js
        except:
            return main_uncensored(number)
    except:
        data = {
            "title": "",
        }
        js = json.dumps(data,
                        ensure_ascii=False,
                        sort_keys=True,
                        indent=4,
                        separators=(",", ":"))
        return js

예제 #3

0

파일 보기

def main(number):
    try:
        number = number.upper()

        # raw_cookies, user_agent = get_javdb_cookie()
        #
        # if not raw_cookies:
        #    return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
        #
        # s_cookie = SimpleCookie()
        # s_cookie.load(raw_cookies)
        # cookies = {}
        # for key, morsel in s_cookie.items():
        #    cookies[key] = morsel.value
        #
        # correct_url = ''

        time.sleep(3)

        try:
            # 先尝试使用ajax
            query_result = get_html(
                'https://javdb.com/videos/search_autocomplete.json?q=' +
                number)

            items = json.loads(query_result)

            links = []
            titles = []

            for item in items:
                if item['number'].upper() == number:
                    links.append('/v/' + item['uid'])
                    titles.append(item['title'])

            if len(links) > 1:
                for i, link in enumerate(links):
                    print(str(i + 1) + ": " + titles[i])
                    print('https://javdb.com' + link)

                index = int(input("input index: ")) - 1

                if index < 0 or index >= len(links):
                    raise ValueError("out of range")

                correct_url = links[index]
            else:
                correct_url = links[0]
        except:
            ok = 0

            for i in range(1, 10):
                try:
                    query_result = get_html('https://javdb.com/search?q=' +
                                            number + '&f=all')
                except:
                    query_result = get_html('https://javdb4.com/search?q=' +
                                            number + '&f=all')

                html = etree.fromstring(
                    query_result,
                    etree.HTMLParser())  # //table/tr[1]/td[1]/text()

                if str(
                        html.xpath(
                            '/html/body/section/div/div[4]/article/div/text()')
                ).strip(" ['']") == '':
                    ok = 1
                    break

                print("请求过于频繁，重试：" + str(i))
                time.sleep(30)

            if ok == 0:
                raise ValueError("retry max")

            # javdb sometime returns multiple results,
            # and the first elememt maybe not the one we are looking for
            # iterate all candidates and find the match one
            urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
            ids = html.xpath(
                '//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()'
            )
            allTitles = html.xpath(
                '//*[@id="videos"]/div/div/a/div[contains(@class, "video-title")]/text()'
            )

            links = []
            titles = []

            for i, id in enumerate(ids):
                if id.upper() == number:
                    links.append(urls[i])
                    titles.append(allTitles[i])

            if len(links) > 1:
                for i, link in enumerate(links):
                    print(str(i + 1) + ": " + titles[i])
                    print('https://javdb.com' + link)

                index = int(input("input index: ")) - 1

                if index < 0 or index >= len(links):
                    raise ValueError("out of range")

                correct_url = links[index]
            else:
                correct_url = links[0]

        detail_page = get_html('https://javdb.com' + correct_url)

        # # If gray image exists ,then replace with normal cover
        # cover_small = getCover_small(query_result, index=ids.index(number))
        # if 'placeholder' in cover_small:
        #     cover_small = getCover(detail_page)

        try:
            dww_htmlcode = fanza.main_htmlcode(getCID(detail_page))
        except:
            dww_htmlcode = ''

        dic = {
            'actor': getActor(detail_page),
            'title': getTitle(detail_page).replace(getNum(detail_page),
                                                   '').strip(),
            'studio': getStudio(detail_page),
            'outline': getOutline(dww_htmlcode),
            'runtime': getRuntime(detail_page),
            'director': getDirector(detail_page),
            'release': getRelease(detail_page),
            'number': getNum(detail_page),
            'cover': getCover(detail_page),
            # 'cover_small': cover_small,
            'imagecut': 1,
            'tag': getTag(detail_page),
            'label': getLabel(detail_page),
            'year':
            getYear(getRelease(detail_page)
                    ),  # str(re.search('\d{4}',getRelease(a)).group()),
            'actor_photo': getActorPhoto(getActor(detail_page)),
            'website': 'https://javdb.com' + correct_url,
            'source': 'javdb.py',
            'series': getSeries(detail_page),
        }

        title = dic['title']

        if title.find('無碼') >= 0:
            raise ValueError("unsupport")

    except Exception as e:
        # print(e)
        dic = {"title": ""}
    js = json.dumps(
        dic,
        ensure_ascii=False,
        sort_keys=True,
        indent=4,
        separators=(',', ':'),
    )  # .encode('UTF-8')
    return js

예제 #4

0

파일 보기

파일: javbus_uncensored.py 프로젝트: pangdogs/AV_Data_Capture_FixBug

def main(number):
    try:
        number = number.upper()

        #htmlMultiText = get_html('https://www.javbus.com/search/' + number + '&type=1', cookies={'existmag':'all'})
        htmlMultiText = get_html('https://www.javbus.com/uncensored/search/' + number + '&type=1', cookies={'existmag':'all'})
        htmlMulti = etree.fromstring(htmlMultiText, etree.HTMLParser())

        links = htmlMulti.xpath('//*[@id="waterfall"]/div/a/@href')
        titles = htmlMulti.xpath('//*[@id="waterfall"]/div/a/div/span/text()[1]')
        ids = htmlMulti.xpath('//*[@id="waterfall"]/div/a/div/span/date[1]/text()[1]')

        movieList = []
        for i, e in enumerate(links):
            if str(ids[i]).upper().replace('_', '-') == number.replace('_', '-'):
                movie = {'link':str(links[i]), 'title':str(titles[i]), 'id':str(ids[i])}
                movieList.append(movie)

        index = 0

        if len(movieList) <= 0:
            raise ValueError("no movie")
        elif len(movieList) >= 2:
            for i, link in enumerate(movieList):
                print(str(i+1)+": "+movieList[i]['title'])
                print(movieList[i]['link'])

            index = int(input("input index: "))-1

        if index < 0 or index >= len(movieList):
            raise ValueError("out of range")

        link = movieList[index]['link']

        if link == '':
            raise ValueError("no match")

        htmlcode = get_html(link)

        if isUnCensored(htmlcode) != 1:
            raise ValueError("unsupport")

        try:
            dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
        except:
            dww_htmlcode = ''
        dic = {
            'title': getTitle(htmlcode).replace(getNum(htmlcode),'').strip(),
            'studio': getStudio(htmlcode),
            'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
            'outline': getOutline(dww_htmlcode),
            'runtime': getRuntime(htmlcode),
            'director': getDirector(htmlcode).strip(),
            'actor': getActor(htmlcode),
            'release': getRelease(htmlcode),
            'number': getNum(htmlcode),
            'cover': getCover(htmlcode),
            'imagecut': 1,
            'tag': getTag(htmlcode),
            'label': getSerise(htmlcode),
            'actor_photo': getActorPhoto(htmlcode),
            'website': 'https://www.javbus.com/' + number,
            'source': 'javbus.py',
            'series': getSerise(htmlcode),
        }
        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,
                        separators=(',', ':'), )  # .encode('UTF-8')
        return js
    except:
        data = {
            "title": "",
        }
        js = json.dumps(
            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
        )
        return js

예제 #5

0

파일 보기

파일: javlib.py 프로젝트: pangdogs/AV_Data_Capture_FixBug

def main(number: str):
    number = number.upper()
    oldNumber = number

    if re.match(r'^([0-9]+)ID-(.+)$', number):
        g = re.search(r'^([0-9]+)ID-(.+)$', number)
        number = 'ID-' + g[1] + g[2]

    # raw_cookies, user_agent = get_javlib_cookie()
    #
    # #Blank cookies mean javlib site return error
    # if not raw_cookies:
    #    return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
    #
    # #Manually construct a dictionary
    # s_cookie = SimpleCookie()
    # s_cookie.load(raw_cookies)
    # cookies = {}
    # for key, morsel in s_cookie.items():
    #    cookies[key] = morsel.value

    # Scraping
    result = get_html(
        "http://www.b47w.com/cn/vl_searchbyid.php?keyword={}".format(number),
        # cookies=cookies,
        # ua=user_agent,
        return_type="object")
    soup = BeautifulSoup(result.text, "html.parser")
    lx = html.fromstring(str(soup))

    multiLabel = get_from_xpath(lx, '//*[@id="rightcolumn"]/div[1]/text()')
    if multiLabel.find('识别码搜寻结果') > 0:
        links = []
        titles = []

        for i in range(1, get_link_count(lx) + 1):
            id, href, title = get_link(lx, i)
            if title.count('（ブルーレイディスク）') > 0:
                continue
            if id.upper() == number:
                links.append('http://www.b47w.com/cn' + href)
                titles.append(title)

        link = ''

        if len(links) > 1:
            for i, link in enumerate(links):
                print(str(i + 1) + ": " + titles[i])
                print(link)

            index = int(input("input index: ")) - 1

            if index < 0 or index >= len(links):
                raise ValueError("out of range")

            link = links[index]
        else:
            link = links[0]

        if link == '':
            raise ValueError("no match")

        result = get_html(link, return_type="object")
        soup = BeautifulSoup(result.text, "html.parser")
        lx = html.fromstring(str(soup))

    try:
        dww_htmlcode = fanza.main_htmlcode(getCID(lx))
    except:
        dww_htmlcode = ''

    realnumber = get_table_el_td(soup, "video_id")
    if oldNumber != number:
        realnumber = oldNumber

    if "/?v=jav" in result.url:
        dic = {
            "title":
            get_title(lx, soup),
            "studio":
            get_table_el_single_anchor(soup, "video_maker"),
            "year":
            get_table_el_td(soup, "video_date")[:4],
            "outline":
            getOutline(dww_htmlcode),
            "director":
            get_table_el_single_anchor(soup, "video_director"),
            "cover":
            get_cover(lx),
            "imagecut":
            1,
            "actor_photo":
            "",
            "website":
            result.url.replace('www.b47w.com', 'www.javlibrary.com'),
            "source":
            "javlib.py",
            "actor":
            get_table_el_multi_anchor(soup, "video_cast"),
            "label":
            get_table_el_single_anchor(soup, "video_label"),
            "tag":
            getTag(get_table_el_multi_anchor(soup, "video_genres")),
            "number":
            realnumber,
            "release":
            get_table_el_td(soup, "video_date"),
            "runtime":
            get_from_xpath(
                lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
            "series":
            '',
        }
    else:
        dic = {}

    return json.dumps(dic,
                      ensure_ascii=False,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ':'))