Python get_htmlの例、ADC_function.get_html Pythonの例

コード例 #1

0

ファイルを表示

ファイル: AV_Data_Capture.py プロジェクト: zhuxiaoyong1982/AV_Data_Capture

def check_update(local_version):
    try:
        data = json.loads(get_html("https://api.github.com/repos/yoshiko2/AV_Data_Capture/releases/latest"))
    except:
        print("[-]Failed to update! Please check new version manually:")
        print("[-] https://github.com/yoshiko2/AV_Data_Capture/releases")
        print("[*]======================================================")
        return

    remote = int(data["tag_name"].replace(".",""))
    local_version = int(local_version.replace(".", ""))
    if local_version < remote:
        print("[*]" + ("* New update " + str(data["tag_name"]) + " *").center(54))
        print("[*]" + "↓ Download ↓".center(54))
        print("[*]https://github.com/yoshiko2/AV_Data_Capture/releases")
        print("[*]======================================================")

コード例 #2

0

ファイルを表示

def check_update(local_version):
    htmlcode = ""
    try:
        htmlcode = get_html(
            "https://api.github.com/repos/yoshiko2/Movie_Data_Capture/releases/latest"
        )
    except:
        print("===== Failed to connect to github =====")
        print("========== AUTO EXIT IN 60s ===========")
        time.sleep(60)
        os._exit(-1)
    data = json.loads(htmlcode)
    remote = int(data["tag_name"].replace(".", ""))
    local_version = int(local_version.replace(".", ""))
    if local_version < remote:
        print("[*]" +
              ("* New update " + str(data["tag_name"]) + " *").center(54))
        print("[*]" + "↓ Download ↓".center(54))
        print("[*]https://github.com/yoshiko2/Movie_Data_Capture/releases")
        print("[*]======================================================")

コード例 #3

0

ファイルを表示

def main(number: str):
    raw_cookies, user_agent = get_javlib_cookie()

    # Blank cookies mean javlib site return error
    if not raw_cookies:
        return json.dumps({},
                          ensure_ascii=False,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ':'))

    # Manually construct a dictionary
    s_cookie = SimpleCookie()
    s_cookie.load(raw_cookies)
    cookies = {}
    for key, morsel in s_cookie.items():
        cookies[key] = morsel.value

    # Scraping
    result = get_html(
        "http://www.m45e.com/cn/vl_searchbyid.php?keyword={}".format(number),
        cookies=cookies,
        ua=user_agent,
        return_type="object")
    soup = BeautifulSoup(result.text, "html.parser")
    lx = html.fromstring(str(soup))

    if "/?v=jav" in result.url:
        dic = {
            "title":
            get_title(lx, soup),
            "studio":
            get_table_el_single_anchor(soup, "video_maker"),
            "year":
            get_table_el_td(soup, "video_date")[:4],
            "outline":
            "",
            "director":
            get_table_el_single_anchor(soup, "video_director"),
            "cover":
            get_cover(lx),
            "imagecut":
            1,
            "actor_photo":
            "",
            "website":
            result.url,
            "source":
            "javlib.py",
            "actor":
            get_table_el_multi_anchor(soup, "video_cast"),
            "label":
            get_table_el_td(soup, "video_label"),
            "tag":
            get_table_el_multi_anchor(soup, "video_genres"),
            "number":
            get_table_el_td(soup, "video_id"),
            "release":
            get_table_el_td(soup, "video_date"),
            "runtime":
            get_from_xpath(
                lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
            "series":
            '',
        }
    else:
        dic = {}

    return json.dumps(dic,
                      ensure_ascii=False,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ':'))

コード例 #4

0

ファイルを表示

ファイル: javlib.py プロジェクト: yoshiko2/AV_Data_Capture

def main(number: str):
    raw_cookies, user_agent = get_javlib_cookie()

    # Blank cookies mean javlib site return error
    if not raw_cookies:
        return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))

    # Manually construct a dictionary
    s_cookie = SimpleCookie()
    s_cookie.load(raw_cookies)
    cookies = {}
    for key, morsel in s_cookie.items():
        cookies[key] = morsel.value

    # Scraping
    result = get_html(
        "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
        cookies=cookies,
        ua=user_agent,
        return_type="object"
    )
    soup = BeautifulSoup(result.text, "html.parser")
    lx = html.fromstring(str(soup))

    fanhao_pather = re.compile(r'<a href=".*?".*?><div class="id">(.*?)</div>')
    fanhao = fanhao_pather.findall(result.text)

    if "/?v=jav" in result.url:
        dic = {
            "title": get_title(lx, soup),
            "studio": get_table_el_single_anchor(soup, "video_maker"),
            "year": get_table_el_td(soup, "video_date")[:4],
            "outline": get_outline(number),
            "director": get_table_el_single_anchor(soup, "video_director"),
            "cover": get_cover(lx),
            "imagecut": 1,
            "actor_photo": "",
            "website": result.url,
            "source": "javlib.py",
            "actor": get_table_el_multi_anchor(soup, "video_cast"),
            "label": get_table_el_td(soup, "video_label"),
            "tag": get_table_el_multi_anchor(soup, "video_genres"),
            "number": get_table_el_td(soup, "video_id"),
            "release": get_table_el_td(soup, "video_date"),
            "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
            "series":'',
        }
    elif number.upper() in fanhao:
        url_pather = re.compile(r'<a href="(.*?)".*?><div class="id">(.*?)</div>')
        s = {}
        url_list = url_pather.findall(result.text)
        for url in url_list:
            s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.')
        av_url = s[number.upper()]
        result = get_html(
            av_url,
            cookies=cookies,
            ua=user_agent,
            return_type="object"
        )
        soup = BeautifulSoup(result.text, "html.parser")
        lx = html.fromstring(str(soup))

        dic = {
            "title": get_title(lx, soup),
            "studio": get_table_el_single_anchor(soup, "video_maker"),
            "year": get_table_el_td(soup, "video_date")[:4],
            "outline": get_outline(number),
            "director": get_table_el_single_anchor(soup, "video_director"),
            "cover": get_cover(lx),
            "imagecut": 1,
            "actor_photo": "",
            "website": result.url,
            "source": "javlib.py",
            "actor": get_table_el_multi_anchor(soup, "video_cast"),
            "label": get_table_el_td(soup, "video_label"),
            "tag": get_table_el_multi_anchor(soup, "video_genres"),
            "number": get_table_el_td(soup, "video_id"),
            "release": get_table_el_td(soup, "video_date"),
            "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
            "series": '',
        }
    else:
        dic = {"title": ""}

    return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))

コード例 #5

0

ファイルを表示

ファイル: javlib.py プロジェクト: pangdogs/AV_Data_Capture_FixBug

def main(number: str):
    number = number.upper()
    oldNumber = number

    if re.match(r'^([0-9]+)ID-(.+)$', number):
        g = re.search(r'^([0-9]+)ID-(.+)$', number)
        number = 'ID-' + g[1] + g[2]

    # raw_cookies, user_agent = get_javlib_cookie()
    #
    # #Blank cookies mean javlib site return error
    # if not raw_cookies:
    #    return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
    #
    # #Manually construct a dictionary
    # s_cookie = SimpleCookie()
    # s_cookie.load(raw_cookies)
    # cookies = {}
    # for key, morsel in s_cookie.items():
    #    cookies[key] = morsel.value

    # Scraping
    result = get_html(
        "http://www.b47w.com/cn/vl_searchbyid.php?keyword={}".format(number),
        # cookies=cookies,
        # ua=user_agent,
        return_type="object")
    soup = BeautifulSoup(result.text, "html.parser")
    lx = html.fromstring(str(soup))

    multiLabel = get_from_xpath(lx, '//*[@id="rightcolumn"]/div[1]/text()')
    if multiLabel.find('识别码搜寻结果') > 0:
        links = []
        titles = []

        for i in range(1, get_link_count(lx) + 1):
            id, href, title = get_link(lx, i)
            if title.count('（ブルーレイディスク）') > 0:
                continue
            if id.upper() == number:
                links.append('http://www.b47w.com/cn' + href)
                titles.append(title)

        link = ''

        if len(links) > 1:
            for i, link in enumerate(links):
                print(str(i + 1) + ": " + titles[i])
                print(link)

            index = int(input("input index: ")) - 1

            if index < 0 or index >= len(links):
                raise ValueError("out of range")

            link = links[index]
        else:
            link = links[0]

        if link == '':
            raise ValueError("no match")

        result = get_html(link, return_type="object")
        soup = BeautifulSoup(result.text, "html.parser")
        lx = html.fromstring(str(soup))

    try:
        dww_htmlcode = fanza.main_htmlcode(getCID(lx))
    except:
        dww_htmlcode = ''

    realnumber = get_table_el_td(soup, "video_id")
    if oldNumber != number:
        realnumber = oldNumber

    if "/?v=jav" in result.url:
        dic = {
            "title":
            get_title(lx, soup),
            "studio":
            get_table_el_single_anchor(soup, "video_maker"),
            "year":
            get_table_el_td(soup, "video_date")[:4],
            "outline":
            getOutline(dww_htmlcode),
            "director":
            get_table_el_single_anchor(soup, "video_director"),
            "cover":
            get_cover(lx),
            "imagecut":
            1,
            "actor_photo":
            "",
            "website":
            result.url.replace('www.b47w.com', 'www.javlibrary.com'),
            "source":
            "javlib.py",
            "actor":
            get_table_el_multi_anchor(soup, "video_cast"),
            "label":
            get_table_el_single_anchor(soup, "video_label"),
            "tag":
            getTag(get_table_el_multi_anchor(soup, "video_genres")),
            "number":
            realnumber,
            "release":
            get_table_el_td(soup, "video_date"),
            "runtime":
            get_from_xpath(
                lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
            "series":
            '',
        }
    else:
        dic = {}

    return json.dumps(dic,
                      ensure_ascii=False,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ':'))