def main(keyword: str) -> Metadata: keyword = keyword.upper() query_result = get_html( 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + keyword. replace('-', '') + '&sg=main&num=30') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] detail_page = get_html('https://xcity.jp' + urls) return Metadata(**{ 'actresses': getActresses(detail_page), 'title': getTitle(detail_page), 'studio': getStudio(detail_page), 'overview': getOverview(detail_page), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), 'vid': getVID(detail_page), 'cover': getCover(detail_page), # 'small_cover': '', 'images': getImages(detail_page), 'genres': getGenres(detail_page), 'label': getLabel(detail_page), # 'star_photos': '', 'source': 'https://xcity.jp' + urls, 'provider': 'xcity', 'series': getSeries(detail_page), })
def main(keyword: str) -> Metadata: keyword = keyword.upper() query_result = get_html('https://javdb.com/search?q=' + keyword + '&f=all') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one urls = html.xpath('//*[@id="videos"]/div/div/a/@href') # 记录一下欧美的ids ['Blacked','Blacked'] if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', keyword): ids = [keyword] correct_url = urls[0] else: ids = html.xpath( '//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') correct_url = urls[ids.index(keyword)] detail_page = get_html('https://javdb.com' + correct_url, params={'locale': 'zh'}) # no cut image by default # If gray image exists ,then replace with normal cover # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', keyword): # small_cover = getSmallCover(query_result) # else: # small_cover = getSmallCover(query_result, index=ids.index(keyword)) # if 'placeholder' in small_cover: # # replace wit normal cover and cut it # small_cover = getCover(detail_page) vid = getVID(detail_page) title = getTitle(detail_page) if title and vid: # remove duplicate title title = title.replace(vid, '').strip() return Metadata( **{ 'actresses': getActresses(detail_page), 'title': title, 'studio': getStudio(detail_page), 'overview': getOverview(detail_page), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), 'vid': vid, 'cover': getCover(detail_page), # 'small_cover': small_cover, 'images': getImages(detail_page), 'genres': getGenres(detail_page), 'label': getLabel(detail_page), # 'star_photos': getActressPhoto(detail_page), 'source': 'https://javdb.com' + correct_url, 'provider': 'javdb', 'series': getSeries(detail_page), })
def search_url(v): x = get_html(avsox_site + '/ja/search/' + v) tree = etree.fromstring( x, etree.HTMLParser()) # //table/tr[1]/td[1]/text() for r in tree.xpath('//*[@id="waterfall"]/div/a/@href'): return str(r), x return None, x
def main(keyword: str) -> Metadata: keyword = keyword.upper() url = f'https://www.dlsite.com/pro/work/=/product_id/{keyword}.html' text = get_html(url, cookies={'locale': 'zh-cn'}, raise_for_status=True) return Metadata( **{ 'actresses': getActresses(text), 'title': getTitle(text), 'studio': getStudio(text), 'overview': getOverview(text), 'runtime': '', 'director': getDirector(text), 'release': getRelease(text), 'vid': keyword, 'cover': 'https:' + getCover(text), # 'small_cover': '', 'genres': getGenres(text), 'label': getLabel(text), # 'star_photos': '', 'source': url, 'provider': 'dlsite', 'series': getSeries(text), })
def main(keyword: str) -> Metadata: keyword = keyword.upper() url = f'https://www.mgstage.com/product/product_detail/{keyword}/' text = str(get_html(url, cookies={'adc': '1'})) soup = BeautifulSoup(text, 'lxml') a = str(soup.find(attrs={'class': 'detail_data'})) b = str(soup.find(attrs={"id": "introduction"})) return Metadata( **{ 'title': getTitle(text), 'studio': getStudio(a), 'overview': getOverview(b), 'runtime': getRuntime(a), # 'director': getDirector(a), 'actresses': getActresses(a), 'release': getRelease(a), 'vid': getVID(a), 'cover': getCover(text), # 'small_cover': getSmallCover(text), 'genres': getGenres(a), 'label': getLabel(a), 'images': getImages(text), # 'star_photos': '', 'source': url, 'provider': 'mgstage', 'series': getSeries(a), })
def searchVID(keyword: str): keyword = keyword.upper() def _searchVID(text: str): _keyword = keyword.replace('-', '').replace('_', '') soup = BeautifulSoup(text, 'lxml') results = soup.find_all(attrs={'class': 'movie-box'}) if not results: return for result in results: r = re.compile(rf'href="{BASE_URL}/(.*?)"') items = re.findall(r, str(result)) for vid in items: vid = vid.strip().upper() if vid.startswith(_keyword): return vid search_page, search_page_uncensored = concurrentMap(lambda url: get_html( url, raise_for_status=lambda r: r.status_code != 404), [ f'{BASE_URL}/search/{keyword}', f'{BASE_URL}/uncensored/search/{keyword}', ], max_workers=2) return _searchVID(search_page) or _searchVID(search_page_uncensored)
def main(keyword: str) -> Metadata: vid = searchVID(keyword) if not vid: vid = keyword url = f'{BASE_URL}/{vid}' text = get_html(url, raise_for_status=True) return Metadata( **{ 'title': getTitle(text), 'studio': getStudio(text) or getPublisher(text), 'overview': getOverview(text), 'runtime': getRuntime(text), 'director': getDirector(text), 'actresses': getActresses(text), 'release': getRelease(text), 'vid': getVID(text), 'cover': getCover(text), 'genres': getGenres(text), 'images': getImages(text), 'label': getSeries(text), 'source': url, 'provider': 'javbus', 'series': getSeries(text), })
def main(keyword: str) -> Metadata: # fanza allow letter + number + underscore, normalize the input here # @note: I only find the usage of underscore as h_test123456789 fanza_search_number = keyword # AV_Data_Capture.py.getVIDber() over format the input, restore the h_ prefix if fanza_search_number.startswith("h-"): fanza_search_number = fanza_search_number.replace("h-", "h_") fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() fanza_urls = [ "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", "https://www.dmm.co.jp/rental/-/detail/=/cid=", ] chosen_url = "" text = '' for url in fanza_urls: chosen_url = url + fanza_search_number text = get_html( "https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format( urlencode({"rurl": chosen_url}) ), # raise_for_status=True, ) if "404 Not Found" not in text: break if not text or "404 Not Found" in text: raise NotFound(f'fanza: {keyword} not found') # for some old page, the input number does not match the page # for example, the url will be cid=test012 # but the hinban on the page is test00012 # so get the hinban first, and then pass it to following functions fanza_hinban = getVID(text) return Metadata(**{ 'title': getTitle(text).strip(), 'studio': getStudio(text), 'overview': getOverview(text), 'runtime': getRuntime(text), 'director': getDirector(text) if 'anime' not in chosen_url else '', 'actresses': getActresses(text) if 'anime' not in chosen_url else '', 'release': getRelease(text), 'vid': fanza_hinban, 'cover': getCover(text, fanza_hinban), 'genres': getGenres(text), 'images': getImages(text), 'label': getLabel(text), 'source': chosen_url, 'provider': 'fanza', 'series': getSeries(text), })
def getOnePhoto(url: str) -> str: html_page = get_html(url) ir = re.compile( r'<span class=\"avatar\" style=\"background-image: url\((.*?)\)') img_url = ir.findall(html_page) if img_url: return img_url[0] else: return ''
def main(name: str): url = parseURL(name) if not url: return text = get_html(url, raise_for_status=True) return Actress(extractName(text), *extractInfo(text), source=url, provider='xslist')
def search(name: str) -> list[tuple[str, str]]: text = get_html(f'{XSLIST_URL}/search', params={'query': name, 'lg': 'zh'}) tree = etree.fromstring(text, etree.HTMLParser()) results = [] for item in tree.xpath('//ul/li'): title = item.xpath('.//h3/a/text()')[0] href = item.xpath('.//h3/a/@href')[0] results.append((title, href)) return results
def main(keyword: str) -> Metadata: avsox_site = _getAvsoxSite() def search_url(v): x = get_html(avsox_site + '/ja/search/' + v) tree = etree.fromstring( x, etree.HTMLParser()) # //table/tr[1]/td[1]/text() for r in tree.xpath('//*[@id="waterfall"]/div/a/@href'): return str(r), x return None, x url, search_page = search_url(keyword) if not url or url.count('https://') > 1: raise NotFound(f'avsox: {keyword} not found') text = get_html(url, raise_for_status=True) soup = BeautifulSoup(text, 'lxml') info = str(soup.find(attrs={'class': 'row movie'})) return Metadata( **{ 'actresses': getActresses(text), 'title': getTitle(text), 'studio': getStudio(info), 'overview': '', # 'runtime': getRuntime(info), 'director': '', # 'release': getRelease(info), 'vid': getVID(info), 'cover': getCover(text), # 'small_cover': getSmallCover(search_page), 'genres': getGenres(text), 'label': getLabel(info), 'source': url, 'provider': 'avsox', 'series': getSeries(info), })
def getGenres(number: str) -> list[str]: url = f'https://adult.contents.fc2.com/api/v4/article/{number}/tag?' data = json.loads(get_html(url)) if data['code'] != 200: raise ValueError(f"Bad code: {data['code']}") return [i['tag'] for i in data.get('tags', [])]
def _getAvsoxSite() -> str: text = get_html('https://tellme.pw/avsox') return etree.HTML(text).xpath('//div[@class="container"]/div/a/@href')[0]
def _getIndex() -> dict: url = f'{REPO_RAW_URL}/Filetree.json' return json.loads(get_html(url, raise_for_status=True))