def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number) if getTitle(htmlcode) == '': htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-', '_')) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: dww_htmlcode = '' dic = { 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))).replace(getNum(htmlcode) + '-', ''), 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), 'outline': getOutline(dww_htmlcode), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'tag': getTag(htmlcode), 'extrafanart': getExtrafanart(htmlcode), 'label': getSerise(htmlcode), 'imagecut': 0, 'actor_photo': '', 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number): try: try: try: htmlcode = get_html('https://www.fanbus.us/' + number) except: htmlcode = get_html('https://www.javbus.com/' + number) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: dww_htmlcode = '' dic = { 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'studio': getStudio(htmlcode), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'outline': getOutline(dww_htmlcode), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'imagecut': 1, 'tag': getTag(htmlcode), 'extrafanart': getExtrafanart(htmlcode), 'label': getSerise(htmlcode), 'actor_photo': getActorPhoto(htmlcode), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js except: return main_uncensored(number) except: data = { "title": "", } js = json.dumps(data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")) return js
def main(number): try: number = number.upper() # raw_cookies, user_agent = get_javdb_cookie() # # if not raw_cookies: # return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # # s_cookie = SimpleCookie() # s_cookie.load(raw_cookies) # cookies = {} # for key, morsel in s_cookie.items(): # cookies[key] = morsel.value # # correct_url = '' time.sleep(3) try: # 先尝试使用ajax query_result = get_html( 'https://javdb.com/videos/search_autocomplete.json?q=' + number) items = json.loads(query_result) links = [] titles = [] for item in items: if item['number'].upper() == number: links.append('/v/' + item['uid']) titles.append(item['title']) if len(links) > 1: for i, link in enumerate(links): print(str(i + 1) + ": " + titles[i]) print('https://javdb.com' + link) index = int(input("input index: ")) - 1 if index < 0 or index >= len(links): raise ValueError("out of range") correct_url = links[index] else: correct_url = links[0] except: ok = 0 for i in range(1, 10): try: query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') except: query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all') html = etree.fromstring( query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() if str( html.xpath( '/html/body/section/div/div[4]/article/div/text()') ).strip(" ['']") == '': ok = 1 break print("请求过于频繁,重试:" + str(i)) time.sleep(30) if ok == 0: raise ValueError("retry max") # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one urls = html.xpath('//*[@id="videos"]/div/div/a/@href') ids = html.xpath( '//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()' ) allTitles = html.xpath( '//*[@id="videos"]/div/div/a/div[contains(@class, "video-title")]/text()' ) links = [] titles = [] for i, id in enumerate(ids): if id.upper() == number: links.append(urls[i]) titles.append(allTitles[i]) if len(links) > 1: for i, link in enumerate(links): print(str(i + 1) + ": " + titles[i]) print('https://javdb.com' + link) index = int(input("input index: ")) - 1 if index < 0 or index >= len(links): raise ValueError("out of range") correct_url = links[index] else: correct_url = links[0] detail_page = get_html('https://javdb.com' + correct_url) # # If gray image exists ,then replace with normal cover # cover_small = getCover_small(query_result, index=ids.index(number)) # if 'placeholder' in cover_small: # cover_small = getCover(detail_page) try: dww_htmlcode = fanza.main_htmlcode(getCID(detail_page)) except: dww_htmlcode = '' dic = { 'actor': getActor(detail_page), 'title': getTitle(detail_page).replace(getNum(detail_page), '').strip(), 'studio': getStudio(detail_page), 'outline': getOutline(dww_htmlcode), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), 'number': getNum(detail_page), 'cover': getCover(detail_page), # 'cover_small': cover_small, 'imagecut': 1, 'tag': getTag(detail_page), 'label': getLabel(detail_page), 'year': getYear(getRelease(detail_page) ), # str(re.search('\d{4}',getRelease(a)).group()), 'actor_photo': getActorPhoto(getActor(detail_page)), 'website': 'https://javdb.com' + correct_url, 'source': 'javdb.py', 'series': getSeries(detail_page), } title = dic['title'] if title.find('無碼') >= 0: raise ValueError("unsupport") except Exception as e: # print(e) dic = {"title": ""} js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def main(number): try: number = number.upper() #htmlMultiText = get_html('https://www.javbus.com/search/' + number + '&type=1', cookies={'existmag':'all'}) htmlMultiText = get_html('https://www.javbus.com/uncensored/search/' + number + '&type=1', cookies={'existmag':'all'}) htmlMulti = etree.fromstring(htmlMultiText, etree.HTMLParser()) links = htmlMulti.xpath('//*[@id="waterfall"]/div/a/@href') titles = htmlMulti.xpath('//*[@id="waterfall"]/div/a/div/span/text()[1]') ids = htmlMulti.xpath('//*[@id="waterfall"]/div/a/div/span/date[1]/text()[1]') movieList = [] for i, e in enumerate(links): if str(ids[i]).upper().replace('_', '-') == number.replace('_', '-'): movie = {'link':str(links[i]), 'title':str(titles[i]), 'id':str(ids[i])} movieList.append(movie) index = 0 if len(movieList) <= 0: raise ValueError("no movie") elif len(movieList) >= 2: for i, link in enumerate(movieList): print(str(i+1)+": "+movieList[i]['title']) print(movieList[i]['link']) index = int(input("input index: "))-1 if index < 0 or index >= len(movieList): raise ValueError("out of range") link = movieList[index]['link'] if link == '': raise ValueError("no match") htmlcode = get_html(link) if isUnCensored(htmlcode) != 1: raise ValueError("unsupport") try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: dww_htmlcode = '' dic = { 'title': getTitle(htmlcode).replace(getNum(htmlcode),'').strip(), 'studio': getStudio(htmlcode), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'outline': getOutline(dww_htmlcode), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode).strip(), 'actor': getActor(htmlcode), 'release': getRelease(htmlcode), 'number': getNum(htmlcode), 'cover': getCover(htmlcode), 'imagecut': 1, 'tag': getTag(htmlcode), 'label': getSerise(htmlcode), 'actor_photo': getActorPhoto(htmlcode), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js except: data = { "title": "", } js = json.dumps( data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") ) return js
def main(number: str): number = number.upper() oldNumber = number if re.match(r'^([0-9]+)ID-(.+)$', number): g = re.search(r'^([0-9]+)ID-(.+)$', number) number = 'ID-' + g[1] + g[2] # raw_cookies, user_agent = get_javlib_cookie() # # #Blank cookies mean javlib site return error # if not raw_cookies: # return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # # #Manually construct a dictionary # s_cookie = SimpleCookie() # s_cookie.load(raw_cookies) # cookies = {} # for key, morsel in s_cookie.items(): # cookies[key] = morsel.value # Scraping result = get_html( "http://www.b47w.com/cn/vl_searchbyid.php?keyword={}".format(number), # cookies=cookies, # ua=user_agent, return_type="object") soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) multiLabel = get_from_xpath(lx, '//*[@id="rightcolumn"]/div[1]/text()') if multiLabel.find('识别码搜寻结果') > 0: links = [] titles = [] for i in range(1, get_link_count(lx) + 1): id, href, title = get_link(lx, i) if title.count('(ブルーレイディスク)') > 0: continue if id.upper() == number: links.append('http://www.b47w.com/cn' + href) titles.append(title) link = '' if len(links) > 1: for i, link in enumerate(links): print(str(i + 1) + ": " + titles[i]) print(link) index = int(input("input index: ")) - 1 if index < 0 or index >= len(links): raise ValueError("out of range") link = links[index] else: link = links[0] if link == '': raise ValueError("no match") result = get_html(link, return_type="object") soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) try: dww_htmlcode = fanza.main_htmlcode(getCID(lx)) except: dww_htmlcode = '' realnumber = get_table_el_td(soup, "video_id") if oldNumber != number: realnumber = oldNumber if "/?v=jav" in result.url: dic = { "title": get_title(lx, soup), "studio": get_table_el_single_anchor(soup, "video_maker"), "year": get_table_el_td(soup, "video_date")[:4], "outline": getOutline(dww_htmlcode), "director": get_table_el_single_anchor(soup, "video_director"), "cover": get_cover(lx), "imagecut": 1, "actor_photo": "", "website": result.url.replace('www.b47w.com', 'www.javlibrary.com'), "source": "javlib.py", "actor": get_table_el_multi_anchor(soup, "video_cast"), "label": get_table_el_single_anchor(soup, "video_label"), "tag": getTag(get_table_el_multi_anchor(soup, "video_genres")), "number": realnumber, "release": get_table_el_td(soup, "video_date"), "runtime": get_from_xpath( lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), "series": '', } else: dic = {} return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))