def main(number: str) -> Movie: javlib_scrap = javlib.main(number) if not javlib_scrap.is_filled(): return javlib_scrap jav321_scrap = jav321.main(javlib_scrap.movie_id) javdb_scrap = javdb.main(javlib_scrap.movie_id) if not javlib_scrap.match_movie(jav321_scrap): jav321_scrap = Movie() if not javlib_scrap.match_movie(javdb_scrap): javdb_scrap = Movie() javlib_scrap.series = jav321_scrap.series or javdb_scrap.series javlib_scrap.outline = jav321_scrap.outline or javdb_scrap.outline javlib_scrap.extra_fanart = (jav321_scrap.extra_fanart or javdb_scrap.extra_fanart) javlib_scrap.ratings.extend(javdb_scrap.ratings) if not javlib_scrap.actors: javlib_scrap.actors = javdb_scrap.actors or jav321_scrap.actors if not javlib_scrap.director: javlib_scrap.director = javdb_scrap.director or jav321_scrap.director if not javlib_scrap.studio: javlib_scrap.studio = javdb_scrap.studio or jav321_scrap.studio if not javlib_scrap.series: javlib_scrap.series = javdb_scrap.series or jav321_scrap.series javlib_scrap.merge_tags(jav321_scrap) javlib_scrap.merge_tags(javdb_scrap) return javlib_scrap
def _add_rating(movie: Movie, lx: html.HtmlElement) -> None: score = lx.xpath('//*[@id="video_review"]//span[@class="score"]/text()') if not score: return try: score = float(score[0].strip('( )')) movie.add_rating(rating=score, source='javlib', max_rating=10.0) except: logger.debug('评分刮削失败。') pass
def _set_rating(movie: Movie, htmlcode) -> None: html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath('//th[contains(text(),"評価")]/../td/text()') result = ''.join([i.strip() for i in result]) try: movie.add_rating(rating = float(result[:3]), votes = int(result[4:].split(' ')[0]), source = 'mgstage', max_rating = 5.0) except: logger.debug('评分刮削失败。')
def set_rating(movie: Movie, htmlcode) -> None: html = etree.fromstring(htmlcode, etree.HTMLParser()) try: rating_div = html.xpath('//a[@class="items_article_Stars"]')[0] stars = rating_div.xpath('.//span') rating = float(stars[0].attrib['class'][-1]) votes = int(stars[1].text) movie.add_rating(rating=rating, max_rating=5.0, source='fc2', votes=votes) except: logger.debug('评分刮削失败')
def set_rating(movie: Movie, a) -> None: html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath('//strong[contains(text(),"評分")]/../span//text()') result = [i.strip() for i in result] result = [i for i in result if i] if not result: return result = result[0] result = re.search(r'^(\d\.\d+).*由(\d+)人評價', result) if result: rating = float(result.group(1)) votes = int(result.group(2)) movie.add_rating(rating = rating, max_rating= 5.0, votes= votes, source='javdb')
def _set_tags(movie: Movie, a) -> None: html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()|' '//th[contains(text(),"ジャンル:")]/../td/text()') result = set([i.strip() for i in result if i.strip()]) result.add('日本') result.add('有码') movie.tags = list(result)
def main(number: str) -> Movie: result = post_html(url="https://www.jav321.com/search", query={"sn": number}) movie = Movie() if not result.text.strip(): return movie soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) if "/video/" in result.url: parse_info(soup, movie) movie.title = get_title(lx) movie.outline = get_outline(lx) movie.cover = get_cover(lx) movie.extra_fanart = get_extrafanart(result.text) movie.trailer = get_trailer(result.text) movie.imagecut = 1 movie.scraper_source = 'jav321' movie.website = result.url return movie
def parse_info(soup: BeautifulSoup, movie: Movie) -> None: data = soup.select_one("div.row > div.col-md-9") if data: dd = str(data).split("<br/>") data_dic = {} for d in dd: data_dic[get_bold_text(h=d)] = d movie.actors = get_actor(data_dic) movie.studio = get_studio(data_dic) movie.tags = get_tag(data_dic) movie.release = get_release(data_dic) movie.runtime = get_runtime(data_dic) movie.series = get_series(data_dic) movie.movie_id = get_number(data_dic)
def main(number: str) -> Movie: raw_cookies, user_agent = get_javlib_cookie() # Blank cookies mean javlib site return error if not raw_cookies: return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # Manually construct a dictionary s_cookie = SimpleCookie() s_cookie.load(raw_cookies) cookies = {} for key, morsel in s_cookie.items(): cookies[key] = morsel.value # Scraping result = get_html( "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format( number), cookies=cookies, ua=user_agent, return_type="object") soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) if "/?v=jav" in result.url: return extract_movie(lx, soup, result.url) else: url = _find_best_movie_match(lx, number) if url: result = get_html(url, cookies=cookies, ua=user_agent, return_type="object") soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) return extract_movie(lx, soup, result.url) return Movie()
def main(number): movie = Movie() try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # pass # else: # number = number.upper() number = number.upper() try: query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') except: query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one urls = html.xpath('//*[@id="videos"]/div/div/a/@href') # 记录一下欧美的ids ['Blacked','Blacked'] correct_url = '' if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): correct_url = urls[0] else: ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') ids = [i.upper() for i in ids] if number.upper() in ids: correct_url = urls[ids.index(number.upper())] if not correct_url: return Movie() detail_page = get_html('https://javdb.com' + correct_url) # no cut image by default imagecut = 3 # If gray image exists ,then replace with normal cover if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): cover_small = getCover_small(query_result) else: cover_small = getCover_small(query_result, index=ids.index(number)) if 'placeholder' in cover_small: # replace wit normal cover and cut it imagecut = 1 cover_small = getCover(detail_page) number = getNum(detail_page) title = getTitle(detail_page) if title and number: # remove duplicate title title = title.replace(number, '').strip() movie.actors = getActor(detail_page) movie.title = title movie.studio = getStudio(detail_page) movie.outline = getOutline(detail_page) movie.runtime = getRuntime(detail_page) movie.director = getDirector(detail_page) movie.release = getRelease(detail_page) movie.movie_id = number movie.cover = getCover(detail_page) movie.cover_small = cover_small movie.trailer = getTrailer(detail_page) movie.extra_fanart = getExtrafanart(detail_page) movie.imagecut = imagecut movie.tags = getTag(detail_page) movie.label = getLabel(detail_page) # 'actor_photo': getActorPhoto(detail_page), movie.website = 'https://javdb.com' + correct_url movie.scraper_source = 'javdb' movie.series = getSeries(detail_page) set_rating(movie, detail_page) except Exception as e: logger.warning(e, exc_info=True) movie = Movie() return movie
def get_data_from_json(file_number: str, filepath: str) -> Movie: # 从JSON返回元数据 """ iterate through all services and fetch the data """ conf = Config.get_instance() func_mapping = { "airav": airav.main, "avsox": avsox.main, "fc2": fc2.main, "fanza": fanza.main, "javdb": javdb.main, "javbus": javbus.main, "mgstage": mgstage.main, "jav321": jav321.main, "xcity": xcity.main, "javlib": javlib.main, "dlsite": dlsite.main, "metajavlib": metajavlib.main, } # default fetch order list, from the beginning to the end sources = conf.sources().split(',') # if the input file name matches certain rules, # move some web service to the beginning of the list if "avsox" in sources and (re.match(r"^\d{5,}", file_number) or "HEYZO" in file_number or "heyzo" in file_number or "Heyzo" in file_number): # if conf.debug() == True: # print('[+]select avsox') sources.insert(0, sources.pop(sources.index("avsox"))) elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or "SIRO" in file_number.upper()): # if conf.debug() == True: # print('[+]select fanza') sources.insert(0, sources.pop(sources.index("mgstage"))) elif "fc2" in sources and ("FC2" in file_number.upper()): # if conf.debug() == True: # print('[+]select fc2') sources.insert(0, sources.pop(sources.index("fc2"))) elif "dlsite" in sources and ("RJ" in file_number or "rj" in file_number or "VJ" in file_number or "vj" in file_number): # if conf.debug() == True: # print('[+]select dlsite') sources.insert(0, sources.pop(sources.index("dlsite"))) json_data = {} movie = None for source in sources: try: if conf.debug(): logger.attn(f'select {source}') returnval = func_mapping[source](file_number) if (isinstance(returnval, Movie)): if returnval.is_filled(): movie = returnval break else: json_data = json.loads(returnval) # if any service return a valid return, break if get_data_state(json_data): break except: traceback.print_exc() break # Return if data not found in all sources if not json_data and not movie: print('[-]Movie Data not found!') moveFailedFolder(filepath, conf.failed_folder()) return Movie() # ================================================网站规则添加结束================================================ if not movie: movie = Movie() movie.title = json_data.get('title') movie.actors = json_data.get('actor') movie.release = json_data.get('release') movie.cover_small = json_data.get('cover_small') movie.cover = json_data.get('cover') movie.tags = json_data.get('tag') movie.year = json_data.get('year') movie.series = json_data.get('series') movie.runtime = json_data.get('runtime') movie.outline = json_data.get('outline') movie.scraper_source = json_data.get('source') movie.label = json_data.get('label') movie.studio = json_data.get('studio') movie.director = json_data.get('director') movie.movie_id = json_data.get('number') movie.trailer = json_data.get('trailer') movie.website = json_data.get('website') movie.imagecut = json_data.get('imagecut') movie.extra_fanart = json_data.get('extrafanart') movie.original_path = filepath if not movie.is_filled(): print('[-]Movie Data not found!') moveFailedFolder(filepath, conf.failed_folder()) return Movie() """ TODO: 翻译以后再说 if conf.is_transalte(): translate_values = conf.transalte_values().split(",") for translate_value in translate_values: if json_data[translate_value] == "": continue # if conf.get_transalte_engine() == "baidu": # json_data[translate_value] = translate( # json_data[translate_value], # target_language="zh", # engine=conf.get_transalte_engine(), # app_id=conf.get_transalte_appId(), # key=conf.get_transalte_key(), # delay=conf.get_transalte_delay(), # ) if conf.get_transalte_engine() == "azure": json_data[translate_value] = translate( json_data[translate_value], target_language="zh-Hans", engine=conf.get_transalte_engine(), key=conf.get_transalte_key(), ) else: json_data[translate_value] = translate(json_data[translate_value]) """ logger.debug(movie) return movie
def _set_actors(movie: Movie, a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() result = html.xpath('//th[contains(text(),"出演:")]/../td/a/text()|' '//th[contains(text(),"出演:")]/../td/text()') result = [i.strip() for i in result if i.strip()] movie.actors = result
def main(number2) -> Movie: number=number2.upper() htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) soup = BeautifulSoup(htmlcode, 'lxml') a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') movie = Movie() movie.title = getTitle(htmlcode).replace("\\n",'').replace(' ','') movie.studio = getStudio(a) movie.outline = getOutline(b) movie.runtime = getRuntime(a) _set_actors(movie, a) movie.release = getRelease(a) movie.movie_id = getNum(a) movie.cover = getCover(htmlcode) movie.imagecut = 0 _set_tags(movie, a) movie.extra_fanart = getExtrafanart(htmlcode) movie.website = f'https://www.mgstage.com/product/product_detail/{number}/' movie.scraper_source = 'mgstage' movie.series = getSeries(a) _set_rating(movie, a) return movie
def main(number): movie = Movie() try: number = number.replace('FC2-', '').replace('fc2-', '') htmlcode = get_html('https://adult.contents.fc2.com/article/' + number + '/') movie.title = getTitle_fc2com(htmlcode) movie.release = getRelease_fc2com(htmlcode) movie.movie_id = f'FC2-{number}' # 因为FC2,用上传者代表演员 movie.actors = [getStudio_fc2com(htmlcode)] movie.studio = getStudio_fc2com(htmlcode) movie.director = getStudio_fc2com(htmlcode) movie.cover = getCover_fc2com(htmlcode) movie.imagecut = 0 movie.extra_fanart = getExtrafanart(htmlcode) movie.trailer = getTrailer(htmlcode) movie.tags = getTag_fc2com(number) movie.website = f'https://adult.contents.fc2.com/article/{number}/' movie.scraper_source = 'fc2' set_rating(movie, htmlcode) except Exception as e: logger.error('fc2刮削失败。') logger.debug('', exc_info=True) movie = Movie() return movie
def extract_movie(lx: html.HtmlElement, soup: BeautifulSoup, url: str) -> Movie: movie = Movie() movie.title = get_title(lx, soup) movie.studio = get_table_el_single_anchor(soup, "video_maker") movie.director = get_table_el_single_anchor(soup, "video_director") movie.cover = get_cover(lx) movie.imagecut = 1 movie.website = url movie.scraper_source = 'javlib' movie.actors = get_table_el_multi_anchor(soup, "video_cast").split(',') movie.label = get_table_el_td(soup, "video_label") movie.tags = get_table_el_multi_anchor(soup, "video_genres").split(',') movie.movie_id = get_table_el_td(soup, "video_id") movie.release = get_table_el_td(soup, "video_date") movie.runtime = get_from_xpath( lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()') _add_rating(movie, lx) movie.add_tag('日本') movie.add_tag('有码') return movie