def javlib_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//*[@class="video"]/a/@title', 'javid': '//*[@class="video"]/@id', 'img': '//*[@class="video"]/a/img/@src', 'car': '//*/div[@class="video"]/a/div[@class="id"]/text()' } xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href' # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) lib_url = javlib_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {lib_url}') res = return_post_res(lib_url, behind_cloudflare=True).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = find_max_page(root.xpath(xpath_max_page)[0]) except IndexError: max_page = page_num return jav_objs_raw, max_page
def javdb_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: """ website parse function """ xpath_dict = { 'title': '//a[@class="box"]/div[@class="video-title"]/text()', 'javid': '//a[@class="box"]/div[@class="uid"]/text()', 'img': '//div[@class="item-image fix-scale-cover"]/img/@data-src', 'car': '//a[@class="box"]/div[@class="uid"]/text()' } xpath_max_page = '//ul[@class="pagination-list"]/li/a[@class="pagination-link"][last()]/text()' # force to get url from ini file each time javdb_url = 'https://javdb4.com/' set_url = javdb_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') # not really behind cloudflare but may prevent python scrape res = return_post_res(set_url, cookies={'over18': "1"}, behind_cloudflare=True).content root = etree.HTML(res.decode('utf-8')) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[-1] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def javbus_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@title', 'javid': '//div[@class="photo-info"]/span/date[1]/text()', 'img': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@src', 'car': '//div[@class="photo-info"]/span/date[1]/text()' } xpath_max_page = '//ul[@class="pagination pagination-lg"]/li/a/text()' # force to get url from ini file each time javbus_url = return_config_string(['其他设置', 'javbus网址']) set_url = javbus_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[-2] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def jav321_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//div[@class="thumbnail"]/a/text()', 'javid': '//div[@class="thumbnail"]/a/@href', # need to extract from link 'img': '//div[@class="thumbnail"]/a/img/@src', 'car': '//div[@class="thumbnail"]/a/text()' # need to extract from title } xpath_max_page = '//ul[@class="pager"]/li[@class="next"]/a/text()' max_page = page_num # default value # force to get url from ini file each time #javbus_url = return_config_string(['其他设置', 'javbus网址']) jav_url = 'https://www.jav321.com/' set_url = jav_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): # need to extract car from title, reusing file_scanner function if k == 'car': # need to separate text with car first _preprocess = _value.split(' ')[-1] # try to extract proper car try: name_group = re.search(DEFAULT_FILENAME_PATTERN, _preprocess) name_digits = name_group.group('digit') # only keep 0 under 3 digits # keep 045, 0830 > 830, 1130, 0002 > 002, 005 if name_digits.isdigit(): name_digits = str(int(name_digits)) while len(name_digits) < 3: name_digits = '0' + name_digits _value = name_group.group('pre') + '-' + name_digits except AttributeError as e: print(f'cannot extract standard car format from {_preprocess} due to {e}') _value = _preprocess jav_objs_raw[_i].update({k: _value}) try: _new_max = root.xpath(xpath_max_page) if len(_new_max) > 0: max_page = int(max_page) + 1 except: pass # max page override #if 'type' in page_template: # max_page = max_page * 100 return jav_objs_raw, max_page
def actress_searcher(self, search_str: str): search_endpoint = 'en/s-12/search' # for female search most likely, s-12 might be changing? search_url = self.top_url + search_endpoint res = return_post_res(search_url, data={'recherche_valeur': search_str, 'recherche_critere': 'f'}).content root = etree.HTML(res) search_results = root.xpath('//div[@class="resultat-pornostar correspondance_exacte"]/p/a') if len(search_results) < 1: raise ActorNotFoundException(f'cannot find actor {search_str}') actress_href = search_results[0].get('href') # we only use 1st return return return_get_res(self.top_url+actress_href).content
def get_single_jav_page(self): # perform search first # https://www.jav321.com/search POST form data: sn: ssni-854 search_url = self.jav_url + 'search' print(f'accessing {search_url}') jav_search_content = return_post_res(search_url, data={'sn': self.car}, behind_cloudflare=True).content if '抱歉,未找到您要找的AV' in str(jav_search_content): raise JAVNotFoundException('{} cannot be found in jav321'.format(self.car)) self.total_index = 1 # jav321 only return optimal result return jav_search_content, self.total_index
def jav777_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict: xpath_dict = { 'title': '//h2[@class="post-title"]/a/@title', 'javid': '//div[@class="post-container"]/div/@id', 'img': '//div[@class="featured-media"]/a/img/@src', 'car': '//h2[@class="post-title"]/a/@title' } xpath_max_page = '//center/a[position() = (last()-1)]/text()' # force to get url from ini file each time jav777_url = JAV777_URL set_url = jav777_url + page_template.format(page_num=page_num, url_parameter=url_parameter) print(f'accessing {set_url}') res = return_post_res(set_url).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): # need to extract car from title, reusing file_scanner function if k == 'car': # remove hd prefixes _value = _value.lstrip('(HD)') name_group = re.search(DEFAULT_FILENAME_PATTERN, _value) name_digits = name_group.group('digit') # only keep 0 under 3 digits # keep 045, 0830 > 830, 1130, 0002 > 002, 005 if name_digits.isdigit(): name_digits = str(int(name_digits)) while len(name_digits) < 3: name_digits = '0' + name_digits _value = name_group.group('pre') + '-' + name_digits jav_objs_raw[_i].update({k: _value}) try: max_page = root.xpath(xpath_max_page)[0] except: max_page = page_num if not max_page: max_page = page_num return jav_objs_raw, max_page
def javlib_set_page(page_prefix: str, page_num: int, config=None) -> dict: xpath_dict = { 'title': '//*[@class="video"]/a/@title', 'javid': '//*[@class="video"]/@id', 'img': '//*[@class="video"]/a/img/@src', 'car': '//*/div[@class="video"]/a/div[@class="id"]/text()' } xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href' # force to get url from ini file each time javlib_url = return_config_string(['其他设置', 'javlibrary网址']) # fill missing parameters if config == None: config = deepcopy(DEFAULT_JAVLIB_CONFIG) lib_url = javlib_url + page_prefix + str(page_num) print(f'accessing {lib_url}') res = return_post_res(lib_url, proxies=config['proxies'], cookies=config['cookies']).content root = etree.HTML(res) jav_objs_raw = defaultlist(dict) for k, v in xpath_dict.items(): _values = root.xpath(v) for _i, _value in enumerate(_values): jav_objs_raw[_i].update({k: _value}) try: max_page = find_max_page(root.xpath(xpath_max_page)[0]) except IndexError: max_page = page_num return jav_objs_raw, max_page