Esempio n. 1
0
def javlib_set_page(page_template: str,
                    page_num=1,
                    url_parameter=None,
                    config=None) -> dict:
    xpath_dict = {
        'title': '//*[@class="video"]/a/@title',
        'javid': '//*[@class="video"]/@id',
        'img': '//*[@class="video"]/a/img/@src',
        'car': '//*/div[@class="video"]/a/div[@class="id"]/text()'
    }
    xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href'

    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    lib_url = javlib_url + page_template.format(page_num=page_num,
                                                url_parameter=url_parameter)
    print(f'accessing {lib_url}')

    res = return_post_res(lib_url, behind_cloudflare=True).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = find_max_page(root.xpath(xpath_max_page)[0])
    except IndexError:
        max_page = page_num

    return jav_objs_raw, max_page
Esempio n. 2
0
def javdb_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    """
    website parse function
    """
    xpath_dict = {
        'title': '//a[@class="box"]/div[@class="video-title"]/text()',
        'javid': '//a[@class="box"]/div[@class="uid"]/text()',
        'img': '//div[@class="item-image fix-scale-cover"]/img/@data-src',
        'car': '//a[@class="box"]/div[@class="uid"]/text()'
    }
    xpath_max_page = '//ul[@class="pagination-list"]/li/a[@class="pagination-link"][last()]/text()'

    # force to get url from ini file each time
    javdb_url = 'https://javdb4.com/'
    set_url = javdb_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    # not really behind cloudflare but may prevent python scrape
    res = return_post_res(set_url, cookies={'over18': "1"}, behind_cloudflare=True).content
    root = etree.HTML(res.decode('utf-8'))

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[-1]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num
    
    return jav_objs_raw, max_page
Esempio n. 3
0
def javbus_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    xpath_dict = {
        'title': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@title',
        'javid': '//div[@class="photo-info"]/span/date[1]/text()',
        'img': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@src',
        'car': '//div[@class="photo-info"]/span/date[1]/text()'
    }
    xpath_max_page = '//ul[@class="pagination pagination-lg"]/li/a/text()'

    # force to get url from ini file each time
    javbus_url = return_config_string(['其他设置', 'javbus网址'])
    set_url = javbus_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[-2]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num
    
    return jav_objs_raw, max_page
Esempio n. 4
0
def jav321_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    xpath_dict = {
        'title': '//div[@class="thumbnail"]/a/text()',
        'javid': '//div[@class="thumbnail"]/a/@href',  # need to extract from link
        'img': '//div[@class="thumbnail"]/a/img/@src',
        'car': '//div[@class="thumbnail"]/a/text()'  # need to extract from title
    }
    xpath_max_page = '//ul[@class="pager"]/li[@class="next"]/a/text()'
    max_page = page_num  # default value

    # force to get url from ini file each time
    #javbus_url = return_config_string(['其他设置', 'javbus网址'])
    jav_url = 'https://www.jav321.com/'
    set_url = jav_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            # need to extract car from title, reusing file_scanner function
            if k == 'car':
                # need to separate text with car first
                _preprocess = _value.split(' ')[-1]
                
                # try to extract proper car
                try:
                    name_group = re.search(DEFAULT_FILENAME_PATTERN, _preprocess)
                    name_digits = name_group.group('digit')

                    # only keep 0 under 3 digits
                    # keep 045, 0830 > 830, 1130, 0002 > 002, 005
                    if name_digits.isdigit():
                        name_digits = str(int(name_digits))
                    while len(name_digits) < 3:
                        name_digits = '0' + name_digits
                    _value = name_group.group('pre') + '-' + name_digits
                except AttributeError as e:
                    print(f'cannot extract standard car format from {_preprocess} due to {e}')
                    _value = _preprocess
            jav_objs_raw[_i].update({k: _value})

    try:
        _new_max = root.xpath(xpath_max_page)
        if len(_new_max) > 0:
            max_page = int(max_page) + 1
    except:
        pass

    # max page override
    #if 'type' in page_template:
    #    max_page = max_page * 100
    
    return jav_objs_raw, max_page
Esempio n. 5
0
    def actress_searcher(self, search_str: str):
        search_endpoint = 'en/s-12/search'  # for female search most likely, s-12 might be changing?
        search_url = self.top_url + search_endpoint
        res = return_post_res(search_url, data={'recherche_valeur': search_str, 'recherche_critere': 'f'}).content

        root = etree.HTML(res)
        search_results = root.xpath('//div[@class="resultat-pornostar correspondance_exacte"]/p/a')
        if len(search_results) < 1:
            raise ActorNotFoundException(f'cannot find actor {search_str}')
        actress_href = search_results[0].get('href')  # we only use 1st return
        
        return return_get_res(self.top_url+actress_href).content
Esempio n. 6
0
    def get_single_jav_page(self):
        # perform search first
        # https://www.jav321.com/search POST form data: sn: ssni-854
        search_url = self.jav_url + 'search'
        print(f'accessing {search_url}')

        jav_search_content = return_post_res(search_url, data={'sn': self.car}, behind_cloudflare=True).content

        if '抱歉,未找到您要找的AV' in str(jav_search_content):
            raise JAVNotFoundException('{} cannot be found in jav321'.format(self.car))

        self.total_index = 1  # jav321 only return optimal result

        return jav_search_content, self.total_index
Esempio n. 7
0
def jav777_set_page(page_template: str,
                    page_num=1,
                    url_parameter=None,
                    config=None) -> dict:
    xpath_dict = {
        'title': '//h2[@class="post-title"]/a/@title',
        'javid': '//div[@class="post-container"]/div/@id',
        'img': '//div[@class="featured-media"]/a/img/@src',
        'car': '//h2[@class="post-title"]/a/@title'
    }
    xpath_max_page = '//center/a[position() = (last()-1)]/text()'

    # force to get url from ini file each time
    jav777_url = JAV777_URL
    set_url = jav777_url + page_template.format(page_num=page_num,
                                                url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            # need to extract car from title, reusing file_scanner function
            if k == 'car':
                # remove hd prefixes
                _value = _value.lstrip('(HD)')

                name_group = re.search(DEFAULT_FILENAME_PATTERN, _value)
                name_digits = name_group.group('digit')

                # only keep 0 under 3 digits
                # keep 045, 0830 > 830, 1130, 0002 > 002, 005
                if name_digits.isdigit():
                    name_digits = str(int(name_digits))
                while len(name_digits) < 3:
                    name_digits = '0' + name_digits
                _value = name_group.group('pre') + '-' + name_digits
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[0]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num

    return jav_objs_raw, max_page
Esempio n. 8
0
def javlib_set_page(page_prefix: str, page_num: int, config=None) -> dict:
    xpath_dict = {
        'title': '//*[@class="video"]/a/@title',
        'javid': '//*[@class="video"]/@id',
        'img': '//*[@class="video"]/a/img/@src',
        'car': '//*/div[@class="video"]/a/div[@class="id"]/text()'
    }
    xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href'

    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    # fill missing parameters
    if config == None:
        config = deepcopy(DEFAULT_JAVLIB_CONFIG)

    lib_url = javlib_url + page_prefix + str(page_num)
    print(f'accessing {lib_url}')

    res = return_post_res(lib_url,
                          proxies=config['proxies'],
                          cookies=config['cookies']).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = find_max_page(root.xpath(xpath_max_page)[0])
    except IndexError:
        max_page = page_num

    return jav_objs_raw, max_page