Ejemplo n.º 1
0
    def get_single_jav_page(self):
        """
        This search method is currently NOT DETERMINISTIC!
        Example: SW-098 -> has 3 outputs
        """

        # perform search first
        # https://www.javbus.com/search/OFJE-235&type=&parent=ce
        search_url = self.jav_url + 'search/{}&type=&parent=ce'.format(
            self.car)
        print(f'accessing {search_url}')

        jav_search_content = return_get_res(search_url).content
        search_root = etree.HTML(jav_search_content)

        search_results = search_root.xpath('//a[@class="movie-box"]/@href')

        if not search_results:
            # sometimes the access will fail, try directly access by car
            direct_url = self.jav_url + self.car
            print(f'no search result, try direct accessing {search_url}')
            jav_search_content = return_get_res(direct_url).content
            search_root = etree.HTML(jav_search_content)

            if search_root.xpath('//a[@class="bigImage"]/img/@title'):
                search_results = [direct_url]

        if not search_results:
            raise JAVNotFoundException('{} cannot be found in javbus'.format(
                self.car))

        self.total_index = len(search_results)
        result_first_url = search_results[self.pick_index]

        return return_get_res(result_first_url).content, self.total_index
Ejemplo n.º 2
0
def javbus_magnet_search(car: str):
    jav_url = return_config_string(['其他设置', 'javbus网址'])
    gid_match = r'.*?var gid = (\d*);.*?'
    magnet_xpath = {
        'magnet': '//tr/td[position()=1]/a[1]/@href',
        'title': '//tr/td[position()=1]/a[1]/text()',
        'size': '//tr/td[position()=2]/a[1]/text()'
    }
    main_url_template = jav_url+'{car}'
    magnet_url_template = jav_url+'ajax/uncledatoolsbyajax.php?gid={gid}&uc=0'

    res = return_get_res(main_url_template.format(car=car)).text
    gid = re.search(gid_match, res).groups()[0]

    res = return_get_res(magnet_url_template.format(gid=gid), headers={'referer': main_url_template.format(car=car)}).content
    root = etree.HTML(res)

    magnets = defaultlist(dict)
    for k, v in magnet_xpath.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            magnets[_i].update({k: _value.strip('\t').strip('\r').strip('\n').strip()})
            if k == 'size':
                magnets[_i].update({'size_sort': parsed_size_to_int(_value.strip('\t').strip('\r').strip('\n').strip())})
    
    return magnets
Ejemplo n.º 3
0
    def get_single_jav_page(self):
        """
        This search method is currently NOT DETERMINISTIC!
        Example: SW-098 -> has 3 outputs
        """

        # perform search first
        lib_search_url = self.jav_url + 'vl_searchbyid.php?keyword=' + self.car
        #print(f'accessing {lib_search_url}')
        jav_html = return_html_text(lib_search_url, behind_cloudflare=True)
        #print('page return ok')

        # 搜索结果的网页,大部分情况就是这个影片的网页,也有可能是多个结果的网页
        # 尝试找标题,第一种情况:找得到,就是这个影片的网页
        if self.car.upper().startswith('T28'):
            # special filter for T28
            title_re = re.search(
                r'<title>((T28-|T-28)\d{1,5}.+?) - JAVLibrary<\/title>',
                jav_html)
        elif self.car.upper().startswith('R18'):
            # special filter for T28
            title_re = re.search(
                r'<title>((R18-|R-18)\d{1,5}.+?) - JAVLibrary<\/title>',
                jav_html)
        else:
            title_re = re.search(
                r'<title>([a-zA-Z]{1,6}-\d{1,5}.+?) - JAVLibrary</title>',
                jav_html)  # 匹配处理“标题”

        # 搜索结果就是AV的页面
        if title_re:
            return return_get_res(lib_search_url,
                                  behind_cloudflare=True).content, 1
        # 第二种情况:搜索结果可能是两个以上,所以这种匹配找不到标题,None!
        else:  # 继续找标题,但匹配形式不同,这是找“可能是多个结果的网页”上的第一个标题
            search_results = re.findall(
                r'v=javli(.+?)" title=".+?-\d+?[a-z]? ', jav_html)
            # 搜索有几个结果,用第一个AV的网页,打开它
            if search_results:
                self.total_index = len(search_results)
                result_first_url = self.jav_url + '?v=javli' + search_results[
                    self.pick_index]
                return return_get_res(
                    result_first_url,
                    behind_cloudflare=True).content, self.total_index
            # 第三种情况:搜索不到这部影片,搜索结果页面什么都没有
            else:
                raise JAVNotFoundException(
                    '{} cannot be found in javlib'.format(self.car))
Ejemplo n.º 4
0
    def get_single_jav_page(self):
        arzon_cookies = self.get_site_sessions().cookies.get_dict()
        arz_search_url = 'https://www.arzon.jp/itemlist.html?t=&m=all&s=&q=' + self.car
        search_html = return_html_text(arz_search_url, cookies=arzon_cookies)

        AVs = re.findall(r'<h2><a href="(/item.+?)" title=', search_html) or [
        ]  # 所有搜索结果链接
        for av in AVs:
            arz_url = 'https://www.arzon.jp' + av  # 第i+1个链接
            print(f'accessing {arz_url}')
            page_content = return_get_res(arz_url,
                                          cookies=arzon_cookies).content

            # only verify when there are multiple results
            if len(AVs) != 1:
                self.total_index = len(AVs)
                # the search result is not reliable so need to double check
                car_xpath = '//tr[td="品番:"]/td[2]/text()'
                _root = etree.HTML(page_content)
                _car = _root.xpath(car_xpath)[0]
                _car = self.clean_up_car(_car)
                if _car == self.car:
                    return page_content, self.total_index
                else:
                    continue
            else:
                return page_content, self.total_index
        return '', 0
Ejemplo n.º 5
0
    def get_single_jav_page(self):
        # new autocomplete search, no rate limit
        # https://javdb.com/videos/search_autocomplete.json?q=luxu-1298
        search_url = self.jav_url + 'videos/search_autocomplete.json?q={}'.format(
            self.car)
        jav_search_result = return_html_text(search_url,
                                             behind_cloudflare=True)
        try:
            jav_search_result = json.loads(jav_search_result)
            self.total_index = len(jav_search_result)
            for i, _rst in enumerate(jav_search_result):
                if _rst['number'] == self.car.upper():
                    result_first_url = self.jav_url + 'v/{}'.format(
                        _rst['uid'])
                    return return_get_res(result_first_url).content.decode(
                        'utf-8'), self.total_index
        except Exception as e:
            print(
                f'issue encounter when autocomplete search javdb {self.car} - {e}'
            )
            pass

        # perform search first, not reliable at all, often multiple results
        # https://javdb4.com/search?q=MILK-08&f=all
        search_url = self.jav_url + 'search?q={}&f=all'.format(self.car)

        jav_search_content = return_get_res(search_url).content
        search_root = etree.HTML(jav_search_content)

        search_results = search_root.xpath('//a[@class="box"]/@href')

        self.total_index = len(search_results)
        # need to match car
        matched_car = search_root.xpath(
            '//a[@class="box"]/div[@class="uid"]/text()')
        if self.total_index < 1:
            raise Exception(f'nothing found for {self.car} from javdb')
        elif self.car.upper() != matched_car[0].upper():
            raise Exception(
                f'{self.car} does not match javdb search result: {matched_car}'
            )

        result_first_url = self.jav_url + search_results[self.pick_index][1:]

        return return_get_res(result_first_url).content.decode(
            'utf-8'), self.total_index
Ejemplo n.º 6
0
    def actress_searcher(self, search_str: str):
        search_endpoint = 'en/s-12/search'  # for female search most likely, s-12 might be changing?
        search_url = self.top_url + search_endpoint
        res = return_post_res(search_url, data={'recherche_valeur': search_str, 'recherche_critere': 'f'}).content

        root = etree.HTML(res)
        search_results = root.xpath('//div[@class="resultat-pornostar correspondance_exacte"]/p/a')
        if len(search_results) < 1:
            raise ActorNotFoundException(f'cannot find actor {search_str}')
        actress_href = search_results[0].get('href')  # we only use 1st return
        
        return return_get_res(self.top_url+actress_href).content
Ejemplo n.º 7
0
    def get_single_jav_page(self):
        # perform search first, not reliable at all, often multiple results
        # https://javdb4.com/search?q=MILK-08&f=all
        search_url = self.jav_url + 'search?q={}&f=all'.format(self.car)

        jav_search_content = return_get_res(search_url).content
        search_root = etree.HTML(jav_search_content)

        search_results = search_root.xpath('//a[@class="box"]/@href')


        self.total_index = len(search_results)
        # need to match car
        matched_car = search_root.xpath('//a[@class="box"]/div[@class="uid"]/text()')
        if self.total_index < 1:
            raise Exception(f'nothing found for {self.car} from javdb')
        elif self.car.upper() != matched_car[0].upper():
            raise Exception(f'{self.car} does not match javdb search result: {matched_car}')

        result_first_url = self.jav_url + search_results[self.pick_index][1:]

        return return_get_res(result_first_url).content.decode('utf-8'), self.total_index
Ejemplo n.º 8
0
    def get_single_jav_page(self):
        """
        This search method is currently NOT DETERMINISTIC!
        Example: SW-098 -> has 3 outputs
        """

        # perform search first
        # https://www.javbus.com/search/OFJE-235&type=&parent=ce
        search_url = self.jav_url + '?s={}'.format(self.car)
        print(f'accessing {search_url}')

        jav_search_content = return_get_res(search_url).content
        search_root = etree.HTML(jav_search_content)

        search_results = search_root.xpath('//h2[@class="post-title"]/a/@href')

        if not search_results:
            raise JAVNotFoundException('{} cannot be found in {}'.format(
                self.car, self.source))

        self.total_index = len(search_results)
        result_first_url = search_results[self.pick_index]

        return return_get_res(result_first_url).content, self.total_index
Ejemplo n.º 9
0
def javdb_set_page(page_template: str,
                   page_num=1,
                   url_parameter=None,
                   config=None) -> dict:
    """
    website parse function
    """
    xpath_dict = {
        'title': '//a[@class="box"]/div[@class="video-title"]/text()',
        'javid': '//a[@class="box"]/div[@class="uid"]/text()',
        'img': '//div[@class="item-image fix-scale-cover"]/img/@data-src',
        'car': '//a[@class="box"]/div[@class="uid"]/text()'
    }
    xpath_max_page = '//ul[@class="pagination-list"]/li/a[@class="pagination-link"][last()]/text()'

    # force to get url from ini file each time
    javdb_url = 'https://javdb4.com/'
    set_url = javdb_url + page_template.format(page_num=page_num,
                                               url_parameter=url_parameter)
    print(f'accessing {set_url}')

    # not really behind cloudflare but may prevent python scrape
    res = return_get_res(set_url,
                         cookies={
                             'over18': "1"
                         },
                         behind_cloudflare=True).content
    root = etree.HTML(res.decode('utf-8'))

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[-1]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num

    return jav_objs_raw, max_page