コード例 #1
0
def return_get_res(url,
                   cookies={},
                   proxies=None,
                   headers=None,
                   encoding='utf-8',
                   behind_cloudflare=False):
    #print(f'accessing {url}')
    if not headers:
        headers = DEFAULT_HEADERS

    # read settings from ini file
    use_proxy = return_config_string(['代理', '是否使用代理?'])

    # prioritize passed in proxies
    if use_proxy == '是' and not proxies:
        proxies = return_config_string(['代理', '代理IP及端口'])

    if behind_cloudflare:
        res = cloudflare_get(url, cookies=cookies, proxies=proxies)
    else:
        res = requests.get(url,
                           headers=headers,
                           cookies=cookies,
                           proxies=proxies)
    res.encoding = encoding
    return res
コード例 #2
0
def return_post_res(url,
                    data=None,
                    cookies=None,
                    proxies=None,
                    headers=None,
                    encoding='utf-8'):
    if not headers:
        headers = DEFAULT_HEADERS

    # read settings from ini file
    use_proxy = return_config_string(['代理', '是否使用代理?'])

    # prioritize passed in proxies
    if use_proxy == '是' and not proxies:
        proxies = return_config_string(['代理', '代理IP及端口'])
    else:
        pass
        #print('not using proxy for requests')

    res = requests.post(url,
                        data,
                        headers=headers,
                        cookies=cookies,
                        proxies=proxies)
    res.encoding = encoding
    return res
コード例 #3
0
def return_html_text(url, cookies=None, proxies=None, encoding='utf-8'):
    # read settings from ini file
    use_proxy = return_config_string(['代理', '是否使用代理?'])

    # prioritize passed in proxies
    if use_proxy == '是' and not proxies:
        proxies = return_config_string(['代理', '代理IP及端口'])

    res = requests.get(url, cookies=cookies, proxies=proxies)
    res.encoding = encoding
    return res.text
コード例 #4
0
def return_html_text(url, cookies=None, proxies=None, encoding='utf-8', behind_cloudflare=False):
    # read settings from ini file
    use_proxy = return_config_string(['代理', '是否使用代理?'])

    # prioritize passed in proxies
    if use_proxy == '是' and not proxies:
        proxies = return_config_string(['代理', '代理IP及端口'])

    if behind_cloudflare:
        res = cloudscraper.create_scraper().get(url, cookies=cookies, proxies=proxies)
    else:
        res = requests.get(url, cookies=cookies, proxies=proxies)
    res.encoding = encoding
    return res.text
コード例 #5
0
def send_emby_images(image_folder_path):
    # init
    num = 0
    up_num = 0

    if not os.path.exists(image_folder_path):
        print('current path: {}'.format(os.getcwd()))
        raise Exception('{} image folder doesn\'t exist, please specify correct path'.format(image_folder_path))

    emby_url = return_config_string(["emby专用", "网址"])
    api_key = return_config_string(["emby专用", "api id"])

    # try correct emby url with /
    if not emby_url.endswith('/'):
        emby_url += '/'

    try:
        for actress in list_emby_actress(emby_url, api_key):
            num += 1
            if num % 500 == 0:
                print('have processed', num, '个actress')

            actress_name = actress['Name']
            actress_id = actress['Id']
            res_info = {'log': f'processed 女优:{actress_name}, ID:{actress_id}'}

            if os.path.isfile(os.path.join(image_folder_path, f'{actress_name}.jpg')):
                file_path = os.path.join(image_folder_path, f'{actress_name}.jpg')
                up_num += post_image_to_actress(actress_id, file_path, emby_url, api_key)

            elif os.path.isfile(os.path.join(image_folder_path, f'{actress_name}.png')):
                file_path = os.path.join(image_folder_path, f'{actress_name}.png')
                up_num += post_image_to_actress(actress_id, file_path, emby_url, api_key)

            else:
                res_info = {'log': f'{actress_name} image file doen\'t exist'}
            print(res_info)

            yield json.dumps(res_info)+'\n'

    except requests.exceptions.ConnectionError:
        print('emby服务端无法访问,请检查:', emby_url, '\n')
    except Exception as err:
        traceback.print_exc()
        print('发生未知错误,请截图给作者:', emby_url, err)

    print(f'成功upload {up_num} 个女优头像!')
    yield json.dumps({'log': f'成功upload {up_num} 个女优头像!'})+'\n'
コード例 #6
0
def javlib_set_page(page_template: str,
                    page_num=1,
                    url_parameter=None,
                    config=None) -> dict:
    xpath_dict = {
        'title': '//*[@class="video"]/a/@title',
        'javid': '//*[@class="video"]/@id',
        'img': '//*[@class="video"]/a/img/@src',
        'car': '//*/div[@class="video"]/a/div[@class="id"]/text()'
    }
    xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href'

    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    lib_url = javlib_url + page_template.format(page_num=page_num,
                                                url_parameter=url_parameter)
    print(f'accessing {lib_url}')

    res = return_post_res(lib_url, behind_cloudflare=True).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = find_max_page(root.xpath(xpath_max_page)[0])
    except IndexError:
        max_page = page_num

    return jav_objs_raw, max_page
コード例 #7
0
def javbus_magnet_search(car: str):
    jav_url = return_config_string(['其他设置', 'javbus网址'])
    gid_match = r'.*?var gid = (\d*);.*?'
    magnet_xpath = {
        'magnet': '//tr/td[position()=1]/a[1]/@href',
        'title': '//tr/td[position()=1]/a[1]/text()',
        'size': '//tr/td[position()=2]/a[1]/text()'
    }
    main_url_template = jav_url+'{car}'
    magnet_url_template = jav_url+'ajax/uncledatoolsbyajax.php?gid={gid}&uc=0'

    res = return_get_res(main_url_template.format(car=car)).text
    gid = re.search(gid_match, res).groups()[0]

    res = return_get_res(magnet_url_template.format(gid=gid), headers={'referer': main_url_template.format(car=car)}).content
    root = etree.HTML(res)

    magnets = defaultlist(dict)
    for k, v in magnet_xpath.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            magnets[_i].update({k: _value.strip('\t').strip('\r').strip('\n').strip()})
            if k == 'size':
                magnets[_i].update({'size_sort': parsed_size_to_int(_value.strip('\t').strip('\r').strip('\n').strip())})
    
    return magnets
コード例 #8
0
def javbus_set_page(page_template: str, page_num=1, url_parameter=None, config=None) -> dict:
    xpath_dict = {
        'title': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@title',
        'javid': '//div[@class="photo-info"]/span/date[1]/text()',
        'img': '//div[@class="photo-frame"]/img[not(contains(@src, "actress"))]/@src',
        'car': '//div[@class="photo-info"]/span/date[1]/text()'
    }
    xpath_max_page = '//ul[@class="pagination pagination-lg"]/li/a/text()'

    # force to get url from ini file each time
    javbus_url = return_config_string(['其他设置', 'javbus网址'])
    set_url = javbus_url + page_template.format(page_num=page_num, url_parameter=url_parameter)
    print(f'accessing {set_url}')

    res = return_post_res(set_url).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = root.xpath(xpath_max_page)[-2]
    except:
        max_page = page_num
    if not max_page:
        max_page = page_num
    
    return jav_objs_raw, max_page
コード例 #9
0
def read_local_ini():
    if request.args.get('filter_dict'):
        res = {}
        errors = []
        filter_dict = literal_eval(request.args.get('filter_dict'))
        for k, v in filter_dict.items():
            try:
                res[k] = return_config_string(v)
            except IniNotFoundException as e:
                errors.append(str(e))
        return jsonify({'local_config': res, 'error': errors})
    else:
        return jsonify({'local_config': load_ini_file()._sections})  # convert returned obj to dict format
コード例 #10
0
    def __init__(self, *args, **kwargs):
        super(JavLibraryScraper, self).__init__(*args, **kwargs)
        self.source = 'javlibrary'
        self.xpath_dict = {
            'search_field': {
                'title': '//title/text()',
                'studio': '//tr[td="制作商:"]/td[2]/span/a/text()',
                'premiered': '//tr[td="发行日期:"]/td[2]/text()',
                #'year': processed from release date
                'length': '//tr[td="长度:"]/td[2]/span/text()',
                'director': '//tr[td="导演:"]/td[2]/text()',
                'image': '//img[@id="video_jacket_img"]/@src',
                'score': '//span[@class="score"]/text()'
            },
            'search_list_field': {
                #'plot': no good source,
                'all_actress': '//span[@class="star"]/a/text()',
                'genres': '//span[@class="genre"]/a/text()'
            },
        }

        self.jav_url = return_config_string(['其他设置', 'javlibrary网址'])
コード例 #11
0
    def __init__(self, *args, **kwargs):
        super(JavBusScraper, self).__init__(*args, **kwargs)
        self.source = 'javlibrary'
        self.xpath_dict = {
            'search_field': {
                'title': '//a[@class="bigImage"]/img/@title',
                'studio': '//p[span="製作商:"]/a/text()',
                'premiered': '//p[span="發行日期:"]/text()',
                #'year': processed from release date
                'length': '//p[span="長度:"]/text()',
                'director': '//p[span="導演:"]/a/text()',
                'image': '//a[@class="bigImage"]/img/@src',
                #'score': no good source
            },
            'search_list_field': {
                #'plot': no good source,
                'all_actress': '//span[@class="genre" and @onmouseover]/a/text()',
                'genres': '//span[@class="genre"]/a[contains(@href, "genre")]/text()'
            },
        }

        self.jav_url = return_config_string(['其他设置', 'javbus网址'])
コード例 #12
0
def javlib_set_page(page_prefix: str, page_num: int, config=None) -> dict:
    xpath_dict = {
        'title': '//*[@class="video"]/a/@title',
        'javid': '//*[@class="video"]/@id',
        'img': '//*[@class="video"]/a/img/@src',
        'car': '//*/div[@class="video"]/a/div[@class="id"]/text()'
    }
    xpath_max_page = '//*/div[@class="page_selector"]/a[@class="page last"]/@href'

    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    # fill missing parameters
    if config == None:
        config = deepcopy(DEFAULT_JAVLIB_CONFIG)

    lib_url = javlib_url + page_prefix + str(page_num)
    print(f'accessing {lib_url}')

    res = return_post_res(lib_url,
                          proxies=config['proxies'],
                          cookies=config['cookies']).content
    root = etree.HTML(res)

    jav_objs_raw = defaultlist(dict)
    for k, v in xpath_dict.items():
        _values = root.xpath(v)
        for _i, _value in enumerate(_values):
            jav_objs_raw[_i].update({k: _value})

    try:
        max_page = find_max_page(root.xpath(xpath_max_page)[0])
    except IndexError:
        max_page = page_num

    return jav_objs_raw, max_page
コード例 #13
0
def parse_javlib(jav_obj: dict, config=None) -> dict:
    # force to get url from ini file each time
    javlib_url = return_config_string(['其他设置', 'javlibrary网址'])

    # fill missing parameters
    if config == None:
        config = deepcopy(DEFAULT_JAVLIB_CONFIG)

    # perform search first
    lib_search_url = javlib_url + 'vl_searchbyid.php?keyword=' + jav_obj['car']
    print(f'accessing {lib_search_url}')

    jav_html = return_html_text(lib_search_url,
                                proxies=config['proxies'],
                                cookies=config['cookies'])
    # 搜索结果的网页,大部分情况就是这个影片的网页,也有可能是多个结果的网页
    # 尝试找标题,第一种情况:找得到,就是这个影片的网页
    if jav_obj['car'].startswith('T28'):
        # special filter for T28
        title_re = re.search(r'<title>(T28-\d{1,5}.+?) - JAVLibrary</title>',
                             jav_html)
        # update title re
        config['search_field'][
            'title'] = r'<title>(T28-\d{1,5}.+?) - JAVLibrary</title>'
    else:
        title_re = re.search(
            r'<title>([a-zA-Z]{1,6}-\d{1,5}.+?) - JAVLibrary</title>',
            jav_html)  # 匹配处理“标题”
    # 搜索结果就是AV的页面
    if title_re:
        pass
    # 第二种情况:搜索结果可能是两个以上,所以这种匹配找不到标题,None!
    else:  # 继续找标题,但匹配形式不同,这是找“可能是多个结果的网页”上的第一个标题
        search_results = re.findall(r'v=javli(.+?)" title=".+?-\d+?[a-z]? ',
                                    jav_html)
        # 搜索有几个结果,用第一个AV的网页,打开它
        if search_results:
            # 只有一个结果,其实其他的被忽略的,比如avop-00127bod
            result_first_url = javlib_url + '?v=javli' + search_results[0]
            jav_html = return_html_text(result_first_url,
                                        proxies=config['proxies'],
                                        cookies=config['cookies'])
        # 第三种情况:搜索不到这部影片,搜索结果页面什么都没有
        else:
            raise JAVNotFoundException('{} cannot be found in javlib'.format(
                jav_obj['car']))

    print('>>正在处理:', jav_obj['car'])
    # process standard fields
    # we can use update here since each field only allows one value
    jav_obj.update(re_parse_html(config['search_field'], jav_html))
    # process list fields
    for k, v in config['search_list_field'].items():
        for each_v in re.findall(v, jav_html):
            jav_obj.setdefault(k, []).append(each_v)

    # get rid of car in title
    if 'title' in jav_obj:
        title_re = re.search(r'(.+?) (.+)', jav_obj['title'])
        if title_re:
            jav_obj['title'] = title_re.group(2)
    else:
        import ipdb
        ipdb.set_trace()

    # process score to make it more realistic
    if 'score' in jav_obj:
        score = (float(jav_obj['score']) - 4) * 5 / 3
        if score >= 0:
            score = '%.1f' % score
        jav_obj['score'] = str(float(score) * 10)

    # extra processing for actress names for japanese name
    jav_obj['all_actress'] = []
    actress_jav_ids = r'<a href=\"vl_star\.php\?s=(.+?)\" rel=\"tag\">(.+?)</a>'
    if True and 'ja' not in javlib_url:  # read from ini file
        javlib_url_jp = javlib_url.replace('cn', 'ja')
        for act_id_re in re.findall(actress_jav_ids, jav_html):
            if len(act_id_re) != 2:
                print(f'skipping {act_id_re}, not enough info')
                continue

            ind_url_jp = javlib_url_jp + f'vl_star.php?s={act_id_re[0]}'
            print(f'requesting {ind_url_jp} for jp name')
            jp_html_text = return_html_text(ind_url_jp)

            # compare jp with cn name
            try:
                jp_name_filter = r'<div class="boxtitle">(.+?)のビデオ</div>'
                jp_name = re.search(jp_name_filter, jp_html_text).group(1)
                act_name = act_id_re[1]
            except:
                import ipdb
                ipdb.set_trace()

            # merge jp into cn name
            if jp_name != act_name:
                act_name = '{}[{}]'.format(act_name, jp_name)

            jav_obj['all_actress'].append(act_name)

    # force set year if not detected
    if not jav_obj.get('year'):
        jav_obj['year'] = 'unknown'

    return jav_obj
コード例 #14
0
    def send_emby_images(self, image_folder_path=None):
        # init
        num = 0
        up_num = []
        failed_names = []

        emby_url = return_config_string(["emby专用", "网址"], config=self.config)
        api_key = return_config_string(["emby专用", "api id"],
                                       config=self.config)
        image_scraper = WarashiScraper()

        # try correct emby url with /
        if not emby_url.endswith('/'):
            emby_url += '/'

        try:
            for actress in self.actress_yielder(emby_url, req_site='emby'):
                num += 1
                if num % 500 == 0:
                    print('have processed', num, '个actress')

                actress_name = actress['Name']
                actress_id = actress['Id']

                actress_formatter = r'(.+?)\[(.+?)\]'
                actress_groups = re.search(actress_formatter, actress_name)
                if actress_groups and len(actress_groups.groups()) == 2:
                    search_term = actress_groups.groups()[1]
                    print(f'use {search_term} for search')
                else:
                    search_term = None

                if not self.replace and actress.get('ImageTags', {}) != {}:
                    res_info = {
                        'log':
                        f'skipping 女优:{actress_name}, already has existing images'
                    }
                    yield json.dumps(res_info, ensure_ascii=False) + '\n'
                    continue

                has_local_image = False
                if image_folder_path:
                    if os.path.isfile(
                            os.path.join(image_folder_path,
                                         f'{actress_name}.jpg')):
                        file_path = os.path.join(image_folder_path,
                                                 f'{actress_name}.jpg')
                        self.post_image_to_actress(actress_id, file_path,
                                                   emby_url, api_key)
                        up_num.append(actress_name)
                        has_local_image = True
                    elif os.path.isfile(
                            os.path.join(image_folder_path,
                                         f'{actress_name}.png')):
                        file_path = os.path.join(image_folder_path,
                                                 f'{actress_name}.png')
                        self.post_image_to_actress(actress_id, file_path,
                                                   emby_url, api_key)
                        up_num.append(actress_name)
                        has_local_image = True

                if not has_local_image:
                    try:
                        if not self.walked_actress.get(
                                search_term or actress_name, ''):
                            image_url = image_scraper.return_image_by_name(
                                search_term or actress_name)
                            self.walked_actress[search_term
                                                or actress_name] = image_url
                        else:
                            image_url = self.walked_actress[search_term
                                                            or actress_name]
                        self.post_image_to_actress(actress_id, image_url,
                                                   emby_url, api_key)
                        up_num.append(actress_name)
                    except ActorNotFoundException as e:
                        res_info = {'log': str(e)}
                        failed_names.append(actress_name)
                        yield json.dumps(res_info, ensure_ascii=False) + '\n'
                        continue
                    except Exception as e:
                        traceback_str = traceback.format_exc()
                        yield json.dumps(traceback_str,
                                         ensure_ascii=False) + '\n'
                        continue

                res_info = {
                    'log': f'processed 女优:{actress_name}, ID:{actress_id}'
                }

                yield json.dumps(res_info, ensure_ascii=False) + '\n'

        except requests.exceptions.ConnectionError:
            print('emby服务端无法访问,请检查:', emby_url)
        except Exception as err:
            traceback.print_exc()
            print('发生未知错误,请截图给作者:', emby_url, err)

        print(f'成功upload {len(up_num)} 个女优头像!')
        yield json.dumps(
            {
                'log':
                f'成功upload {len(up_num)} 个女优头像!succeeded on {up_num} \n failed on {failed_names}'
            },
            ensure_ascii=False) + '\n'
コード例 #15
0
 def setup_credentials(self):
     api_key = return_config_string(["emby专用", "api id"],
                                    config=self.config)
     return {'api_key': api_key}