Beispiel #1
0
 def firefox_ver(self, value):
     # 如果格式不符合,保持原来的值,不做任何修改
     if not helper.match_expect_type(value, 'dict'):
         return
     if 'min' in value and helper.match_expect_type(value['min'], 'int'):
         self._firefox_ver['min'] = value['min']
     # 使用range进行list生成时,会会忽略最大值,所以需要+1
     # [range(74, 75)] => [74]
     if 'max' in value and helper.match_expect_type(value['max'], 'int'):
         self._firefox_ver['max'] = value['max'] + 1
Beispiel #2
0
 def storage_type(self, value):
     r = gbh_helper.enum_set_check(value=value,
                                   enum_type=gfp_self_enum.StorageType)
     if r is None:
         return
     else:
         self._storage_type = r
Beispiel #3
0
 def proxy_type(self, value):
     r = gbh_helper.enum_set_check(value=value,
                                   enum_type=gfp_self_enum.ProxyType)
     if r is None:
         return
     else:
         self._proxy_type = r
Beispiel #4
0
 def country(self, value):
     r = gbh_helper.enum_set_check(value=value,
                                   enum_type=gfp_self_enum.Country,
                                   replace=False)
     if r is None:
         return
     else:
         self._country = r
Beispiel #5
0
def get_chrome_ver(setting, url, if_need_proxy, proxies):
    '''
    :param setting: setting的实例
    :param url: 获取chrome版本的url
    :param if_need_proxy:连接到https://www.chromedownloads.net是否需要代理
    :param proxies: 如果需要使用代理,可用的代理
    :return: set
    '''
    chrome_ver = set({})
    current_year = datetime.date.today().year
    # print(current_year)

    # print(if_need_proxy)
    #     valid_proxies = None
    #     if if_need_proxy:
    #         if setting.proxies is None:
    #             raise Exception("setting没有设置任何代理,无法连接到https://www.chromedownloads\
    # .net获得chrome版本")
    #     # print(setting.proxies)
    #
    #         for single_proxies in setting.proxies:
    #             tmp = helper.detect_if_proxy_usable(proxies=single_proxies, url=url)
    #             # print(tmp)
    #             if tmp:
    #                 # print(single_proxies)
    #                 valid_proxies = single_proxies
    #                 break
    #
    #         if valid_proxies is None:
    #             raise Exception('尝试了所有代理,都无法连接https://www.chromedownloads.net')
    # print(valid_proxies)
    r = helper.send_request_get_response(url=url,
                                         if_use_proxy=if_need_proxy,
                                         proxies=proxies,
                                         header=self_constant.HEADER)
    # print(r.html)
    records = r.html.find(
        'div.download_content>ul.fix>'
        'li[class!=divide-line]', first=False)
    # print(records)
    for single_record in records:
        # print(single_record.text)
        version_element_list = single_record.find('span.version_title>a')
        release_data_element_list = single_record.find('span.release_date')
        # 第一个li是标题,需要忽略
        if len(version_element_list) == 0:
            continue
        # 判断版本时间
        version_release_year = \
            int(release_data_element_list[0].text.split('-')[0])
        if current_year - version_release_year + 1 > \
                setting.chrome_max_release_year:
            continue

        chrome_ver.add(version_element_list[0].text.split('_')[3])
    return chrome_ver
Beispiel #6
0
 def raw_site(self, value):
     r = gbh_helper.enum_set_check(value=value,
                                   enum_type=gfp_self_enum.SupportedWeb)
     if r is None:
         return
     else:
         self._raw_site = r
         self._site = self._generate_site(
             enumset_site=self._raw_site,
             enumset_protocol=self._protocol,
             int_site_max_page_no=self._site_max_page_no)
Beispiel #7
0
 def chrome_max_release_year(self, value):
     # 是否为整数
     if not helper.match_expect_type(value, 'int'):
         return
     # 是否大于0
     if value < 0:
         return
     # 是否小于当前年-2008
     if CHROME_MAX_RELEASE_YEAR < value:
         return
     self._chrome_max_release_year = value
Beispiel #8
0
    def site_max_page_no(self, value):
        if not gbh_helper.match_expect_type(value, 'int'):
            raise ValueError('site_max_page_no的值必须是整数')
        if value < 1 or value > 10:
            raise ValueError('site_max_page_no的值必须在1到9之间')
        # 实际使用列表表达式生成url,因此site_max_page_no要+1,符合感受
        self._site_max_page_no = value + 1

        self._site = self._generate_site(
            enumset_site=self._raw_site,
            enumset_protocol=self._protocol,
            int_site_max_page_no=self._site_max_page_no)
Beispiel #9
0
    def time_interval_in_seconds(self, old_date_time, new_date_time):
        '''
        计算old_date_time和new_date_time之间时间间隔,单位秒
        :param old_date_time:
        :param new_date_time:
        :return:    int
        '''

        if not helper.match_expect_type(old_date_time, 'datetime.datetime'):
            if helper.match_expect_type(old_date_time, 'str'):
                old_date_time = datetime.datetime.strptime(
                    old_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('old_date_time的格式不正确')

        if not helper.match_expect_type(new_date_time, 'datetime.datetime'):
            if helper.match_expect_type(new_date_time, 'str'):
                new_date_time = datetime.datetime.strptime(
                    new_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('new_date_time的格式不正确')

        # datetime.datetime.now()+datetime.timedelta(days=1)
        return int((new_date_time - old_date_time).total_seconds())
Beispiel #10
0
def getPrefilterFunction(webenum):
    '''
    根据enum SupportedWeb的名字,返回对应的prefilter函数,如果没有出错,返回None
    :param webenum:
    :return: function
    '''
    if not gbh_helper.match_expect_type(webenum, 'SupportedWeb'):
        return

    if webenum.name == gfp_self_enum.SupportedWeb.Xici.name:
        return prefilter.pre_filter_xicidaili
    if webenum.name == gfp_self_enum.SupportedWeb.Kuai.name:
        return prefilter.pre_filter_kuaidaili
    if webenum.name == gfp_self_enum.SupportedWeb.Proxylist.name:
        return prefilter.pre_filter_proxy_list
    if webenum.name == gfp_self_enum.SupportedWeb.Hidemy.name:
        return prefilter.pre_filter_hidemy

    return
Beispiel #11
0
def getExtractDataFunction(webenum):
    '''
    根据enum SupportedWeb的名字,返回对应的prefilter函数,如果没有出错,返回None
    :param webenum:
    :return: function
    '''
    if not gbh_helper.match_expect_type(webenum, 'SupportedWeb'):
        return

    if webenum.name == gfp_self_enum.SupportedWeb.Xici.name:
        return gen_proxy_from_page.extra_data_from_page_xicidaili
    if webenum.name == gfp_self_enum.SupportedWeb.Kuai.name:
        return gen_proxy_from_page.extra_data_from_page_kuaidaili
    if webenum.name == gfp_self_enum.SupportedWeb.Proxylist.name:
        return gen_proxy_from_page.extra_data_from_page_proxylist
    if webenum.name == gfp_self_enum.SupportedWeb.Hidemy.name:
        return gen_proxy_from_page.extra_data_from_page_hidemy

    return
Beispiel #12
0
 def validate_single_proxy(self, single_proxy, url, final_result):
     '''
     :param single_proxy:dict。 gen_proxy获得的结果中,单个记录。{ip,port,type,protocol}
     :param url: 代理对此url是否有效
     :param final_result:list。为了在协程中直接将valid的proxy提取,直接传入此参数
     :return: boolean。实际上,使用协程时,无法使用此返回值,而是直接将结果放入final_result
     '''
     ip = single_proxy['ip']
     port = single_proxy['port']
     proxy = {'http': '%s:%s' % (ip, port), 'https': '%s:%s' % (ip, port)}
     print('开始检测代理%s:%s对网站%s是否有效' %
           (single_proxy['ip'], single_proxy['port'], url))
     # print(proxy)
     if gbh_helper.detect_if_proxy_usable(proxies=proxy, url=url):
         print('代理 %s 有效' % proxy['http'])
         final_result.append(single_proxy)
         return True
     else:
         print('代理 %s 无效' % proxy['http'])
         # final_result.append(single_result)
         return False
Beispiel #13
0
 def check_if_site_need_proxy(self):
     # print(self._site)
     for single_site in self._site:
         single_site['need_proxy'] = gbh_helper.detect_if_need_proxy(
             single_site['urls'][0])
Beispiel #14
0
def gen_header(setting, url, num=None):
    '''
    :param setting:
    :param url: 根据url生成host
    :param num:
    :return:
    '''
    ua = []
    if num is not None:
        # 如果只需要一个header,优选返回firefox的ua
        if num == 1:
            # print(setting.browser_type)
            if self_enum.BrowserType.FireFox in setting.browser_type:
                # print('num =1 browse=ff')
                ua += gen_ua.generate_firefox_ua(setting=setting, num=1)
            elif self_enum.BrowserType.Chrome in setting.browser_type:
                # print('num =1 browse=ch')
                ua += gen_ua.generate_chrome_ua(setting=setting, num=1)
        # 如果需要多个header
        else:
            # 如果可以产生ff的ua,先产生
            if self_enum.BrowserType.FireFox in setting.browser_type:
                ua += gen_ua.generate_firefox_ua(setting=setting, num=num)

            # 如果生成的ff的ua数量不满足,再尝试生成chrome的ua
            if len(ua) < num:
                if self_enum.BrowserType.Chrome in setting.browser_type:
                    ua += gen_ua.generate_chrome_ua(setting=setting, num=num)
    # num = None,生成最大数量的ua
    else:
        if self_enum.BrowserType.FireFox in setting.browser_type:
            ua += gen_ua.generate_firefox_ua(setting=setting)
        if self_enum.BrowserType.Chrome in setting.browser_type:
            ua += gen_ua.generate_chrome_ua(setting=setting)

    header = []
    host = gbh_helper.extract_host_from_url(url)
    for single_ua in ua:
        # setting.header_no_ua['User-Agent'] = single_ua
        # tmp_header = setting.header_no_ua
        # tmp_header['User-Agent'] = single_ua
        if 'Firefox' in single_ua:
            header.append({
                **setting.firefox_header_no_ua,
                **{
                    'User-Agent': single_ua
                },
                **{
                    'Host': host
                }
            })
        elif 'Chrome' in single_ua:
            header.append({
                **setting.chrome_header_no_ua,
                **{
                    'User-Agent': single_ua
                },
                **{
                    'Host': host
                }
            })
    return header
Beispiel #15
0
def generate_chrome_ua(setting, num=None):
    '''
    :param setting: setting的实例
    :param num: 期望生成chrome_ua的个数
    :return: list,包含需要获取版本的UA
    '''
    if num is not None:
        # 如果只需要返回一个,直接生成
        if num == 1:
            return [
                'Mozilla/5.0 (Windows NT 6.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
            ]

    try:
        version_url = generate_chrome_url_base_on_type(setting)
    except ValueError as e:
        # print('generate_chrome_header调用generate_chrome_url_base_on_type'
        #       ',传入的参数必须是set')
        print(e)
        return

    # 检测是否需要代理,如果需要,设置代理
    # if_use_proxy = helper.detect_if_need_proxy(version_url[0])
    # print(version_url)
    if_need_proxy = helper.detect_if_need_proxy(self_constant.CHROME_BASE_URL)
    valid_proxies = None
    if if_need_proxy:
        if setting.proxies is None:
            raise Exception("setting没有设置任何代理,无法连接到https://www.chromedownloads\
    .net获得chrome版本")
        # print(setting.proxies)

        for single_proxies in setting.proxies:
            tmp = helper.detect_if_proxy_usable(
                proxies=single_proxies, url=self_constant.CHROME_BASE_URL)
            # print(tmp)
            if tmp:
                # print(single_proxies)
                valid_proxies = single_proxies
                break

        if valid_proxies is None:
            raise Exception('尝试了所有代理,都无法连接https://www.chromedownloads.net')

    chrome_ver = set({})
    for single_url in version_url:
        tmp_chrome_ver = get_chrome_ver(url=single_url,
                                        setting=setting,
                                        if_need_proxy=if_need_proxy,
                                        proxies=valid_proxies)
        # logging.debug(tmp_chrome_ver)
        # 获得的version加入chrome_ver
        chrome_ver = chrome_ver | tmp_chrome_ver
    # logging.debug(chrome_ver)
    os_bit = set([])
    if self_enum.OsType.All in setting.os_type:
        os_bit = {'Win32; x32', 'Win64; x64'}
    else:
        if self_enum.OsType.Win32 in setting.os_type:
            os_bit.add('Win32; x32')
        if self_enum.OsType.Win64 in setting.os_type:
            os_bit.add('Win64; x64')

    chrome_ua = [
        'Mozilla/5.0 (%s; %s) AppleWebKit/537.36 (KHTML, \
like Gecko) Chrome/%s Safari/537.36' % (winver, osbit, chromever)
        for osbit in os_bit for winver in setting.WIN_VER
        for chromever in chrome_ver
    ]
    # else:
    #     raise Exception('当前不支持产生非Windows的user-agent')

    if num is not None:
        if len(chrome_ua) > num:
            return random.sample(chrome_ua, num)

    return chrome_ua
Beispiel #16
0
 def chrome_type(self, value):
     r = helper.enum_set_check(value, self_enum.ChromeType)
     if r is None:
         return
     else:
         self._chrome_type = r
Beispiel #17
0
 def os_type(self, value):
     r = helper.enum_set_check(value, self_enum.OsType)
     if r is None:
         return
     else:
         self._os_type = r
Beispiel #18
0
 def browser_type(self, value):
     r = helper.enum_set_check(value, self_enum.BrowserType)
     if r is None:
         return
     else:
         self._browser_type = r
Beispiel #19
0
 def valid_time_in_db(self, value):
     if not gbh_helper.match_expect_type(value, 'int'):
         raise ValueError('valid_time_in_db的值必须是整数')
     if value < 300 or value > 86400 * 5:
         raise ValueError('valid_time_in_db的值必须在300到86400×5之间')
     self._valid_time_in_db = value