def generate_ips(self): """ 从文件中读取 ip ,以生成器的方式返回可用 ip """ path = self.config["abs_dir"] + "/ips_ok.txt" with open(path, "rt") as ips_file: for ip in ips_file: yield toolBox.strip(ip)
def get_ips_from_file(self): """ 从 ips_ok.txt 中读取可用 ip :return: 存储可用 ip 的列表 """ path = self.config["abs_dir"] + "/ips_ok.txt" ips_ok = [] with open(path, "rt") as ips_file: for ip in ips_file: ip = toolBox.strip(ip) ips_ok.append(ip) return ips_ok
def get_66_ips(self): """ 获取 66网站 的 IP """ base_url = ["http://www.66ip.cn/nmtq.php?proxytype=0", # http "http://www.66ip.cn/nmtq.php?proxytype=1"] # https for n in range(10): url = random.choice(base_url) html = self.get_html(url, "gbk") tag = list(islice(html, 10, 49))[0::2] pre = "http://" if url.endswith("0") else "https://" for t in tag: if t is None: continue ip_path = pre + toolBox.strip(t) self._ip_cache_lib.add(ip_path) time.sleep(self.config["frequency"])
def load_config(): """ 读取加载文件:同级目录下的 config.txt 并赋值给类属性 config,以供其他方法共享、调用 """ config = {} # 读取配置文件中的参数的类型是字符串,但以下参数值得类型不应该是字符串 # 所以构建此列表,以供后续处理其中对应的值 need_eval = ["proxy", "connect_timeout", "read_timeout", "frequency"] config_file = os.path.abspath("config.txt") with open(config_file, "rt") as config_file: for item in config_file: item = toolBox.strip(item) key, value = item.split("=") if value == "": msg = "'" + key + "'" # 如果只获取到了 key 而没有获取到对应 value,则抛出解析错误异常 raise AnalysisError(msg) # 将字符串值转换为相应类型 config[key] = eval(value) if key in need_eval else value toolBox.print_format("Loading config") # 设置各可配置项的默认值 config.setdefault("proxy", False) config.setdefault("dir_name", "ips_lib/") config.setdefault("abs_dir", os.path.abspath(config["dir_name"])) # 暂时只支持 html.parser,后续加入 lxml 等解析器 config.setdefault("parser", "html.parser") config.setdefault("connect_timeout", 3) config.setdefault("read_timeout", 6) config.setdefault("frequency", 6) config.setdefault("test_domain", "https://book.douban.com/") toolBox.print_dict(config) return config