def final_steps(xml_total, proxy_total): if int(xml_total) > 0: xml_c.create_xml(xml_total) if int(proxy_total) > 0: gp.get_proxy(proxy_total) complete_msg = f"{'='*25} SCRIPT COMPLETE {'='*25}" logger.info(complete_msg)
def parse_state(html,proxies): soup0 = BeautifulSoup(html,'lxml') countries = soup0.find_all(name = 'div',attrs = {'class':'col-xs-12'})#所有欧洲国家的集合 countries = countries[3] countries = str(countries) pattern0 = re.compile('<a.*?href="(.*?)".*?>(.*?)</a',re.S) country = re.findall(pattern0,countries) #获取到具体国家的href和名称 ht = "10.34.7.141" pt = 3306 pw = "123456" db = "ip_proxy" us = "crawl" # print(country) for c in country: pr = get_proxy.get_proxy(ht,pt,pw,db,us) proxies = {'http': "http://"+pr} country_page = getHTMl(c[0],proxies) Code_getHTMl(c[0]) soup1 = BeautifulSoup(country_page,'lxml') county = soup1.find_all(name = 'div',attrs = {'class':'col-md-3 col-xs-6 my-padding-6'})#国家分类的主要行政区 # print(county) county = str(county) pattern1 = re.compile('<a.*?href="(.*?)".*?>(.*?)</a',re.S) city = re.findall(pattern1,county) #获取到具体国家的href和名称 for ic in city: Code_getHTMl(c[0]+ic[0])
def get_url(url_queue): start_url = "http://bangumi.tv/anime/browser?sort=rank&page=" flag = True n = 1 while flag: requests_url = start_url + str(n) try: html = requests.get(requests_url, proxies=get_proxy()).content except: print("获取目录出错,重试中...") else: soup = BeautifulSoup(html, "lxml") hrefs = soup.select('#browserItemList li .subjectCover') if len(hrefs) == 0: flag = False else: for href in hrefs: url = href.get("href") url_queue.put(url) n += 1 print("正在获取第" + str(n) + "页列表...") pass
def __init__(self): super(Proxy, self).__init__() self.proxy_list.extend(self.__read_all()) diff = PROXY_LENGTH - len(self.proxy_list) if diff > 0: new_proxy = get_proxy(diff) self.proxy_list.extend(new_proxy) self.__save(new_proxy) logging.info(self.proxy_list)
def update(self, index): if index > len(self.proxy_list): return self.proxy_list.pop(index) self.proxy_list.insert(index, get_proxy(1)[0]) logging.info(self.proxy_list) with open('paid-proxy.txt', 'w') as f: f.write('\n'.join(self.proxy_list)) self.now_index -= 1 if self.now_index < 0: self.now_index = len(self.proxy_list) - 1
def Code_getHTMl(url): ht = "10.34.7.141" pt = 3306 pw = "123456" db = "ip_proxy" us = "crawl" pr = get_proxy.get_proxy(ht,pt,pw,db,us) headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' } proxies = {'http': "http://"+pr} resp = requests.get(url,headers = headers,proxies=proxies,timeout = 10) if resp.status_code != 200: resp.encoding = 'UTF-8' write_to_file(resp.text) print(resp.status_code) print(200)
def content_crawler(url_queue, data_queue): print("爬虫线程启动...") while True: if url_queue.empty(): time.sleep(1) else: try: url = parse.urljoin("http://bangumi.tv/", url_queue.get()) except: print("抓取失败,重试中...") else: print("开始抓取: " + url) html = requests.get(url, proxies=get_proxy()).content data_get(html, url, data_queue) pass
def _anti_progrosse(self): # todo 反爬虫代理函数 proxy = get_proxy.get_proxy() if not proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy, } response = requests.get(begin_url.format(page * 90, city, keyword), headers=headers, proxies=proxies) if response.status_code != 200: print( 'proxy mode fail!!! please wait a few time, and try again') return urls = self.parser.list_zhilian(response.text) else: print("Can't seek useful proxy!") return
soup1 = BeautifulSoup(country_page,'lxml') county = soup1.find_all(name = 'div',attrs = {'class':'col-md-3 col-xs-6 my-padding-6'})#国家分类的主要行政区 # print(county) county = str(county) pattern1 = re.compile('<a.*?href="(.*?)".*?>(.*?)</a',re.S) city = re.findall(pattern1,county) #获取到具体国家的href和名称 for ic in city: Code_getHTMl(c[0]+ic[0]) # print(c) # country_page = getHTMl(country[0],proxies) # print(country[0]) # soup1 = BeautifulSoup(country_page,'lxml') # county = soup1.find_all(name = 'div',attrs = {'class':'col-md-3 col-xs-6 my-padding-6'})#国家分类的主要行政区 # countries = str(countries).replace('"','\"').replace('\t','').replace('\n','').replace('\r','') if __name__ == "__main__": ht = "10.34.7.141" pt = 3306 pw = "123456" db = "ip_proxy" us = "crawl" pr = get_proxy.get_proxy(ht,pt,pw,db,us) proxies = {'http': "http://"+pr} url = 'https://www.nowmsg.com/' html = getHTMl(url,proxies) parse_state(html,proxies)
def __init__(self): self.logger = logging.getLogger('FW_Lite') self.version = SConfigParser() self.userconf = SConfigParser() self.reload() self.UPDATE_INTV = 6 self.timeout = self.userconf.dgetint('fgfwproxy', 'timeout', 4) ParentProxy.DEFAULT_TIMEOUT = self.timeout self.parentlist = ParentProxyList() self.HOSTS = defaultdict(list) self.GUI = '-GUI' in sys.argv self.rproxy = self.userconf.dgetbool('fgfwproxy', 'rproxy', False) listen = self.userconf.dget('fgfwproxy', 'listen', '8118') if listen.isdigit(): self.listen = ('127.0.0.1', int(listen)) else: self.listen = (listen.rsplit(':', 1)[0], int(listen.rsplit(':', 1)[1])) try: self.local_ip = set(socket.gethostbyname_ex(socket.gethostname())[2]) except: try: csock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) csock.connect(('8.8.8.8', 53)) (addr, port) = csock.getsockname() csock.close() self.local_ip = set([addr]) except socket.error: self.local_ip = set(['127.0.0.1']) ip = self.local_ip.pop() self.local_ip.add(ip) self.PAC = '''\ function FindProxyForURL(url, host) { if (isPlainHostName(host) || host.indexOf('127.') == 0 || host.indexOf('192.168.') == 0 || host.indexOf('10.') == 0 || shExpMatch(host, 'localhost.*')) { return 'DIRECT'; } return "PROXY %s:%s; DIRECT";}''' % (ip, self.listen[1]) if self.userconf.dget('fgfwproxy', 'pac', ''): if os.path.isfile(self.userconf.dget('fgfwproxy', 'pac', '')): self.PAC = open(self.userconf.dget('fgfwproxy', 'pac', '')).read() else: self.PAC = '''\ function FindProxyForURL(url, host) { if (isPlainHostName(host) || host.indexOf('127.') == 0 || host.indexOf('192.168.') == 0 || host.indexOf('10.') == 0 || shExpMatch(host, 'localhost.*')) { return 'DIRECT'; } return "PROXY %s; DIRECT";}''' % self.userconf.dget('fgfwproxy', 'pac', '') self.PAC = self.PAC.encode() if self.userconf.dget('FGFW_Lite', 'logfile', ''): path = self.userconf.dget('FGFW_Lite', 'logfile', '') dirname = os.path.dirname(path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) formatter = logging.Formatter('FW-Lite %(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') hdlr = logging.handlers.RotatingFileHandler(path, maxBytes=1048576, backupCount=5) hdlr.setFormatter(formatter) self.logger.addHandler(hdlr) self.region = set(x.upper() for x in self.userconf.dget('fgfwproxy', 'region', '').split('|') if x.strip()) self.profiles = len(self.userconf.dget('fgfwproxy', 'profile', '13')) self.xheaders = self.userconf.dgetbool('fgfwproxy', 'xheaders', False) if self.userconf.dget('fgfwproxy', 'parentproxy', ''): self.addparentproxy('direct', '%s 0' % self.userconf.dget('fgfwproxy', 'parentproxy', '')) self.addparentproxy('local', 'direct 100') else: self.addparentproxy('direct', 'direct 0') ParentProxy.set_via(self.parentlist.direct) for k, v in self.userconf.items('parents'): if '6Rc59g0jFlTppvel' in v: self.userconf.remove_option('parents', k) self.confsave() continue self.addparentproxy(k, v) if not self.rproxy and len([k for k in self.parentlist.httpsparents() if k.httpspriority < 100]) == 0: self.logger.warning('No parent proxy available!') self.maxretry = self.userconf.dgetint('fgfwproxy', 'maxretry', 4) def addhost(host, ip): try: ipo = ip_address(ip) if isinstance(ipo, IPv4Address): self.HOSTS[host].append((2, ip)) else: self.HOSTS[host].append((10, ip)) except Exception: self.logging.warning('unsupported host: %s' % ip) for host, ip in self.userconf.items('hosts'): addhost(host, ip) if os.path.isfile('./fgfw-lite/hosts'): for line in open('./fgfw-lite/hosts'): line = line.strip() if line and not line.startswith('#'): try: ip, host = line.split() addhost(host, ip) except Exception as e: self.logger.warning('%s %s' % (e, line)) self.localdns = parse_hostport(self.userconf.dget('dns', 'localdns', '8.8.8.8:53' if self.rproxy else '223.5.5.5:53')) self.remotedns = self.localdns if self.rproxy else parse_hostport(self.userconf.dget('dns', 'remotedns', '208.67.222.222:5353')) self.REDIRECTOR = redirector(self) self.PARENT_PROXY = get_proxy(self) self.resolver = resolver.get_resolver(self.localdns, self.remotedns, ParentProxy('self', 'http://127.0.0.1:%d' % self.listen[1]), self.PARENT_PROXY.force)
# -*- coding: utf-8 -*- """ 使用代理爬虫 """ __author__ = 'katherinelove' import requests from get_proxy import get_proxy PROXY_POOL_URL = 'http://localhost:5555/random' if __name__ == '__main__': proxy = get_proxy(PROXY_POOL_URL) print(proxy) proxies = {'http': 'http://' + proxy, 'https': 'https://' + proxy} try: r = requests.get('http://httpbin.org/get', proxies=proxies) if r.status_code == 200: print(r.text) except Exception as e: print('error:', e.args)
from get_proxy import get_proxy print(get_proxy())
def __init__(self): self.logger = logging.getLogger('FW_Lite') self.version = SConfigParser() self.userconf = SConfigParser() self.reload() self.UPDATE_INTV = 6 self.timeout = self.userconf.dgetint('fgfwproxy', 'timeout', 4) ParentProxy.DEFAULT_TIMEOUT = self.timeout self.parentlist = ParentProxyList() self.HOSTS = defaultdict(list) self.GUI = '-GUI' in sys.argv self.rproxy = self.userconf.dgetbool('fgfwproxy', 'rproxy', False) listen = self.userconf.dget('fgfwproxy', 'listen', '8118') if listen.isdigit(): self.listen = ('127.0.0.1', int(listen)) else: self.listen = (listen.rsplit(':', 1)[0], int(listen.rsplit(':', 1)[1])) try: self.local_ip = set( socket.gethostbyname_ex(socket.gethostname())[2]) except: try: csock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) csock.connect(('8.8.8.8', 53)) (addr, port) = csock.getsockname() csock.close() self.local_ip = set([addr]) except socket.error: self.local_ip = set(['127.0.0.1']) ip = self.local_ip.pop() self.local_ip.add(ip) self.PAC = '''\ function FindProxyForURL(url, host) { if (isPlainHostName(host) || host.indexOf('127.') == 0 || host.indexOf('192.168.') == 0 || host.indexOf('10.') == 0 || shExpMatch(host, 'localhost.*')) { return 'DIRECT'; } return "PROXY %s:%s; DIRECT";}''' % (ip, self.listen[1]) if self.userconf.dget('fgfwproxy', 'pac', ''): if os.path.isfile(self.userconf.dget('fgfwproxy', 'pac', '')): self.PAC = open(self.userconf.dget('fgfwproxy', 'pac', '')).read() else: self.PAC = '''\ function FindProxyForURL(url, host) { if (isPlainHostName(host) || host.indexOf('127.') == 0 || host.indexOf('192.168.') == 0 || host.indexOf('10.') == 0 || shExpMatch(host, 'localhost.*')) { return 'DIRECT'; } return "PROXY %s; DIRECT";}''' % self.userconf.dget('fgfwproxy', 'pac', '') self.PAC = self.PAC.encode() if self.userconf.dget('FGFW_Lite', 'logfile', ''): path = self.userconf.dget('FGFW_Lite', 'logfile', '') dirname = os.path.dirname(path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) formatter = logging.Formatter( 'FW-Lite %(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') hdlr = logging.handlers.RotatingFileHandler(path, maxBytes=1048576, backupCount=5) hdlr.setFormatter(formatter) self.logger.addHandler(hdlr) self.region = set( x.upper() for x in self.userconf.dget('fgfwproxy', 'region', '').split('|') if x.strip()) self.profiles = len(self.userconf.dget('fgfwproxy', 'profile', '13')) self.xheaders = self.userconf.dgetbool('fgfwproxy', 'xheaders', False) if self.userconf.dget('fgfwproxy', 'parentproxy', ''): self.addparentproxy( 'direct', '%s 0' % self.userconf.dget('fgfwproxy', 'parentproxy', '')) self.addparentproxy('local', 'direct 100') else: self.addparentproxy('direct', 'direct 0') ParentProxy.set_via(self.parentlist.direct) for k, v in self.userconf.items('parents'): if '6Rc59g0jFlTppvel' in v: self.userconf.remove_option('parents', k) self.confsave() continue self.addparentproxy(k, v) if not self.rproxy and len([ k for k in self.parentlist.httpsparents() if k.httpspriority < 100 ]) == 0: self.logger.warning('No parent proxy available!') self.maxretry = self.userconf.dgetint('fgfwproxy', 'maxretry', 4) def addhost(host, ip): try: ipo = ip_address(ip) if isinstance(ipo, IPv4Address): self.HOSTS[host].append((2, ip)) else: self.HOSTS[host].append((10, ip)) except Exception: self.logging.warning('unsupported host: %s' % ip) for host, ip in self.userconf.items('hosts'): addhost(host, ip) if os.path.isfile('./fgfw-lite/hosts'): for line in open('./fgfw-lite/hosts'): line = line.strip() if line and not line.startswith('#'): try: ip, host = line.split() addhost(host, ip) except Exception as e: self.logger.warning('%s %s' % (e, line)) self.localdns = parse_hostport( self.userconf.dget( 'dns', 'localdns', '8.8.8.8:53' if self.rproxy else '223.5.5.5:53')) self.remotedns = self.localdns if self.rproxy else parse_hostport( self.userconf.dget('dns', 'remotedns', '208.67.222.222:5353')) self.REDIRECTOR = redirector(self) self.PARENT_PROXY = get_proxy(self) self.resolver = resolver.get_resolver( self.localdns, self.remotedns, ParentProxy('self', 'http://127.0.0.1:%d' % self.listen[1]), self.PARENT_PROXY.force)
def __init__(self): self.logger = logging.getLogger('config') self.logger.setLevel(logging.INFO) hdr = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s %(name)s:%(levelname)s %(message)s', datefmt='%H:%M:%S') hdr.setFormatter(formatter) self.logger.addHandler(hdr) self.version = SConfigParser() self.userconf = SConfigParser() self.reload() self.UPDATE_INTV = 6 self.timeout = self.userconf.dgetint('fgfwproxy', 'timeout', 3) ParentProxy.DEFAULT_TIMEOUT = self.timeout self.parentlist = ParentProxyList() self.HOSTS = defaultdict(list) self.GUI = '-GUI' in sys.argv self.rproxy = self.userconf.dgetbool('fgfwproxy', 'rproxy', False) listen = self.userconf.dget('fgfwproxy', 'listen', '8118') if listen.isdigit(): self.listen = ('127.0.0.1', int(listen)) else: self.listen = (listen.rsplit(':', 1)[0], int(listen.rsplit(':', 1)[1])) try: csock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) csock.connect(('8.8.8.8', 53)) (addr, port) = csock.getsockname() csock.close() self.local_ip = addr except socket.error: self.local_ip = '127.0.0.1' ip = self.local_ip self.PAC = PAC.replace('__PROXY__', 'PROXY %s:%s' % (ip, self.listen[1])) if self.userconf.dget('fgfwproxy', 'pac', ''): if os.path.isfile(self.userconf.dget('fgfwproxy', 'pac', '')): self.PAC = open(self.userconf.dget('fgfwproxy', 'pac', '')).read() self.PAC = self.PAC.encode() if self.userconf.dget('FGFW_Lite', 'logfile', ''): path = self.userconf.dget('FGFW_Lite', 'logfile', '') dirname = os.path.dirname(path) if dirname and not os.path.exists(dirname): os.makedirs(dirname) formatter = logging.Formatter( 'FW-Lite %(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') hdlr = logging.handlers.RotatingFileHandler(path, maxBytes=1048576, backupCount=5) hdlr.setFormatter(formatter) self.logger.addHandler(hdlr) self.region = set( x.upper() for x in self.userconf.dget('fgfwproxy', 'region', '').split('|') if x.strip()) self.profile_num = len(self.userconf.dget('fgfwproxy', 'profile', '13')) self.xheaders = self.userconf.dgetbool('fgfwproxy', 'xheaders', False) if self.userconf.dget('fgfwproxy', 'parentproxy', ''): self.addparentproxy( 'direct', '%s 0' % self.userconf.dget('fgfwproxy', 'parentproxy', '')) self.addparentproxy('local', 'direct 100') else: self.addparentproxy('direct', 'direct 0') ParentProxy.set_via(self.parentlist.direct) for k, v in self.userconf.items('parents'): if '6Rc59g0jFlTppvel' in v: self.userconf.remove_option('parents', k) self.confsave() continue self.addparentproxy(k, v) if not self.rproxy and len([ k for k in self.parentlist.httpsparents() if k.httpspriority < 100 ]) == 0: self.logger.warning('No parent proxy available!') self.maxretry = self.userconf.dgetint('fgfwproxy', 'maxretry', 4) def addhost(host, ip): try: ipo = ip_address(ip) if isinstance(ipo, IPv4Address): self.HOSTS[host].append((2, ip)) else: self.HOSTS[host].append((10, ip)) except Exception: self.logger.warning('unsupported host: %s' % ip) sys.stderr.write(traceback.format_exc() + '\n') sys.stderr.flush() for host, ip in self.userconf.items('hosts'): addhost(host, ip) if os.path.isfile('./fgfw-lite/hosts'): for line in open('./fgfw-lite/hosts'): line = line.strip() if line and not line.startswith('#'): try: ip, host = line.split() addhost(host, ip) except Exception as e: self.logger.warning('%s %s' % (e, line)) localdns = self.userconf.dget('dns', 'localdns', '') # get local dns setting from system if not localdns: if sys.platform.startswith('win'): import subprocess localdns = subprocess.check_output( ['nslookup', '127.0.0.1']).splitlines()[1].split()[1].decode() elif sys.platform == 'linux2': lst = [] with open('/etc/resolv.conf') as f: for line in f: if line.startswith('nameserver'): lst.append(line.split()[1]) localdns = '|'.join(lst) else: localdns = '119.29.29.29' self.logger.info('localdns: ' + localdns) self.localdns = [ parse_hostport(dns, 53) for dns in localdns.split('|') ] remotedns = localdns if self.rproxy else self.userconf.dget( 'dns', 'remotedns', '8.8.8.8') self.logger.info('remotedns: ' + remotedns) self.remotedns = [ parse_hostport(dns, 53) for dns in remotedns.split('|') ] self.REDIRECTOR = redirector(self) self.GET_PROXY = get_proxy(self) bad_ip = set(self.userconf.dget('dns', 'bad_ip', '').split('|')) self.resolver = resolver.get_resolver( self.localdns, self.remotedns, proxy=ParentProxy('self', 'http://127.0.0.1:%d' % self.listen[1]), apfilter=[self.GET_PROXY.gfwlist, self.GET_PROXY.local], bad_ip=bad_ip)
for i in range(pg): pg = str(i + 1) params = { 'kw': loc, } extend_url = parse.urlencode(params) full_url = base_url + pg + '/?' + extend_url proxies = random.choice(proxy_list) response = requests.get(full_url, headers=headers, proxies={'http': proxies}) if response.status_code == 200: url_list = get_url(response.text) print('page', i, 'url list get') get_contents(full_url, *url_list) else: print('page', i, 'failed') time.sleep(5) def main(): page = 1 loc = '国际创新园' get_html(page, loc) write_to_json(an_house_info) if __name__ == '__main__': proxy_list = get_proxy() an_house_info = [] main()
from get_proxy import get_proxy import random, json ans = get_proxy() print(f'[INFO] found {len(ans)} proxies!') with open('OmegaOptions.bak', 'rb') as f: s = f.read() s = json.loads(s) basketball_teams = [ "Hawks", "Bobcats", "Raptors", "Knicks", "Bucks", "Nets", "Pacers", "76ers", "Bulls", "Magic", "Wizards", "Cavaliers", "Celtics", "Pistons", "Heat", "Hornets", "Warriors", "Jazz", "Blazers", "Clippers", "Nuggets", "Timberwolves", "Lakers", "Grizzlies", "Rockets", "Kings", "Mavericks", "Supersonics", "Spurs", "Suns" ] for idx, x in enumerate(ans): if idx == len(basketball_teams): break ip, port = x.split(':') port = int(port) color = hex(random.randint(0, 0xfff))[2:].zfill(3) name = basketball_teams[idx] data = { "bypassList": [{ "conditionType": "BypassCondition", "pattern": "127.0.0.1" }, { "conditionType": "BypassCondition", "pattern": "[::1]" }, { "conditionType": "BypassCondition",