class TestProxy(object): def __init__(self): self.URL = r'http://www.cctv.com/' self.timeout = 3 self.myLog = LogUtils() # self.run() # protocol: http server: http://123.23.45.4:8080 def link_with_server_port(self, protocol, server) -> bool: # self.myLog.info('link_with_server_port currentThreadName=%s' % threading.currentThread().getName()) opener = urllib.request.build_opener(urllib.request.ProxyHandler({protocol: server})) urllib.request.install_opener(opener) try: response = urllib.request.urlopen(self.URL, timeout=self.timeout) except Exception as e: self.myLog.warn('使用代理 %s connect failed, exception=%s' % (server, e)) return False else: self.myLog.info('成功得到响应数据,响应码%s' % response.code) try: readResultStr = response.read().decode() # response.read() 返回的是 bytes 格式,所以需要 decode() except Exception as e2: self.myLog.warn('%s connect response.read() failed, exception=%s' % (server, e2)) return False if str(response.code) == '200': self.myLog.info('%s 请求成功' % self.URL) return True else: self.myLog.info('%s 请求失败' % self.URL) return False
class RandomProxy(object): def __init__(self): self.proxys = Resource.get_proxy() self.myLog = LogUtils() def process_request(self, request, spider): if len(Resource.get_proxy()) != 0: proxy = random.choice(Resource.get_proxy()) proxy = proxy.replace('\n', '') proxy = proxy.strip() self.myLog.debug('random choice proxy = %s' % proxy) request.meta['proxy'] = proxy # http代理
class InterceptorProxy(object): def __init__(self): self.myLog = LogUtils() def process_request(self, request, spider): self.myLog.debug( 'InterceptorProxy request url=%s headers=%s cookies=%s meta=%s' % (request.url, request.headers, request.cookies, request.meta)) def process_response(self, request, response, spider): self.myLog.debug('InterceptorProxy %s 的 response = %s' % (request.url, response))
def parse(self, response): tempMyLog = LogUtils() tempMyLog.info('proxy360Spider') subSelector = response.xpath( '//div[@class="proxylistitem" and @name="list_proxy_ip"]') items = [] for sub in subSelector: item = GetproxyItem() item['ip'] = sub.xpath('.//span[1]/text()').extract()[0] item['port'] = sub.xpath('.//span[2]/text()').extract()[0] item['type'] = sub.xpath('.//span[3]/text()').extract()[0] item['location'] = sub.xpath('.//span[4]/text()').extract()[0] item['protocol'] = 'HTTP' item['source'] = 'proxy360' items.append(item) return items
def __init__(self): # 字符串前面加 r 就不用管其中的特殊字符了,即整体转义 print( get_root_path() ) # '/Users/imac/MyDir/Project/PyProject/SpiderProject/xiciSpider/xiciSpider' self.totalProxyFile = os.path.join(get_root_path(), 'build/proxy.txt') # 全部代理.txt self.aliveProxyFile = os.path.join(get_root_path(), 'build/alive.txt') # 可用代理.txt # self.URL = r'http://www.xicidaili.com/nn/1' # 拿西刺的代理测试西刺的不好使,connect refused # self.URL = r'http://www.baidu.com/' self.URL = r'http://www.cctv.com/' self.threads = 5 # 10 self.timeout = 3 self.aliveList = [] self.myLog = LogUtils()
class ToTxtPipeline(object): myLog = LogUtils() def process_item(self, item, spider): fileName = os.path.join(get_root_path(), 'build/proxy.txt') # self.myLog.info('开始写入到%s,item=%s' % (fileName, item)) with open(fileName, 'a') as fp: fp.write(item['ip'] + '\t') fp.write(item['port'] + '\t') fp.write(item['protocol'] + '\t') fp.write(item['type'] + '\t\t') # 中文则加两个 \t 防止对不齐 fp.write(item['location'] + '\t\t') fp.write(item['source'] + '\n') return item
class Resource: log = LogUtils() userAgents = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] __PROXIES = [] @staticmethod def get_proxy(): if len(Resource.__PROXIES) != 0: # Resource.log.info('get_proxy 返回长度为:%s' % len(Resource.__PROXIES)) return Resource.__PROXIES else: aliveProxyFile = os.path.join(get_root_path(), 'build/alive.txt') # 可用代理.txt if not os.path.exists(aliveProxyFile): return Resource.__PROXIES with open(aliveProxyFile, 'r') as fp: lines = fp.readlines() Resource.__PROXIES = lines # Resource.log.info('get_proxy 返回长度为:%s' % len(Resource.__PROXIES)) return lines
class ToCsvPipeline(object): myLog = LogUtils() def process_item(self, item, spider): csvFileName = os.path.join(get_root_path(), 'build/proxy.csv') # self.myLog.info('开始写入到%s,item=%s' % (csvFileName, item)) with open(csvFileName, 'w+') as file: columns = ['ip', 'port', 'protocol', 'type', 'location', 'source'] csvfile = csv.DictWriter(file, columns) # 写入 csv 文件列名 csvfile.writeheader() # 写入行数据 csvfile.writerow({ 'ip': item['ip'], 'port': item['port'], 'protocol': item['protocol'], 'type': item['type'], 'location': item['location'], 'source': item['source'] }) return item
class TestProxy(object): def __init__(self): # 字符串前面加 r 就不用管其中的特殊字符了,即整体转义 print( get_root_path() ) # '/Users/imac/MyDir/Project/PyProject/SpiderProject/xiciSpider/xiciSpider' self.totalProxyFile = os.path.join(get_root_path(), 'build/proxy.txt') # 全部代理.txt self.aliveProxyFile = os.path.join(get_root_path(), 'build/alive.txt') # 可用代理.txt # self.URL = r'http://www.xicidaili.com/nn/1' # 拿西刺的代理测试西刺的不好使,connect refused # self.URL = r'http://www.baidu.com/' self.URL = r'http://www.cctv.com/' self.threads = 5 # 10 self.timeout = 3 self.aliveList = [] self.myLog = LogUtils() # self.run() def run(self): with open(self.totalProxyFile, 'r') as fp: lines = fp.readlines() if len(lines) == 0: self.myLog.error('读取 %s 内容是空的,退出程序' % self.totalProxyFile) perThreadDataSize = 0 modResult = len(lines) % self.threads if modResult == 0: perThreadDataSize = len(lines) / self.threads else: perThreadDataSize = (len(lines) / self.threads) + 1 perThreadDataSize = int(perThreadDataSize) self.myLog.info( 'len(lines) = %s, self.threads = %s, perThreadDataSize=%s' % (len(lines), self.threads, perThreadDataSize)) threadList = [] for index in range(self.threads): startIndex = index * perThreadDataSize if (index + 1) * perThreadDataSize > len(lines): stopIndex = len(lines) else: stopIndex = (index + 1) * perThreadDataSize # self.myLog.info('len(lines) = %s, startIndex=%s, stopIndex=%s' % (len(lines), startIndex, stopIndex)) subLine = lines[startIndex:stopIndex] # self.myLog.info('TestProxy run threadIndex=%d,切割 lineList=%s' % (index, subLine)) tempThread = threading.Thread(target=self.link_with_proxy, args=(subLine, )) self.myLog.debug('开启线程: threadName=%s' % tempThread.name) tempThread.start() threadList.append(tempThread) # tempThread.join() for tt in threadList: tt.join() self.myLog.info('-- thread join 结束,准备写入 alive.txt --') if len(self.aliveList) != 0: self.myLog.info( 'self.aliveList 不为空,开始写入 alive.txt, len(aliveList)=%s' % len(self.aliveList)) with open(self.aliveProxyFile, 'w') as fp: for i in range(len(self.aliveList)): fp.write(self.aliveList[i]) fp.write('\n') else: self.myLog.error('self.aliveList 是空的,无法写入到 alive.txt') # protocol: http server: http://123.23.45.4:8080 def link_with_server_port(self, protocol, server) -> bool: # self.myLog.info('link_with_server_port currentThreadName=%s' % threading.currentThread().getName()) opener = urllib.request.build_opener( urllib.request.ProxyHandler({protocol: server})) urllib.request.install_opener(opener) try: response = urllib.request.urlopen(self.URL, timeout=self.timeout) except Exception as e: self.myLog.warn('使用代理 %s connect failed, exception=%s' % (server, e)) return False else: self.myLog.info('成功得到响应数据,响应码%s' % response.code) try: readResultStr = response.read().decode( ) # response.read() 返回的是 bytes 格式,所以需要 decode() except Exception as e2: self.myLog.warn( '%s connect response.read() failed, exception=%s' % (server, e2)) return False if str(response.code) == '200': self.myLog.info('%s 请求成功' % self.URL) return True else: self.myLog.info('%s 请求失败' % self.URL) return False def link_with_proxy(self, lineList): # self.myLog.info('linkWithProxy line=%s' % line) for line in lineList: if line.find('HTTP') == -1: self.myLog.warn('发现了畸形数据: %s, threadName=%s' % (line, threading.currentThread().getName())) return lineList = line.split('\t') protocol = lineList[2].lower() ip_port = lineList[0] + ':' + lineList[1] server = protocol + r'://' + ip_port # http://175.42.158.71:9999 result = self.link_with_server_port(protocol, server) if result: self.myLog.info('aliveList 开始 append 数据: %s' % server) self.aliveList.append( server) # list.append() 是线程安全的,这里只添加 ip 地址:端口号
def __init__(self): self.URL = r'http://www.cctv.com/' self.timeout = 3 self.myLog = LogUtils()
def __init__(self): self.proxys = Resource.get_proxy() self.myLog = LogUtils()
class ProxyXiciSpider(scrapy.Spider): name = "proxy_xici_spider" allowed_domains = ["xicidaili.com"] # nn: 国内高匿代理 nt: 国内普通代理 wn: 国内 HTTPS 代理 wt: 国内 HTTP 代理 wds = ['nn', 'nt', 'wn', 'wt'] pages = 20 sleepTime = 10 start_urls = [] myLog = LogUtils() for type in wds: for i in range(1, pages + 1): start_urls.append('http://www.xicidaili.com/' + type + '/' + str(i)) def start_requests(self): self.start_urls = ['http://www.xicidaili.com/nn/1'] for url in self.start_urls: self.myLog.info('开始请求: request url = %s' % url) sleep(self.sleepTime) # 休眠, 防止封ip , 设置了 DOWNLOAD_DELAY 这里貌似不设置也没事 yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin, dont_filter=True) def errback_httpbin(self, failure): # log all failures self.myLog.info('响应失败, {}'.format(repr(failure))) self.myLog.info(repr(failure)) if failure.check(HttpError): response = failure.value.response self.myLog.info('HttpError错误 on %s', response.url) elif failure.check(DNSLookupError): # this is the original request request = failure.request self.myLog.info('DNSLookupError错误 on %s', request.url) elif failure.check(TimeoutError, TCPTimedOutError): request = failure.request self.myLog.info('TimeoutError错误 on %s', request.url) def parse(self, response): self.myLog.info('响应成功, url={}'.format(response.url)) for ele in self.process_data(response): yield ele def process_data(self, response): # self.myLog.debug('ProxyXiciSpider parse response=%s' % (response.text)) subSelector = response.xpath('//tr[@class=""]|//tr[@class="odd"]') items = [] for sub in subSelector: item = GetproxyItem() item['ip'] = sub.xpath('.//td[2]/text()').extract()[0] item['port'] = sub.xpath('.//td[3]/text()').extract()[0] item['type'] = sub.xpath('.//td[5]/text()').extract()[0] if sub.xpath('.//td[4]/a/text()'): item['location'] = sub.xpath('//td[4]/a/text()').extract()[0] else: item['location'] = sub.xpath('.//td[4]/text()').extract()[0] item['protocol'] = sub.xpath('.//td[6]/text()').extract()[0] item['source'] = 'xicidaili' items.append(item) return items
def __init__(self): self.myLog = LogUtils()