コード例 #1
0
class ToTxtPipeline(object):
    myLog = LogUtils()

    def process_item(self, item, spider):
        fileName = os.path.join(get_root_path(), 'build/proxy.txt')
        # self.myLog.info('开始写入到%s,item=%s' % (fileName, item))
        with open(fileName, 'a') as fp:
            fp.write(item['ip'] + '\t')
            fp.write(item['port'] + '\t')
            fp.write(item['protocol'] + '\t')
            fp.write(item['type'] + '\t\t')  # 中文则加两个 \t 防止对不齐
            fp.write(item['location'] + '\t\t')
            fp.write(item['source'] + '\n')
        return item
コード例 #2
0
 def parse(self, response):
     tempMyLog = LogUtils()
     tempMyLog.info('proxy360Spider')
     subSelector = response.xpath(
         '//div[@class="proxylistitem" and @name="list_proxy_ip"]')
     items = []
     for sub in subSelector:
         item = GetproxyItem()
         item['ip'] = sub.xpath('.//span[1]/text()').extract()[0]
         item['port'] = sub.xpath('.//span[2]/text()').extract()[0]
         item['type'] = sub.xpath('.//span[3]/text()').extract()[0]
         item['location'] = sub.xpath('.//span[4]/text()').extract()[0]
         item['protocol'] = 'HTTP'
         item['source'] = 'proxy360'
         items.append(item)
     return items
コード例 #3
0
 def __init__(self):
     # 字符串前面加 r 就不用管其中的特殊字符了,即整体转义
     print(
         get_root_path()
     )  # '/Users/imac/MyDir/Project/PyProject/SpiderProject/xiciSpider/xiciSpider'
     self.totalProxyFile = os.path.join(get_root_path(),
                                        'build/proxy.txt')  # 全部代理.txt
     self.aliveProxyFile = os.path.join(get_root_path(),
                                        'build/alive.txt')  # 可用代理.txt
     # self.URL = r'http://www.xicidaili.com/nn/1' # 拿西刺的代理测试西刺的不好使,connect refused
     # self.URL = r'http://www.baidu.com/'
     self.URL = r'http://www.cctv.com/'
     self.threads = 5  # 10
     self.timeout = 3
     self.aliveList = []
     self.myLog = LogUtils()
コード例 #4
0
class Resource:
    log = LogUtils()

    userAgents = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]

    __PROXIES = []

    @staticmethod
    def get_proxy():

        if len(Resource.__PROXIES) != 0:
            # Resource.log.info('get_proxy 返回长度为:%s' % len(Resource.__PROXIES))
            return Resource.__PROXIES
        else:
            aliveProxyFile = os.path.join(get_root_path(),
                                          'build/alive.txt')  # 可用代理.txt
            if not os.path.exists(aliveProxyFile):
                return Resource.__PROXIES
            with open(aliveProxyFile, 'r') as fp:
                lines = fp.readlines()
                Resource.__PROXIES = lines
                # Resource.log.info('get_proxy 返回长度为:%s' % len(Resource.__PROXIES))
                return lines
コード例 #5
0
class ToCsvPipeline(object):
    myLog = LogUtils()

    def process_item(self, item, spider):
        csvFileName = os.path.join(get_root_path(), 'build/proxy.csv')
        # self.myLog.info('开始写入到%s,item=%s' % (csvFileName, item))

        with open(csvFileName, 'w+') as file:
            columns = ['ip', 'port', 'protocol', 'type', 'location', 'source']
            csvfile = csv.DictWriter(file, columns)
            # 写入 csv 文件列名
            csvfile.writeheader()
            # 写入行数据
            csvfile.writerow({
                'ip': item['ip'],
                'port': item['port'],
                'protocol': item['protocol'],
                'type': item['type'],
                'location': item['location'],
                'source': item['source']
            })
        return item
コード例 #6
0
 def __init__(self):
     self.URL = r'http://www.cctv.com/'
     self.timeout = 3
     self.myLog = LogUtils()
コード例 #7
0
 def __init__(self):
     self.proxys = Resource.get_proxy()
     self.myLog = LogUtils()
コード例 #8
0
class ProxyXiciSpider(scrapy.Spider):

    name = "proxy_xici_spider"
    allowed_domains = ["xicidaili.com"]
    # nn: 国内高匿代理		nt: 国内普通代理	wn: 国内 HTTPS 代理		wt: 国内 HTTP 代理
    wds = ['nn', 'nt', 'wn', 'wt']
    pages = 20
    sleepTime = 10
    start_urls = []
    myLog = LogUtils()

    for type in wds:
        for i in range(1, pages + 1):
            start_urls.append('http://www.xicidaili.com/' + type + '/' +
                              str(i))

    def start_requests(self):
        self.start_urls = ['http://www.xicidaili.com/nn/1']
        for url in self.start_urls:
            self.myLog.info('开始请求: request url = %s' % url)
            sleep(self.sleepTime)  # 休眠, 防止封ip , 设置了 DOWNLOAD_DELAY 这里貌似不设置也没事
            yield scrapy.Request(url,
                                 callback=self.parse,
                                 errback=self.errback_httpbin,
                                 dont_filter=True)

    def errback_httpbin(self, failure):
        # log all failures
        self.myLog.info('响应失败, {}'.format(repr(failure)))
        self.myLog.info(repr(failure))

        if failure.check(HttpError):
            response = failure.value.response
            self.myLog.info('HttpError错误 on %s', response.url)

        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.myLog.info('DNSLookupError错误 on %s', request.url)

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.myLog.info('TimeoutError错误 on %s', request.url)

    def parse(self, response):
        self.myLog.info('响应成功, url={}'.format(response.url))
        for ele in self.process_data(response):
            yield ele

    def process_data(self, response):
        # self.myLog.debug('ProxyXiciSpider parse response=%s' % (response.text))

        subSelector = response.xpath('//tr[@class=""]|//tr[@class="odd"]')
        items = []
        for sub in subSelector:
            item = GetproxyItem()

            item['ip'] = sub.xpath('.//td[2]/text()').extract()[0]
            item['port'] = sub.xpath('.//td[3]/text()').extract()[0]
            item['type'] = sub.xpath('.//td[5]/text()').extract()[0]
            if sub.xpath('.//td[4]/a/text()'):
                item['location'] = sub.xpath('//td[4]/a/text()').extract()[0]
            else:
                item['location'] = sub.xpath('.//td[4]/text()').extract()[0]
            item['protocol'] = sub.xpath('.//td[6]/text()').extract()[0]
            item['source'] = 'xicidaili'
            items.append(item)
        return items
コード例 #9
0
 def __init__(self):
     self.myLog = LogUtils()