Example #1
0
from RedisQueue import RedisQueue

redis = RedisQueue('0', 'jandan')


def user_agent(url):
    proxy_handler = urllib2.ProxyHandler({'http': '127.0.0.1:8080'})
    opener = urllib2.build_opener(proxy_handler)
    urllib2.install_opener(opener)
    req_header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
    }
    req_timeout = 20
    req = urllib2.Request(url, None, req_header)
    page = urllib2.urlopen(req, None, req_timeout)
    html = page
    return html


while True:
    while not redis.empty():
        down_url = redis.get()
        print(down_url)
        try:
            data = user_agent(down_url).read()
            with open('./' + down_url[-11:], 'wb') as code:
                code.write(data)
            redis.pop()
        except:
            pass
class Spider(object):

    def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None):
        self.url = url
        self.key_word = key_word
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'weixin.sogou.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        self.session = Session()
        self.queue = RedisQueue()
        self.mysql = MySQL()

    def start(self):
        '''
        初始化工作
        '''
        self.session.headers.update(self.headers)
        start_url = self.url+'?'+urlencode({'query': self.key_word, 'type':2})
        weixin_request = WeiXinRequest(url=start_url, 
                callback=self.parse_index, need_proxy=True)
        self.queue.pusp(weixin_request)

    def parse_index(self, response):
        '''
        页面解析
        '''
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeiXinRequest(url=url, 
                    callback=self.parse_detail, headers=self.headers)
            yield weixin_request
        # 下一页链接
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.url+str(next)
            weixin_request = WeiXinRequest(url=url, 
                    callback=self.parse_index, headers=self.headers)
            yield weixin_request

    def parse_detail(self, response):
        '''
        解析详情
        '''
        doc(response.text)
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('rich_medai_content').text(),
            'data': doc('#post-date').text(),
            'nickname': doc('#js_profile_qrcode > div > strong').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
        }
        yield data
    
    def error(self, weixin_request: WeiXinRequest):
        weixin_request.fail_times += 1
        print ('Request Faile: {}, Url is{}'.format(weixin_request.fail_times, 
            weixin_request.url))
        if weixin_request.fail_times < MAX_FAIL_TIME:
            self.queue.pusp(weixin_request)

    def get_proxy(self, url='localhsot:5000/random'):
        '''
        在自己的ip池中获取可用的代理ip
        '''
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp.text
            return None
        except Exception as e:
            return None

    def schedule(self):
        '''
        调度策略
        '''
        while not self.queue.empty():
            # 队列不为空
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print ('Schedule: {}'.format(weixin_request.url))
            response = self.request(weixin_request)
            if response and response.status_code == 200:
                results = list(callback(response))    # gererator
                if results:
                    for result in results:
                        if isinstance(result, WeiXinRequest):
                            self.queue.pusp()
                        if isinstance(result, dict):
                            self.mysql.insert(result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, weixin_request: WeiXinRequest):
        '''
        执行请求
        '''
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                            'http': proxy,
                            'https': proxy,
                    }
                    return self.session.send(weixin_request.prepare(),
                            timeout=weixin_request.timeout, allow_redirects=False,
                            proxies=proxies)
            return self.session.send(weixin_request.prepare(), 
                    timeout=weixin_request.timeout, allow_redirects=False)
        except (ConnectionError, ReadTimeout):
            return False

    def run(self):
        self.start()
        self.schedule()
Example #3
0
    try:
        ip, port = host.strip().split(":")
        proxies = dict(http='socks5://' + host, https='socks5://' + host)
        timeout = 1.0
        resp = requests.get('http://2017.ip138.com/ic.asp',
                            proxies=proxies,
                            timeout=timeout)
        r = resp.text.encode('UTF-8')
        ip = re.findall('\[(\d*?\.\d*?\.\d*?\.\d*?)\]', r)[0]
        if ip:
            rWrite.put(host)
            print "############" + host + "############"
    except:
        pass


while True:
    while not rRead.empty():
        sProxy = rRead.pop()
        lProxy.append(sProxy)
        nCounter = nCounter + 1
        if nCounter % 100 == 0:
            lRequests = threadpool.makeRequests(check, lProxy)
            [pool.putRequest(req) for req in lRequests]
            pool.wait()
            lProxy = []
    if len(lProxy) != 0:
        lRequests = threadpool.makeRequests(check, lProxy)
        [pool.putRequest(req) for req in lRequests]
        pool.wait()
        lProxy = []
class Spider(object):
    def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None):
        self.url = url
        self.key_word = key_word
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Host':
            'weixin.sogou.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        self.session = Session()
        self.queue = RedisQueue()
        self.mysql = MySQL()

    def start(self):
        '''
        初始化工作
        '''
        self.session.headers.update(self.headers)
        start_url = self.url + '?' + urlencode({
            'query': self.key_word,
            'type': 2
        })
        weixin_request = WeiXinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        self.queue.pusp(weixin_request)

    def parse_index(self, response):
        '''
        页面解析
        '''
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeiXinRequest(url=url,
                                           callback=self.parse_detail,
                                           headers=self.headers)
            yield weixin_request
        # 下一页链接
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.url + str(next)
            weixin_request = WeiXinRequest(url=url,
                                           callback=self.parse_index,
                                           headers=self.headers)
            yield weixin_request

    def parse_detail(self, response):
        '''
        解析详情
        '''
        doc(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('rich_medai_content').text(),
            'data':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
        }
        yield data

    def error(self, weixin_request: WeiXinRequest):
        weixin_request.fail_times += 1
        print('Request Faile: {}, Url is{}'.format(weixin_request.fail_times,
                                                   weixin_request.url))
        if weixin_request.fail_times < MAX_FAIL_TIME:
            self.queue.pusp(weixin_request)

    def get_proxy(self, url='localhsot:5000/random'):
        '''
        在自己的ip池中获取可用的代理ip
        '''
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp.text
            return None
        except Exception as e:
            return None

    def schedule(self):
        '''
        调度策略
        '''
        while not self.queue.empty():
            # 队列不为空
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule: {}'.format(weixin_request.url))
            response = self.request(weixin_request)
            if response and response.status_code == 200:
                results = list(callback(response))  # gererator
                if results:
                    for result in results:
                        if isinstance(result, WeiXinRequest):
                            self.queue.pusp()
                        if isinstance(result, dict):
                            self.mysql.insert(result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, weixin_request: WeiXinRequest):
        '''
        执行请求
        '''
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': proxy,
                        'https': proxy,
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout):
            return False

    def run(self):
        self.start()
        self.schedule()