Exemple #1
0
class Record(object):
    def __init__(self,host='127.0.0.1',port=6379):         
        self.r=StrictRedis()
    
    def run(self):
        while True:
            value=self.r.rpop('alerts')
            if value:
                obj=json.loads(value)
                keyredis=obj['src_ip']+'_'+str(obj['src_port'])+'_'+ obj['dest_ip']+'_'+str(obj['dest_port'])
                entry=self.r.get(keyredis)
                if entry:
                    restruct=json.loads(entry)
                else:
                    restruct={}
                if not 'http' in restruct:
                    restruct['http']=[]
                if not 'alerts' in restruct:
                    restruct['alerts']=[]
                if not 'files' in restruct:
                    restruct['files']=[]  
                if 'alert' in obj:    
                    restruct['alerts'].append(obj['alert']['signature'])
                if 'fileinfo' in obj:
                    restruct['files'].append(obj['fileinfo'])
                if 'http' in obj:
                    restruct['http'].append(obj['http'])
                if len(restruct)>0:
                    self.r.set(keyredis, json.dumps(restruct))
            else:
                sleep(1)
class DuanZiSpider(object):
    """内涵吧爬虫:爬取段子以及段子中的图片"""
    def __init__(self):
        """初始化对象"""
        self.base_url = 'http://www.neihanpa.com/article'
        self.start_index = int(raw_input('请输入开始页:'))
        self.end_index = int(raw_input('请输入结束页:'))
        self.headers = HEADERS_USER
        # 创建队列存储页面
        self.queue = Queue(int(self.end_index - self.start_index))
        #  创建匹配规则获取urls
        self.xpath_urls = '//a[@class="title"and @title]/@href '
        # 创建Redis链接
        self.redis_cli = StrictRedis('127.0.0.1')

    def send_request(self, url, query={}):
        """发送请求"""
        print '线程: %s ,正在爬取页面: %s' % (threading.current_thread(), url)
        s = requests.session()
        s.keep_alive = False
        response = requests.get(
            url,
            params=query,
            headers={'User-Agent': random.choice(self.headers)})
        return response.content

    def __open(self):
        self.client = pymongo.MongoClient(host="127.0.0.1", port=27017)
        self.db = self.client.test
        self.collection = self.db.neihan

    def save_content(self, html):
        """保存内容到mangodb"""
        html_obj = etree.HTML(html)
        # 找到段子文本
        content_str = ''
        contents = html_obj.xpath('//div[@class="detail"]//p/text()')
        for co in contents:
            content_str += (co + '\n')
        # 段子标题
        title = html_obj.xpath('//h1[@class="title"]/text()')

        # 段子图片保存在本地,返回名字,保存路径到mangodb
        img = html_obj.xpath('//div[@class="detail"]//img/@src')
        try:
            url = img[0]
        except Exception as e:
            print e
            return
        try:
            file_name = re.search(r'/(\w+\.png)$', url).group(1)
        except Exception as e:
            file_name = base64.b16encode('dadasda') + '.png'
            print "图片名称提取失败"
        response = self.send_request(url)
        with open(r'd:/neihan/images/' + file_name, 'wb') as f:
            f.write(response)

        self.__open()
        item_list = {}
        item_list['title'] = title[0]
        item_list['img_path'] = url
        item_list['content'] = content_str
        print "[INFO] 正在写入MongoDB"
        print self.client
        try:
            self.collection.insert(item_list)
            print "[INFO] 写入成功!"
        except Exception as e:
            print '写入mongodb失败'

    def parse_index_page(self, html):
        """处理抓取url页面内容"""
        html_obj = etree.HTML(html)
        urls = html_obj.xpath(self.xpath_urls)
        print urls
        # 抓取到的url存入Redis
        for url in urls:
            self.redis_cli.lpush('urls', url)
            # print '保存:%s,ok'%url

    def do_job(self):
        """爬虫开始工作"""
        while True:
            i = self.queue.get()
            # 执行任务
            url = self.base_url + '/index_' + str(i) + '.html'
            html = self.send_request(url)
            self.parse_index_page(html)

            while True:
                # 从Redis获取url爬取
                url_detail = self.redis_cli.rpop('urls')
                if not url_detail:
                    break
                detail_url = "http://www.neihanpa.com" + url_detail
                detail_html = self.send_request(detail_url)
                self.save_content(detail_html)

            # 每执行完一个任务通知队列
            self.queue.task_done()

    def main(self):
        # 创建9个线程的线程池
        for _ in range(1, 10):
            t = threading.Thread(target=duanzi.do_job)
            # 设置成守护线程,主线程退出,所有线程也会挂掉
            t.daemon = True
            # 开启线程
            t.start()