class Record(object): def __init__(self,host='127.0.0.1',port=6379): self.r=StrictRedis() def run(self): while True: value=self.r.rpop('alerts') if value: obj=json.loads(value) keyredis=obj['src_ip']+'_'+str(obj['src_port'])+'_'+ obj['dest_ip']+'_'+str(obj['dest_port']) entry=self.r.get(keyredis) if entry: restruct=json.loads(entry) else: restruct={} if not 'http' in restruct: restruct['http']=[] if not 'alerts' in restruct: restruct['alerts']=[] if not 'files' in restruct: restruct['files']=[] if 'alert' in obj: restruct['alerts'].append(obj['alert']['signature']) if 'fileinfo' in obj: restruct['files'].append(obj['fileinfo']) if 'http' in obj: restruct['http'].append(obj['http']) if len(restruct)>0: self.r.set(keyredis, json.dumps(restruct)) else: sleep(1)
class DuanZiSpider(object): """内涵吧爬虫:爬取段子以及段子中的图片""" def __init__(self): """初始化对象""" self.base_url = 'http://www.neihanpa.com/article' self.start_index = int(raw_input('请输入开始页:')) self.end_index = int(raw_input('请输入结束页:')) self.headers = HEADERS_USER # 创建队列存储页面 self.queue = Queue(int(self.end_index - self.start_index)) # 创建匹配规则获取urls self.xpath_urls = '//a[@class="title"and @title]/@href ' # 创建Redis链接 self.redis_cli = StrictRedis('127.0.0.1') def send_request(self, url, query={}): """发送请求""" print '线程: %s ,正在爬取页面: %s' % (threading.current_thread(), url) s = requests.session() s.keep_alive = False response = requests.get( url, params=query, headers={'User-Agent': random.choice(self.headers)}) return response.content def __open(self): self.client = pymongo.MongoClient(host="127.0.0.1", port=27017) self.db = self.client.test self.collection = self.db.neihan def save_content(self, html): """保存内容到mangodb""" html_obj = etree.HTML(html) # 找到段子文本 content_str = '' contents = html_obj.xpath('//div[@class="detail"]//p/text()') for co in contents: content_str += (co + '\n') # 段子标题 title = html_obj.xpath('//h1[@class="title"]/text()') # 段子图片保存在本地,返回名字,保存路径到mangodb img = html_obj.xpath('//div[@class="detail"]//img/@src') try: url = img[0] except Exception as e: print e return try: file_name = re.search(r'/(\w+\.png)$', url).group(1) except Exception as e: file_name = base64.b16encode('dadasda') + '.png' print "图片名称提取失败" response = self.send_request(url) with open(r'd:/neihan/images/' + file_name, 'wb') as f: f.write(response) self.__open() item_list = {} item_list['title'] = title[0] item_list['img_path'] = url item_list['content'] = content_str print "[INFO] 正在写入MongoDB" print self.client try: self.collection.insert(item_list) print "[INFO] 写入成功!" except Exception as e: print '写入mongodb失败' def parse_index_page(self, html): """处理抓取url页面内容""" html_obj = etree.HTML(html) urls = html_obj.xpath(self.xpath_urls) print urls # 抓取到的url存入Redis for url in urls: self.redis_cli.lpush('urls', url) # print '保存:%s,ok'%url def do_job(self): """爬虫开始工作""" while True: i = self.queue.get() # 执行任务 url = self.base_url + '/index_' + str(i) + '.html' html = self.send_request(url) self.parse_index_page(html) while True: # 从Redis获取url爬取 url_detail = self.redis_cli.rpop('urls') if not url_detail: break detail_url = "http://www.neihanpa.com" + url_detail detail_html = self.send_request(detail_url) self.save_content(detail_html) # 每执行完一个任务通知队列 self.queue.task_done() def main(self): # 创建9个线程的线程池 for _ in range(1, 10): t = threading.Thread(target=duanzi.do_job) # 设置成守护线程,主线程退出,所有线程也会挂掉 t.daemon = True # 开启线程 t.start()