class DuanZiSpider(object): """内涵吧爬虫:爬取段子以及段子中的图片""" def __init__(self): """初始化对象""" self.base_url = 'http://www.neihanpa.com/article' self.start_index = int(raw_input('请输入开始页:')) self.end_index = int(raw_input('请输入结束页:')) self.headers = HEADERS_USER # 创建队列存储页面 self.queue = Queue(int(self.end_index - self.start_index)) # 创建匹配规则获取urls self.xpath_urls = '//a[@class="title"and @title]/@href ' # 创建Redis链接 self.redis_cli = StrictRedis('127.0.0.1') def send_request(self, url, query={}): """发送请求""" print '线程: %s ,正在爬取页面: %s' % (threading.current_thread(), url) s = requests.session() s.keep_alive = False response = requests.get( url, params=query, headers={'User-Agent': random.choice(self.headers)}) return response.content def __open(self): self.client = pymongo.MongoClient(host="127.0.0.1", port=27017) self.db = self.client.test self.collection = self.db.neihan def save_content(self, html): """保存内容到mangodb""" html_obj = etree.HTML(html) # 找到段子文本 content_str = '' contents = html_obj.xpath('//div[@class="detail"]//p/text()') for co in contents: content_str += (co + '\n') # 段子标题 title = html_obj.xpath('//h1[@class="title"]/text()') # 段子图片保存在本地,返回名字,保存路径到mangodb img = html_obj.xpath('//div[@class="detail"]//img/@src') try: url = img[0] except Exception as e: print e return try: file_name = re.search(r'/(\w+\.png)$', url).group(1) except Exception as e: file_name = base64.b16encode('dadasda') + '.png' print "图片名称提取失败" response = self.send_request(url) with open(r'd:/neihan/images/' + file_name, 'wb') as f: f.write(response) self.__open() item_list = {} item_list['title'] = title[0] item_list['img_path'] = url item_list['content'] = content_str print "[INFO] 正在写入MongoDB" print self.client try: self.collection.insert(item_list) print "[INFO] 写入成功!" except Exception as e: print '写入mongodb失败' def parse_index_page(self, html): """处理抓取url页面内容""" html_obj = etree.HTML(html) urls = html_obj.xpath(self.xpath_urls) print urls # 抓取到的url存入Redis for url in urls: self.redis_cli.lpush('urls', url) # print '保存:%s,ok'%url def do_job(self): """爬虫开始工作""" while True: i = self.queue.get() # 执行任务 url = self.base_url + '/index_' + str(i) + '.html' html = self.send_request(url) self.parse_index_page(html) while True: # 从Redis获取url爬取 url_detail = self.redis_cli.rpop('urls') if not url_detail: break detail_url = "http://www.neihanpa.com" + url_detail detail_html = self.send_request(detail_url) self.save_content(detail_html) # 每执行完一个任务通知队列 self.queue.task_done() def main(self): # 创建9个线程的线程池 for _ in range(1, 10): t = threading.Thread(target=duanzi.do_job) # 设置成守护线程,主线程退出,所有线程也会挂掉 t.daemon = True # 开启线程 t.start()
class RedisStore: def __init__(self, db_host, db_port, db_num, db_pw): self.pool = ConnectionPool(max_connections=2, db=db_num, host=db_host, port=db_port, password=db_pw, decode_responses=True) self.redis = StrictRedis(connection_pool=self.pool) self.redis.ping() self._object_map = WeakValueDictionary() def create_object(self, dbo_class, dbo_dict, update_timestamp=True): dbo_class = get_dbo_class(getattr(dbo_class, 'dbo_key_type', dbo_class)) if not dbo_class: return try: dbo_id = dbo_dict['dbo_id'] except KeyError: dbo_id, dbo_dict = dbo_dict, {} if dbo_id is None or dbo_id == '': log.warn("create_object called with empty dbo_id") return dbo_id = str(dbo_id).lower() if self.object_exists(dbo_class.dbo_key_type, dbo_id): raise ObjectExistsError(dbo_id) dbo = dbo_class() dbo.dbo_id = dbo_id dbo.hydrate(dbo_dict) dbo.db_created() if dbo.dbo_set_key: self.redis.sadd(dbo.dbo_set_key, dbo.dbo_id) self.save_object(dbo, update_timestamp) return dbo def load_object(self, dbo_key, key_type=None, silent=False): if key_type: try: key_type = key_type.dbo_key_type except AttributeError: pass try: dbo_key, dbo_id = ':'.join((key_type, dbo_key)), dbo_key except TypeError: if not silent: log.exception("Invalid dbo_key passed to load_object", stack_info=True) return else: key_type, _, dbo_id = dbo_key.partition(':') cached_dbo = self._object_map.get(dbo_key) if cached_dbo: return cached_dbo json_str = self.redis.get(dbo_key) if not json_str: if not silent: log.warn("Failed to find {} in database", dbo_key) return return self._json_to_obj(json_str, key_type, dbo_id) def save_object(self, dbo, update_timestamp=False, autosave=False): if update_timestamp: dbo.dbo_ts = int(time.time()) if dbo.dbo_indexes: self._update_indexes(dbo) self._clear_old_refs(dbo) save_root, new_refs = dbo.to_db_value() self.redis.set(dbo.dbo_key, json_encode(save_root)) if new_refs: self._set_new_refs(dbo, new_refs) log.debug("db object {} {}saved", dbo.dbo_key, "auto" if autosave else "") self._object_map[dbo.dbo_key] = dbo return dbo def update_object(self, dbo, dbo_dict): dbo.hydrate(dbo_dict) return self.save_object(dbo, True) def delete_object(self, dbo): key = dbo.dbo_key dbo.db_deleted() self.delete_key(key) self._clear_old_refs(dbo) if dbo.dbo_set_key: self.redis.srem(dbo.dbo_set_key, dbo.dbo_id) for children_type in dbo.dbo_children_types: self.delete_object_set( get_dbo_class(children_type), "{}_{}s:{}".format(dbo.dbo_key_type, children_type, dbo.dbo_id)) for ix_name in dbo.dbo_indexes: ix_value = getattr(dbo, ix_name, None) if ix_value is not None and ix_value != '': self.delete_index('ix:{}:{}'.format(dbo.dbo_key_type, ix_name), ix_value) log.debug("object deleted: {}", key) self.evict_object(dbo) def load_cached(self, dbo_key): return self._object_map.get(dbo_key) def object_exists(self, obj_type, obj_id): return self.redis.exists('{}:{}'.format(obj_type, obj_id)) def load_object_set(self, dbo_class, set_key=None): dbo_class = get_dbo_class(getattr(dbo_class, 'dbo_key_type', dbo_class)) key_type = dbo_class.dbo_key_type if not set_key: set_key = dbo_class.dbo_set_key results = set() keys = deque() pipeline = self.redis.pipeline() for key in self.fetch_set_keys(set_key): dbo_key = ':'.join([key_type, key]) try: results.add(self._object_map[dbo_key]) except KeyError: keys.append(key) pipeline.get(dbo_key) for dbo_id, json_str in zip(keys, pipeline.execute()): if json_str: obj = self._json_to_obj(json_str, key_type, dbo_id) if obj: results.add(obj) continue log.warn("Removing missing object from set {}", set_key) self.delete_set_key(set_key, dbo_id) return results def delete_object_set(self, dbo_class, set_key=None): if not set_key: set_key = dbo_class.dbo_set_key for dbo in self.load_object_set(dbo_class, set_key): self.delete_object(dbo) self.delete_key(set_key) def reload_object(self, dbo_key): dbo = self._object_map.get(dbo_key) if dbo: json_str = self.redis.get(dbo_key) if not json_str: log.warn("Failed to find {} in database for reload", dbo_key) return None return self.update_object(dbo, json_decode(json_str)) return self.load_object(dbo_key) def evict_object(self, dbo): self._object_map.pop(dbo.dbo_key, None) def load_value(self, key, default=None): json = self.redis.get(key) if json: return json_decode(json) return default def save_value(self, key, value): self.redis.set(key, json_encode(value)) def fetch_set_keys(self, set_key): return self.redis.smembers(set_key) def add_set_key(self, set_key, *values): self.redis.sadd(set_key, *values) def delete_set_key(self, set_key, value): self.redis.srem(set_key, value) def set_key_exists(self, set_key, value): return self.redis.sismember(set_key, value) def db_counter(self, counter_id, inc=1): return self.redis.incr("counter:{}".format(counter_id), inc) def delete_key(self, key): self.redis.delete(key) def set_index(self, index_name, key, value): return self.redis.hset(index_name, key, value) def get_index(self, index_name, key): return self.redis.hget(index_name, key) def get_full_index(self, index_name): return self.redis.hgetall(index_name) def delete_index(self, index_name, key): return self.redis.hdel(index_name, key) def get_all_hash(self, index_name): return { key: json_decode(value) for key, value in self.redis.hgetall(index_name).items() } def get_hash_keys(self, hash_id): return self.redis.hkeys(hash_id) def set_db_hash(self, hash_id, hash_key, value): return self.redis.hset(hash_id, hash_key, json_encode(value)) def get_db_hash(self, hash_id, hash_key): return json_decode(self.redis.hget(hash_id, hash_key)) def remove_db_hash(self, hash_id, hash_key): self.redis.hdel(hash_id, hash_key) def get_all_db_hash(self, hash_id): return [ json_decode(value) for value in self.redis.hgetall(hash_id).values() ] def get_db_list(self, list_id, start=0, end=-1): return [ json_decode(value) for value in self.redis.lrange(list_id, start, end) ] def add_db_list(self, list_id, value): self.redis.lpush(list_id, json_encode(value)) def trim_db_list(self, list_id, start, end): return self.redis.ltrim(list_id, start, end) def dbo_holders(self, dbo_key, degrees=0): all_keys = set() def find(find_key, degree): holder_keys = self.fetch_set_keys('{}:holders'.format(find_key)) for new_key in holder_keys: if new_key != dbo_key and new_key not in all_keys: all_keys.add(new_key) if degree < degrees: find(new_key, degree + 1) find(dbo_key, 0) return all_keys def _json_to_obj(self, json_str, key_type, dbo_id): dbo_dict = json_decode(json_str) dbo = get_mixed_type(key_type, dbo_dict.get('mixins'))() dbo.dbo_id = dbo_id dbo.hydrate(dbo_dict) self._object_map[dbo.dbo_key] = dbo return dbo def _update_indexes(self, dbo): try: old_dbo = json_decode(self.redis.get(dbo.dbo_key)) except TypeError: old_dbo = None for ix_name in dbo.dbo_indexes: new_val = getattr(dbo, ix_name, None) old_val = old_dbo.get(ix_name) if old_dbo else None if old_val == new_val: continue ix_key = 'ix:{}:{}'.format(dbo.dbo_key_type, ix_name) if old_val is not None: self.delete_index(ix_key, old_val) if new_val is not None and new_val != '': if self.get_index(ix_key, new_val): raise NonUniqueError(ix_key, new_val) self.set_index(ix_key, new_val, dbo.dbo_id) def _clear_old_refs(self, dbo): dbo_key = dbo.dbo_key ref_key = '{}:refs'.format(dbo_key) for ref_id in self.fetch_set_keys(ref_key): holder_key = '{}:holders'.format(ref_id) self.delete_set_key(holder_key, dbo_key) self.delete_key(ref_key) def _set_new_refs(self, dbo, new_refs): dbo_key = dbo.dbo_key self.add_set_key("{}:refs".format(dbo_key), *new_refs) for ref_id in new_refs: holder_key = '{}:holders'.format(ref_id) self.add_set_key(holder_key, dbo_key)
class RedisStore(): def __init__(self, db_host, db_port, db_num, db_pw): self.pool = ConnectionPool(max_connections=2, db=db_num, host=db_host, port=db_port, password=db_pw, decode_responses=True) self.redis = StrictRedis(connection_pool=self.pool) self.redis.ping() self._object_map = WeakValueDictionary() def create_object(self, dbo_class, dbo_dict, update_timestamp=True): dbo_class = get_dbo_class(getattr(dbo_class, 'dbo_key_type', dbo_class)) if not dbo_class: return try: dbo_id = dbo_dict['dbo_id'] except KeyError: dbo_id, dbo_dict = dbo_dict, {} if dbo_id is None or dbo_id == '': warn("create_object called with empty dbo_id") return dbo_id = str(dbo_id).lower() if self.object_exists(dbo_class.dbo_key_type, dbo_id): raise ObjectExistsError(dbo_id) dbo = dbo_class() dbo.dbo_id = dbo_id dbo.hydrate(dbo_dict) dbo.db_created() if dbo.dbo_set_key: self.redis.sadd(dbo.dbo_set_key, dbo.dbo_id) self.save_object(dbo, update_timestamp) return dbo def save_object(self, dbo, update_timestamp=False, autosave=False): if update_timestamp: dbo.dbo_ts = int(time.time()) if dbo.dbo_indexes: self._update_indexes(dbo) self._clear_old_refs(dbo) save_root, new_refs = dbo.to_db_value() self.redis.set(dbo.dbo_key, json_encode(save_root)) if new_refs: self._set_new_refs(dbo, new_refs) debug("db object {} {}saved", dbo.dbo_key, "auto" if autosave else "") self._object_map[dbo.dbo_key] = dbo return dbo def save_raw(self, key, raw): self.redis.set(key, json_encode(raw)) def load_raw(self, key, default=None): json = self.redis.get(key) if json: return json_decode(json) return default def load_cached(self, dbo_key): return self._object_map.get(dbo_key) def load_object(self, dbo_key, key_type=None, silent=False): if key_type: try: key_type = key_type.dbo_key_type except AttributeError: pass try: dbo_key, dbo_id = ':'.join([key_type, dbo_key]), dbo_key except TypeError: if not silent: exception("Invalid dbo_key passed to load_object", stack_info=True) return else: key_type, _, dbo_id = dbo_key.partition(':') cached_dbo = self._object_map.get(dbo_key) if cached_dbo: return cached_dbo json_str = self.redis.get(dbo_key) if not json_str: if not silent: warn("Failed to find {} in database", dbo_key) return return self.load_from_json(json_str, key_type, dbo_id) def load_from_json(self, json_str, key_type, dbo_id): dbo_dict = json_decode(json_str) dbo = get_mixed_type(key_type, dbo_dict.get('mixins'))() dbo.dbo_id = dbo_id self._object_map[dbo.dbo_key] = dbo dbo.hydrate(dbo_dict) return dbo def object_exists(self, obj_type, obj_id): return self.redis.exists('{}:{}'.format(obj_type, obj_id)) def load_object_set(self, dbo_class, set_key=None): key_type = dbo_class.dbo_key_type if not set_key: set_key = dbo_class.dbo_set_key results = set() keys = deque() pipeline = self.redis.pipeline() for key in self.fetch_set_keys(set_key): dbo_key = ':'.join([key_type, key]) try: results.add(self._object_map[dbo_key]) except KeyError: keys.append(key) pipeline.get(dbo_key) for dbo_id, json_str in zip(keys, pipeline.execute()): if json_str: obj = self.load_from_json(json_str, key_type, dbo_id) if obj: results.add(obj) continue warn("Removing missing object from set {}", set_key) self.delete_set_key(set_key, dbo_id) return results def delete_object_set(self, dbo_class, set_key=None): if not set_key: set_key = dbo_class.dbo_set_key for dbo in self.load_object_set(dbo_class, set_key): self.delete_object(dbo) self.delete_key(set_key) def update_object(self, dbo, dbo_dict): dbo.hydrate(dbo_dict) return self.save_object(dbo, True) def delete_object(self, dbo): key = dbo.dbo_key dbo.db_deleted() self.delete_key(key) self._clear_old_refs(dbo) if dbo.dbo_set_key: self.redis.srem(dbo.dbo_set_key, dbo.dbo_id) for children_type in dbo.dbo_children_types: self.delete_object_set(get_dbo_class(children_type), "{}_{}s:{}".format(dbo.dbo_key_type, children_type, dbo.dbo_id)) for ix_name in dbo.dbo_indexes: ix_value = getattr(dbo, ix_name, None) if ix_value is not None and ix_value != '': self.delete_index('ix:{}:{}'.format(dbo.dbo_key_type, ix_name), ix_value) debug("object deleted: {}", key) self.evict_object(dbo) def reload_object(self, dbo_key): dbo = self._object_map.get(dbo_key) if dbo: json_str = self.redis.get(dbo_key) if not json_str: warn("Failed to find {} in database for reload", dbo_key) return None return self.update_object(dbo, json_decode(json_str)) return self.load_object(dbo_key) def evict_object(self, dbo): self._object_map.pop(dbo.dbo_key, None) def fetch_set_keys(self, set_key): return self.redis.smembers(set_key) def add_set_key(self, set_key, *values): self.redis.sadd(set_key, *values) def delete_set_key(self, set_key, value): self.redis.srem(set_key, value) def set_key_exists(self, set_key, value): return self.redis.sismember(set_key, value) def db_counter(self, counter_id, inc=1): return self.redis.incr("counter:{}".format(counter_id), inc) def delete_key(self, key): self.redis.delete(key) def set_index(self, index_name, key, value): return self.redis.hset(index_name, key, value) def get_index(self, index_name, key): return self.redis.hget(index_name, key) def get_full_index(self, index_name): return self.redis.hgetall(index_name) def delete_index(self, index_name, key): return self.redis.hdel(index_name, key) def get_all_hash(self, index_name): return {key: json_decode(value) for key, value in self.redis.hgetall(index_name).items()} def set_db_hash(self, hash_id, hash_key, value): return self.redis.hset(hash_id, hash_key, json_encode(value)) def get_db_hash(self, hash_id, hash_key): return json_decode(self.redis.hget(hash_id, hash_key)) def remove_db_hash(self, hash_id, hash_key): self.redis.hdel(hash_id, hash_key) def get_all_db_hash(self, hash_id): return [json_decode(value) for value in self.redis.hgetall(hash_id).values()] def get_db_list(self, list_id, start=0, end=-1): return [json_decode(value) for value in self.redis.lrange(list_id, start, end)] def add_db_list(self, list_id, value): self.redis.lpush(list_id, json_encode(value)) def trim_db_list(self, list_id, start, end): return self.redis.ltrim(list_id, start, end) def _update_indexes(self, dbo): try: old_dbo = json_decode(self.redis.get(dbo.dbo_key)) except TypeError: old_dbo = None for ix_name in dbo.dbo_indexes: new_val = getattr(dbo, ix_name, None) old_val = old_dbo.get(ix_name) if old_dbo else None if old_val == new_val: continue ix_key = 'ix:{}:{}'.format(dbo.dbo_key_type, ix_name) if old_val is not None: self.delete_index(ix_key, old_val) if new_val is not None and new_val != '': if self.get_index(ix_key, new_val): raise NonUniqueError(ix_key, new_val) self.set_index(ix_key, new_val, dbo.dbo_id) def _clear_old_refs(self, dbo): dbo_key = dbo.dbo_key ref_key = '{}:refs'.format(dbo_key) for ref_id in self.fetch_set_keys(ref_key): holder_key = '{}:holders'.format(ref_id) self.delete_set_key(holder_key, dbo_key) self.delete_key(ref_key) def _set_new_refs(self, dbo, new_refs): dbo_key = dbo.dbo_key self.add_set_key("{}:refs".format(dbo_key), *new_refs) for ref_id in new_refs: holder_key = '{}:holders'.format(ref_id) self.add_set_key(holder_key, dbo_key)