def main(config): cfg = cliconfig(config) session = SessionFactory(cfg['database']['url']).create() server = Solr(str(cfg['solr']['url']), http_user=cfg['solr'].get('username'), http_pass=cfg['solr'].get('password')) documents = [] q = session.query(Address).filter(Address.prefecture is not None) q = q.order_by(Address.zipcode) for r in ifilter(lambda r: r, imap(transform, q)): documents.append(r) if len(documents) >= COMMIT_UNIT: server.add_many(documents) documents = [] if len(documents) > 0: server.add_many(documents) server.commit()
class SolrPipeline(object): """ Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS 信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与 SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection 名的对应关系,后者用于指定默认存储的 solr collection。 另外,可根据需要配置: SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False SOLR_WEB_APP: solr 运行的 web app,默认为 "solr" SOLR_USER: solr 集群用户 SOLR_PASSWORD: solr 集群用户密码 SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False SOLR_TIMEOUT: solr 超时设置,默认为 10 秒 SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量 提交缓存大小,默认为 10 * 1024 * 1024(10 兆) DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER, 则以此配置作为它的值 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数, 默认为 100 个 DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER, 则以此配置作为它的值 本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。 """ @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.solr_connection = None self.solr_collection = None solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE") self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False solr_servers = crawler.settings.get("SOLR_SERVERS") if not solr_servers: log.msg("No field SOLR_SERVERS in settings.py!", level=log.CRITICAL) raise NotConfigured elif not isinstance(solr_servers, list): log.msg("Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!", level=log.CRITICAL) raise NotConfigured elif not self.solr_cloud_mode and len(solr_servers) > 1: log.msg("Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!", level=log.CRITICAL) raise NotConfigured self.solr_servers = solr_servers solr_web_app = crawler.settings.get("SOLR_WEB_APP") solr_web_app = solr_web_app if solr_web_app else "solr" self.solr_web_app = solr_web_app if solr_web_app.endswith('/') else solr_web_app + '/' solr_collection_default = crawler.settings.get('SOLR_COLLECTION_DEFAULT') solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP') if not solr_collection_default and not solr_collection_map: log.msg( "You must at least set one of the two fields SOLR_COLLECTION_MAP " "and SOLR_COLLECTION_DEFAULT in settings.py!", level=log.CRITICAL) raise NotConfigured if solr_collection_map and not isinstance(solr_collection_map, dict): log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!", level=log.CRITICAL) raise NotConfigured self.solr_collection_name = solr_collection_default self.solr_collection_map = solr_collection_map if solr_collection_map else {} self.solr_user = crawler.settings.get("SOLR_USER") self.solr_password = crawler.settings.get("SOLR_PASSWORD") solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES") self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False solr_timeout = crawler.settings.get("SOLR_TIMEOUT") self.solr_timeout = solr_timeout if solr_timeout is not None else 10 solr_cache_max_size = crawler.settings.get("SOLR_CACHE_MAX_SIZE_PER_SPIDER") if solr_cache_max_size is None: solr_cache_max_size = crawler.settings.get("DEFAULT_CACHE_MAX_SIZE_PER_SPIDER") self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024 solr_cache_max_len = crawler.settings.get("SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER") if solr_cache_max_len is None: solr_cache_max_len = crawler.settings.get("DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER") self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100 self.crawler = crawler self.cache_buffer = {} self.locks = {} def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection(server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock() def close_spider(self, spider): try: self.index_item(None, spider, True) finally: self.cache_buffer[spider.name].clear() @check_spider_pipeline def process_item(self, item, spider): self.index_item(item, spider, False) return item def index_item(self, item, spider, close_spider): lock = self.locks[spider.name] lock.acquire() cache_queue = self.cache_buffer[spider.name] if not close_spider: cache_queue.append(dict(item)) cache_len = len(cache_queue) cache_size = sizeof(cache_queue) try: if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \ or (cache_len > 0 and cache_size >= self.solr_cache_max_size): if self.solr_cloud_mode: self.solr_collection.add(list(cache_queue)) else: self.solr_collection.add_many(list(cache_queue)) # self.solr_collection.commit() spider.log( "{cache_len} items of size {cache_size} byte(s) indexed in solr".format( cache_len=cache_len, cache_size=cache_size), level=log.INFO) cache_queue.clear() except Exception, e: trace_info = traceback.format_exc() spider.log( "Failed to index item(s): {message}\n{trace_info}".format( message=e.message, trace_info=trace_info), level=log.ERROR) finally:
class SolrPipeline(object): """ Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS 信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与 SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection 名的对应关系,后者用于指定默认存储的 solr collection。 另外,可根据需要配置: SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False SOLR_WEB_APP: solr 运行的 web app,默认为 "solr" SOLR_USER: solr 集群用户 SOLR_PASSWORD: solr 集群用户密码 SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False SOLR_TIMEOUT: solr 超时设置,默认为 10 秒 SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量 提交缓存大小,默认为 10 * 1024 * 1024(10 兆) DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER, 则以此配置作为它的值 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数, 默认为 100 个 DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER, 则以此配置作为它的值 本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。 """ @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.solr_connection = None self.solr_collection = None solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE") self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False solr_servers = crawler.settings.get("SOLR_SERVERS") if not solr_servers: log.msg("No field SOLR_SERVERS in settings.py!", level=log.CRITICAL) raise NotConfigured elif not isinstance(solr_servers, list): log.msg( "Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!", level=log.CRITICAL) raise NotConfigured elif not self.solr_cloud_mode and len(solr_servers) > 1: log.msg( "Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!", level=log.CRITICAL) raise NotConfigured self.solr_servers = solr_servers solr_web_app = crawler.settings.get("SOLR_WEB_APP") solr_web_app = solr_web_app if solr_web_app else "solr" self.solr_web_app = solr_web_app if solr_web_app.endswith( '/') else solr_web_app + '/' solr_collection_default = crawler.settings.get( 'SOLR_COLLECTION_DEFAULT') solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP') if not solr_collection_default and not solr_collection_map: log.msg( "You must at least set one of the two fields SOLR_COLLECTION_MAP " "and SOLR_COLLECTION_DEFAULT in settings.py!", level=log.CRITICAL) raise NotConfigured if solr_collection_map and not isinstance(solr_collection_map, dict): log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!", level=log.CRITICAL) raise NotConfigured self.solr_collection_name = solr_collection_default self.solr_collection_map = solr_collection_map if solr_collection_map else {} self.solr_user = crawler.settings.get("SOLR_USER") self.solr_password = crawler.settings.get("SOLR_PASSWORD") solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES") self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False solr_timeout = crawler.settings.get("SOLR_TIMEOUT") self.solr_timeout = solr_timeout if solr_timeout is not None else 10 solr_cache_max_size = crawler.settings.get( "SOLR_CACHE_MAX_SIZE_PER_SPIDER") if solr_cache_max_size is None: solr_cache_max_size = crawler.settings.get( "DEFAULT_CACHE_MAX_SIZE_PER_SPIDER") self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024 solr_cache_max_len = crawler.settings.get( "SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER") if solr_cache_max_len is None: solr_cache_max_len = crawler.settings.get( "DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER") self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100 self.crawler = crawler self.cache_buffer = {} self.locks = {} def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection( server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[ self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock() def close_spider(self, spider): try: self.index_item(None, spider, True) finally: self.cache_buffer[spider.name].clear() @check_spider_pipeline def process_item(self, item, spider): self.index_item(item, spider, False) return item def index_item(self, item, spider, close_spider): lock = self.locks[spider.name] lock.acquire() cache_queue = self.cache_buffer[spider.name] if not close_spider: cache_queue.append(dict(item)) cache_len = len(cache_queue) cache_size = sizeof(cache_queue) try: if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \ or (cache_len > 0 and cache_size >= self.solr_cache_max_size): if self.solr_cloud_mode: self.solr_collection.add(list(cache_queue)) else: self.solr_collection.add_many(list(cache_queue)) # self.solr_collection.commit() spider.log( "{cache_len} items of size {cache_size} byte(s) indexed in solr" .format(cache_len=cache_len, cache_size=cache_size), level=log.INFO) cache_queue.clear() except Exception, e: trace_info = traceback.format_exc() spider.log( "Failed to index item(s): {message}\n{trace_info}".format( message=e.message, trace_info=trace_info), level=log.ERROR) finally: