def add_sounds_to_solr(sounds): logger.info("adding multiple sounds to solr index") solr = Solr(settings.SOLR_URL) logger.info("creating XML") documents = map(convert_to_solr_document, sounds) logger.info("posting to Solr") solr.add(documents)
def add_sounds_to_solr(sounds): logger.info("adding multiple sounds to solr index") solr = Solr(settings.SOLR_URL) logger.info("creating XML") documents = map(convert_to_solr_document, sounds) logger.info("posting to Solr") solr.add(documents)
def add_posts_to_solr(posts): logger.info("adding multiple forum posts to solr index") solr = Solr(settings.SOLR_FORUM_URL) logger.info("creating XML") documents = map(convert_to_solr_document, posts) logger.info("posting to Solr") solr.add(documents) logger.info("optimizing solr index") #solr.optimize() logger.info("done")
def add_posts_to_solr(posts): logger.info("adding multiple forum posts to solr index") solr = Solr(settings.SOLR_FORUM_URL) logger.info("creating XML") documents = map(convert_to_solr_document, posts) logger.info("posting to Solr") solr.add(documents) logger.info("optimizing solr index") solr.optimize() logger.info("done")
def send_posts_to_solr(posts): logger.info("adding forum posts to solr index") logger.info("creating XML") documents = [convert_to_solr_document(p) for p in posts] try: logger.info("posting to Solr") solr = Solr(settings.SOLR_FORUM_URL) solr.add(documents) solr.commit() except SolrException as e: logger.error("failed to add posts to solr index, reason: %s" % str(e)) logger.info("done")
def index(): s = Solr('http://localhost:8983/solr/jobs') db = get_db() for job in db.jobs.find(): job["id"] = str(job.pop("_id")) job["updated_at"] = date.today() try: company = job["company"] location = job["location"] geo = db.company_coordinates.find(_get_query(company, location))[0] job["geo_location"] = "%f,%f" % (geo["lat"], geo["lng"]) print dict(job) s.add(dict(job), commit=True) except Exception as e: print e pass
def add_item(solr_address, doc): s = Solr(solr_address) resp = s.add(doc, commit=True)
class SolrPipeline(object): """ Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS 信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与 SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection 名的对应关系,后者用于指定默认存储的 solr collection。 另外,可根据需要配置: SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False SOLR_WEB_APP: solr 运行的 web app,默认为 "solr" SOLR_USER: solr 集群用户 SOLR_PASSWORD: solr 集群用户密码 SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False SOLR_TIMEOUT: solr 超时设置,默认为 10 秒 SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量 提交缓存大小,默认为 10 * 1024 * 1024(10 兆) DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER, 则以此配置作为它的值 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数, 默认为 100 个 DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER, 则以此配置作为它的值 本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。 """ @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.solr_connection = None self.solr_collection = None solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE") self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False solr_servers = crawler.settings.get("SOLR_SERVERS") if not solr_servers: log.msg("No field SOLR_SERVERS in settings.py!", level=log.CRITICAL) raise NotConfigured elif not isinstance(solr_servers, list): log.msg("Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!", level=log.CRITICAL) raise NotConfigured elif not self.solr_cloud_mode and len(solr_servers) > 1: log.msg("Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!", level=log.CRITICAL) raise NotConfigured self.solr_servers = solr_servers solr_web_app = crawler.settings.get("SOLR_WEB_APP") solr_web_app = solr_web_app if solr_web_app else "solr" self.solr_web_app = solr_web_app if solr_web_app.endswith('/') else solr_web_app + '/' solr_collection_default = crawler.settings.get('SOLR_COLLECTION_DEFAULT') solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP') if not solr_collection_default and not solr_collection_map: log.msg( "You must at least set one of the two fields SOLR_COLLECTION_MAP " "and SOLR_COLLECTION_DEFAULT in settings.py!", level=log.CRITICAL) raise NotConfigured if solr_collection_map and not isinstance(solr_collection_map, dict): log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!", level=log.CRITICAL) raise NotConfigured self.solr_collection_name = solr_collection_default self.solr_collection_map = solr_collection_map if solr_collection_map else {} self.solr_user = crawler.settings.get("SOLR_USER") self.solr_password = crawler.settings.get("SOLR_PASSWORD") solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES") self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False solr_timeout = crawler.settings.get("SOLR_TIMEOUT") self.solr_timeout = solr_timeout if solr_timeout is not None else 10 solr_cache_max_size = crawler.settings.get("SOLR_CACHE_MAX_SIZE_PER_SPIDER") if solr_cache_max_size is None: solr_cache_max_size = crawler.settings.get("DEFAULT_CACHE_MAX_SIZE_PER_SPIDER") self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024 solr_cache_max_len = crawler.settings.get("SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER") if solr_cache_max_len is None: solr_cache_max_len = crawler.settings.get("DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER") self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100 self.crawler = crawler self.cache_buffer = {} self.locks = {} def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection(server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock() def close_spider(self, spider): try: self.index_item(None, spider, True) finally: self.cache_buffer[spider.name].clear() @check_spider_pipeline def process_item(self, item, spider): self.index_item(item, spider, False) return item def index_item(self, item, spider, close_spider): lock = self.locks[spider.name] lock.acquire() cache_queue = self.cache_buffer[spider.name] if not close_spider: cache_queue.append(dict(item)) cache_len = len(cache_queue) cache_size = sizeof(cache_queue) try: if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \ or (cache_len > 0 and cache_size >= self.solr_cache_max_size): if self.solr_cloud_mode: self.solr_collection.add(list(cache_queue)) else: self.solr_collection.add_many(list(cache_queue)) # self.solr_collection.commit() spider.log( "{cache_len} items of size {cache_size} byte(s) indexed in solr".format( cache_len=cache_len, cache_size=cache_size), level=log.INFO) cache_queue.clear() except Exception, e: trace_info = traceback.format_exc() spider.log( "Failed to index item(s): {message}\n{trace_info}".format( message=e.message, trace_info=trace_info), level=log.ERROR) finally:
class SolrPipeline(object): """ Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS 信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与 SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection 名的对应关系,后者用于指定默认存储的 solr collection。 另外,可根据需要配置: SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False SOLR_WEB_APP: solr 运行的 web app,默认为 "solr" SOLR_USER: solr 集群用户 SOLR_PASSWORD: solr 集群用户密码 SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False SOLR_TIMEOUT: solr 超时设置,默认为 10 秒 SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量 提交缓存大小,默认为 10 * 1024 * 1024(10 兆) DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER, 则以此配置作为它的值 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数, 默认为 100 个 DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER, 则以此配置作为它的值 本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。 """ @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.solr_connection = None self.solr_collection = None solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE") self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False solr_servers = crawler.settings.get("SOLR_SERVERS") if not solr_servers: log.msg("No field SOLR_SERVERS in settings.py!", level=log.CRITICAL) raise NotConfigured elif not isinstance(solr_servers, list): log.msg( "Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!", level=log.CRITICAL) raise NotConfigured elif not self.solr_cloud_mode and len(solr_servers) > 1: log.msg( "Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!", level=log.CRITICAL) raise NotConfigured self.solr_servers = solr_servers solr_web_app = crawler.settings.get("SOLR_WEB_APP") solr_web_app = solr_web_app if solr_web_app else "solr" self.solr_web_app = solr_web_app if solr_web_app.endswith( '/') else solr_web_app + '/' solr_collection_default = crawler.settings.get( 'SOLR_COLLECTION_DEFAULT') solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP') if not solr_collection_default and not solr_collection_map: log.msg( "You must at least set one of the two fields SOLR_COLLECTION_MAP " "and SOLR_COLLECTION_DEFAULT in settings.py!", level=log.CRITICAL) raise NotConfigured if solr_collection_map and not isinstance(solr_collection_map, dict): log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!", level=log.CRITICAL) raise NotConfigured self.solr_collection_name = solr_collection_default self.solr_collection_map = solr_collection_map if solr_collection_map else {} self.solr_user = crawler.settings.get("SOLR_USER") self.solr_password = crawler.settings.get("SOLR_PASSWORD") solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES") self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False solr_timeout = crawler.settings.get("SOLR_TIMEOUT") self.solr_timeout = solr_timeout if solr_timeout is not None else 10 solr_cache_max_size = crawler.settings.get( "SOLR_CACHE_MAX_SIZE_PER_SPIDER") if solr_cache_max_size is None: solr_cache_max_size = crawler.settings.get( "DEFAULT_CACHE_MAX_SIZE_PER_SPIDER") self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024 solr_cache_max_len = crawler.settings.get( "SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER") if solr_cache_max_len is None: solr_cache_max_len = crawler.settings.get( "DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER") self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100 self.crawler = crawler self.cache_buffer = {} self.locks = {} def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection( server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[ self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock() def close_spider(self, spider): try: self.index_item(None, spider, True) finally: self.cache_buffer[spider.name].clear() @check_spider_pipeline def process_item(self, item, spider): self.index_item(item, spider, False) return item def index_item(self, item, spider, close_spider): lock = self.locks[spider.name] lock.acquire() cache_queue = self.cache_buffer[spider.name] if not close_spider: cache_queue.append(dict(item)) cache_len = len(cache_queue) cache_size = sizeof(cache_queue) try: if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \ or (cache_len > 0 and cache_size >= self.solr_cache_max_size): if self.solr_cloud_mode: self.solr_collection.add(list(cache_queue)) else: self.solr_collection.add_many(list(cache_queue)) # self.solr_collection.commit() spider.log( "{cache_len} items of size {cache_size} byte(s) indexed in solr" .format(cache_len=cache_len, cache_size=cache_size), level=log.INFO) cache_queue.clear() except Exception, e: trace_info = traceback.format_exc() spider.log( "Failed to index item(s): {message}\n{trace_info}".format( message=e.message, trace_info=trace_info), level=log.ERROR) finally: