def main(config):
    cfg = cliconfig(config)
    session = SessionFactory(cfg['database']['url']).create()
    server = Solr(str(cfg['solr']['url']),
        http_user=cfg['solr'].get('username'),
        http_pass=cfg['solr'].get('password'))

    documents = []
    q = session.query(Address).filter(Address.prefecture is not None)
    q = q.order_by(Address.zipcode)
    for r in ifilter(lambda r: r, imap(transform, q)):
        documents.append(r)
        if len(documents) >= COMMIT_UNIT:
            server.add_many(documents)
            documents = []
    if len(documents) > 0:
        server.add_many(documents)
    server.commit()
Exemple #2
0
class SolrPipeline(object):
    """
    Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS
    信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与
    SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection
    名的对应关系,后者用于指定默认存储的 solr collection。

    另外,可根据需要配置:
    SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False
    SOLR_WEB_APP: solr 运行的 web app,默认为 "solr"
    SOLR_USER: solr 集群用户
    SOLR_PASSWORD: solr 集群用户密码
    SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False
    SOLR_TIMEOUT: solr 超时设置,默认为 10 秒
    SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量
    提交缓存大小,默认为 10 * 1024 * 1024(10 兆)
    DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER,
    则以此配置作为它的值
    SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数,
    默认为 100 个
    DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER,
    则以此配置作为它的值

    本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。
    """

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.solr_connection = None
        self.solr_collection = None

        solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE")
        self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False

        solr_servers = crawler.settings.get("SOLR_SERVERS")
        if not solr_servers:
            log.msg("No field SOLR_SERVERS in settings.py!", level=log.CRITICAL)
            raise NotConfigured
        elif not isinstance(solr_servers, list):
            log.msg("Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!", level=log.CRITICAL)
            raise NotConfigured
        elif not self.solr_cloud_mode and len(solr_servers) > 1:
            log.msg("Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!", level=log.CRITICAL)
            raise NotConfigured
        self.solr_servers = solr_servers

        solr_web_app = crawler.settings.get("SOLR_WEB_APP")
        solr_web_app = solr_web_app if solr_web_app else "solr"
        self.solr_web_app = solr_web_app if solr_web_app.endswith('/') else solr_web_app + '/'

        solr_collection_default = crawler.settings.get('SOLR_COLLECTION_DEFAULT')
        solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP')
        if not solr_collection_default and not solr_collection_map:
            log.msg(
                "You must at least set one of the two fields SOLR_COLLECTION_MAP "
                "and SOLR_COLLECTION_DEFAULT in settings.py!",
                level=log.CRITICAL)
            raise NotConfigured
        if solr_collection_map and not isinstance(solr_collection_map, dict):
            log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!",
                    level=log.CRITICAL)
            raise NotConfigured
        self.solr_collection_name = solr_collection_default
        self.solr_collection_map = solr_collection_map if solr_collection_map else {}

        self.solr_user = crawler.settings.get("SOLR_USER")
        self.solr_password = crawler.settings.get("SOLR_PASSWORD")
        solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES")
        self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False
        solr_timeout = crawler.settings.get("SOLR_TIMEOUT")
        self.solr_timeout = solr_timeout if solr_timeout is not None else 10

        solr_cache_max_size = crawler.settings.get("SOLR_CACHE_MAX_SIZE_PER_SPIDER")
        if solr_cache_max_size is None:
            solr_cache_max_size = crawler.settings.get("DEFAULT_CACHE_MAX_SIZE_PER_SPIDER")
        self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024

        solr_cache_max_len = crawler.settings.get("SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER")
        if solr_cache_max_len is None:
            solr_cache_max_len = crawler.settings.get("DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER")
        self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100

        self.crawler = crawler

        self.cache_buffer = {}
        self.locks = {}

    def open_spider(self, spider):
        solr_collection_name = self.solr_collection_map.get(spider.name)
        if solr_collection_name:
            self.solr_collection_name = solr_collection_name
        if not self.solr_collection_name:
            spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL)
            raise CloseSpider
        if self.solr_cloud_mode:
            from solrcloudpy import SolrConnection

            self.solr_connection = SolrConnection(server=self.solr_servers,
                                                  detect_live_nodes=self.solr_detect_live_nodes,
                                                  user=self.solr_user,
                                                  password=self.solr_password,
                                                  timeout=self.solr_timeout,
                                                  webappdir=self.solr_web_app)
            self.solr_collection = self.solr_connection[self.solr_collection_name]
        else:
            from solr import Solr
            from urlparse import urljoin

            collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name))
            if isinstance(collection_url, unicode):
                collection_url = collection_url.encode("UTF-8")
            self.solr_collection = Solr(url=collection_url,
                                        http_user=self.solr_user,
                                        http_pass=self.solr_password,
                                        timeout=self.solr_timeout)

        if self.solr_cache_max_len > 0:
            max_len = self.solr_cache_max_len * 2
        else:
            max_len = 2
        self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len)
        self.locks[spider.name] = Lock()

    def close_spider(self, spider):
        try:
            self.index_item(None, spider, True)
        finally:
            self.cache_buffer[spider.name].clear()

    @check_spider_pipeline
    def process_item(self, item, spider):
        self.index_item(item, spider, False)
        return item

    def index_item(self, item, spider, close_spider):
        lock = self.locks[spider.name]
        lock.acquire()

        cache_queue = self.cache_buffer[spider.name]
        if not close_spider:
            cache_queue.append(dict(item))

        cache_len = len(cache_queue)
        cache_size = sizeof(cache_queue)
        try:
            if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \
                    or (cache_len > 0 and cache_size >= self.solr_cache_max_size):
                if self.solr_cloud_mode:
                    self.solr_collection.add(list(cache_queue))
                else:
                    self.solr_collection.add_many(list(cache_queue))
             #   self.solr_collection.commit()
                spider.log(
                    "{cache_len} items of size {cache_size} byte(s) indexed in solr".format(
                        cache_len=cache_len,
                        cache_size=cache_size),
                    level=log.INFO)
                cache_queue.clear()
        except Exception, e:
            trace_info = traceback.format_exc()
            spider.log(
                "Failed to index item(s): {message}\n{trace_info}".format(
                    message=e.message,
                    trace_info=trace_info),
                level=log.ERROR)
        finally:
Exemple #3
0
class SolrPipeline(object):
    """
    Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS
    信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与
    SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection
    名的对应关系,后者用于指定默认存储的 solr collection。

    另外,可根据需要配置:
    SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False
    SOLR_WEB_APP: solr 运行的 web app,默认为 "solr"
    SOLR_USER: solr 集群用户
    SOLR_PASSWORD: solr 集群用户密码
    SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False
    SOLR_TIMEOUT: solr 超时设置,默认为 10 秒
    SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量
    提交缓存大小,默认为 10 * 1024 * 1024(10 兆)
    DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER,
    则以此配置作为它的值
    SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数,
    默认为 100 个
    DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER,
    则以此配置作为它的值

    本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。
    """
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.solr_connection = None
        self.solr_collection = None

        solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE")
        self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False

        solr_servers = crawler.settings.get("SOLR_SERVERS")
        if not solr_servers:
            log.msg("No field SOLR_SERVERS in settings.py!",
                    level=log.CRITICAL)
            raise NotConfigured
        elif not isinstance(solr_servers, list):
            log.msg(
                "Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!",
                level=log.CRITICAL)
            raise NotConfigured
        elif not self.solr_cloud_mode and len(solr_servers) > 1:
            log.msg(
                "Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!",
                level=log.CRITICAL)
            raise NotConfigured
        self.solr_servers = solr_servers

        solr_web_app = crawler.settings.get("SOLR_WEB_APP")
        solr_web_app = solr_web_app if solr_web_app else "solr"
        self.solr_web_app = solr_web_app if solr_web_app.endswith(
            '/') else solr_web_app + '/'

        solr_collection_default = crawler.settings.get(
            'SOLR_COLLECTION_DEFAULT')
        solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP')
        if not solr_collection_default and not solr_collection_map:
            log.msg(
                "You must at least set one of the two fields SOLR_COLLECTION_MAP "
                "and SOLR_COLLECTION_DEFAULT in settings.py!",
                level=log.CRITICAL)
            raise NotConfigured
        if solr_collection_map and not isinstance(solr_collection_map, dict):
            log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!",
                    level=log.CRITICAL)
            raise NotConfigured
        self.solr_collection_name = solr_collection_default
        self.solr_collection_map = solr_collection_map if solr_collection_map else {}

        self.solr_user = crawler.settings.get("SOLR_USER")
        self.solr_password = crawler.settings.get("SOLR_PASSWORD")
        solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES")
        self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False
        solr_timeout = crawler.settings.get("SOLR_TIMEOUT")
        self.solr_timeout = solr_timeout if solr_timeout is not None else 10

        solr_cache_max_size = crawler.settings.get(
            "SOLR_CACHE_MAX_SIZE_PER_SPIDER")
        if solr_cache_max_size is None:
            solr_cache_max_size = crawler.settings.get(
                "DEFAULT_CACHE_MAX_SIZE_PER_SPIDER")
        self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024

        solr_cache_max_len = crawler.settings.get(
            "SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER")
        if solr_cache_max_len is None:
            solr_cache_max_len = crawler.settings.get(
                "DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER")
        self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100

        self.crawler = crawler

        self.cache_buffer = {}
        self.locks = {}

    def open_spider(self, spider):
        solr_collection_name = self.solr_collection_map.get(spider.name)
        if solr_collection_name:
            self.solr_collection_name = solr_collection_name
        if not self.solr_collection_name:
            spider.log("No collection associated with " + spider.name + "!",
                       level=log.CRITICAL)
            raise CloseSpider
        if self.solr_cloud_mode:
            from solrcloudpy import SolrConnection

            self.solr_connection = SolrConnection(
                server=self.solr_servers,
                detect_live_nodes=self.solr_detect_live_nodes,
                user=self.solr_user,
                password=self.solr_password,
                timeout=self.solr_timeout,
                webappdir=self.solr_web_app)
            self.solr_collection = self.solr_connection[
                self.solr_collection_name]
        else:
            from solr import Solr
            from urlparse import urljoin

            collection_url = reduce(urljoin,
                                    (self.solr_servers[0], self.solr_web_app,
                                     self.solr_collection_name))
            if isinstance(collection_url, unicode):
                collection_url = collection_url.encode("UTF-8")
            self.solr_collection = Solr(url=collection_url,
                                        http_user=self.solr_user,
                                        http_pass=self.solr_password,
                                        timeout=self.solr_timeout)

        if self.solr_cache_max_len > 0:
            max_len = self.solr_cache_max_len * 2
        else:
            max_len = 2
        self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len)
        self.locks[spider.name] = Lock()

    def close_spider(self, spider):
        try:
            self.index_item(None, spider, True)
        finally:
            self.cache_buffer[spider.name].clear()

    @check_spider_pipeline
    def process_item(self, item, spider):
        self.index_item(item, spider, False)
        return item

    def index_item(self, item, spider, close_spider):
        lock = self.locks[spider.name]
        lock.acquire()

        cache_queue = self.cache_buffer[spider.name]
        if not close_spider:
            cache_queue.append(dict(item))

        cache_len = len(cache_queue)
        cache_size = sizeof(cache_queue)
        try:
            if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \
                    or (cache_len > 0 and cache_size >= self.solr_cache_max_size):
                if self.solr_cloud_mode:
                    self.solr_collection.add(list(cache_queue))
                else:
                    self.solr_collection.add_many(list(cache_queue))
            #   self.solr_collection.commit()
                spider.log(
                    "{cache_len} items of size {cache_size} byte(s) indexed in solr"
                    .format(cache_len=cache_len, cache_size=cache_size),
                    level=log.INFO)
                cache_queue.clear()
        except Exception, e:
            trace_info = traceback.format_exc()
            spider.log(
                "Failed to index item(s): {message}\n{trace_info}".format(
                    message=e.message, trace_info=trace_info),
                level=log.ERROR)
        finally: