Beispiel #1
0
    def init_cass_cache(self,
                        cluster,
                        caches,
                        cassandra_seeds,
                        memcached_kw={},
                        cassandra_kw={}):
        localcache_cls = (SelfEmptyingCache
                          if self.running_as_script else LocalCache)

        pmc_chain = (localcache_cls(), )

        # if caches, append
        if caches:
            pmc_chain += (CMemcache(caches,
                                    num_clients=self.num_mc_clients,
                                    **memcached_kw), )

        # if seeds, append
        if cassandra_seeds:
            cassandra_seeds = list(cassandra_seeds)
            random.shuffle(cassandra_seeds)
            pmc_chain += (CassandraCache(cluster, cluster, cassandra_seeds,
                                         **cassandra_kw), )
            mc = CassandraCacheChain(pmc_chain, cache_negative_results=True)
        else:
            mc = MemcacheChain(pmc_chain)

        self.cache_chains.append(mc)
        return mc
Beispiel #2
0
 def init_cass_cache(self, keyspace, column_family, cassandra_client,
                     lock_factory,
                     memcache = None,
                     read_consistency_level = CL_ONE,
                     write_consistency_level = CL_ONE,
                     localcache_cls = LocalCache):
     return CassandraCacheChain(localcache_cls(),
                                CassandraCache(keyspace, column_family,
                                               cassandra_client,
                                               read_consistency_level = read_consistency_level,
                                               write_consistency_level = write_consistency_level),
                                memcache = memcache,
                                lock_factory = lock_factory)
Beispiel #3
0
    def setup(self):
        self.queues = queues.declare_queues(self)

        ################# PROVIDERS
        self.media_provider = select_provider(
            self.config,
            self.pkg_resources_working_set,
            "r2.provider.media",
            self.media_provider,
        )
        self.startup_timer.intermediate("providers")

        ################# CONFIGURATION
        # AMQP is required
        if not self.amqp_host:
            raise ValueError("amqp_host not set in the .ini")

        if not self.cassandra_seeds:
            raise ValueError("cassandra_seeds not set in the .ini")

        # heavy load mode is read only mode with a different infobar
        if self.heavy_load_mode:
            self.read_only_mode = True

        origin_prefix = self.domain_prefix + "." if self.domain_prefix else ""
        self.origin = "http://" + origin_prefix + self.domain

        self.trusted_domains = set([self.domain])
        if self.https_endpoint:
            https_url = urlparse(self.https_endpoint)
            self.trusted_domains.add(https_url.hostname)

        # load the unique hashed names of files under static
        static_files = os.path.join(self.paths.get('static_files'), 'static')
        names_file_path = os.path.join(static_files, 'names.json')
        if os.path.exists(names_file_path):
            with open(names_file_path) as handle:
                self.static_names = json.load(handle)
        else:
            self.static_names = {}

        # make python warnings go through the logging system
        logging.captureWarnings(capture=True)

        log = logging.getLogger('reddit')

        # when we're a script (paster run) just set up super simple logging
        if self.running_as_script:
            log.setLevel(logging.INFO)
            log.addHandler(logging.StreamHandler())

        # if in debug mode, override the logging level to DEBUG
        if self.debug:
            log.setLevel(logging.DEBUG)

        # attempt to figure out which pool we're in and add that to the
        # LogRecords.
        try:
            with open("/etc/ec2_asg", "r") as f:
                pool = f.read().strip()
            # clean up the pool name since we're putting stuff after "-"
            pool = pool.partition("-")[0]
        except IOError:
            pool = "reddit-app"
        self.log = logging.LoggerAdapter(log, {"pool": pool})

        # set locations
        locations = pkg_resources.resource_stream(__name__,
                                                  "../data/locations.json")
        self.locations = json.loads(locations.read())

        if not self.media_domain:
            self.media_domain = self.domain
        if self.media_domain == self.domain:
            print >> sys.stderr, ("Warning: g.media_domain == g.domain. " +
                   "This may give untrusted content access to user cookies")
        if self.oauth_domain == self.domain:
            print >> sys.stderr, ("Warning: g.oauth_domain == g.domain. "
                    "CORS requests to g.domain will be allowed")

        for arg in sys.argv:
            tokens = arg.split("=")
            if len(tokens) == 2:
                k, v = tokens
                self.log.debug("Overriding g.%s to %s" % (k, v))
                setattr(self, k, v)

        self.reddit_host = socket.gethostname()
        self.reddit_pid  = os.getpid()

        if hasattr(signal, 'SIGUSR1'):
            # not all platforms have user signals
            signal.signal(signal.SIGUSR1, thread_dump)

        locale.setlocale(locale.LC_ALL, self.locale)

        # Pre-calculate ratelimit values
        self.RL_RESET_SECONDS = self.config["RL_RESET_MINUTES"] * 60
        self.RL_MAX_REQS = int(self.config["RL_AVG_REQ_PER_SEC"] *
                                      self.RL_RESET_SECONDS)

        self.RL_OAUTH_RESET_SECONDS = self.config["RL_OAUTH_RESET_MINUTES"] * 60
        self.RL_OAUTH_MAX_REQS = int(self.config["RL_OAUTH_AVG_REQ_PER_SEC"] *
                                     self.RL_OAUTH_RESET_SECONDS)

        self.RL_LOGIN_MAX_REQS = int(self.config["RL_LOGIN_AVG_PER_SEC"] *
                                     self.RL_RESET_SECONDS)

        self.startup_timer.intermediate("configuration")

        ################# ZOOKEEPER
        # for now, zookeeper will be an optional part of the stack.
        # if it's not configured, we will grab the expected config from the
        # [live_config] section of the ini file
        zk_hosts = self.config.get("zookeeper_connection_string")
        if zk_hosts:
            from r2.lib.zookeeper import (connect_to_zookeeper,
                                          LiveConfig, LiveList)
            zk_username = self.config["zookeeper_username"]
            zk_password = self.config["zookeeper_password"]
            self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username,
                                                             zk_password))
            self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE)
            self.secrets = fetch_secrets(self.zookeeper)
            self.throttles = LiveList(self.zookeeper, "/throttles",
                                      map_fn=ipaddress.ip_network,
                                      reduce_fn=ipaddress.collapse_addresses)

            # close our zk connection when the app shuts down
            SHUTDOWN_CALLBACKS.append(self.zookeeper.stop)
        else:
            self.zookeeper = None
            parser = ConfigParser.RawConfigParser()
            parser.optionxform = str
            parser.read([self.config["__file__"]])
            self.live_config = extract_live_config(parser, self.plugins)
            self.secrets = extract_secrets(parser)
            self.throttles = tuple()  # immutable since it's not real

        self.startup_timer.intermediate("zookeeper")

        ################# PRIVILEGED USERS
        self.admins = PermissionFilteredEmployeeList(
            self.live_config, type="admin")
        self.sponsors = PermissionFilteredEmployeeList(
            self.live_config, type="sponsor")
        self.employees = PermissionFilteredEmployeeList(
            self.live_config, type="employee")

        ################# MEMCACHE
        num_mc_clients = self.num_mc_clients

        # the main memcache pool. used for most everything.
        memcache = CMemcache(
            self.memcaches,
            min_compress_len=1400,
            num_clients=num_mc_clients,
            binary=True,
        )

        # a pool just used for @memoize results
        memoizecaches = CMemcache(
            self.memoizecaches,
            min_compress_len=50 * 1024,
            num_clients=num_mc_clients,
            binary=True,
        )

        # a pool just for srmember rels
        srmembercaches = CMemcache(
            self.srmembercaches,
            min_compress_len=96,
            num_clients=num_mc_clients,
            binary=True,
        )

        # a pool just for rels
        relcaches = CMemcache(
            self.relcaches,
            min_compress_len=96,
            num_clients=num_mc_clients,
            binary=True,
        )

        ratelimitcaches = CMemcache(
            self.ratelimitcaches,
            min_compress_len=96,
            num_clients=num_mc_clients,
        )

        # a smaller pool of caches used only for distributed locks.
        # TODO: move this to ZooKeeper
        self.lock_cache = CMemcache(self.lockcaches,
                                    binary=True,
                                    num_clients=num_mc_clients)
        self.make_lock = make_lock_factory(self.lock_cache, self.stats)

        # memcaches used in front of the permacache CF in cassandra.
        # XXX: this is a legacy thing; permacache was made when C* didn't have
        # a row cache.
        if self.permacache_memcaches:
            permacache_memcaches = CMemcache(self.permacache_memcaches,
                                             min_compress_len=50 * 1024,
                                             num_clients=num_mc_clients)
        else:
            permacache_memcaches = None

        # the stalecache is a memcached local to the current app server used
        # for data that's frequently fetched but doesn't need to be fresh.
        if self.stalecaches:
            stalecaches = CMemcache(self.stalecaches,
                                    binary=True,
                                    num_clients=num_mc_clients)
        else:
            stalecaches = None

        # rendercache holds rendered partial templates.
        rendercaches = CMemcache(
            self.rendercaches,
            noreply=True,
            no_block=True,
            num_clients=num_mc_clients,
            min_compress_len=480,
        )

        # pagecaches hold fully rendered pages
        pagecaches = CMemcache(
            self.pagecaches,
            noreply=True,
            no_block=True,
            num_clients=num_mc_clients,
            min_compress_len=1400,
        )

        self.startup_timer.intermediate("memcache")

        ################# CASSANDRA
        keyspace = "reddit"
        self.cassandra_pools = {
            "main":
                StatsCollectingConnectionPool(
                    keyspace,
                    stats=self.stats,
                    logging_name="main",
                    server_list=self.cassandra_seeds,
                    pool_size=self.cassandra_pool_size,
                    timeout=4,
                    max_retries=3,
                    prefill=False
                ),
        }

        permacache_cf = CassandraCache(
            'permacache',
            self.cassandra_pools[self.cassandra_default_pool],
            read_consistency_level=self.cassandra_rcl,
            write_consistency_level=self.cassandra_wcl
        )

        self.startup_timer.intermediate("cassandra")

        ################# POSTGRES
        self.dbm = self.load_db_params()
        self.startup_timer.intermediate("postgres")

        ################# CHAINS
        # initialize caches. Any cache-chains built here must be added
        # to cache_chains (closed around by reset_caches) so that they
        # can properly reset their local components
        cache_chains = {}
        localcache_cls = (SelfEmptyingCache if self.running_as_script
                          else LocalCache)

        if stalecaches:
            self.cache = StaleCacheChain(
                localcache_cls(),
                stalecaches,
                memcache,
            )
        else:
            self.cache = MemcacheChain((localcache_cls(), memcache))
        cache_chains.update(cache=self.cache)

        if stalecaches:
            self.memoizecache = StaleCacheChain(
                localcache_cls(),
                stalecaches,
                memoizecaches,
            )
        else:
            self.memoizecache = MemcacheChain(
                (localcache_cls(), memoizecaches))
        cache_chains.update(memoizecache=self.memoizecache)

        if stalecaches:
            self.srmembercache = StaleCacheChain(
                localcache_cls(),
                stalecaches,
                srmembercaches,
            )
        else:
            self.srmembercache = MemcacheChain(
                (localcache_cls(), srmembercaches))
        cache_chains.update(srmembercache=self.srmembercache)

        if stalecaches:
            self.relcache = StaleCacheChain(
                localcache_cls(),
                stalecaches,
                relcaches,
            )
        else:
            self.relcache = MemcacheChain(
                (localcache_cls(), relcaches))
        cache_chains.update(relcache=self.relcache)

        self.ratelimitcache = MemcacheChain(
                (localcache_cls(), ratelimitcaches))
        cache_chains.update(ratelimitcache=self.ratelimitcache)

        self.rendercache = MemcacheChain((
            localcache_cls(),
            rendercaches,
        ))
        cache_chains.update(rendercache=self.rendercache)

        self.pagecache = MemcacheChain((
            localcache_cls(),
            pagecaches,
        ))
        cache_chains.update(pagecache=self.pagecache)

        # the thing_cache is used in tdb_cassandra.
        self.thing_cache = CacheChain((localcache_cls(),))
        cache_chains.update(thing_cache=self.thing_cache)

        self.permacache = CassandraCacheChain(
            localcache_cls(),
            permacache_cf,
            memcache=permacache_memcaches,
            lock_factory=self.make_lock,
        )
        cache_chains.update(permacache=self.permacache)

        # hardcache is used for various things that tend to expire
        # TODO: replace hardcache w/ cassandra stuff
        self.hardcache = HardcacheChain(
            (localcache_cls(), memcache, HardCache(self)),
            cache_negative_results=True,
        )
        cache_chains.update(hardcache=self.hardcache)

        # I know this sucks, but we need non-request-threads to be
        # able to reset the caches, so we need them be able to close
        # around 'cache_chains' without being able to call getattr on
        # 'g'
        def reset_caches():
            for name, chain in cache_chains.iteritems():
                chain.reset()
                chain.stats = CacheStats(self.stats, name)
        self.cache_chains = cache_chains

        self.reset_caches = reset_caches
        self.reset_caches()

        self.startup_timer.intermediate("cache_chains")

        # try to set the source control revision numbers
        self.versions = {}
        r2_root = os.path.dirname(os.path.dirname(self.paths["root"]))
        r2_gitdir = os.path.join(r2_root, ".git")
        self.short_version = self.record_repo_version("r2", r2_gitdir)

        if I18N_PATH:
            i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git")
            self.record_repo_version("i18n", i18n_git_path)

        self.startup_timer.intermediate("revisions")
Beispiel #4
0
    def setup(self):
        self.queues = queues.declare_queues(self)

        ################# CONFIGURATION
        # AMQP is required
        if not self.amqp_host:
            raise ValueError("amqp_host not set in the .ini")

        # This requirement doesn't *have* to be a requirement, but there are
        # bugs at the moment that will pop up if you violate it
        # XXX: get rid of these options. new query cache is always on.
        if self.write_query_queue and not self.use_query_cache:
            raise Exception("write_query_queue requires use_query_cache")

        if not self.cassandra_seeds:
            raise ValueError("cassandra_seeds not set in the .ini")

        # heavy load mode is read only mode with a different infobar
        if self.heavy_load_mode:
            self.read_only_mode = True

        origin_prefix = self.domain_prefix + "." if self.domain_prefix else ""
        self.origin = "http://" + origin_prefix + self.domain
        self.secure_domains = set([urlparse(self.payment_domain).netloc])

        self.trusted_domains = set([self.domain])
        self.trusted_domains.update(self.authorized_cnames)
        if self.https_endpoint:
            https_url = urlparse(self.https_endpoint)
            self.secure_domains.add(https_url.netloc)
            self.trusted_domains.add(https_url.hostname)
        if getattr(self, 'oauth_domain', None):
            self.secure_domains.add(self.oauth_domain)

        # load the unique hashed names of files under static
        static_files = os.path.join(self.paths.get('static_files'), 'static')
        names_file_path = os.path.join(static_files, 'names.json')
        if os.path.exists(names_file_path):
            with open(names_file_path) as handle:
                self.static_names = json.load(handle)
        else:
            self.static_names = {}

        #setup the logger
        self.log = logging.getLogger('reddit')
        self.log.addHandler(logging.StreamHandler())
        if self.debug:
            self.log.setLevel(logging.DEBUG)
        else:
            self.log.setLevel(logging.INFO)

        # set log level for pycountry which is chatty
        logging.getLogger('pycountry.db').setLevel(logging.CRITICAL)

        if not self.media_domain:
            self.media_domain = self.domain
        if self.media_domain == self.domain:
            print ("Warning: g.media_domain == g.domain. " +
                   "This may give untrusted content access to user cookies")

        for arg in sys.argv:
            tokens = arg.split("=")
            if len(tokens) == 2:
                k, v = tokens
                self.log.debug("Overriding g.%s to %s" % (k, v))
                setattr(self, k, v)

        self.reddit_host = socket.gethostname()
        self.reddit_pid  = os.getpid()

        if hasattr(signal, 'SIGUSR1'):
            # not all platforms have user signals
            signal.signal(signal.SIGUSR1, thread_dump)

        self.startup_timer.intermediate("configuration")

        ################# ZOOKEEPER
        # for now, zookeeper will be an optional part of the stack.
        # if it's not configured, we will grab the expected config from the
        # [live_config] section of the ini file
        zk_hosts = self.config.get("zookeeper_connection_string")
        if zk_hosts:
            from r2.lib.zookeeper import (connect_to_zookeeper,
                                          LiveConfig, LiveList)
            zk_username = self.config["zookeeper_username"]
            zk_password = self.config["zookeeper_password"]
            self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username,
                                                             zk_password))
            self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE)
            self.throttles = LiveList(self.zookeeper, "/throttles",
                                      map_fn=ipaddress.ip_network,
                                      reduce_fn=ipaddress.collapse_addresses)
        else:
            self.zookeeper = None
            parser = ConfigParser.RawConfigParser()
            parser.read([self.config["__file__"]])
            self.live_config = extract_live_config(parser, self.plugins)
            self.throttles = tuple()  # immutable since it's not real
        self.startup_timer.intermediate("zookeeper")

        ################# MEMCACHE
        num_mc_clients = self.num_mc_clients

        # the main memcache pool. used for most everything.
        self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients)

        # a smaller pool of caches used only for distributed locks.
        # TODO: move this to ZooKeeper
        self.lock_cache = CMemcache(self.lockcaches,
                                    num_clients=num_mc_clients)
        self.make_lock = make_lock_factory(self.lock_cache, self.stats)

        # memcaches used in front of the permacache CF in cassandra.
        # XXX: this is a legacy thing; permacache was made when C* didn't have
        # a row cache.
        if self.permacache_memcaches:
            permacache_memcaches = CMemcache(self.permacache_memcaches,
                                             num_clients=num_mc_clients)
        else:
            permacache_memcaches = None

        # the stalecache is a memcached local to the current app server used
        # for data that's frequently fetched but doesn't need to be fresh.
        if self.stalecaches:
            stalecaches = CMemcache(self.stalecaches,
                                    num_clients=num_mc_clients)
        else:
            stalecaches = None

        # rendercache holds rendered partial templates as well as fully
        # cached pages.
        rendercaches = CMemcache(
            self.rendercaches,
            noreply=True,
            no_block=True,
            num_clients=num_mc_clients,
        )

        self.startup_timer.intermediate("memcache")

        ################# CASSANDRA
        keyspace = "reddit"
        self.cassandra_pools = {
            "main":
                StatsCollectingConnectionPool(
                    keyspace,
                    stats=self.stats,
                    logging_name="main",
                    server_list=self.cassandra_seeds,
                    pool_size=self.cassandra_pool_size,
                    timeout=2,
                    max_retries=3,
                    prefill=False
                ),
        }

        permacache_cf = CassandraCache(
            'permacache',
            self.cassandra_pools[self.cassandra_default_pool],
            read_consistency_level=self.cassandra_rcl,
            write_consistency_level=self.cassandra_wcl
        )

        self.startup_timer.intermediate("cassandra")

        ################# POSTGRES
        event.listens_for(engine.Engine, 'before_cursor_execute')(
            self.stats.pg_before_cursor_execute)
        event.listens_for(engine.Engine, 'after_cursor_execute')(
            self.stats.pg_after_cursor_execute)

        self.dbm = self.load_db_params()
        self.startup_timer.intermediate("postgres")

        ################# CHAINS
        # initialize caches. Any cache-chains built here must be added
        # to cache_chains (closed around by reset_caches) so that they
        # can properly reset their local components
        self.cache_chains = {}
        localcache_cls = (SelfEmptyingCache if self.running_as_script
                          else LocalCache)

        if stalecaches:
            self.cache = StaleCacheChain(
                localcache_cls(),
                stalecaches,
                self.memcache,
            )
        else:
            self.cache = MemcacheChain((localcache_cls(), self.memcache))
        self.cache_chains.update(cache=self.cache)

        self.rendercache = MemcacheChain((
            localcache_cls(),
            rendercaches,
        ))
        self.cache_chains.update(rendercache=self.rendercache)

        # the thing_cache is used in tdb_cassandra.
        self.thing_cache = CacheChain((localcache_cls(),))
        self.cache_chains.update(thing_cache=self.thing_cache)

        self.permacache = CassandraCacheChain(
            localcache_cls(),
            permacache_cf,
            memcache=permacache_memcaches,
            lock_factory=self.make_lock,
        )
        self.cache_chains.update(permacache=self.permacache)

        # hardcache is used for various things that tend to expire
        # TODO: replace hardcache w/ cassandra stuff
        self.hardcache = HardcacheChain(
            (localcache_cls(), self.memcache, HardCache(self)),
            cache_negative_results=True,
        )
        self.cache_chains.update(hardcache=self.hardcache)

        # I know this sucks, but we need non-request-threads to be
        # able to reset the caches, so we need them be able to close
        # around 'cache_chains' without being able to call getattr on
        # 'g'
        cache_chains = self.cache_chains.copy()
        def reset_caches():
            for name, chain in cache_chains.iteritems():
                chain.reset()
                chain.stats = CacheStats(self.stats, name)

        self.reset_caches = reset_caches
        self.reset_caches()

        self.startup_timer.intermediate("cache_chains")

        # try to set the source control revision numbers
        self.versions = {}
        r2_root = os.path.dirname(os.path.dirname(self.paths["root"]))
        r2_gitdir = os.path.join(r2_root, ".git")
        self.short_version = self.record_repo_version("r2", r2_gitdir)

        if I18N_PATH:
            i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git")
            self.record_repo_version("i18n", i18n_git_path)

        self.startup_timer.intermediate("revisions")
Beispiel #5
0
def pushup_permacache(verbosity=1000):
    """When putting cassandra into the permacache chain, we need to
       push everything up into the rest of the chain, so this is
       everything that uses the permacache, as of that check-in."""
    from pylons import app_globals as g
    from r2.models import Link, Subreddit, Account
    from r2.lib.db.operators import desc
    from r2.lib.comment_tree import comments_key, messages_key
    from r2.lib.utils import fetch_things2, in_chunks
    from r2.lib.utils import last_modified_key
    from r2.lib.promote import promoted_memo_key
    from r2.lib.subreddit_search import load_all_reddits
    from r2.lib.db import queries
    from r2.lib.cache import CassandraCacheChain

    authority = g.permacache.caches[-1]
    nonauthority = CassandraCacheChain(g.permacache.caches[1:-1])

    def populate(keys):
        vals = authority.simple_get_multi(keys)
        if vals:
            nonauthority.set_multi(vals)

    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_reddits()

        yield queries.get_all_comments().iden

        l_q = Link._query(
            Link.c._spam == (True, False),
            Link.c._deleted == (True, False),
            sort=desc('_date'),
            data=True,
        )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(
            Account.c._spam == (True, False),
            sort=desc('_date'),
        )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subreddit._query(
            Subreddit.c._spam == (True, False),
            sort=desc('_date'),
        )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr,
                                            sort,
                                            time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subreddit_messages(sr).iden
            yield queries.get_unread_subreddit_messages(sr).iden

    done = 0
    for keys in in_chunks(gen_keys(), verbosity):
        g.reset_caches()
        done += len(keys)
        print 'Done %d: %r' % (done, keys[-1])
        populate(keys)
Beispiel #6
0
    def setup(self):
        # heavy load mode is read only mode with a different infobar
        if self.heavy_load_mode:
            self.read_only_mode = True

        if hasattr(signal, 'SIGUSR1'):
            # not all platforms have user signals
            signal.signal(signal.SIGUSR1, thread_dump)

        # initialize caches. Any cache-chains built here must be added
        # to cache_chains (closed around by reset_caches) so that they
        # can properly reset their local components

        localcache_cls = (SelfEmptyingCache
                          if self.running_as_script else LocalCache)
        num_mc_clients = self.num_mc_clients

        self.cache_chains = {}

        # for now, zookeeper will be an optional part of the stack.
        # if it's not configured, we will grab the expected config from the
        # [live_config] section of the ini file
        zk_hosts = self.config.get("zookeeper_connection_string")
        if zk_hosts:
            from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig,
                                          LiveList)
            zk_username = self.config["zookeeper_username"]
            zk_password = self.config["zookeeper_password"]
            self.zookeeper = connect_to_zookeeper(zk_hosts,
                                                  (zk_username, zk_password))
            self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE)
            self.throttles = LiveList(self.zookeeper,
                                      "/throttles",
                                      map_fn=ipaddress.ip_network,
                                      reduce_fn=ipaddress.collapse_addresses)
        else:
            self.zookeeper = None
            parser = ConfigParser.RawConfigParser()
            parser.read([self.config["__file__"]])
            self.live_config = extract_live_config(parser, self.plugins)
            self.throttles = tuple()  # immutable since it's not real

        self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients)
        self.lock_cache = CMemcache(self.lockcaches,
                                    num_clients=num_mc_clients)

        self.stats = Stats(self.config.get('statsd_addr'),
                           self.config.get('statsd_sample_rate'))

        event.listens_for(engine.Engine, 'before_cursor_execute')(
            self.stats.pg_before_cursor_execute)
        event.listens_for(engine.Engine, 'after_cursor_execute')(
            self.stats.pg_after_cursor_execute)

        self.make_lock = make_lock_factory(self.lock_cache, self.stats)

        if not self.cassandra_seeds:
            raise ValueError("cassandra_seeds not set in the .ini")

        keyspace = "reddit"
        self.cassandra_pools = {
            "main":
            StatsCollectingConnectionPool(keyspace,
                                          stats=self.stats,
                                          logging_name="main",
                                          server_list=self.cassandra_seeds,
                                          pool_size=self.cassandra_pool_size,
                                          timeout=2,
                                          max_retries=3,
                                          prefill=False),
        }

        perma_memcache = (CMemcache(self.permacache_memcaches,
                                    num_clients=num_mc_clients)
                          if self.permacache_memcaches else None)
        self.permacache = CassandraCacheChain(
            localcache_cls(),
            CassandraCache('permacache',
                           self.cassandra_pools[self.cassandra_default_pool],
                           read_consistency_level=self.cassandra_rcl,
                           write_consistency_level=self.cassandra_wcl),
            memcache=perma_memcache,
            lock_factory=self.make_lock)

        self.cache_chains.update(permacache=self.permacache)

        # hardcache is done after the db info is loaded, and then the
        # chains are reset to use the appropriate initial entries

        if self.stalecaches:
            self.cache = StaleCacheChain(
                localcache_cls(),
                CMemcache(self.stalecaches, num_clients=num_mc_clients),
                self.memcache)
        else:
            self.cache = MemcacheChain((localcache_cls(), self.memcache))
        self.cache_chains.update(cache=self.cache)

        self.rendercache = MemcacheChain(
            (localcache_cls(),
             CMemcache(self.rendercaches,
                       noreply=True,
                       no_block=True,
                       num_clients=num_mc_clients)))
        self.cache_chains.update(rendercache=self.rendercache)

        self.thing_cache = CacheChain((localcache_cls(), ))
        self.cache_chains.update(thing_cache=self.thing_cache)

        #load the database info
        self.dbm = self.load_db_params()

        # can't do this until load_db_params() has been called
        self.hardcache = HardcacheChain(
            (localcache_cls(), self.memcache, HardCache(self)),
            cache_negative_results=True)
        self.cache_chains.update(hardcache=self.hardcache)

        # I know this sucks, but we need non-request-threads to be
        # able to reset the caches, so we need them be able to close
        # around 'cache_chains' without being able to call getattr on
        # 'g'
        cache_chains = self.cache_chains.copy()

        def reset_caches():
            for name, chain in cache_chains.iteritems():
                chain.reset()
                chain.stats = CacheStats(self.stats, name)

        self.reset_caches = reset_caches
        self.reset_caches()

        # set the modwindow
        self.MODWINDOW = timedelta(self.MODWINDOW)

        self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN'))

        origin_prefix = self.domain_prefix + "." if self.domain_prefix else ""
        self.origin = "http://" + origin_prefix + self.domain
        self.secure_domains = set([urlparse(self.payment_domain).netloc])

        self.trusted_domains = set([self.domain])
        self.trusted_domains.update(self.authorized_cnames)
        if self.https_endpoint:
            https_url = urlparse(self.https_endpoint)
            self.secure_domains.add(https_url.netloc)
            self.trusted_domains.add(https_url.hostname)
        if getattr(self, 'oauth_domain', None):
            self.secure_domains.add(self.oauth_domain)

        # load the unique hashed names of files under static
        static_files = os.path.join(self.paths.get('static_files'), 'static')
        names_file_path = os.path.join(static_files, 'names.json')
        if os.path.exists(names_file_path):
            with open(names_file_path) as handle:
                self.static_names = json.load(handle)
        else:
            self.static_names = {}

        #setup the logger
        self.log = logging.getLogger('reddit')
        self.log.addHandler(logging.StreamHandler())
        if self.debug:
            self.log.setLevel(logging.DEBUG)
        else:
            self.log.setLevel(logging.INFO)

        # set log level for pycountry which is chatty
        logging.getLogger('pycountry.db').setLevel(logging.CRITICAL)

        if not self.media_domain:
            self.media_domain = self.domain
        if self.media_domain == self.domain:
            print("Warning: g.media_domain == g.domain. " +
                  "This may give untrusted content access to user cookies")

        self.reddit_host = socket.gethostname()
        self.reddit_pid = os.getpid()

        for arg in sys.argv:
            tokens = arg.split("=")
            if len(tokens) == 2:
                k, v = tokens
                self.log.debug("Overriding g.%s to %s" % (k, v))
                setattr(self, k, v)

        #if we're going to use the query_queue, we need amqp
        if self.write_query_queue and not self.amqp_host:
            raise Exception("amqp_host must be defined to use the query queue")

        # This requirement doesn't *have* to be a requirement, but there are
        # bugs at the moment that will pop up if you violate it
        if self.write_query_queue and not self.use_query_cache:
            raise Exception("write_query_queue requires use_query_cache")

        # try to set the source control revision numbers
        self.versions = {}
        r2_root = os.path.dirname(os.path.dirname(self.paths["root"]))
        r2_gitdir = os.path.join(r2_root, ".git")
        self.short_version = self.record_repo_version("r2", r2_gitdir)

        if I18N_PATH:
            i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git")
            self.record_repo_version("i18n", i18n_git_path)

        if self.log_start:
            self.log.error("reddit app %s:%s started %s at %s" %
                           (self.reddit_host, self.reddit_pid,
                            self.short_version, datetime.now()))
Beispiel #7
0
    def __init__(self, global_conf, app_conf, paths, **extra):
        """
        Globals acts as a container for objects available throughout
        the life of the application.

        One instance of Globals is created by Pylons during
        application initialization and is available during requests
        via the 'g' variable.

        ``global_conf``
            The same variable used throughout ``config/middleware.py``
            namely, the variables from the ``[DEFAULT]`` section of the
            configuration file.

        ``app_conf``
            The same ``kw`` dictionary used throughout
            ``config/middleware.py`` namely, the variables from the
            section in the config file for your application.

        ``extra``
            The configuration returned from ``load_config`` in 
            ``config/middleware.py`` which may be of use in the setup of
            your global variables.

        """

        # slop over all variables to start with
        for k, v in global_conf.iteritems():
            if not k.startswith("_") and not hasattr(self, k):
                if k in self.int_props:
                    v = int(v)
                elif k in self.float_props:
                    v = float(v)
                elif k in self.bool_props:
                    v = self.to_bool(v)
                elif k in self.tuple_props:
                    v = tuple(self.to_iter(v))
                elif k in self.choice_props:
                    if v not in self.choice_props[k]:
                        raise ValueError(
                            "Unknown option for %r: %r not in %r" %
                            (k, v, self.choice_props[k]))
                    v = self.choice_props[k][v]
                setattr(self, k, v)

        self.running_as_script = global_conf.get('running_as_script', False)

        if hasattr(signal, 'SIGUSR1'):
            # not all platforms have user signals
            signal.signal(signal.SIGUSR1, thread_dump)

        # initialize caches. Any cache-chains built here must be added
        # to cache_chains (closed around by reset_caches) so that they
        # can properly reset their local components

        localcache_cls = (SelfEmptyingCache
                          if self.running_as_script else LocalCache)
        num_mc_clients = self.num_mc_clients

        self.cache_chains = []

        self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients)
        self.make_lock = make_lock_factory(self.memcache)

        if not self.cassandra_seeds:
            raise ValueError("cassandra_seeds not set in the .ini")
        self.cassandra = PycassaConnectionPool(
            'reddit',
            server_list=self.cassandra_seeds,
            pool_size=len(self.cassandra_seeds),
            # TODO: .ini setting
            timeout=15,
            max_retries=3,
            prefill=False)
        perma_memcache = (CMemcache(self.permacache_memcaches,
                                    num_clients=num_mc_clients)
                          if self.permacache_memcaches else None)
        self.permacache = CassandraCacheChain(
            localcache_cls(),
            CassandraCache('permacache',
                           self.cassandra,
                           read_consistency_level=self.cassandra_rcl,
                           write_consistency_level=self.cassandra_wcl),
            memcache=perma_memcache,
            lock_factory=self.make_lock)

        self.cache_chains.append(self.permacache)

        # hardcache is done after the db info is loaded, and then the
        # chains are reset to use the appropriate initial entries

        if self.stalecaches:
            self.cache = StaleCacheChain(
                localcache_cls(),
                CMemcache(self.stalecaches, num_clients=num_mc_clients),
                self.memcache)
        else:
            self.cache = MemcacheChain((localcache_cls(), self.memcache))
        self.cache_chains.append(self.cache)

        self.rendercache = MemcacheChain(
            (localcache_cls(),
             CMemcache(self.rendercaches,
                       noreply=True,
                       no_block=True,
                       num_clients=num_mc_clients)))
        self.cache_chains.append(self.rendercache)

        self.servicecache = MemcacheChain(
            (localcache_cls(),
             CMemcache(self.servicecaches, num_clients=num_mc_clients)))
        self.cache_chains.append(self.servicecache)

        self.thing_cache = CacheChain((localcache_cls(), ))
        self.cache_chains.append(self.thing_cache)

        # set default time zone if one is not set
        tz = global_conf.get('timezone')
        dtz = global_conf.get('display_timezone', tz)

        self.tz = pytz.timezone(tz)
        self.display_tz = pytz.timezone(dtz)

        #load the database info
        self.dbm = self.load_db_params(global_conf)

        # can't do this until load_db_params() has been called
        self.hardcache = HardcacheChain(
            (localcache_cls(), self.memcache, HardCache(self)),
            cache_negative_results=True)
        self.cache_chains.append(self.hardcache)

        # I know this sucks, but we need non-request-threads to be
        # able to reset the caches, so we need them be able to close
        # around 'cache_chains' without being able to call getattr on
        # 'g'
        cache_chains = self.cache_chains[::]

        def reset_caches():
            for chain in cache_chains:
                chain.reset()

        self.reset_caches = reset_caches
        self.reset_caches()

        #make a query cache
        self.stats_collector = QueryStats()

        # set the modwindow
        self.MODWINDOW = timedelta(self.MODWINDOW)

        self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN'))

        # turn on for language support
        self.languages, self.lang_name = \
                        get_active_langs(default_lang= self.lang)

        all_languages = self.lang_name.keys()
        all_languages.sort()
        self.all_languages = all_languages

        self.paths = paths

        # load the md5 hashes of files under static
        static_files = os.path.join(paths.get('static_files'), 'static')
        self.static_md5 = {}
        if os.path.exists(static_files):
            for f in os.listdir(static_files):
                if f.endswith('.md5'):
                    key = f[0:-4]
                    f = os.path.join(static_files, f)
                    with open(f, 'r') as handle:
                        md5 = handle.read().strip('\n')
                    self.static_md5[key] = md5

        #set up the logging directory
        log_path = self.log_path
        process_iden = global_conf.get('scgi_port', 'default')
        self.reddit_port = process_iden
        if log_path:
            if not os.path.exists(log_path):
                os.makedirs(log_path)
            for fname in os.listdir(log_path):
                if fname.startswith(process_iden):
                    full_name = os.path.join(log_path, fname)
                    os.remove(full_name)

        #setup the logger
        self.log = logging.getLogger('reddit')
        self.log.addHandler(logging.StreamHandler())
        if self.debug:
            self.log.setLevel(logging.DEBUG)
        else:
            self.log.setLevel(logging.INFO)

        # set log level for pycountry which is chatty
        logging.getLogger('pycountry.db').setLevel(logging.CRITICAL)

        if not self.media_domain:
            self.media_domain = self.domain
        if self.media_domain == self.domain:
            print("Warning: g.media_domain == g.domain. " +
                  "This may give untrusted content access to user cookies")

        #read in our CSS so that it can become a default for subreddit
        #stylesheets
        stylesheet_path = os.path.join(paths.get('static_files'),
                                       self.static_path.lstrip('/'),
                                       self.stylesheet)
        with open(stylesheet_path) as s:
            self.default_stylesheet = s.read()

        self.profanities = None
        if self.profanity_wordlist and os.path.exists(self.profanity_wordlist):
            with open(self.profanity_wordlist, 'r') as handle:
                words = []
                for line in handle:
                    words.append(line.strip(' \n\r'))
                if words:
                    self.profanities = re.compile(
                        r"\b(%s)\b" % '|'.join(words), re.I | re.U)

        self.reddit_host = socket.gethostname()
        self.reddit_pid = os.getpid()

        for arg in sys.argv:
            tokens = arg.split("=")
            if len(tokens) == 2:
                k, v = tokens
                self.log.debug("Overriding g.%s to %s" % (k, v))
                setattr(self, k, v)

        #the shutdown toggle
        self.shutdown = False

        #if we're going to use the query_queue, we need amqp
        if self.write_query_queue and not self.amqp_host:
            raise Exception("amqp_host must be defined to use the query queue")

        # This requirement doesn't *have* to be a requirement, but there are
        # bugs at the moment that will pop up if you violate it
        if self.write_query_queue and not self.use_query_cache:
            raise Exception("write_query_queue requires use_query_cache")

        # try to set the source control revision number
        try:
            popen = subprocess.Popen(
                ["git", "log", "--date=short", "--pretty=format:%H %h", '-n1'],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE)
            resp, stderrdata = popen.communicate()
            resp = resp.strip().split(' ')
            self.version, self.short_version = resp
        except object, e:
            self.log.info("Couldn't read source revision (%r)" % e)
            self.version = self.short_version = '(unknown)'
Beispiel #8
0
    def setup(self, global_conf):
        # heavy load mode is read only mode with a different infobar
        if self.heavy_load_mode:
            self.read_only_mode = True

        if hasattr(signal, 'SIGUSR1'):
            # not all platforms have user signals
            signal.signal(signal.SIGUSR1, thread_dump)

        # initialize caches. Any cache-chains built here must be added
        # to cache_chains (closed around by reset_caches) so that they
        # can properly reset their local components

        localcache_cls = (SelfEmptyingCache
                          if self.running_as_script else LocalCache)
        num_mc_clients = self.num_mc_clients

        self.cache_chains = {}

        self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients)
        self.make_lock = make_lock_factory(self.memcache)

        self.stats = Stats(global_conf.get('statsd_addr'),
                           global_conf.get('statsd_sample_rate'))

        if not self.cassandra_seeds:
            raise ValueError("cassandra_seeds not set in the .ini")

        keyspace = "reddit"
        self.cassandra_pools = {
            "main":
            StatsCollectingConnectionPool(keyspace,
                                          stats=self.stats,
                                          logging_name="main",
                                          server_list=self.cassandra_seeds,
                                          pool_size=len(self.cassandra_seeds),
                                          timeout=2,
                                          max_retries=3,
                                          prefill=False),
            "noretries":
            StatsCollectingConnectionPool(keyspace,
                                          stats=self.stats,
                                          logging_name="noretries",
                                          server_list=self.cassandra_seeds,
                                          pool_size=len(self.cassandra_seeds),
                                          timeout=.1,
                                          max_retries=0,
                                          prefill=False),
        }

        perma_memcache = (CMemcache(self.permacache_memcaches,
                                    num_clients=num_mc_clients)
                          if self.permacache_memcaches else None)
        self.permacache = CassandraCacheChain(
            localcache_cls(),
            CassandraCache('permacache',
                           self.cassandra_pools[self.cassandra_default_pool],
                           read_consistency_level=self.cassandra_rcl,
                           write_consistency_level=self.cassandra_wcl),
            memcache=perma_memcache,
            lock_factory=self.make_lock)

        self.cache_chains.update(permacache=self.permacache)

        # hardcache is done after the db info is loaded, and then the
        # chains are reset to use the appropriate initial entries

        if self.stalecaches:
            self.cache = StaleCacheChain(
                localcache_cls(),
                CMemcache(self.stalecaches, num_clients=num_mc_clients),
                self.memcache)
        else:
            self.cache = MemcacheChain((localcache_cls(), self.memcache))
        self.cache_chains.update(cache=self.cache)

        self.rendercache = MemcacheChain(
            (localcache_cls(),
             CMemcache(self.rendercaches,
                       noreply=True,
                       no_block=True,
                       num_clients=num_mc_clients)))
        self.cache_chains.update(rendercache=self.rendercache)

        self.servicecache = MemcacheChain(
            (localcache_cls(),
             CMemcache(self.servicecaches, num_clients=num_mc_clients)))
        self.cache_chains.update(servicecache=self.servicecache)

        self.thing_cache = CacheChain((localcache_cls(), ))
        self.cache_chains.update(thing_cache=self.thing_cache)

        #load the database info
        self.dbm = self.load_db_params(global_conf)

        # can't do this until load_db_params() has been called
        self.hardcache = HardcacheChain(
            (localcache_cls(), self.memcache, HardCache(self)),
            cache_negative_results=True)
        self.cache_chains.update(hardcache=self.hardcache)

        # I know this sucks, but we need non-request-threads to be
        # able to reset the caches, so we need them be able to close
        # around 'cache_chains' without being able to call getattr on
        # 'g'
        cache_chains = self.cache_chains.copy()

        def reset_caches():
            for name, chain in cache_chains.iteritems():
                chain.reset()
                chain.stats = CacheStats(self.stats, name)

        self.reset_caches = reset_caches
        self.reset_caches()

        #make a query cache
        self.stats_collector = QueryStats()

        # set the modwindow
        self.MODWINDOW = timedelta(self.MODWINDOW)

        self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN'))

        origin_prefix = self.domain_prefix + "." if self.domain_prefix else ""
        self.origin = "http://" + origin_prefix + self.domain
        self.secure_domains = set([urlparse(self.payment_domain).netloc])

        self.trusted_domains = set([self.domain])
        self.trusted_domains.update(self.authorized_cnames)
        if self.https_endpoint:
            https_url = urlparse(self.https_endpoint)
            self.secure_domains.add(https_url.netloc)
            self.trusted_domains.add(https_url.hostname)

        # load the unique hashed names of files under static
        static_files = os.path.join(self.paths.get('static_files'), 'static')
        names_file_path = os.path.join(static_files, 'names.json')
        if os.path.exists(names_file_path):
            with open(names_file_path) as handle:
                self.static_names = json.load(handle)
        else:
            self.static_names = {}

        #setup the logger
        self.log = logging.getLogger('reddit')
        self.log.addHandler(logging.StreamHandler())
        if self.debug:
            self.log.setLevel(logging.DEBUG)
        else:
            self.log.setLevel(logging.INFO)

        # set log level for pycountry which is chatty
        logging.getLogger('pycountry.db').setLevel(logging.CRITICAL)

        if not self.media_domain:
            self.media_domain = self.domain
        if self.media_domain == self.domain:
            print("Warning: g.media_domain == g.domain. " +
                  "This may give untrusted content access to user cookies")

        self.reddit_host = socket.gethostname()
        self.reddit_pid = os.getpid()

        for arg in sys.argv:
            tokens = arg.split("=")
            if len(tokens) == 2:
                k, v = tokens
                self.log.debug("Overriding g.%s to %s" % (k, v))
                setattr(self, k, v)

        #the shutdown toggle
        self.shutdown = False

        #if we're going to use the query_queue, we need amqp
        if self.write_query_queue and not self.amqp_host:
            raise Exception("amqp_host must be defined to use the query queue")

        # This requirement doesn't *have* to be a requirement, but there are
        # bugs at the moment that will pop up if you violate it
        if self.write_query_queue and not self.use_query_cache:
            raise Exception("write_query_queue requires use_query_cache")

        # try to set the source control revision number
        try:
            self.version = subprocess.check_output(
                ["git", "rev-parse", "HEAD"])
        except subprocess.CalledProcessError, e:
            self.log.info("Couldn't read source revision (%r)" % e)
            self.version = self.short_version = '(unknown)'