def setup(self, global_conf): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = [] self.memcache = CMemcache(self.memcaches, num_clients = num_mc_clients) self.make_lock = make_lock_factory(self.memcache) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") self.cassandra = PycassaConnectionPool('reddit', server_list = self.cassandra_seeds, pool_size = len(self.cassandra_seeds), # TODO: .ini setting timeout=15, max_retries=3, prefill=False) perma_memcache = (CMemcache(self.permacache_memcaches, num_clients = num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain(localcache_cls(), CassandraCache('permacache', self.cassandra, read_consistency_level = self.cassandra_rcl, write_consistency_level = self.cassandra_wcl), memcache = perma_memcache, lock_factory = self.make_lock) self.cache_chains.append(self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain(localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.append(self.cache) self.rendercache = MemcacheChain((localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients = num_mc_clients))) self.cache_chains.append(self.rendercache) self.servicecache = MemcacheChain((localcache_cls(), CMemcache(self.servicecaches, num_clients = num_mc_clients))) self.cache_chains.append(self.servicecache) self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.append(self.thing_cache) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain((localcache_cls(), self.memcache, HardCache(self)), cache_negative_results = True) self.cache_chains.append(self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains[::] def reset_caches(): for chain in cache_chains: chain.reset() self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) if self.https_endpoint: self.secure_domains.add(urlparse(self.https_endpoint).netloc) self.trusted_origins = [self.origin, self.https_endpoint] + ['http://' + origin_prefix + cname for cname in self.authorized_cnames] # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.profanities = None if self.profanity_wordlist and os.path.exists(self.profanity_wordlist): with open(self.profanity_wordlist, 'r') as handle: words = [] for line in handle: words.append(line.strip(' \n\r')) if words: self.profanities = re.compile(r"\b(%s)\b" % '|'.join(words), re.I | re.U) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #the shutdown toggle self.shutdown = False #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision number try: popen = subprocess.Popen(["git", "log", "--date=short", "--pretty=format:%H %h", '-n1'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) resp, stderrdata = popen.communicate() resp = resp.strip().split(' ') self.version, self.short_version = resp except object, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'
def setup(self): self.env = '' if ( # handle direct invocation of "nosetests" "test" in sys.argv[0] or # handle "setup.py test" and all permutations thereof. "setup.py" in sys.argv[0] and "test" in sys.argv[1:] ): self.env = "unit_test" self.queues = queues.declare_queues(self) self.extension_subdomains = dict( simple="mobile", i="compact", api="api", rss="rss", xml="xml", json="json", ) ################# PROVIDERS self.auth_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.auth", self.authentication_provider, ) self.media_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.media", self.media_provider, ) self.cdn_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.cdn", self.cdn_provider, ) self.ticket_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.support", # TODO: fix this later, it refuses to pick up # g.config['ticket_provider'] value, so hardcoding for now. # really, the next uncommented line should be: #self.ticket_provider, # instead of: "zendesk", ) self.image_resizing_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.image_resizing", self.image_resizing_provider, ) self.email_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.email", self.email_provider, ) self.startup_timer.intermediate("providers") ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = self.default_scheme + "://" + origin_prefix + self.domain self.trusted_domains = set([self.domain]) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} # make python warnings go through the logging system logging.captureWarnings(capture=True) log = logging.getLogger('reddit') # when we're a script (paster run) just set up super simple logging if self.running_as_script: log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler()) # if in debug mode, override the logging level to DEBUG if self.debug: log.setLevel(logging.DEBUG) # attempt to figure out which pool we're in and add that to the # LogRecords. try: with open("/etc/ec2_asg", "r") as f: pool = f.read().strip() # clean up the pool name since we're putting stuff after "-" pool = pool.partition("-")[0] except IOError: pool = "reddit-app" self.log = logging.LoggerAdapter(log, {"pool": pool}) # set locations locations = pkg_resources.resource_stream(__name__, "../data/locations.json") self.locations = json.loads(locations.read()) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print >> sys.stderr, ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") if self.oauth_domain == self.domain: print >> sys.stderr, ("Warning: g.oauth_domain == g.domain. " "CORS requests to g.domain will be allowed") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) locale.setlocale(locale.LC_ALL, self.locale) # Pre-calculate ratelimit values self.RL_RESET_SECONDS = self.config["RL_RESET_MINUTES"] * 60 self.RL_MAX_REQS = int(self.config["RL_AVG_REQ_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_OAUTH_RESET_SECONDS = self.config["RL_OAUTH_RESET_MINUTES"] * 60 self.RL_OAUTH_MAX_REQS = int(self.config["RL_OAUTH_AVG_REQ_PER_SEC"] * self.RL_OAUTH_RESET_SECONDS) self.RL_LOGIN_MAX_REQS = int(self.config["RL_LOGIN_AVG_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_LOGIN_IP_MAX_REQS = int(self.config["RL_LOGIN_IP_AVG_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_SHARE_MAX_REQS = int(self.config["RL_SHARE_AVG_PER_SEC"] * self.RL_RESET_SECONDS) # Compile ratelimit regexs user_agent_ratelimit_regexes = {} for agent_re, limit in self.user_agent_ratelimit_regexes.iteritems(): user_agent_ratelimit_regexes[re.compile(agent_re)] = limit self.user_agent_ratelimit_regexes = user_agent_ratelimit_regexes self.startup_timer.intermediate("configuration") ################# ZOOKEEPER zk_hosts = self.config["zookeeper_connection_string"] zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.throttles = IPNetworkLiveList( self.zookeeper, root="/throttles", reduced_data_node="/throttles_reduced", ) parser = ConfigParser.RawConfigParser() parser.optionxform = str parser.read([self.config["__file__"]]) if self.config["liveconfig_source"] == "zookeeper": self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) else: self.live_config = extract_live_config(parser, self.plugins) if self.config["secrets_source"] == "zookeeper": self.secrets = fetch_secrets(self.zookeeper) else: self.secrets = extract_secrets(parser) ################# PRIVILEGED USERS self.admins = PermissionFilteredEmployeeList( self.live_config, type="admin") self.sponsors = PermissionFilteredEmployeeList( self.live_config, type="sponsor") self.employees = PermissionFilteredEmployeeList( self.live_config, type="employee") # Store which OAuth clients employees may use, the keys are just for # readability. self.employee_approved_clients = \ self.live_config["employee_approved_clients"].values() self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # a smaller pool of caches used only for distributed locks. self.lock_cache = CMemcache( "lock", self.lockcaches, num_clients=num_mc_clients, ) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. permacache_memcaches = CMemcache( "perma", self.permacache_memcaches, min_compress_len=1400, num_clients=num_mc_clients, ) # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache( "stale", self.stalecaches, num_clients=num_mc_clients, ) else: stalecaches = None self.startup_timer.intermediate("memcache") ################# MCROUTER self.mcrouter = Mcrouter( "mcrouter", self.mcrouter_addr, min_compress_len=1400, num_clients=num_mc_clients, ) ################# THRIFT-BASED SERVICES activity_endpoint = self.config.get("activity_endpoint") if activity_endpoint: # make ActivityInfo objects rendercache-key friendly # TODO: figure out a more general solution for this if # we need to do this for other thrift-generated objects ActivityInfo.cache_key = lambda self, style: repr(self) activity_pool = ThriftConnectionPool(activity_endpoint, timeout=0.1) self.baseplate.add_to_context("activity_service", ThriftContextFactory(activity_pool, ActivityService.Client)) self.startup_timer.intermediate("thrift") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False ), } permacache_cf = Permacache._setup_column_family( 'permacache', self.cassandra_pools[self.cassandra_default_pool], ) self.startup_timer.intermediate("cassandra") ################# POSTGRES self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.gencache = StaleCacheChain( localcache_cls(), stalecaches, self.mcrouter, ) else: self.gencache = CacheChain((localcache_cls(), self.mcrouter)) cache_chains.update(gencache=self.gencache) if stalecaches: self.thingcache = StaleCacheChain( localcache_cls(), stalecaches, self.mcrouter, ) else: self.thingcache = CacheChain((localcache_cls(), self.mcrouter)) cache_chains.update(thingcache=self.thingcache) if stalecaches: self.memoizecache = StaleCacheChain( localcache_cls(), stalecaches, self.mcrouter, ) else: self.memoizecache = MemcacheChain( (localcache_cls(), self.mcrouter)) cache_chains.update(memoizecache=self.memoizecache) if stalecaches: self.srmembercache = StaleCacheChain( localcache_cls(), stalecaches, self.mcrouter, ) else: self.srmembercache = MemcacheChain( (localcache_cls(), self.mcrouter)) cache_chains.update(srmembercache=self.srmembercache) if stalecaches: self.relcache = StaleCacheChain( localcache_cls(), stalecaches, self.mcrouter, ) else: self.relcache = MemcacheChain( (localcache_cls(), self.mcrouter)) cache_chains.update(relcache=self.relcache) self.ratelimitcache = MemcacheChain( (localcache_cls(), self.mcrouter)) cache_chains.update(ratelimitcache=self.ratelimitcache) # rendercache holds rendered partial templates. self.rendercache = MemcacheChain(( localcache_cls(), self.mcrouter, )) cache_chains.update(rendercache=self.rendercache) # commentpanecaches hold fully rendered comment panes self.commentpanecache = MemcacheChain(( localcache_cls(), self.mcrouter, )) cache_chains.update(commentpanecache=self.commentpanecache) # cassandra_local_cache is used for request-local caching in tdb_cassandra self.cassandra_local_cache = localcache_cls() cache_chains.update(cassandra_local_cache=self.cassandra_local_cache) if stalecaches: permacache_cache = StaleCacheChain( localcache_cls(), stalecaches, permacache_memcaches, ) else: permacache_cache = CacheChain( (localcache_cls(), permacache_memcaches), ) cache_chains.update(permacache=permacache_cache) self.permacache = Permacache( permacache_cache, permacache_cf, lock_factory=self.make_lock, ) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), HardCache(self)), cache_negative_results=True, ) cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' def reset_caches(): for name, chain in cache_chains.iteritems(): if isinstance(chain, TransitionalCache): chain = chain.read_chain chain.reset() if isinstance(chain, LocalCache): continue elif isinstance(chain, StaleCacheChain): chain.stats = StaleCacheStats(self.stats, name) else: chain.stats = CacheStats(self.stats, name) self.cache_chains = cache_chains self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) # Initialize the amqp module globals, start the worker, etc. r2.lib.amqp.initialize(self) self.events = EventQueue() self.startup_timer.intermediate("revisions")
def setup(self): self.env = '' if ( # handle direct invocation of "nosetests" "test" in sys.argv[0] or # handle "setup.py test" and all permutations thereof. "setup.py" in sys.argv[0] and "test" in sys.argv[1:] ): self.env = "unit_test" self.queues = queues.declare_queues(self) self.extension_subdomains = dict( simple="mobile", i="compact", api="api", rss="rss", xml="xml", json="json", ) ################# PROVIDERS self.auth_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.auth", self.authentication_provider, ) self.media_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.media", self.media_provider, ) self.cdn_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.cdn", self.cdn_provider, ) self.ticket_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.support", # TODO: fix this later, it refuses to pick up # g.config['ticket_provider'] value, so hardcoding for now. # really, the next uncommented line should be: #self.ticket_provider, # instead of: "zendesk", ) self.image_resizing_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.image_resizing", self.image_resizing_provider, ) self.email_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.email", self.email_provider, ) self.startup_timer.intermediate("providers") ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = self.default_scheme + "://" + origin_prefix + self.domain self.trusted_domains = set([self.domain]) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} # make python warnings go through the logging system logging.captureWarnings(capture=True) log = logging.getLogger('reddit') # when we're a script (paster run) just set up super simple logging if self.running_as_script: log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler()) # if in debug mode, override the logging level to DEBUG if self.debug: log.setLevel(logging.DEBUG) # attempt to figure out which pool we're in and add that to the # LogRecords. try: with open("/etc/ec2_asg", "r") as f: pool = f.read().strip() # clean up the pool name since we're putting stuff after "-" pool = pool.partition("-")[0] except IOError: pool = "reddit-app" self.log = logging.LoggerAdapter(log, {"pool": pool}) # set locations locations = pkg_resources.resource_stream(__name__, "../data/locations.json") self.locations = json.loads(locations.read()) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print >> sys.stderr, ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") if self.oauth_domain == self.domain: print >> sys.stderr, ("Warning: g.oauth_domain == g.domain. " "CORS requests to g.domain will be allowed") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) locale.setlocale(locale.LC_ALL, self.locale) # Pre-calculate ratelimit values self.RL_RESET_SECONDS = self.config["RL_RESET_MINUTES"] * 60 self.RL_MAX_REQS = int(self.config["RL_AVG_REQ_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_OAUTH_RESET_SECONDS = self.config["RL_OAUTH_RESET_MINUTES"] * 60 self.RL_OAUTH_MAX_REQS = int(self.config["RL_OAUTH_AVG_REQ_PER_SEC"] * self.RL_OAUTH_RESET_SECONDS) self.RL_LOGIN_MAX_REQS = int(self.config["RL_LOGIN_AVG_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_LOGIN_IP_MAX_REQS = int(self.config["RL_LOGIN_IP_AVG_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_SHARE_MAX_REQS = int(self.config["RL_SHARE_AVG_PER_SEC"] * self.RL_RESET_SECONDS) # Compile ratelimit regexs user_agent_ratelimit_regexes = {} for agent_re, limit in self.user_agent_ratelimit_regexes.iteritems(): user_agent_ratelimit_regexes[re.compile(agent_re)] = limit self.user_agent_ratelimit_regexes = user_agent_ratelimit_regexes self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.secrets = fetch_secrets(self.zookeeper) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) # close our zk connection when the app shuts down SHUTDOWN_CALLBACKS.append(self.zookeeper.stop) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.optionxform = str parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.secrets = extract_secrets(parser) self.throttles = tuple() # immutable since it's not real ################# PRIVILEGED USERS self.admins = PermissionFilteredEmployeeList( self.live_config, type="admin") self.sponsors = PermissionFilteredEmployeeList( self.live_config, type="sponsor") self.employees = PermissionFilteredEmployeeList( self.live_config, type="employee") # Store which OAuth clients employees may use, the keys are just for # readability. self.employee_approved_clients = \ self.live_config["employee_approved_clients"].values() self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. memcaches = CMemcache( "main", self.memcaches, min_compress_len=1400, num_clients=num_mc_clients, validators=[validate_size_error], ) # a pool just used for @memoize results memoizecaches = CMemcache( "memoize", self.memoizecaches, min_compress_len=50 * 1024, num_clients=num_mc_clients, validators=[validate_size_error], ) # a pool just for srmember rels srmembercaches = CMemcache( "srmember", self.srmembercaches, min_compress_len=96, num_clients=num_mc_clients, validators=[validate_size_error], ) # a pool just for rels relcaches = CMemcache( "rel", self.relcaches, min_compress_len=96, num_clients=num_mc_clients, validators=[validate_size_error], ) ratelimitcaches = CMemcache( "ratelimit", self.ratelimitcaches, min_compress_len=96, num_clients=num_mc_clients, validators=[validate_size_error], ) # a smaller pool of caches used only for distributed locks. self.lock_cache = CMemcache( "lock", self.lockcaches, num_clients=num_mc_clients, validators=[validate_size_error], ) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. permacache_memcaches = CMemcache("perma", self.permacache_memcaches, min_compress_len=1400, num_clients=num_mc_clients, validators=[],) # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache( "stale", self.stalecaches, num_clients=num_mc_clients, validators=[validate_size_error], ) else: stalecaches = None # hardcache memcache pool hardcache_memcaches = CMemcache( "hardcache", self.hardcache_memcaches, binary=True, min_compress_len=1400, num_clients=num_mc_clients, validators=[validate_size_error], ) self.startup_timer.intermediate("memcache") ################# MCROUTER self.mcrouter = Mcrouter( "mcrouter", self.mcrouter_addr, min_compress_len=1400, num_clients=1, ) ################# THRIFT-BASED SERVICES activity_endpoint = self.config.get("activity_endpoint") if activity_endpoint: # make ActivityInfo objects rendercache-key friendly # TODO: figure out a more general solution for this if # we need to do this for other thrift-generated objects ActivityInfo.cache_key = lambda self, style: repr(self) activity_pool = ThriftConnectionPool(activity_endpoint, timeout=0.1) self.baseplate.add_to_context("activity_service", ThriftContextFactory(activity_pool, ActivityService.Client)) self.startup_timer.intermediate("thrift") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False ), } permacache_cf = Permacache._setup_column_family( 'permacache', self.cassandra_pools[self.cassandra_default_pool], ) self.startup_timer.intermediate("cassandra") ################# POSTGRES self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, memcaches, ) else: self.cache = CacheChain((localcache_cls(), memcaches)) cache_chains.update(cache=self.cache) if stalecaches: self.thingcache = StaleCacheChain( localcache_cls(), stalecaches, self.mcrouter, ) else: self.thingcache = CacheChain((localcache_cls(), self.mcrouter)) cache_chains.update(thingcache=self.thingcache) if stalecaches: self.memoizecache = StaleCacheChain( localcache_cls(), stalecaches, memoizecaches, ) else: self.memoizecache = MemcacheChain( (localcache_cls(), memoizecaches)) cache_chains.update(memoizecache=self.memoizecache) if stalecaches: self.srmembercache = StaleCacheChain( localcache_cls(), stalecaches, srmembercaches, ) else: self.srmembercache = MemcacheChain( (localcache_cls(), srmembercaches)) cache_chains.update(srmembercache=self.srmembercache) if stalecaches: self.relcache = StaleCacheChain( localcache_cls(), stalecaches, relcaches, ) else: self.relcache = MemcacheChain( (localcache_cls(), relcaches)) cache_chains.update(relcache=self.relcache) self.ratelimitcache = MemcacheChain( (localcache_cls(), ratelimitcaches)) cache_chains.update(ratelimitcache=self.ratelimitcache) # rendercache holds rendered partial templates. self.rendercache = MemcacheChain(( localcache_cls(), self.mcrouter, )) cache_chains.update(rendercache=self.rendercache) # pagecaches hold fully rendered pages (includes comment panes) self.pagecache = MemcacheChain(( localcache_cls(), self.mcrouter, )) cache_chains.update(pagecache=self.pagecache) # cassandra_local_cache is used for request-local caching in tdb_cassandra self.cassandra_local_cache = localcache_cls() cache_chains.update(cassandra_local_cache=self.cassandra_local_cache) if stalecaches: permacache_cache = StaleCacheChain( localcache_cls(), stalecaches, permacache_memcaches, ) else: permacache_cache = CacheChain( (localcache_cls(), permacache_memcaches), ) cache_chains.update(permacache=permacache_cache) self.permacache = Permacache( permacache_cache, permacache_cf, lock_factory=self.make_lock, ) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), hardcache_memcaches, HardCache(self)), cache_negative_results=True, ) cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' def reset_caches(): for name, chain in cache_chains.iteritems(): if isinstance(chain, TransitionalCache): chain = chain.read_chain chain.reset() if isinstance(chain, LocalCache): continue elif isinstance(chain, StaleCacheChain): chain.stats = StaleCacheStats(self.stats, name) else: chain.stats = CacheStats(self.stats, name) self.cache_chains = cache_chains self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) # Initialize the amqp module globals, start the worker, etc. r2.lib.amqp.initialize(self) self.events = EventQueue() self.startup_timer.intermediate("revisions")
def setup(self): self.queues = queues.declare_queues(self) ################# PROVIDERS self.media_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.media", self.media_provider, ) self.startup_timer.intermediate("providers") ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.trusted_domains = set([self.domain]) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} # make python warnings go through the logging system logging.captureWarnings(capture=True) log = logging.getLogger('reddit') # when we're a script (paster run) just set up super simple logging if self.running_as_script: log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler()) # if in debug mode, override the logging level to DEBUG if self.debug: log.setLevel(logging.DEBUG) # attempt to figure out which pool we're in and add that to the # LogRecords. try: with open("/etc/ec2_asg", "r") as f: pool = f.read().strip() # clean up the pool name since we're putting stuff after "-" pool = pool.partition("-")[0] except IOError: pool = "reddit-app" self.log = logging.LoggerAdapter(log, {"pool": pool}) # set locations self.locations = {} if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print >> sys.stderr, ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) locale.setlocale(locale.LC_ALL, self.locale) # Pre-calculate ratelimit values self.RL_RESET_SECONDS = self.config["RL_RESET_MINUTES"] * 60 self.RL_MAX_REQS = int(self.config["RL_AVG_REQ_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_OAUTH_RESET_SECONDS = self.config["RL_OAUTH_RESET_MINUTES"] * 60 self.RL_OAUTH_MAX_REQS = int(self.config["RL_OAUTH_AVG_REQ_PER_SEC"] * self.RL_OAUTH_RESET_SECONDS) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.secrets = fetch_secrets(self.zookeeper) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.optionxform = str parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.secrets = extract_secrets(parser) self.throttles = tuple() # immutable since it's not real self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. self.memcache = CMemcache( self.memcaches, min_compress_len=50 * 1024, num_clients=num_mc_clients, ) # a pool just used for @memoize results memoizecaches = CMemcache( self.memoizecaches, min_compress_len=50 * 1024, num_clients=num_mc_clients, ) # a pool just for srmember rels srmembercaches = CMemcache( self.srmembercaches, min_compress_len=96, num_clients=num_mc_clients, ) ratelimitcaches = CMemcache( self.ratelimitcaches, min_compress_len=96, num_clients=num_mc_clients, ) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. if self.permacache_memcaches: permacache_memcaches = CMemcache(self.permacache_memcaches, min_compress_len=50 * 1024, num_clients=num_mc_clients) else: permacache_memcaches = None # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates. rendercaches = CMemcache( self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients, min_compress_len=480, ) # pagecaches hold fully rendered pages pagecaches = CMemcache( self.pagecaches, noreply=True, no_block=True, num_clients=num_mc_clients, min_compress_len=1400, ) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False ), } permacache_cf = CassandraCache( 'permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl ) self.startup_timer.intermediate("cassandra") ################# POSTGRES self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, self.memcache, ) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) cache_chains.update(cache=self.cache) if stalecaches: self.memoizecache = StaleCacheChain( localcache_cls(), stalecaches, memoizecaches, ) else: self.memoizecache = MemcacheChain( (localcache_cls(), memoizecaches)) cache_chains.update(memoizecache=self.memoizecache) if stalecaches: self.srmembercache = StaleCacheChain( localcache_cls(), stalecaches, srmembercaches, ) else: self.srmembercache = MemcacheChain( (localcache_cls(), srmembercaches)) cache_chains.update(srmembercache=self.srmembercache) self.ratelimitcache = MemcacheChain( (localcache_cls(), ratelimitcaches)) cache_chains.update(ratelimitcache=self.ratelimitcache) self.rendercache = MemcacheChain(( localcache_cls(), rendercaches, )) cache_chains.update(rendercache=self.rendercache) self.pagecache = MemcacheChain(( localcache_cls(), pagecaches, )) cache_chains.update(pagecache=self.pagecache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(),)) cache_chains.update(thing_cache=self.thing_cache) self.permacache = CassandraCacheChain( localcache_cls(), permacache_cf, memcache=permacache_memcaches, lock_factory=self.make_lock, ) cache_chains.update(permacache=self.permacache) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True, ) cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.cache_chains = cache_chains self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def __init__(self, global_conf, app_conf, paths, **extra): """ Globals acts as a container for objects available throughout the life of the application. One instance of Globals is created by Pylons during application initialization and is available during requests via the 'g' variable. ``global_conf`` The same variable used throughout ``config/middleware.py`` namely, the variables from the ``[DEFAULT]`` section of the configuration file. ``app_conf`` The same ``kw`` dictionary used throughout ``config/middleware.py`` namely, the variables from the section in the config file for your application. ``extra`` The configuration returned from ``load_config`` in ``config/middleware.py`` which may be of use in the setup of your global variables. """ # slop over all variables to start with for k, v in global_conf.iteritems(): if not k.startswith("_") and not hasattr(self, k): if k in self.int_props: v = int(v) elif k in self.float_props: v = float(v) elif k in self.bool_props: v = self.to_bool(v) elif k in self.tuple_props: v = tuple(self.to_iter(v)) setattr(self, k, v) self.running_as_script = global_conf.get('running_as_script', False) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = [] self.permacache = self.init_cass_cache('permacache', self.permacache_memcaches, self.cassandra_seeds) self.urlcache = self.init_cass_cache('urls', self.url_caches, self.url_seeds) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries self.cache = self.init_memcached(self.memcaches) self.memcache = self.cache.caches[1] # used by lock.py self.rendercache = self.init_memcached(self.rendercaches, noreply=True, no_block=True) self.servicecache = self.init_memcached(self.servicecaches) self.make_lock = make_lock_factory(self.memcache) # set default time zone if one is not set tz = global_conf.get('timezone') dtz = global_conf.get('display_timezone', tz) self.tz = pytz.timezone(tz) self.display_tz = pytz.timezone(dtz) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.append(self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains[::] def reset_caches(): for chain in cache_chains: chain.reset() self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) # turn on for language support self.languages, self.lang_name = \ get_active_langs(default_lang= self.lang) all_languages = self.lang_name.keys() all_languages.sort() self.all_languages = all_languages self.paths = paths # load the md5 hashes of files under static static_files = os.path.join(paths.get('static_files'), 'static') self.static_md5 = {} if os.path.exists(static_files): for f in os.listdir(static_files): if f.endswith('.md5'): key = f.strip('.md5') f = os.path.join(static_files, f) with open(f, 'r') as handle: md5 = handle.read().strip('\n') self.static_md5[key] = md5 #set up the logging directory log_path = self.log_path process_iden = global_conf.get('scgi_port', 'default') self.reddit_port = process_iden if log_path: if not os.path.exists(log_path): os.makedirs(log_path) for fname in os.listdir(log_path): if fname.startswith(process_iden): full_name = os.path.join(log_path, fname) os.remove(full_name) #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.WARNING) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") #read in our CSS so that it can become a default for subreddit #stylesheets stylesheet_path = os.path.join(paths.get('static_files'), self.static_path.lstrip('/'), self.stylesheet) with open(stylesheet_path) as s: self.default_stylesheet = s.read() self.profanities = None if self.profanity_wordlist and os.path.exists(self.profanity_wordlist): with open(self.profanity_wordlist, 'r') as handle: words = [] for line in handle: words.append(line.strip(' \n\r')) if words: self.profanities = re.compile( r"\b(%s)\b" % '|'.join(words), re.I | re.U) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() #the shutdown toggle self.shutdown = False #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # try to set the source control revision number try: popen = subprocess.Popen( ["git", "log", "--date=short", "--pretty=format:%H %h", '-n1'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) resp, stderrdata = popen.communicate() resp = resp.strip().split(' ') self.version, self.short_version = resp except object, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'
def setup(self): self.queues = queues.declare_queues(self) ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it # XXX: get rid of these options. new query cache is always on. if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, 'oauth_domain', None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. if self.permacache_memcaches: permacache_memcaches = CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) else: permacache_memcaches = None # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates. rendercaches = CMemcache( self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients, ) # pagecaches hold fully rendered pages pagecaches = CMemcache( self.pagecaches, noreply=True, no_block=True, num_clients=num_mc_clients, ) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False ), } permacache_cf = CassandraCache( 'permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl ) self.startup_timer.intermediate("cassandra") ################# POSTGRES event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components self.cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, self.memcache, ) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain(( localcache_cls(), rendercaches, )) self.cache_chains.update(rendercache=self.rendercache) self.pagecache = MemcacheChain(( localcache_cls(), pagecaches, )) self.cache_chains.update(pagecache=self.pagecache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.update(thing_cache=self.thing_cache) self.permacache = CassandraCacheChain( localcache_cls(), permacache_cf, memcache=permacache_memcaches, lock_factory=self.make_lock, ) self.cache_chains.update(permacache=self.permacache) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True, ) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def __init__(self, global_conf, app_conf, paths, **extra): """ Globals acts as a container for objects available throughout the life of the application. One instance of Globals is created by Pylons during application initialization and is available during requests via the 'g' variable. ``global_conf`` The same variable used throughout ``config/middleware.py`` namely, the variables from the ``[DEFAULT]`` section of the configuration file. ``app_conf`` The same ``kw`` dictionary used throughout ``config/middleware.py`` namely, the variables from the section in the config file for your application. ``extra`` The configuration returned from ``load_config`` in ``config/middleware.py`` which may be of use in the setup of your global variables. """ def to_bool(x): return (x.lower() == 'true') if x else None def to_iter(name, delim = ','): return (x.strip() for x in global_conf.get(name, '').split(delim)) # slop over all variables to start with for k, v in global_conf.iteritems(): if not k.startswith("_") and not hasattr(self, k): if k in self.int_props: v = int(v) elif k in self.bool_props: v = to_bool(v) elif k in self.tuple_props: v = tuple(to_iter(k)) setattr(self, k, v) # initialize caches mc = Memcache(self.memcaches) self.cache = CacheChain((LocalCache(), mc)) self.permacache = Memcache(self.permacaches) self.rendercache = Memcache(self.rendercaches) self.make_lock = make_lock_factory(mc) self.rec_cache = Memcache(self.rec_cache) # set default time zone if one is not set self.tz = pytz.timezone(global_conf.get('timezone')) #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) # turn on for language support self.languages, self.lang_name = _get_languages() all_languages = self.lang_name.keys() all_languages.sort() self.all_languages = all_languages # load the md5 hashes of files under static static_files = os.path.join(paths.get('static_files'), 'static') self.static_md5 = {} if os.path.exists(static_files): for f in os.listdir(static_files): if f.endswith('.md5'): key = f[0:-4] f = os.path.join(static_files, f) with open(f, 'r') as handle: md5 = handle.read().strip('\n') self.static_md5[key] = md5 #set up the logging directory log_path = self.log_path process_iden = global_conf.get('scgi_port', 'default') if log_path: if not os.path.exists(log_path): os.makedirs(log_path) for fname in os.listdir(log_path): if fname.startswith(process_iden): full_name = os.path.join(log_path, fname) os.remove(full_name) #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) #read in our CSS so that it can become a default for subreddit #stylesheets stylesheet_path = os.path.join(paths.get('static_files'), self.static_path.lstrip('/'), self.stylesheet) with open(stylesheet_path) as s: self.default_stylesheet = s.read() self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid()
def setup(self): self.queues = queues.declare_queues(self) ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, "oauth_domain", None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get("static_files"), "static") names_file_path = os.path.join(static_files, "names.json") if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} # if we're a web app running on old uwsgi, force load the logging # config from the file since uwsgi didn't do it for us if not self.running_as_script and self.old_uwsgi_load_logging_config: logging.config.fileConfig(self.config["__file__"]) # make python warnings go through the logging system logging.captureWarnings(capture=True) log = logging.getLogger("reddit") # when we're a script (paster run) just set up super simple logging if self.running_as_script: log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler()) # if in debug mode, override the logging level to DEBUG if self.debug: log.setLevel(logging.DEBUG) # attempt to figure out which pool we're in and add that to the # LogRecords. try: with open("/etc/ec2_asg", "r") as f: pool = f.read().strip() # clean up the pool name since we're putting stuff after "-" pool = pool.partition("-")[0] except IOError: pool = "reddit-app" self.log = logging.LoggerAdapter(log, {"pool": pool}) # make cssutils use the real logging system csslog = logging.getLogger("cssutils") cssutils.log.setLog(csslog) # load the country list countries_file_path = os.path.join(static_files, "countries.json") try: with open(countries_file_path) as handle: self.countries = json.load(handle) self.log.debug("Using countries.json.") except IOError: self.log.warning("Couldn't find countries.json. Using pycountry.") self.countries = get_countries_and_codes() if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, "SIGUSR1"): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import connect_to_zookeeper, LiveConfig, LiveList zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList( self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses ) self.banned_domains = LiveDict(self.zookeeper, "/banned-domains", watch=True) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.banned_domains = dict() self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. if self.permacache_memcaches: permacache_memcaches = CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) else: permacache_memcaches = None # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates. rendercaches = CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients) # pagecaches hold fully rendered pages pagecaches = CMemcache(self.pagecaches, noreply=True, no_block=True, num_clients=num_mc_clients) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False, ) } permacache_cf = CassandraCache( "permacache", self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl, ) self.startup_timer.intermediate("cassandra") ################# POSTGRES event.listens_for(engine.Engine, "before_cursor_execute")(self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, "after_cursor_execute")(self.stats.pg_after_cursor_execute) self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components self.cache_chains = {} localcache_cls = SelfEmptyingCache if self.running_as_script else LocalCache if stalecaches: self.cache = StaleCacheChain(localcache_cls(), stalecaches, self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain((localcache_cls(), rendercaches)) self.cache_chains.update(rendercache=self.rendercache) self.pagecache = MemcacheChain((localcache_cls(), pagecaches)) self.cache_chains.update(pagecache=self.pagecache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.update(thing_cache=self.thing_cache) self.permacache = CassandraCacheChain( localcache_cls(), permacache_cf, memcache=permacache_memcaches, lock_factory=self.make_lock ) self.cache_chains.update(permacache=self.permacache) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain((localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def setup(self): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = {} # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.stats = Stats(self.config.get('statsd_addr'), self.config.get('statsd_sample_rate')) event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) self.make_lock = make_lock_factory(self.lock_cache, self.stats) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=2, max_retries=3, prefill=False), } perma_memcache = (CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain( localcache_cls(), CassandraCache('permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl), memcache=perma_memcache, lock_factory=self.make_lock) self.cache_chains.update(permacache=self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain( localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain( (localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients))) self.cache_chains.update(rendercache=self.rendercache) self.thing_cache = CacheChain((localcache_cls(), )) self.cache_chains.update(thing_cache=self.thing_cache) #load the database info self.dbm = self.load_db_params() # can't do this until load_db_params() has been called self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, 'oauth_domain', None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) if self.log_start: self.log.error("reddit app %s:%s started %s at %s" % (self.reddit_host, self.reddit_pid, self.short_version, datetime.now()))
def __init__(self, global_conf, app_conf, paths, **extra): """ Globals acts as a container for objects available throughout the life of the application. One instance of Globals is created by Pylons during application initialization and is available during requests via the 'g' variable. ``global_conf`` The same variable used throughout ``config/middleware.py`` namely, the variables from the ``[DEFAULT]`` section of the configuration file. ``app_conf`` The same ``kw`` dictionary used throughout ``config/middleware.py`` namely, the variables from the section in the config file for your application. ``extra`` The configuration returned from ``load_config`` in ``config/middleware.py`` which may be of use in the setup of your global variables. """ # slop over all variables to start with for k, v in global_conf.iteritems(): if not k.startswith("_") and not hasattr(self, k): if k in self.int_props: v = int(v) elif k in self.float_props: v = float(v) elif k in self.bool_props: v = self.to_bool(v) elif k in self.tuple_props: v = tuple(self.to_iter(v)) elif k in self.choice_props: if v not in self.choice_props[k]: raise ValueError("Unknown option for %r: %r not in %r" % (k, v, self.choice_props[k])) v = self.choice_props[k][v] setattr(self, k, v) self.running_as_script = global_conf.get('running_as_script', False) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = [] self.memcache = CMemcache(self.memcaches, num_clients = num_mc_clients) self.make_lock = make_lock_factory(self.memcache) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") if not self.url_seeds: raise ValueError("url_seeds not set in the .ini") self.cassandra_seeds = list(self.cassandra_seeds) random.shuffle(self.cassandra_seeds) self.cassandra = pycassa.connect_thread_local(self.cassandra_seeds) perma_memcache = (CMemcache(self.permacache_memcaches, num_clients = num_mc_clients) if self.permacache_memcaches else None) self.permacache = self.init_cass_cache('permacache', 'permacache', self.cassandra, self.make_lock, memcache = perma_memcache, read_consistency_level = self.cassandra_rcl, write_consistency_level = self.cassandra_wcl, localcache_cls = localcache_cls) self.cache_chains.append(self.permacache) self.url_seeds = list(self.url_seeds) random.shuffle(self.url_seeds) self.url_cassandra = pycassa.connect_thread_local(self.url_seeds) self.urlcache = self.init_cass_cache('urls', 'urls', self.url_cassandra, self.make_lock, # until we've merged this # with the regular # cluster, this will # always be CL_ONE read_consistency_level = CL_ONE, write_consistency_level = CL_ONE, localcache_cls = localcache_cls) self.cache_chains.append(self.urlcache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.append(self.cache) self.rendercache = MemcacheChain((localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients = num_mc_clients))) self.cache_chains.append(self.rendercache) self.servicecache = MemcacheChain((localcache_cls(), CMemcache(self.servicecaches, num_clients = num_mc_clients))) self.cache_chains.append(self.servicecache) self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.append(self.thing_cache) # set default time zone if one is not set tz = global_conf.get('timezone') dtz = global_conf.get('display_timezone', tz) self.tz = pytz.timezone(tz) self.display_tz = pytz.timezone(dtz) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain((localcache_cls(), self.memcache, HardCache(self)), cache_negative_results = True) self.cache_chains.append(self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains[::] def reset_caches(): for chain in cache_chains: chain.reset() self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) # turn on for language support self.languages, self.lang_name = \ get_active_langs(default_lang= self.lang) all_languages = self.lang_name.keys() all_languages.sort() self.all_languages = all_languages self.paths = paths # load the md5 hashes of files under static static_files = os.path.join(paths.get('static_files'), 'static') self.static_md5 = {} if os.path.exists(static_files): for f in os.listdir(static_files): if f.endswith('.md5'): key = f[0:-4] f = os.path.join(static_files, f) with open(f, 'r') as handle: md5 = handle.read().strip('\n') self.static_md5[key] = md5 #set up the logging directory log_path = self.log_path process_iden = global_conf.get('scgi_port', 'default') self.reddit_port = process_iden if log_path: if not os.path.exists(log_path): os.makedirs(log_path) for fname in os.listdir(log_path): if fname.startswith(process_iden): full_name = os.path.join(log_path, fname) os.remove(full_name) #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain #if self.media_domain == self.domain: #print ("Warning: g.media_domain == g.domain. " + # "This may give untrusted content access to user cookies") #read in our CSS so that it can become a default for subreddit #stylesheets stylesheet_path = os.path.join(paths.get('static_files'), self.static_path.lstrip('/'), self.stylesheet) with open(stylesheet_path) as s: self.default_stylesheet = s.read() self.profanities = None if self.profanity_wordlist and os.path.exists(self.profanity_wordlist): with open(self.profanity_wordlist, 'r') as handle: words = [] for line in handle: words.append(line.strip(' \n\r')) if words: self.profanities = re.compile(r"\b(%s)\b" % '|'.join(words), re.I | re.U) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() #the shutdown toggle self.shutdown = False #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision number try: popen = subprocess.Popen(["git", "log", "--date=short", "--pretty=format:%H %h", '-n1'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) resp, stderrdata = popen.communicate() resp = resp.strip().split(' ') self.version, self.short_version = resp except object, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'
def setup(self, global_conf): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = {} self.memcache = CMemcache(self.memcaches, num_clients = num_mc_clients) self.make_lock = make_lock_factory(self.memcache) self.stats = Stats(global_conf.get('statsd_addr'), global_conf.get('statsd_sample_rate')) event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=2, max_retries=3, prefill=False ), "noretries": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="noretries", server_list=self.cassandra_seeds, pool_size=len(self.cassandra_seeds), timeout=2, max_retries=0, prefill=False ), } perma_memcache = (CMemcache(self.permacache_memcaches, num_clients = num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain(localcache_cls(), CassandraCache('permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level = self.cassandra_rcl, write_consistency_level = self.cassandra_wcl), memcache = perma_memcache, lock_factory = self.make_lock) self.cache_chains.update(permacache=self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain(localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain((localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients = num_mc_clients))) self.cache_chains.update(rendercache=self.rendercache) self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.update(thing_cache=self.thing_cache) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain((localcache_cls(), self.memcache, HardCache(self)), cache_negative_results = True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision number try: self.version = subprocess.check_output(["git", "rev-parse", "HEAD"]) except subprocess.CalledProcessError, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'
def setup(self, global_conf): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = {} self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.memcache) self.stats = Stats(global_conf.get('statsd_addr'), global_conf.get('statsd_sample_rate')) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=len(self.cassandra_seeds), timeout=2, max_retries=3, prefill=False), "noretries": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="noretries", server_list=self.cassandra_seeds, pool_size=len(self.cassandra_seeds), timeout=.1, max_retries=0, prefill=False), } perma_memcache = (CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain( localcache_cls(), CassandraCache('permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl), memcache=perma_memcache, lock_factory=self.make_lock) self.cache_chains.update(permacache=self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain( localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain( (localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients))) self.cache_chains.update(rendercache=self.rendercache) self.servicecache = MemcacheChain( (localcache_cls(), CMemcache(self.servicecaches, num_clients=num_mc_clients))) self.cache_chains.update(servicecache=self.servicecache) self.thing_cache = CacheChain((localcache_cls(), )) self.cache_chains.update(thing_cache=self.thing_cache) #load the database info self.dbm = self.load_db_params(global_conf) # can't do this until load_db_params() has been called self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #the shutdown toggle self.shutdown = False #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision number try: self.version = subprocess.check_output( ["git", "rev-parse", "HEAD"]) except subprocess.CalledProcessError, e: self.log.info("Couldn't read source revision (%r)" % e) self.version = self.short_version = '(unknown)'
def __init__(self, global_conf, app_conf, paths, **extra): """ Globals acts as a container for objects available throughout the life of the application. One instance of Globals is created by Pylons during application initialization and is available during requests via the 'g' variable. ``global_conf`` The same variable used throughout ``config/middleware.py`` namely, the variables from the ``[DEFAULT]`` section of the configuration file. ``app_conf`` The same ``kw`` dictionary used throughout ``config/middleware.py`` namely, the variables from the section in the config file for your application. ``extra`` The configuration returned from ``load_config`` in ``config/middleware.py`` which may be of use in the setup of your global variables. """ def to_bool(x): return (x.lower() == 'true') if x else None def to_iter(name, delim = ','): return (x.strip() for x in global_conf.get(name, '').split(delim)) # slop over all variables to start with for k, v in global_conf.iteritems(): if not k.startswith("_") and not hasattr(self, k): if k in self.int_props: v = int(v) elif k in self.bool_props: v = to_bool(v) elif k in self.tuple_props: v = tuple(to_iter(k)) setattr(self, k, v) # initialize caches mc = Memcache(self.memcaches, debug=True) self.cache = CacheChain((LocalCache(), mc)) self.permacache = Memcache(self.permacaches, debug=True) self.rendercache = Memcache(self.rendercaches, debug=True) self.make_lock = make_lock_factory(mc) self.rec_cache = Memcache(self.rec_cache, debug=True) # set default time zone if one is not set self.tz = pytz.timezone(global_conf.get('timezone')) #make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) # turn on for language support self.languages, self.lang_name = _get_languages() all_languages = self.lang_name.keys() all_languages.sort() self.all_languages = all_languages # load the md5 hashes of files under static static_files = os.path.join(paths.get('static_files'), 'static') self.static_md5 = {} if os.path.exists(static_files): for f in os.listdir(static_files): if f.endswith('.md5'): key = f[0:-4] f = os.path.join(static_files, f) with open(f, 'r') as handle: md5 = handle.read().strip('\n') self.static_md5[key] = md5 #set up the logging directory log_path = self.log_path process_iden = global_conf.get('scgi_port', 'default') if log_path: if not os.path.exists(log_path): os.makedirs(log_path) for fname in os.listdir(log_path): if fname.startswith(process_iden): full_name = os.path.join(log_path, fname) os.remove(full_name) #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) #read in our CSS so that it can become a default for subreddit #stylesheets stylesheet_path = os.path.join(paths.get('static_files'), self.static_path.lstrip('/'), self.stylesheet) with open(stylesheet_path) as s: self.default_stylesheet = s.read() self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid()
def setup(self): # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) num_mc_clients = self.num_mc_clients self.cache_chains = {} # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.memcache = CMemcache(self.memcaches, num_clients = num_mc_clients) self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.stats = Stats(self.config.get('statsd_addr'), self.config.get('statsd_sample_rate')) event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) self.make_lock = make_lock_factory(self.lock_cache, self.stats) if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=2, max_retries=3, prefill=False ), } perma_memcache = (CMemcache(self.permacache_memcaches, num_clients = num_mc_clients) if self.permacache_memcaches else None) self.permacache = CassandraCacheChain(localcache_cls(), CassandraCache('permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level = self.cassandra_rcl, write_consistency_level = self.cassandra_wcl), memcache = perma_memcache, lock_factory = self.make_lock) self.cache_chains.update(permacache=self.permacache) # hardcache is done after the db info is loaded, and then the # chains are reset to use the appropriate initial entries if self.stalecaches: self.cache = StaleCacheChain(localcache_cls(), CMemcache(self.stalecaches, num_clients=num_mc_clients), self.memcache) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain((localcache_cls(), CMemcache(self.rendercaches, noreply=True, no_block=True, num_clients = num_mc_clients))) self.cache_chains.update(rendercache=self.rendercache) self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.update(thing_cache=self.thing_cache) #load the database info self.dbm = self.load_db_params() # can't do this until load_db_params() has been called self.hardcache = HardcacheChain((localcache_cls(), self.memcache, HardCache(self)), cache_negative_results = True) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get('REDDIT_MAIN')) origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, 'oauth_domain', None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) #if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) if self.log_start: self.log.error("reddit app %s:%s started %s at %s" % (self.reddit_host, self.reddit_pid, self.short_version, datetime.now()))
def setup(self): self.queues = queues.declare_queues(self) ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") # This requirement doesn't *have* to be a requirement, but there are # bugs at the moment that will pop up if you violate it # XXX: get rid of these options. new query cache is always on. if self.write_query_queue and not self.use_query_cache: raise Exception("write_query_queue requires use_query_cache") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.secure_domains = set([urlparse(self.payment_domain).netloc]) self.trusted_domains = set([self.domain]) self.trusted_domains.update(self.authorized_cnames) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.secure_domains.add(https_url.netloc) self.trusted_domains.add(https_url.hostname) if getattr(self, 'oauth_domain', None): self.secure_domains.add(self.oauth_domain) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} #setup the logger self.log = logging.getLogger('reddit') self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.INFO) # set log level for pycountry which is chatty logging.getLogger('pycountry.db').setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print ("Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.throttles = tuple() # immutable since it's not real self.startup_timer.intermediate("zookeeper") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. self.memcache = CMemcache(self.memcaches, num_clients=num_mc_clients) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. if self.permacache_memcaches: permacache_memcaches = CMemcache(self.permacache_memcaches, num_clients=num_mc_clients) else: permacache_memcaches = None # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates as well as fully # cached pages. rendercaches = CMemcache( self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients, ) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool( keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=2, max_retries=3, prefill=False ), } permacache_cf = CassandraCache( 'permacache', self.cassandra_pools[self.cassandra_default_pool], read_consistency_level=self.cassandra_rcl, write_consistency_level=self.cassandra_wcl ) self.startup_timer.intermediate("cassandra") ################# POSTGRES event.listens_for(engine.Engine, 'before_cursor_execute')( self.stats.pg_before_cursor_execute) event.listens_for(engine.Engine, 'after_cursor_execute')( self.stats.pg_after_cursor_execute) self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components self.cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, self.memcache, ) else: self.cache = MemcacheChain((localcache_cls(), self.memcache)) self.cache_chains.update(cache=self.cache) self.rendercache = MemcacheChain(( localcache_cls(), rendercaches, )) self.cache_chains.update(rendercache=self.rendercache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(),)) self.cache_chains.update(thing_cache=self.thing_cache) self.permacache = CassandraCacheChain( localcache_cls(), permacache_cf, memcache=permacache_memcaches, lock_factory=self.make_lock, ) self.cache_chains.update(permacache=self.permacache) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), self.memcache, HardCache(self)), cache_negative_results=True, ) self.cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' cache_chains = self.cache_chains.copy() def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() chain.stats = CacheStats(self.stats, name) self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def setup(self): self.queues = queues.declare_queues(self) self.extension_subdomains = dict( m="mobile", i="compact", api="api", rss="rss", xml="xml", json="json", ) ################# PROVIDERS self.auth_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.auth", self.authentication_provider, ) self.media_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.media", self.media_provider, ) self.cdn_provider = select_provider( self.config, self.pkg_resources_working_set, "r2.provider.cdn", self.cdn_provider, ) self.startup_timer.intermediate("providers") ################# CONFIGURATION # AMQP is required if not self.amqp_host: raise ValueError("amqp_host not set in the .ini") if not self.cassandra_seeds: raise ValueError("cassandra_seeds not set in the .ini") # heavy load mode is read only mode with a different infobar if self.heavy_load_mode: self.read_only_mode = True origin_prefix = self.domain_prefix + "." if self.domain_prefix else "" self.origin = "http://" + origin_prefix + self.domain self.trusted_domains = set([self.domain]) if self.https_endpoint: https_url = urlparse(self.https_endpoint) self.trusted_domains.add(https_url.hostname) # load the unique hashed names of files under static static_files = os.path.join(self.paths.get('static_files'), 'static') names_file_path = os.path.join(static_files, 'names.json') if os.path.exists(names_file_path): with open(names_file_path) as handle: self.static_names = json.load(handle) else: self.static_names = {} # make python warnings go through the logging system logging.captureWarnings(capture=True) log = logging.getLogger('reddit') # when we're a script (paster run) just set up super simple logging if self.running_as_script: log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler()) # if in debug mode, override the logging level to DEBUG if self.debug: log.setLevel(logging.DEBUG) # attempt to figure out which pool we're in and add that to the # LogRecords. try: with open("/etc/ec2_asg", "r") as f: pool = f.read().strip() # clean up the pool name since we're putting stuff after "-" pool = pool.partition("-")[0] except IOError: pool = "reddit-app" self.log = logging.LoggerAdapter(log, {"pool": pool}) # set locations locations = pkg_resources.resource_stream(__name__, "../data/locations.json") self.locations = json.loads(locations.read()) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print >> sys.stderr, ( "Warning: g.media_domain == g.domain. " + "This may give untrusted content access to user cookies") if self.oauth_domain == self.domain: print >> sys.stderr, ("Warning: g.oauth_domain == g.domain. " "CORS requests to g.domain will be allowed") for arg in sys.argv: tokens = arg.split("=") if len(tokens) == 2: k, v = tokens self.log.debug("Overriding g.%s to %s" % (k, v)) setattr(self, k, v) self.reddit_host = socket.gethostname() self.reddit_pid = os.getpid() if hasattr(signal, 'SIGUSR1'): # not all platforms have user signals signal.signal(signal.SIGUSR1, thread_dump) locale.setlocale(locale.LC_ALL, self.locale) # Pre-calculate ratelimit values self.RL_RESET_SECONDS = self.config["RL_RESET_MINUTES"] * 60 self.RL_MAX_REQS = int(self.config["RL_AVG_REQ_PER_SEC"] * self.RL_RESET_SECONDS) self.RL_OAUTH_RESET_SECONDS = self.config["RL_OAUTH_RESET_MINUTES"] * 60 self.RL_OAUTH_MAX_REQS = int(self.config["RL_OAUTH_AVG_REQ_PER_SEC"] * self.RL_OAUTH_RESET_SECONDS) self.RL_LOGIN_MAX_REQS = int(self.config["RL_LOGIN_AVG_PER_SEC"] * self.RL_RESET_SECONDS) self.startup_timer.intermediate("configuration") ################# ZOOKEEPER # for now, zookeeper will be an optional part of the stack. # if it's not configured, we will grab the expected config from the # [live_config] section of the ini file zk_hosts = self.config.get("zookeeper_connection_string") if zk_hosts: from r2.lib.zookeeper import (connect_to_zookeeper, LiveConfig, LiveList) zk_username = self.config["zookeeper_username"] zk_password = self.config["zookeeper_password"] self.zookeeper = connect_to_zookeeper(zk_hosts, (zk_username, zk_password)) self.live_config = LiveConfig(self.zookeeper, LIVE_CONFIG_NODE) self.secrets = fetch_secrets(self.zookeeper) self.throttles = LiveList(self.zookeeper, "/throttles", map_fn=ipaddress.ip_network, reduce_fn=ipaddress.collapse_addresses) # close our zk connection when the app shuts down SHUTDOWN_CALLBACKS.append(self.zookeeper.stop) else: self.zookeeper = None parser = ConfigParser.RawConfigParser() parser.optionxform = str parser.read([self.config["__file__"]]) self.live_config = extract_live_config(parser, self.plugins) self.secrets = extract_secrets(parser) self.throttles = tuple() # immutable since it's not real self.startup_timer.intermediate("zookeeper") ################# PRIVILEGED USERS self.admins = PermissionFilteredEmployeeList(self.live_config, type="admin") self.sponsors = PermissionFilteredEmployeeList(self.live_config, type="sponsor") self.employees = PermissionFilteredEmployeeList(self.live_config, type="employee") ################# MEMCACHE num_mc_clients = self.num_mc_clients # the main memcache pool. used for most everything. memcache = CMemcache( self.memcaches, min_compress_len=1400, num_clients=num_mc_clients, binary=True, ) # a pool just used for @memoize results memoizecaches = CMemcache( self.memoizecaches, min_compress_len=50 * 1024, num_clients=num_mc_clients, binary=True, ) # a pool just for srmember rels srmembercaches = CMemcache( self.srmembercaches, min_compress_len=96, num_clients=num_mc_clients, binary=True, ) # a pool just for rels relcaches = CMemcache( self.relcaches, min_compress_len=96, num_clients=num_mc_clients, binary=True, ) ratelimitcaches = CMemcache( self.ratelimitcaches, min_compress_len=96, num_clients=num_mc_clients, ) # a smaller pool of caches used only for distributed locks. # TODO: move this to ZooKeeper self.lock_cache = CMemcache(self.lockcaches, binary=True, num_clients=num_mc_clients) self.make_lock = make_lock_factory(self.lock_cache, self.stats) # memcaches used in front of the permacache CF in cassandra. # XXX: this is a legacy thing; permacache was made when C* didn't have # a row cache. permacache_memcaches = CMemcache(self.permacache_memcaches, min_compress_len=1400, num_clients=num_mc_clients) # the stalecache is a memcached local to the current app server used # for data that's frequently fetched but doesn't need to be fresh. if self.stalecaches: stalecaches = CMemcache(self.stalecaches, binary=True, num_clients=num_mc_clients) else: stalecaches = None # rendercache holds rendered partial templates. rendercaches = CMemcache( self.rendercaches, noreply=True, no_block=True, num_clients=num_mc_clients, min_compress_len=480, ) # pagecaches hold fully rendered pages pagecaches = CMemcache( self.pagecaches, noreply=True, no_block=True, num_clients=num_mc_clients, min_compress_len=1400, ) self.startup_timer.intermediate("memcache") ################# CASSANDRA keyspace = "reddit" self.cassandra_pools = { "main": StatsCollectingConnectionPool(keyspace, stats=self.stats, logging_name="main", server_list=self.cassandra_seeds, pool_size=self.cassandra_pool_size, timeout=4, max_retries=3, prefill=False), } permacache_cf = Permacache._setup_column_family( 'permacache', self.cassandra_pools[self.cassandra_default_pool], ) self.startup_timer.intermediate("cassandra") ################# POSTGRES self.dbm = self.load_db_params() self.startup_timer.intermediate("postgres") ################# CHAINS # initialize caches. Any cache-chains built here must be added # to cache_chains (closed around by reset_caches) so that they # can properly reset their local components cache_chains = {} localcache_cls = (SelfEmptyingCache if self.running_as_script else LocalCache) if stalecaches: self.cache = StaleCacheChain( localcache_cls(), stalecaches, memcache, ) else: self.cache = MemcacheChain((localcache_cls(), memcache)) cache_chains.update(cache=self.cache) if stalecaches: self.memoizecache = StaleCacheChain( localcache_cls(), stalecaches, memoizecaches, ) else: self.memoizecache = MemcacheChain( (localcache_cls(), memoizecaches)) cache_chains.update(memoizecache=self.memoizecache) if stalecaches: self.srmembercache = StaleCacheChain( localcache_cls(), stalecaches, srmembercaches, ) else: self.srmembercache = MemcacheChain( (localcache_cls(), srmembercaches)) cache_chains.update(srmembercache=self.srmembercache) if stalecaches: self.relcache = StaleCacheChain( localcache_cls(), stalecaches, relcaches, ) else: self.relcache = MemcacheChain((localcache_cls(), relcaches)) cache_chains.update(relcache=self.relcache) self.ratelimitcache = MemcacheChain( (localcache_cls(), ratelimitcaches)) cache_chains.update(ratelimitcache=self.ratelimitcache) self.rendercache = MemcacheChain(( localcache_cls(), rendercaches, )) cache_chains.update(rendercache=self.rendercache) self.pagecache = MemcacheChain(( localcache_cls(), pagecaches, )) cache_chains.update(pagecache=self.pagecache) # the thing_cache is used in tdb_cassandra. self.thing_cache = CacheChain((localcache_cls(), ), check_keys=False) cache_chains.update(thing_cache=self.thing_cache) if stalecaches: permacache_cache = StaleCacheChain( localcache_cls(), stalecaches, permacache_memcaches, check_keys=False, ) else: permacache_cache = CacheChain( (localcache_cls(), permacache_memcaches), check_keys=False, ) cache_chains.update(permacache=permacache_cache) self.permacache = Permacache( permacache_cache, permacache_cf, lock_factory=self.make_lock, ) # hardcache is used for various things that tend to expire # TODO: replace hardcache w/ cassandra stuff self.hardcache = HardcacheChain( (localcache_cls(), memcache, HardCache(self)), cache_negative_results=True, ) cache_chains.update(hardcache=self.hardcache) # I know this sucks, but we need non-request-threads to be # able to reset the caches, so we need them be able to close # around 'cache_chains' without being able to call getattr on # 'g' def reset_caches(): for name, chain in cache_chains.iteritems(): chain.reset() if isinstance(chain, StaleCacheChain): chain.stats = StaleCacheStats(self.stats, name) else: chain.stats = CacheStats(self.stats, name) self.cache_chains = cache_chains self.reset_caches = reset_caches self.reset_caches() self.startup_timer.intermediate("cache_chains") # try to set the source control revision numbers self.versions = {} r2_root = os.path.dirname(os.path.dirname(self.paths["root"])) r2_gitdir = os.path.join(r2_root, ".git") self.short_version = self.record_repo_version("r2", r2_gitdir) if I18N_PATH: i18n_git_path = os.path.join(os.path.dirname(I18N_PATH), ".git") self.record_repo_version("i18n", i18n_git_path) self.startup_timer.intermediate("revisions")
def __init__(self, global_conf, app_conf, paths, **extra): """ Globals acts as a container for objects available throughout the life of the application. One instance of Globals is created by Pylons during application initialization and is available during requests via the 'g' variable. ``global_conf`` The same variable used throughout ``config/middleware.py`` namely, the variables from the ``[DEFAULT]`` section of the configuration file. ``app_conf`` The same ``kw`` dictionary used throughout ``config/middleware.py`` namely, the variables from the section in the config file for your application. ``extra`` The configuration returned from ``load_config`` in ``config/middleware.py`` which may be of use in the setup of your global variables. """ # slop over all variables to start with for k, v in global_conf.iteritems(): if not k.startswith("_") and not hasattr(self, k): if k in self.int_props: v = int(v) elif k in self.float_props: v = float(v) elif k in self.bool_props: v = self.to_bool(v) elif k in self.tuple_props: v = tuple(self.to_iter(v)) setattr(self, k, v) self.paid_sponsors = set(x.lower() for x in self.paid_sponsors) # initialize caches mc = Memcache(self.memcaches, pickleProtocol=1) self.memcache = mc self.cache = CacheChain((LocalCache(), mc)) self.permacache = Memcache(self.permacaches, pickleProtocol=1) self.rendercache = Memcache(self.rendercaches, pickleProtocol=1) self.make_lock = make_lock_factory(mc) self.rec_cache = Memcache(self.rec_cache, pickleProtocol=1) # set default time zone if one is not set tz = global_conf.get("timezone") dtz = global_conf.get("display_timezone", tz) self.tz = pytz.timezone(tz) self.display_tz = pytz.timezone(dtz) # load the database info self.dbm = self.load_db_params(global_conf) # make a query cache self.stats_collector = QueryStats() # set the modwindow self.MODWINDOW = timedelta(self.MODWINDOW) self.REDDIT_MAIN = bool(os.environ.get("REDDIT_MAIN")) # turn on for language support self.languages, self.lang_name = get_active_langs(default_lang=self.lang) all_languages = self.lang_name.keys() all_languages.sort() self.all_languages = all_languages # load the md5 hashes of files under static static_files = os.path.join(paths.get("static_files"), "static") self.static_md5 = {} if os.path.exists(static_files): for f in os.listdir(static_files): if f.endswith(".md5"): key = f.strip(".md5") f = os.path.join(static_files, f) with open(f, "r") as handle: md5 = handle.read().strip("\n") self.static_md5[key] = md5 # set up the logging directory log_path = self.log_path process_iden = global_conf.get("scgi_port", "default") if log_path: if not os.path.exists(log_path): os.makedirs(log_path) for fname in os.listdir(log_path): if fname.startswith(process_iden): full_name = os.path.join(log_path, fname) os.remove(full_name) # setup the logger self.log = logging.getLogger("digg") self.log.addHandler(logging.StreamHandler()) if self.debug: self.log.setLevel(logging.DEBUG) else: self.log.setLevel(logging.WARNING) # set log level for pycountry which is chatty logging.getLogger("pycountry.db").setLevel(logging.CRITICAL) if not self.media_domain: self.media_domain = self.domain if self.media_domain == self.domain: print "Warning: g.media_domain == g.domain. This may give untrusted content access to user cookies" # read in our CSS so that it can become a default for subdigg # stylesheets stylesheet_path = os.path.join(paths.get("static_files"), self.static_path.lstrip("/"), self.stylesheet) with open(stylesheet_path) as s: self.default_stylesheet = s.read() self.digg_host = socket.gethostname() self.digg_pid = os.getpid() # the shutdown toggle self.shutdown = False # if we're going to use the query_queue, we need amqp if self.write_query_queue and not self.amqp_host: raise Exception("amqp_host must be defined to use the query queue")