class CachedQueryMutator(object): """Utility to manipulate cached queries with batching. This implements the context manager protocol so it can be used with the with statement for clean batches. """ def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): return self def __exit__(self, type, value, traceback): self.send() def insert(self, query, things): """Insert items into the given cached query. If the items are already in the query, they will have their sorts updated. This will sometimes trigger pruning with a configurable probability (see g.querycache_prune_chance). """ if not things: return LOG.debug("Inserting %r into query %r", things, query) assert not query.is_precomputed query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def delete(self, query, things): """Remove things from the query.""" if not things: return LOG.debug("Deleting %r from query %r", things, query) query._delete(self.mutator, things) def send(self): """Commit the mutations batched up so far and potentially do pruning. This is automatically called by __exit__ when used as a context manager. """ self.mutator.send() if self.to_prune: LOG.debug("Pruning queries %r", self.to_prune) CachedQuery._prune_multi(self.to_prune)
def set_account_ip(account_id, ip, date=None): """Set an IP address as having accessed an account. Updates all underlying datastores. """ if date is None: date = datetime.datetime.now(g.tz) m = Mutator(CONNECTION_POOL) m.insert(IPsByAccount._cf, str(account_id), {date: ip}, ttl=CF_TTL) m.insert(AccountsByIP._cf, ip, {date: str(account_id)}, ttl=CF_TTL) m.send()
def set_account_ip(account_id, ip, date=None): """Set an IP address as having accessed an account. Updates all underlying datastores. """ # don't store private IPs, send a graphite event so we can alert on this if ip_address(ip).is_private: g.stats.simple_event('ip.private_ip_storage_prevented') return if date is None: date = datetime.datetime.now(g.tz) m = Mutator(CONNECTION_POOL) m.insert(IPsByAccount._cf, str(account_id), {date: ip}, ttl=CF_TTL) m.insert(AccountsByIP._cf, ip, {date: str(account_id)}, ttl=CF_TTL) m.send()
class CachedQueryMutator(object): def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): return self def __exit__(self, type, value, traceback): self.send() def insert(self, query, things): if not things: return LOG.debug("Inserting %r into query %r", things, query) query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def delete(self, query, things): if not things: return LOG.debug("Deleting %r from query %r", things, query) query._delete(self.mutator, things) def send(self): self.mutator.send() if self.to_prune: LOG.debug("Pruning queries %r", self.to_prune) CachedQuery._prune_multi(self.to_prune)
class CachedQueryMutator(object): """Utility to manipulate cached queries with batching. This implements the context manager protocol so it can be used with the with statement for clean batches. """ def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): return self def __exit__(self, type, value, traceback): self.send() def insert(self, query, things): """Insert items into the given cached query. If the items are already in the query, they will have their sorts updated. This will sometimes trigger pruning with a configurable probability (see g.querycache_prune_chance). """ if not things: return LOG.debug("Inserting %r into query %r", things, query) assert not query.is_precomputed query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def replace(self, query, things, ttl=None): """Replace a precomputed query with a new set of things. The query index will be updated. If a TTL is specified, it will be applied to all columns generated by this action allowing old precomputed queries to fall away after they're no longer useful. """ assert query.is_precomputed if isinstance(ttl, datetime.timedelta): ttl = ttl.total_seconds() query._replace(self.mutator, things, ttl) def delete(self, query, things): """Remove things from the query.""" if not things: return LOG.debug("Deleting %r from query %r", things, query) query._delete(self.mutator, things) def send(self): """Commit the mutations batched up so far and potentially do pruning. This is automatically called by __exit__ when used as a context manager. """ self.mutator.send() if self.to_prune: LOG.debug("Pruning queries %r", self.to_prune) CachedQuery._prune_multi(self.to_prune)
class CassandraDataStore(Delegate): def __init__(self, keyspace='agamemnon', server_list=['localhost:9160'], replication_factor=1, default_consistency_level=ConsistencyLevel.QUORUM, create_keyspace = False, **kwargs): super(CassandraDataStore,self).__init__() self._keyspace = keyspace self._server_list = server_list self._replication_factor = replication_factor self._consistency_level = default_consistency_level self._pool_args = kwargs if create_keyspace: self.create() else: self.init_pool() def init_pool(self): self._pool = pycassa.pool.ConnectionPool(self._keyspace, self._server_list, self._pool_args) self._cf_cache = {} self._index_cache = {} self._batch = None self.in_batch = False self.batch_count = 0 if not self.cf_exists(OUTBOUND_RELATIONSHIP_CF): self.create_cf(OUTBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(INBOUND_RELATIONSHIP_CF): self.create_cf(INBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(RELATIONSHIP_INDEX): self.create_cf(RELATIONSHIP_INDEX, super=True) if not self.cf_exists(RELATIONSHIP_CF): self.create_cf(RELATIONSHIP_CF, super=False) @property def system_manager(self): for server in self._server_list: try: return pycassa.system_manager.SystemManager(server) except TTransportException as e: log.warning("Could not connect to Cassandra server {0}".format(server)) raise CassandraClusterNotFoundException("Could not connect to any Cassandra server in list") @property def keyspace(self): return self._keyspace def create(self): if self._keyspace not in self.system_manager.list_keyspaces(): strategy_options = { 'replication_factor': str(self._replication_factor) } self.system_manager.create_keyspace(self._keyspace, strategy_options = strategy_options ) self.init_pool() def drop(self): self.system_manager.drop_keyspace(self._keyspace) self._pool.dispose() self._pool = None def truncate(self): try: self.drop() except InvalidRequestException: pass self.create() self.init_pool() def get_count(self, type, row, columns=None, column_start=None, super_column=None, column_finish=None): args = {} if columns is not None: args['columns'] = columns if column_start is not None: args['column_start'] = column_start if column_finish is not None: args['column_finish'] = column_finish if super_column is not None: args['super_column'] = super_column return self.get_cf(type).get_count(row, **args) def create_cf(self, type, column_type=pycassa.system_manager.ASCII_TYPE, super=False, index_columns=list()): self.system_manager.create_column_family(self._keyspace, type, super=super, comparator_type=column_type) for column in index_columns: self.create_secondary_index(type, column, column_type) return cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False, read_consistency_level=self._consistency_level, write_consistency_level=self._consistency_level) def create_secondary_index(self, type, column, column_type=pycassa.system_manager.ASCII_TYPE): self.system_manager.create_index(self._keyspace, type, column, column_type, index_name='%s_%s_index' % (type, column)) def cf_exists(self, type): if type in self._cf_cache: return True try: cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False, read_consistency_level=self._consistency_level, write_consistency_level=self._consistency_level) except NotFoundException: return False return True def get_cf(self, type, create=True): column_family = None if type in self._cf_cache: return self._cf_cache[type] try: column_family = cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False, read_consistency_level=self._consistency_level, write_consistency_level=self._consistency_level) self._cf_cache[type] = column_family except NotFoundException: if create: column_family = self.create_cf(type) return column_family def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) else: with Mutator(self._pool) as b: b.insert(column_family, key, columns) def remove(self,column_family, key, columns=None, super_column=None): if self._batch is not None: self._batch.remove(column_family, key, columns=columns, super_column=super_column) else: column_family.remove(key, columns=columns, super_column=super_column) def start_batch(self, queue_size = 0): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool,queue_size) self.batch_count += 1 def commit_batch(self): self.batch_count -= 1 if not self.batch_count: self._batch.send() self._batch = None
class CassandraDataStore(object): def __init__(self, keyspace, pool, system_manager): self._cf_cache = {} self._index_cache = {} self._system_manager = system_manager self._pool = pool self._keyspace = keyspace self._batch = None self.in_batch = False self.batch_count = 0 if not self.cf_exists(OUTBOUND_RELATIONSHIP_CF): self.create_cf(OUTBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(INBOUND_RELATIONSHIP_CF): self.create_cf(INBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(RELATIONSHIP_INDEX): self.create_cf(RELATIONSHIP_INDEX, super=True) def get_count(self, type, row, columns=None, column_start=None, super_column=None, column_finish=None): args = {} if columns is not None: args['columns'] = columns if column_start is not None: args['column_start'] = column_start if column_finish is not None: args['column_finish'] = column_finish if super_column is not None: args['super_column'] = super_column return self.get_cf(type).get_count(row, **args) def create_cf(self, type, column_type=system_manager.ASCII_TYPE, super=False, index_columns=list()): self._system_manager.create_column_family(self._keyspace, type, super=super, comparator_type=column_type) for column in index_columns: self._system_manager.create_index(self._keyspace, type, column, column_type, index_name='%s_%s_index' % (type, column)) return cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) def cf_exists(self, type): if type in self._cf_cache: return True try: cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) except NotFoundException: return False return True def get_cf(self, type, create=True): column_family = None if type in self._cf_cache: return self._cf_cache[type] try: column_family = cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) self._cf_cache[type] = column_family except NotFoundException: if create: column_family = self.create_cf(type) return column_family def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) with Mutator(self._pool) as b: b.insert(column_family, key, columns) def remove(self,column_family, key, columns=None, super_column=None): if self._batch is not None: self._batch.remove(column_family, key, columns=columns, super_column=super_column) else: column_family.remove(key, columns=columns, super_column=super_column) def start_batch(self): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool,0) self.batch_count += 1 def commit_batch(self): self.batch_count -= 1 if not self.batch_count: self._batch.send() self._batch = None
def parse_logs(self, build_ids): """Parse the logs for the specified build IDs into storage.""" # TODO hook up parallel processing. OUR_VERSION = '1' mut = Mutator(self._pool) cf = ColumnFamily(self._pool, 'build_timelines') i_cf = ColumnFamily(self._pool, 'indices') builds_cf = ColumnFamily(self._pool, 'builds') counters = ColumnFamily(self._pool, 'counters') super_counters = ColumnFamily(self._pool, 'super_counters') for build_id in build_ids: info = self._connection.build_from_id(build_id) if not info: continue existing_version = info.get('log_parsing_version') if existing_version and existing_version >= OUR_VERSION: continue if info['log_fetch_status'] != 'fetched': continue log = self._connection.file_data(info['log_url']) if not log: continue parsed = parse_build_log(log) cat = info['builder_category'] cols = {} indices = {} for step in parsed.steps: start = calendar.timegm(step.start.utctimetuple()) end = calendar.timegm(step.end.utctimetuple()) elapsed = end - start name = step.name cols[start] = { 'name': name, 'state': step.state, 'results': step.results, 'start': unicode(start), 'end': unicode(end), 'elapsed': unicode(elapsed) } start_date = step.start.date().isoformat() indices[name] = {build_id: ''} counters.add('build_step_number', name) counters.add('build_step_duration', name, elapsed) super_counters.add('build_step_number_by_category', name, 1, cat) super_counters.add('build_step_duration_by_category', name, elapsed, cat) super_counters.add('build_step_number_by_day', name, 1, start_date) super_counters.add('build_step_duration_by_day', name, elapsed, start_date) day_cat = '%s.%s' % (start_date, cat) super_counters.add('build_step_number_by_day_and_category', name, 1, day_cat) super_counters.add('build_step_duration_by_day_and_category', name, elapsed, day_cat) mut.insert(cf, build_id, cols) mut.insert(i_cf, 'build_step_name_to_build_ids', indices) mut.insert(builds_cf, build_id, {'log_parsing_version': OUR_VERSION}) yield 'Parsed build %s into %d steps.' % (build_id, len(parsed.steps)) mut.send()
test_cardinalities = [1, 2, 3, 4, 5] test_cardinalities_multiplier = 1000 line = "-" * 62 print line print "| %5s | %10s | %10s | %10s | %10s |" % ("bits", "card", "estim", "diff", "diff") print line for card in test_cardinalities: x = CubicHyperLogLogCassandra(cf, "my_counter_test", 9, mutator=mut) x.clear() for i in range(card): print i for j in range(test_cardinalities_multiplier): x.add(str(i) + "-" + str(j)) mut.send() x.load() card = card * test_cardinalities_multiplier card2 = len(x) perc = float(card - card2) / card * 100 print "| %5d | %10d | %10d | %10d | %10.2f%% |" % (x.m, card, card2, card - card2, perc) print "Bloomfilter test", ("Niki" in x), ("Peter Peterson" in x), ("123-123" in x)
class CassandraDataStore(Delegate): def __init__(self, keyspace='agamemnon', server_list=['localhost:9160'], replication_factor=1, create_keyspace=False, **kwargs): super(CassandraDataStore, self).__init__() self._keyspace = keyspace self._server_list = server_list self._replication_factor = replication_factor self._pool_args = kwargs self._system_manager = pycassa.system_manager.SystemManager( server_list[0]) if create_keyspace: self.create() else: self.init_pool() def init_pool(self): self._pool = pycassa.pool.ConnectionPool(self._keyspace, self._server_list, self._pool_args) self._cf_cache = {} self._index_cache = {} self._batch = None self.in_batch = False self.batch_count = 0 if not self.cf_exists(OUTBOUND_RELATIONSHIP_CF): self.create_cf(OUTBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(INBOUND_RELATIONSHIP_CF): self.create_cf(INBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(RELATIONSHIP_INDEX): self.create_cf(RELATIONSHIP_INDEX, super=True) if not self.cf_exists(RELATIONSHIP_CF): self.create_cf(RELATIONSHIP_CF, super=False) @property def system_manager(self): return self._system_manager @property def keyspace(self): return self._keyspace def create(self): if self._keyspace not in self._system_manager.list_keyspaces(): strategy_options = { 'replication_factor': str(self._replication_factor) } self._system_manager.create_keyspace( self._keyspace, strategy_options=strategy_options) self.init_pool() def drop(self): self._system_manager.drop_keyspace(self._keyspace) self._pool.dispose() self._pool = None def truncate(self): try: self.drop() except InvalidRequestException: pass self.create() self.init_pool() def get_count(self, type, row, columns=None, column_start=None, super_column=None, column_finish=None): args = {} if columns is not None: args['columns'] = columns if column_start is not None: args['column_start'] = column_start if column_finish is not None: args['column_finish'] = column_finish if super_column is not None: args['super_column'] = super_column return self.get_cf(type).get_count(row, **args) def create_cf(self, type, column_type=pycassa.system_manager.ASCII_TYPE, super=False, index_columns=list()): self._system_manager.create_column_family(self._keyspace, type, super=super, comparator_type=column_type) for column in index_columns: self.create_secondary_index(type, column, column_type) return cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) def create_secondary_index(self, type, column, column_type=pycassa.system_manager.ASCII_TYPE): self._system_manager.create_index(self._keyspace, type, column, column_type, index_name='%s_%s_index' % (type, column)) def cf_exists(self, type): if type in self._cf_cache: return True try: cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) except NotFoundException: return False return True def get_cf(self, type, create=True): column_family = None if type in self._cf_cache: return self._cf_cache[type] try: column_family = cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) self._cf_cache[type] = column_family except NotFoundException: if create: column_family = self.create_cf(type) return column_family def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) else: with Mutator(self._pool) as b: b.insert(column_family, key, columns) def remove(self, column_family, key, columns=None, super_column=None): if self._batch is not None: self._batch.remove(column_family, key, columns=columns, super_column=super_column) else: column_family.remove(key, columns=columns, super_column=super_column) def start_batch(self, queue_size=0): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool, queue_size) self.batch_count += 1 def commit_batch(self): self.batch_count -= 1 if not self.batch_count: self._batch.send() self._batch = None