class CachedQueryMutator(object): """Utility to manipulate cached queries with batching. This implements the context manager protocol so it can be used with the with statement for clean batches. """ def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): return self def __exit__(self, type, value, traceback): self.send() def insert(self, query, things): """Insert items into the given cached query. If the items are already in the query, they will have their sorts updated. This will sometimes trigger pruning with a configurable probability (see g.querycache_prune_chance). """ if not things: return LOG.debug("Inserting %r into query %r", things, query) assert not query.is_precomputed query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def delete(self, query, things): """Remove things from the query.""" if not things: return LOG.debug("Deleting %r from query %r", things, query) query._delete(self.mutator, things) def send(self): """Commit the mutations batched up so far and potentially do pruning. This is automatically called by __exit__ when used as a context manager. """ self.mutator.send() if self.to_prune: LOG.debug("Pruning queries %r", self.to_prune) CachedQuery._prune_multi(self.to_prune)
class CachedQueryMutator(object): def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): self.mutator.__enter__() return self def __exit__(self, type, value, traceback): self.mutator.__exit__(type, value, traceback) if self.to_prune: CachedQuery._prune_multi(self.to_prune) def insert(self, query, things): if not things: return query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def delete(self, query, things): if not things: return query._delete(self.mutator, things)
def set_account_ip(account_id, ip, date=None): """Set an IP address as having accessed an account. Updates all underlying datastores. """ if date is None: date = datetime.datetime.now(g.tz) m = Mutator(CONNECTION_POOL) m.insert(IPsByAccount._cf, str(account_id), {date: ip}, ttl=CF_TTL) m.insert(AccountsByIP._cf, ip, {date: str(account_id)}, ttl=CF_TTL) m.send()
def set_account_ip(account_id, ip, date=None): """Set an IP address as having accessed an account. Updates all underlying datastores. """ # don't store private IPs, send a graphite event so we can alert on this if ip_address(ip).is_private: g.stats.simple_event('ip.private_ip_storage_prevented') return if date is None: date = datetime.datetime.now(g.tz) m = Mutator(CONNECTION_POOL) m.insert(IPsByAccount._cf, str(account_id), {date: ip}, ttl=CF_TTL) m.insert(AccountsByIP._cf, ip, {date: str(account_id)}, ttl=CF_TTL) m.send()
def _prune_multi(cls, queries): cls._fetch_multi(queries) with Mutator(CONNECTION_POOL) as m: for q in queries: q._sort_data() q._prune(m)
def save_log(self, application, host, severity, timestamp, message): """ Saves a log message. Raises: - DaedalusException if any parameter isn't valid. """ _check_application(application) _check_severity(severity) _check_host(host) _check_message(message) try: timestamp = float(timestamp) except: raise (DaedalusException( "The timestamp '{0}' couldn't be transformed to a float". format(timestamp))) event_uuid = convert_time_to_uuid(timestamp, randomize=True) _id = event_uuid.get_hex() json_message = json.dumps({ 'application': application, 'host': host, 'severity': severity, 'timestamp': timestamp, '_id': _id, 'message': message, }) pool = self._get_pool() with Mutator(pool) as batch: # Save on <CF> CF_LOGS row_key = ymd_from_uuid1(event_uuid) batch.insert(self._get_cf_logs(), str(row_key), { event_uuid: json_message, }) # Save on <CF> CF_LOGS_BY_APP batch.insert(self._get_cf_logs_by_app(), application, { event_uuid: EMPTY_VALUE, }) # Save on <CF> CF_LOGS_BY_HOST batch.insert(self._get_cf_logs_by_host(), host, { event_uuid: EMPTY_VALUE, }) # Save on <CF> CF_LOGS_BY_SEVERITY batch.insert(self._get_cf_logs_by_severity(), severity, { event_uuid: EMPTY_VALUE, })
class CachedQueryMutator(object): def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): return self def __exit__(self, type, value, traceback): self.send() def insert(self, query, things): if not things: return LOG.debug("Inserting %r into query %r", things, query) query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def delete(self, query, things): if not things: return LOG.debug("Deleting %r from query %r", things, query) query._delete(self.mutator, things) def send(self): self.mutator.send() if self.to_prune: LOG.debug("Pruning queries %r", self.to_prune) CachedQuery._prune_multi(self.to_prune)
def start_batch(self, queue_size=0): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool, queue_size) self.batch_count += 1
def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set()
class CassandraDataStore(Delegate): def __init__(self, keyspace='agamemnon', server_list=['localhost:9160'], replication_factor=1, create_keyspace=False, **kwargs): super(CassandraDataStore, self).__init__() self._keyspace = keyspace self._server_list = server_list self._replication_factor = replication_factor self._pool_args = kwargs self._system_manager = pycassa.system_manager.SystemManager( server_list[0]) if create_keyspace: self.create() else: self.init_pool() def init_pool(self): self._pool = pycassa.pool.ConnectionPool(self._keyspace, self._server_list, self._pool_args) self._cf_cache = {} self._index_cache = {} self._batch = None self.in_batch = False self.batch_count = 0 if not self.cf_exists(OUTBOUND_RELATIONSHIP_CF): self.create_cf(OUTBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(INBOUND_RELATIONSHIP_CF): self.create_cf(INBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(RELATIONSHIP_INDEX): self.create_cf(RELATIONSHIP_INDEX, super=True) if not self.cf_exists(RELATIONSHIP_CF): self.create_cf(RELATIONSHIP_CF, super=False) @property def system_manager(self): return self._system_manager @property def keyspace(self): return self._keyspace def create(self): if self._keyspace not in self._system_manager.list_keyspaces(): strategy_options = { 'replication_factor': str(self._replication_factor) } self._system_manager.create_keyspace( self._keyspace, strategy_options=strategy_options) self.init_pool() def drop(self): self._system_manager.drop_keyspace(self._keyspace) self._pool.dispose() self._pool = None def truncate(self): try: self.drop() except InvalidRequestException: pass self.create() self.init_pool() def get_count(self, type, row, columns=None, column_start=None, super_column=None, column_finish=None): args = {} if columns is not None: args['columns'] = columns if column_start is not None: args['column_start'] = column_start if column_finish is not None: args['column_finish'] = column_finish if super_column is not None: args['super_column'] = super_column return self.get_cf(type).get_count(row, **args) def create_cf(self, type, column_type=pycassa.system_manager.ASCII_TYPE, super=False, index_columns=list()): self._system_manager.create_column_family(self._keyspace, type, super=super, comparator_type=column_type) for column in index_columns: self.create_secondary_index(type, column, column_type) return cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) def create_secondary_index(self, type, column, column_type=pycassa.system_manager.ASCII_TYPE): self._system_manager.create_index(self._keyspace, type, column, column_type, index_name='%s_%s_index' % (type, column)) def cf_exists(self, type): if type in self._cf_cache: return True try: cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) except NotFoundException: return False return True def get_cf(self, type, create=True): column_family = None if type in self._cf_cache: return self._cf_cache[type] try: column_family = cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) self._cf_cache[type] = column_family except NotFoundException: if create: column_family = self.create_cf(type) return column_family def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) else: with Mutator(self._pool) as b: b.insert(column_family, key, columns) def remove(self, column_family, key, columns=None, super_column=None): if self._batch is not None: self._batch.remove(column_family, key, columns=columns, super_column=super_column) else: column_family.remove(key, columns=columns, super_column=super_column) def start_batch(self, queue_size=0): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool, queue_size) self.batch_count += 1 def commit_batch(self): self.batch_count -= 1 if not self.batch_count: self._batch.send() self._batch = None
class CachedQueryMutator(object): """Utility to manipulate cached queries with batching. This implements the context manager protocol so it can be used with the with statement for clean batches. """ def __init__(self): self.mutator = Mutator(CONNECTION_POOL) self.to_prune = set() def __enter__(self): return self def __exit__(self, type, value, traceback): self.send() def insert(self, query, things): """Insert items into the given cached query. If the items are already in the query, they will have their sorts updated. This will sometimes trigger pruning with a configurable probability (see g.querycache_prune_chance). """ if not things: return LOG.debug("Inserting %r into query %r", things, query) assert not query.is_precomputed query._insert(self.mutator, things) if (random.random() / len(things)) < PRUNE_CHANCE: self.to_prune.add(query) def replace(self, query, things, ttl=None): """Replace a precomputed query with a new set of things. The query index will be updated. If a TTL is specified, it will be applied to all columns generated by this action allowing old precomputed queries to fall away after they're no longer useful. """ assert query.is_precomputed if isinstance(ttl, datetime.timedelta): ttl = ttl.total_seconds() query._replace(self.mutator, things, ttl) def delete(self, query, things): """Remove things from the query.""" if not things: return LOG.debug("Deleting %r from query %r", things, query) query._delete(self.mutator, things) def send(self): """Commit the mutations batched up so far and potentially do pruning. This is automatically called by __exit__ when used as a context manager. """ self.mutator.send() if self.to_prune: LOG.debug("Pruning queries %r", self.to_prune) CachedQuery._prune_multi(self.to_prune)
def update(self): things = list(self.query) with Mutator(CONNECTION_POOL) as m: self.model.remove(m, self.key, None) # empty the whole row self._insert(m, things)
#!/usr/bin/python # # PyCassa test # from cubichyperloglog import CubicHyperLogLogCassandra from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily from pycassa.batch import Mutator pool = ConnectionPool("test", ["localhost:9160"]) cf = ColumnFamily(pool, "hll") mut = Mutator(pool, 5000) test_cardinalities = [1, 2, 3, 4, 5] test_cardinalities_multiplier = 1000 line = "-" * 62 print line print "| %5s | %10s | %10s | %10s | %10s |" % ("bits", "card", "estim", "diff", "diff") print line for card in test_cardinalities: x = CubicHyperLogLogCassandra(cf, "my_counter_test", 9, mutator=mut) x.clear() for i in range(card):
def start_batch(self, queue_size = 0): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool,queue_size) self.batch_count += 1
class CassandraDataStore(Delegate): def __init__(self, keyspace='agamemnon', server_list=['localhost:9160'], replication_factor=1, default_consistency_level=ConsistencyLevel.QUORUM, create_keyspace = False, **kwargs): super(CassandraDataStore,self).__init__() self._keyspace = keyspace self._server_list = server_list self._replication_factor = replication_factor self._consistency_level = default_consistency_level self._pool_args = kwargs if create_keyspace: self.create() else: self.init_pool() def init_pool(self): self._pool = pycassa.pool.ConnectionPool(self._keyspace, self._server_list, self._pool_args) self._cf_cache = {} self._index_cache = {} self._batch = None self.in_batch = False self.batch_count = 0 if not self.cf_exists(OUTBOUND_RELATIONSHIP_CF): self.create_cf(OUTBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(INBOUND_RELATIONSHIP_CF): self.create_cf(INBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(RELATIONSHIP_INDEX): self.create_cf(RELATIONSHIP_INDEX, super=True) if not self.cf_exists(RELATIONSHIP_CF): self.create_cf(RELATIONSHIP_CF, super=False) @property def system_manager(self): for server in self._server_list: try: return pycassa.system_manager.SystemManager(server) except TTransportException as e: log.warning("Could not connect to Cassandra server {0}".format(server)) raise CassandraClusterNotFoundException("Could not connect to any Cassandra server in list") @property def keyspace(self): return self._keyspace def create(self): if self._keyspace not in self.system_manager.list_keyspaces(): strategy_options = { 'replication_factor': str(self._replication_factor) } self.system_manager.create_keyspace(self._keyspace, strategy_options = strategy_options ) self.init_pool() def drop(self): self.system_manager.drop_keyspace(self._keyspace) self._pool.dispose() self._pool = None def truncate(self): try: self.drop() except InvalidRequestException: pass self.create() self.init_pool() def get_count(self, type, row, columns=None, column_start=None, super_column=None, column_finish=None): args = {} if columns is not None: args['columns'] = columns if column_start is not None: args['column_start'] = column_start if column_finish is not None: args['column_finish'] = column_finish if super_column is not None: args['super_column'] = super_column return self.get_cf(type).get_count(row, **args) def create_cf(self, type, column_type=pycassa.system_manager.ASCII_TYPE, super=False, index_columns=list()): self.system_manager.create_column_family(self._keyspace, type, super=super, comparator_type=column_type) for column in index_columns: self.create_secondary_index(type, column, column_type) return cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False, read_consistency_level=self._consistency_level, write_consistency_level=self._consistency_level) def create_secondary_index(self, type, column, column_type=pycassa.system_manager.ASCII_TYPE): self.system_manager.create_index(self._keyspace, type, column, column_type, index_name='%s_%s_index' % (type, column)) def cf_exists(self, type): if type in self._cf_cache: return True try: cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False, read_consistency_level=self._consistency_level, write_consistency_level=self._consistency_level) except NotFoundException: return False return True def get_cf(self, type, create=True): column_family = None if type in self._cf_cache: return self._cf_cache[type] try: column_family = cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False, read_consistency_level=self._consistency_level, write_consistency_level=self._consistency_level) self._cf_cache[type] = column_family except NotFoundException: if create: column_family = self.create_cf(type) return column_family def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) else: with Mutator(self._pool) as b: b.insert(column_family, key, columns) def remove(self,column_family, key, columns=None, super_column=None): if self._batch is not None: self._batch.remove(column_family, key, columns=columns, super_column=super_column) else: column_family.remove(key, columns=columns, super_column=super_column) def start_batch(self, queue_size = 0): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool,queue_size) self.batch_count += 1 def commit_batch(self): self.batch_count -= 1 if not self.batch_count: self._batch.send() self._batch = None
def start_batch(self): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool,0) self.batch_count += 1
class CassandraDataStore(object): def __init__(self, keyspace, pool, system_manager): self._cf_cache = {} self._index_cache = {} self._system_manager = system_manager self._pool = pool self._keyspace = keyspace self._batch = None self.in_batch = False self.batch_count = 0 if not self.cf_exists(OUTBOUND_RELATIONSHIP_CF): self.create_cf(OUTBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(INBOUND_RELATIONSHIP_CF): self.create_cf(INBOUND_RELATIONSHIP_CF, super=True) if not self.cf_exists(RELATIONSHIP_INDEX): self.create_cf(RELATIONSHIP_INDEX, super=True) def get_count(self, type, row, columns=None, column_start=None, super_column=None, column_finish=None): args = {} if columns is not None: args['columns'] = columns if column_start is not None: args['column_start'] = column_start if column_finish is not None: args['column_finish'] = column_finish if super_column is not None: args['super_column'] = super_column return self.get_cf(type).get_count(row, **args) def create_cf(self, type, column_type=system_manager.ASCII_TYPE, super=False, index_columns=list()): self._system_manager.create_column_family(self._keyspace, type, super=super, comparator_type=column_type) for column in index_columns: self._system_manager.create_index(self._keyspace, type, column, column_type, index_name='%s_%s_index' % (type, column)) return cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) def cf_exists(self, type): if type in self._cf_cache: return True try: cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) except NotFoundException: return False return True def get_cf(self, type, create=True): column_family = None if type in self._cf_cache: return self._cf_cache[type] try: column_family = cf.ColumnFamily(self._pool, type, autopack_names=False, autopack_values=False) self._cf_cache[type] = column_family except NotFoundException: if create: column_family = self.create_cf(type) return column_family def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) with Mutator(self._pool) as b: b.insert(column_family, key, columns) def remove(self,column_family, key, columns=None, super_column=None): if self._batch is not None: self._batch.remove(column_family, key, columns=columns, super_column=super_column) else: column_family.remove(key, columns=columns, super_column=super_column) def start_batch(self): if self._batch is None: self.in_batch = True self._batch = Mutator(self._pool,0) self.batch_count += 1 def commit_batch(self): self.batch_count -= 1 if not self.batch_count: self._batch.send() self._batch = None
def insert(self, column_family, key, columns): if self._batch is not None: self._batch.insert(column_family, key, columns) else: with Mutator(self._pool) as b: b.insert(column_family, key, columns)
def parse_logs(self, build_ids): """Parse the logs for the specified build IDs into storage.""" # TODO hook up parallel processing. OUR_VERSION = '1' mut = Mutator(self._pool) cf = ColumnFamily(self._pool, 'build_timelines') i_cf = ColumnFamily(self._pool, 'indices') builds_cf = ColumnFamily(self._pool, 'builds') counters = ColumnFamily(self._pool, 'counters') super_counters = ColumnFamily(self._pool, 'super_counters') for build_id in build_ids: info = self._connection.build_from_id(build_id) if not info: continue existing_version = info.get('log_parsing_version') if existing_version and existing_version >= OUR_VERSION: continue if info['log_fetch_status'] != 'fetched': continue log = self._connection.file_data(info['log_url']) if not log: continue parsed = parse_build_log(log) cat = info['builder_category'] cols = {} indices = {} for step in parsed.steps: start = calendar.timegm(step.start.utctimetuple()) end = calendar.timegm(step.end.utctimetuple()) elapsed = end - start name = step.name cols[start] = { 'name': name, 'state': step.state, 'results': step.results, 'start': unicode(start), 'end': unicode(end), 'elapsed': unicode(elapsed) } start_date = step.start.date().isoformat() indices[name] = {build_id: ''} counters.add('build_step_number', name) counters.add('build_step_duration', name, elapsed) super_counters.add('build_step_number_by_category', name, 1, cat) super_counters.add('build_step_duration_by_category', name, elapsed, cat) super_counters.add('build_step_number_by_day', name, 1, start_date) super_counters.add('build_step_duration_by_day', name, elapsed, start_date) day_cat = '%s.%s' % (start_date, cat) super_counters.add('build_step_number_by_day_and_category', name, 1, day_cat) super_counters.add('build_step_duration_by_day_and_category', name, elapsed, day_cat) mut.insert(cf, build_id, cols) mut.insert(i_cf, 'build_step_name_to_build_ids', indices) mut.insert(builds_cf, build_id, {'log_parsing_version': OUR_VERSION}) yield 'Parsed build %s into %d steps.' % (build_id, len(parsed.steps)) mut.send()