class TestBigInt(unittest.TestCase): @classmethod def setup_class(cls): sys = SystemManager() sys.create_column_family(TEST_KS, 'StdInteger', comparator_type=IntegerType()) @classmethod def teardown_class(cls): sys = SystemManager() sys.drop_column_family(TEST_KS, 'StdInteger') def setUp(self): self.key = 'TestBigInt' self.cf = ColumnFamily(pool, 'StdInteger') def tearDown(self): self.cf.remove(self.key) def test_negative_integers(self): self.cf.insert(self.key, {-1: '-1'}) self.cf.insert(self.key, {-12342390: '-12342390'}) self.cf.insert(self.key, {-255: '-255'}) self.cf.insert(self.key, {-256: '-256'}) self.cf.insert(self.key, {-257: '-257'}) for key, cols in self.cf.get_range(): self.assertEquals(str(cols.keys()[0]), cols.values()[0])
def execute(self): ## first validate data data_ok, fault = self._validate_data() if not data_ok: return (False, fault) ## if data ok, construct InsertCommands if self.op_type == CassandraQuery.OP_DELETE: try: domain = self.data.domain row_key = self.data.get_pk() client = db_connection.get_client() cf = ColumnFamily(client, domain) ## if cascading is enabled, first delete all DBObject and collections comprised in this DBObject if self.cascade: pass ## lastly remove data for current element cf.remove(row_key) return (True, None) except Exception, ex: return (False, ex)
def remove_column(columnFamily, uid, columns) : "To remove columns from a key" try: column = ColumnFamily(pool, columnFamily) column.remove(uid,columns) except: return {'status':0}
def column_family_remove(self,machine_id,keyspace_name,column_family_name,key): """Remove a key from column family for a given keyspace """ if (self.keyspace_contains(keyspace_name,column_family_name) == False): print "Error : Keyspace:column family could not be found." return False pool = ConnectionPool(keyspace = keyspace_name, server_list = keyspace.server_ips, prefill=False) col_fam = ColumnFamily(pool, column_family_name) col_fam.remove(key) return True
def column_family_remove(self, machine_id, keyspace_name, column_family_name, key): """Remove a key from column family for a given keyspace """ if (self.keyspace_contains(keyspace_name, column_family_name) == False): print "Error : Keyspace:column family could not be found." return False pool = ConnectionPool(keyspace=keyspace_name, server_list=keyspace.server_ips, prefill=False) col_fam = ColumnFamily(pool, column_family_name) col_fam.remove(key) return True
def truncate_build_metadata(self): """Truncates all derived build metadata. This bulk removes all build metadata and should not be performed unless you want to reload all derived data! """ for cf in ['slaves', 'masters', 'builders', 'builds']: cf = ColumnFamily(self.pool, cf) cf.truncate() cf = ColumnFamily(self.pool, 'indices') for key in BUILD_METADATA_INDICES: cf.remove(key) cf = ColumnFamily(self.pool, 'simple_indices') for key in BUILD_METADATA_SIMPLE_INDICES: cf.remove(key) cf = ColumnFamily(self.pool, 'counters') for key in BUILD_METADATA_COUNTERS: cf.remove(key) cf = ColumnFamily(self.pool, 'super_counters') for key in BUILD_METADATA_SUPER_COUNTERS: cf.remove(key)
class TestTypeErrors(unittest.TestCase): def test_packing_enabled(self): self.cf = ColumnFamily(pool, "Standard1") self.cf.insert("key", {"col": "val"}) assert_raises(TypeError, self.cf.insert, args=("key", {123: "val"})) assert_raises(TypeError, self.cf.insert, args=("key", {"col": 123})) assert_raises(TypeError, self.cf.insert, args=("key", {123: 123})) self.cf.remove("key") def test_packing_disabled(self): self.cf = ColumnFamily(pool, "Standard1", autopack_names=False, autopack_values=False) self.cf.insert("key", {"col": "val"}) assert_raises(TypeError, self.cf.insert, args=("key", {123: "val"})) assert_raises(TypeError, self.cf.insert, args=("key", {"col": 123})) assert_raises(TypeError, self.cf.insert, args=("key", {123: 123})) self.cf.remove("key")
def test_validated_columns(self): sys = SystemManager() sys.create_column_family( TEST_KS, 'Validators', ) sys.alter_column(TEST_KS, 'Validators', 'long', LongType()) sys.alter_column(TEST_KS, 'Validators', 'int', IntegerType()) sys.alter_column(TEST_KS, 'Validators', 'time', TimeUUIDType()) sys.alter_column(TEST_KS, 'Validators', 'lex', LexicalUUIDType()) sys.alter_column(TEST_KS, 'Validators', 'ascii', AsciiType()) sys.alter_column(TEST_KS, 'Validators', 'utf8', UTF8Type()) sys.alter_column(TEST_KS, 'Validators', 'bytes', BytesType()) sys.close() cf = ColumnFamily(pool, 'Validators') key = 'key1' col = {'long': 1L} cf.insert(key, col) assert_equal(cf.get(key)['long'], 1L) col = {'int': 1} cf.insert(key, col) assert_equal(cf.get(key)['int'], 1) col = {'time': TIME1} cf.insert(key, col) assert_equal(cf.get(key)['time'], TIME1) col = {'lex': uuid.UUID(bytes='aaa aaa aaa aaaa')} cf.insert(key, col) assert_equal(cf.get(key)['lex'], uuid.UUID(bytes='aaa aaa aaa aaaa')) col = {'ascii': 'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['ascii'], 'aaa') col = {'utf8': u'a\u0020'} cf.insert(key, col) assert_equal(cf.get(key)['utf8'], u'a\u0020') col = {'bytes': 'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['bytes'], 'aaa') cf.remove(key)
def test_validated_columns(self): sys = SystemManager() sys.create_column_family( TEST_KS, 'Validators', ) sys.alter_column(TEST_KS, 'Validators', 'long', LONG_TYPE) sys.alter_column(TEST_KS, 'Validators', 'int', INT_TYPE) sys.alter_column(TEST_KS, 'Validators', 'time', TIME_UUID_TYPE) sys.alter_column(TEST_KS, 'Validators', 'lex', LEXICAL_UUID_TYPE) sys.alter_column(TEST_KS, 'Validators', 'ascii', ASCII_TYPE) sys.alter_column(TEST_KS, 'Validators', 'utf8', UTF8_TYPE) sys.alter_column(TEST_KS, 'Validators', 'bytes', BYTES_TYPE) sys.close() cf = ColumnFamily(pool, 'Validators') key = 'key1' col = {'long': 1L} cf.insert(key, col) assert_equal(cf.get(key)['long'], 1L) col = {'int': 1} cf.insert(key, col) assert_equal(cf.get(key)['int'], 1) col = {'time': TIME1} cf.insert(key, col) assert_equal(cf.get(key)['time'], TIME1) col = {'lex': uuid.UUID(bytes='aaa aaa aaa aaaa')} cf.insert(key, col) assert_equal(cf.get(key)['lex'], uuid.UUID(bytes='aaa aaa aaa aaaa')) col = {'ascii': 'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['ascii'], 'aaa') col = {'utf8': u'a\u0020'} cf.insert(key, col) assert_equal(cf.get(key)['utf8'], u'a\u0020') col = {'bytes': 'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['bytes'], 'aaa') cf.remove(key)
class TestTypeErrors(unittest.TestCase): def test_packing_enabled(self): self.cf = ColumnFamily(pool, 'Standard1') self.cf.insert('key', {'col': 'val'}) assert_raises(TypeError, self.cf.insert, args=('key', {123: 'val'})) assert_raises(TypeError, self.cf.insert, args=('key', {'col': 123})) assert_raises(TypeError, self.cf.insert, args=('key', {123: 123})) self.cf.remove('key') def test_packing_disabled(self): self.cf = ColumnFamily(pool, 'Standard1', autopack_names=False, autopack_values=False) self.cf.insert('key', {'col': 'val'}) assert_raises(TypeError, self.cf.insert, args=('key', {123: 'val'})) assert_raises(TypeError, self.cf.insert, args=('key', {'col': 123})) assert_raises(TypeError, self.cf.insert, args=('key', {123: 123})) self.cf.remove('key')
def remove(self, instance, columns=None, write_consistency_level=None): """ Removes a stored instance. The `columns` parameter is a list of columns that should be removed. If this is left as the default value of ``None``, the entire stored instance will be removed. """ if self.super: return ColumnFamily.remove(self, instance.key, super_column=instance.super_column, columns=columns, write_consistency_level=write_consistency_level) else: return ColumnFamily.remove(self, instance.key, columns, write_consistency_level=write_consistency_level)
def create_new_thread(self, thread_name): threads = ColumnFamily(self.conn, 'threads') ret = list(threads.get_range()) if len(ret) > 99: oldest_thread = self._get_oldest_thread() oldest_thread_id = str(oldest_thread['thread_id']) threads.remove(oldest_thread_id) self._drop_cf(oldest_thread_id) thread_id = '%s' % random.randint(1,sys.maxint) dt = datetime.datetime.today() str_dt = dt.strftime('%Y-%m-%d %H:%M:%S') threads.insert(thread_id, {'thread_name': thread_name, 'post_count': '1', 'create_time': str_dt, 'update_time': str_dt}) self._create_cf(thread_id) return thread_id
def test_validated_columns(self): sys = SystemManager() sys.create_column_family(TEST_KS, "Validators") sys.alter_column(TEST_KS, "Validators", "long", LongType()) sys.alter_column(TEST_KS, "Validators", "int", IntegerType()) sys.alter_column(TEST_KS, "Validators", "time", TimeUUIDType()) sys.alter_column(TEST_KS, "Validators", "lex", LexicalUUIDType()) sys.alter_column(TEST_KS, "Validators", "ascii", AsciiType()) sys.alter_column(TEST_KS, "Validators", "utf8", UTF8Type()) sys.alter_column(TEST_KS, "Validators", "bytes", BytesType()) sys.close() cf = ColumnFamily(pool, "Validators") key = "key1" col = {"long": 1L} cf.insert(key, col) assert_equal(cf.get(key)["long"], 1L) col = {"int": 1} cf.insert(key, col) assert_equal(cf.get(key)["int"], 1) col = {"time": TIME1} cf.insert(key, col) assert_equal(cf.get(key)["time"], TIME1) col = {"lex": uuid.UUID(bytes="aaa aaa aaa aaaa")} cf.insert(key, col) assert_equal(cf.get(key)["lex"], uuid.UUID(bytes="aaa aaa aaa aaaa")) col = {"ascii": "aaa"} cf.insert(key, col) assert_equal(cf.get(key)["ascii"], "aaa") col = {"utf8": u"a\u0020"} cf.insert(key, col) assert_equal(cf.get(key)["utf8"], u"a\u0020") col = {"bytes": "aaa"} cf.insert(key, col) assert_equal(cf.get(key)["bytes"], "aaa") cf.remove(key)
def test_validated_columns(self): sys = SystemManager() sys.create_column_family(TEST_KS, 'Validators',) sys.alter_column(TEST_KS, 'Validators', 'long', LongType()) sys.alter_column(TEST_KS, 'Validators', 'int', IntegerType()) sys.alter_column(TEST_KS, 'Validators', 'time', TimeUUIDType()) sys.alter_column(TEST_KS, 'Validators', 'lex', LexicalUUIDType()) sys.alter_column(TEST_KS, 'Validators', 'ascii', AsciiType()) sys.alter_column(TEST_KS, 'Validators', 'utf8', UTF8Type()) sys.alter_column(TEST_KS, 'Validators', 'bytes', BytesType()) sys.close() cf = ColumnFamily(pool, 'Validators') key = 'key1' col = {'long':1L} cf.insert(key, col) assert_equal(cf.get(key)['long'], 1L) col = {'int':1} cf.insert(key, col) assert_equal(cf.get(key)['int'], 1) col = {'time':TIME1} cf.insert(key, col) assert_equal(cf.get(key)['time'], TIME1) col = {'lex':uuid.UUID(bytes='aaa aaa aaa aaaa')} cf.insert(key, col) assert_equal(cf.get(key)['lex'], uuid.UUID(bytes='aaa aaa aaa aaaa')) col = {'ascii':'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['ascii'], 'aaa') col = {'utf8':u'a\u0020'} cf.insert(key, col) assert_equal(cf.get(key)['utf8'], u'a\u0020') col = {'bytes':'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['bytes'], 'aaa') cf.remove(key)
def test_validated_columns(self): sys = SystemManager() sys.create_column_family(TEST_KS, 'Validators',) sys.alter_column(TEST_KS, 'Validators', 'long', LONG_TYPE) sys.alter_column(TEST_KS, 'Validators', 'int', INT_TYPE) sys.alter_column(TEST_KS, 'Validators', 'time', TIME_UUID_TYPE) sys.alter_column(TEST_KS, 'Validators', 'lex', LEXICAL_UUID_TYPE) sys.alter_column(TEST_KS, 'Validators', 'ascii', ASCII_TYPE) sys.alter_column(TEST_KS, 'Validators', 'utf8', UTF8_TYPE) sys.alter_column(TEST_KS, 'Validators', 'bytes', BYTES_TYPE) sys.close() cf = ColumnFamily(pool, 'Validators') key = 'key1' col = {'long':1L} cf.insert(key, col) assert_equal(cf.get(key)['long'], 1L) col = {'int':1} cf.insert(key, col) assert_equal(cf.get(key)['int'], 1) col = {'time':TIME1} cf.insert(key, col) assert_equal(cf.get(key)['time'], TIME1) col = {'lex':uuid.UUID(bytes='aaa aaa aaa aaaa')} cf.insert(key, col) assert_equal(cf.get(key)['lex'], uuid.UUID(bytes='aaa aaa aaa aaaa')) col = {'ascii':'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['ascii'], 'aaa') col = {'utf8':u'a\u0020'} cf.insert(key, col) assert_equal(cf.get(key)['utf8'], u'a\u0020') col = {'bytes':'aaa'} cf.insert(key, col) assert_equal(cf.get(key)['bytes'], 'aaa') cf.remove(key)
def truncate_log_metadata(self): for cf in ['build_timelines']: cf = ColumnFamily(self.pool, cf) cf.truncate() cf = ColumnFamily(self.pool, 'indices') for key in LOG_METADATA_INDICES: cf.remove(key) cf = ColumnFamily(self.pool, 'counters') for key in LOG_METADATA_COUNTERS: cf.remove(key) cf = ColumnFamily(self.pool, 'super_counters') for key in LOG_METADATA_SUPER_COUNTERS: cf.remove(key) cf = ColumnFamily(self.pool, 'builds') batch = cf.batch() # Remove log parsing state from builds. for key, cols in cf.get_range(columns=['log_parsing_version']): if 'log_parsing_version' not in cols: continue batch.remove(key, ['log_parsing_version']) batch.send()
def column_family_remove(self, keyspace_name, column_family_name, key): """Remove a key from column family for a given keyspace """ keyspace.error = "Unknown error occur please check your inputs" if not self.keyspace_contains(keyspace.local_system, keyspace_name, column_family_name): keyspace.error = "Desired Keyspace,Column Family pair could not be found." return False try: pool = ConnectionPool(keyspace=keyspace_name, server_list=keyspace.server_ips, prefill=False) except Exception as e: print e return False try: col_fam = ColumnFamily(pool, column_family_name) except Exception as e: print e return False try: col_fam.remove(key) except Exception as e: print e return False return True
class TestDateTypes(unittest.TestCase): def _compare_dates(self, d1, d2): self.assertEquals(d1.timetuple(), d2.timetuple()) self.assertEquals(int(d1.microsecond / 1e3), int(d2.microsecond / 1e3)) def test_compatibility(self): self.cf = ColumnFamily(pool, "Standard1") self.cf.column_validators["date"] = OldPycassaDateType() d = datetime.utcnow() self.cf.insert("key1", {"date": d}) self._compare_dates(self.cf.get("key1")["date"], d) self.cf.column_validators["date"] = IntermediateDateType() self._compare_dates(self.cf.get("key1")["date"], d) self.cf.insert("key1", {"date": d}) self._compare_dates(self.cf.get("key1")["date"], d) self.cf.column_validators["date"] = DateType() self._compare_dates(self.cf.get("key1")["date"], d) self.cf.insert("key1", {"date": d}) self._compare_dates(self.cf.get("key1")["date"], d) self.cf.remove("key1")
def undo(self): if type == InsertCommand.INS_BASIC: ## I know that data for a basic insert is of this tuple type domain, row_key, basic_type_dict = self.data client = db_connection.get_client() cf = ColumnFamily(client, domain) cf.remove(row_key) elif type == InsertCommand.INS_OBJECT: ## call the save operation for the object if self.data: self.data.delete(cascade=False) elif type == InsertCommand.INS_BATCH: domain, basic_type_item_dict = self.data client = db_connection.get_client() cf = ColumnFamily(client, domain) b = cf.batch() for row_key in basic_type_item_dict.keys(): b.remove(row_key) b.send()
import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool('ApplicationData', ['localhost:9160']) col_fam = ColumnFamily(pool, 'UserInfo') col_fam.insert('Diego', {'email': '*****@*****.**'}) readData = col_fam.get('Diego', columns=['email']) col_fam.remove('Diego', columns=['email']) #batch b = col_fam.batch(queue_size=10) b.insert('John', {'email': '*****@*****.**', 'state': 'IL', 'gender': 'M'}) b.insert('Jane', {'email': '*****@*****.**', 'state': 'CA'}) b.remove('John', ['gender']) b.remove('Jane') b.send()
class DailyTemporalBloomFilter(DailyTemporalBase): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __new__(cls, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): return super(DailyTemporalBloomFilter, cls).__new__(cls, capacity=capacity, error_rate=error_rate) def __init__(self, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): filename = "" super(DailyTemporalBloomFilter, self).__init__(capacity=capacity, error_rate=error_rate) self.bf_name = name self.expiration = expiration self.initialize_period() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() self.snapshot_path = snapshot_path def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert('%s_%s' % (self.bf_name, current_period_hour), {k:'' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def _day_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the days between a start and end datetime (inclusive).""" days = (end - start).days if inclusive: days += 1 for i in xrange(days): if reverse: yield end - dt.timedelta(days=i) else: yield start + dt.timedelta(days=i) def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self, rebuild_snapshot=True): """Rebuild the BF using the archived items""" self.initialize_bitarray() #if rebuild_snapshot: # self.delete_snapshots() def multi_rows_itr(rows): for row in rows.values(): for k in row.keys(): yield k last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) days = self._day_range(last_period, dt.datetime.now()) rows = [] for i,day in enumerate(days): rows = ["%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str) for hour_str in ["%02d" % i for i in range(24)]] rows_content = self.columnfamily.multiget(rows, column_count=1E6) update_current = day == self.current_period for k in multi_rows_itr(rows_content): self.add_rebuild(k, update_current) if rebuild_snapshot: self.save_snaphot(override_period=day) if not update_current: self.initialize_current_day_bitarray() def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration-1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def add_rebuild(self, key, update_current=True): super(DailyTemporalBloomFilter, self).add(key, update_current) def add(self, key_string): if isinstance(key_string, unicode): key = key_string.encode('utf8') else: key = key_string self.archive_bf_key(key) result = super(DailyTemporalBloomFilter, self).add(key) return result def resize(self, new_capacity=None, new_error_rate=None): self._set_capacity(new_capacity or self.capacity) self._set_error_rate(new_error_rate or self.error_rate) self._initialize_parameters() self.initialize_bitarray() self.rebuild_from_archive(rebuild_snapshot=True) def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def save_snaphot(self, override_period=None): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ period = override_period or self.current_period filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name, self.expiration, period.strftime("%Y-%m-%d")) self._save_snapshot(filename)
class Buyer(Llama): def __init__(self, client, qname, trend=5): super(Buyer, self).__init__(client, uuid.uuid4().hex) self.holdings = {} self.cash = 100000.0 self.history = {} self.trend = trend self.pool = ConnectionPool('example_consumer_Buyer') self.stored_holdings = ColumnFamily(self.pool, 'Holdings') self.quote_history = ColumnFamily(self.pool, 'Quotes') self.stored_cash = ColumnFamily(self.pool, 'Cash') try: cash = self.stored_cash.get('current') self.cash = cash['amount'] except ttypes.NotFoundException: self.stored_cash.insert('current', {'amount': self.cash}) for symbol, columns in self.stored_holdings.get_range(): self.holdings[symbol] = (columns['number_of_shares'], columns['price'], columns['cost']) date_expression = create_index_expression('timestamp', datetime.date.today(), GT) date_clause = create_index_clause([date_expression], count=1000) for key, columns in self.quote_history.get_range(): symbol = columns['symbol'] price = columns['price'] self.add_quote(symbol, price) def add_quote(self, symbol, price): if symbol not in self.history: self.history[symbol] = [price] else: self.history[symbol].append(price) if len(self.history[symbol]) >= self.trend: price_low = min(self.history[symbol][-self.trend:]) price_max = max(self.history[symbol][-self.trend:]) price_avg = sum(self.history[symbol][-self.trend:]) / self.trend #print "Recent history of %s is %s" % (symbol, self.history[symbol][-self.trend:]) else: price_low, price_max, price_avg = (-1, -1, -1) print "%s quotes until we start deciding whether to buy or sell %s" % ( self.trend - len(self.history[symbol]), symbol) #print "Recent history of %s is %s" % (symbol, self.history[symbol]) return (price_low, price_max, price_avg) def do_message(self, quote): symbol, price, date, counter = quote #print "Thinking about whether to buy or sell %s at %s" % (symbol, price) price_low, price_max, price_avg = self.add_quote(symbol, price) self.save_quote(symbol, price) if price_low == -1: return #print "Trending minimum/avg/max of %s is %s-%s-%s" % (symbol, price_low, price_avg, price_max) #for symbol in self.holdings.keys(): # print "self.history[symbol][-1] = %s" % self.history[symbol][-1] # print "self.holdings[symbol][0] = %s" % self.holdings[symbol][0] # print "Value of %s is %s" % (symbol, float(self.holdings[symbol][0])*self.history[symbol][-1]) value = sum([ self.holdings[symbol][0] * self.history[symbol][-1] for symbol in self.holdings.keys() ]) print "Net worth is %s + %s = %s" % (self.cash, value, self.cash + value) if symbol not in self.holdings: if price < 1.01 * price_low: shares_to_buy = random.choice([10, 15, 20, 25, 30]) print "I don't own any %s yet, and the price is below the trending minimum of %s so I'm buying %s shares." % ( symbol, price_low, shares_to_buy) cost = shares_to_buy * price print "Cost is %s, cash is %s" % (cost, self.cash) if cost < self.cash: self.buy_holdings(symbol, shares_to_buy, price, cost) self.update_cash(-cost) print "Cash is now %s" % self.cash else: print "Unfortunately, I don't have enough cash at this time." else: if price > self.holdings[symbol][1] and price > 0.99 * price_max: print "+++++++ Price of %s is higher than my holdings, so I'm going to sell!" % symbol sale_value = self.holdings[symbol][0] * price print "Sale value is %s" % sale_value print "Holdings value is %s" % self.holdings[symbol][2] print "Total net is %s" % (sale_value - self.holdings[symbol][2]) self.update_cash(sale_value) print "Cash is now %s" % self.cash self.sell_holdings(symbol) def update_cash(self, change): self.cash += change cash = self.stored_cash.get('current') cash['amount'] = self.cash self.stored_cash.insert('current', cash) def buy_holdings(self, symbol, shares_to_buy, price, cost): self.holdings[symbol] = (shares_to_buy, price, cost) stored_holding = { 'number_of_shares': shares_to_buy, 'price': price, 'cost': cost } self.stored_holdings.insert(symbol, stored_holding) def sell_holdings(self, symbol): del self.holdings[symbol] self.stored_holdings.remove(symbol) def save_quote(self, symbol, price): key = str(uuid.uuid4()) self.quote_history.insert(key, {'symbol': symbol, 'price': price})
import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool('superKeySpace', ['localhost:9160']) col_fam = ColumnFamily(pool, 'superGroup') row = col_fam.get('Will') print('Got and printing row = will') print(row) superCol= row.get('name') print('printing the super column "name" for Will\'s record') print(superCol) #add first name will col_fam.insert('Will', {'name': {'first': 'Will'}}) print('added Will\'s first name and printing Will\'s record') print(col_fam.get('Will')) #change first from will to bill col_fam.insert('Will', {'name': {'first': 'Bill'}}) print('changed Will\'s first name to Bill and printing Will\'s record') print(col_fam.get('Will')) #resetting Will's first name to empty print('removeing Will\'s first name') col_fam.remove('Will', super_column='name', columns=['first']) print(col_fam.get('Will'))
def remove_row(columnFamily, uid) : "To remove row from a column family" column = ColumnFamily(pool, columnFamily) column.remove(uid)
class Buyer(Llama): def __init__(self, client, qname, trend=5): super(Buyer, self).__init__(client, uuid.uuid4().hex) self.holdings = {} self.cash = 100000.0 self.history = {} self.trend = trend self.pool = ConnectionPool('example_consumer_Buyer') self.stored_holdings = ColumnFamily(self.pool, 'Holdings') self.quote_history = ColumnFamily(self.pool, 'Quotes') self.stored_cash = ColumnFamily(self.pool, 'Cash') try: cash = self.stored_cash.get('current') self.cash = cash['amount'] except ttypes.NotFoundException: self.stored_cash.insert('current', { 'amount': self.cash }) for symbol, columns in self.stored_holdings.get_range(): self.holdings[symbol] = (columns['number_of_shares'], columns['price'], columns['cost']) date_expression = create_index_expression('timestamp', datetime.date.today(), GT) date_clause = create_index_clause([date_expression], count=1000) for key, columns in self.quote_history.get_range(): symbol = columns['symbol'] price = columns['price'] self.add_quote(symbol, price) def add_quote(self, symbol, price): if symbol not in self.history: self.history[symbol] = [price] else: self.history[symbol].append(price) if len(self.history[symbol]) >= self.trend: price_low = min(self.history[symbol][-self.trend:]) price_max = max(self.history[symbol][-self.trend:]) price_avg = sum(self.history[symbol][-self.trend:])/self.trend #print "Recent history of %s is %s" % (symbol, self.history[symbol][-self.trend:]) else: price_low, price_max, price_avg = (-1, -1, -1) print "%s quotes until we start deciding whether to buy or sell %s" % (self.trend - len(self.history[symbol]), symbol) #print "Recent history of %s is %s" % (symbol, self.history[symbol]) return (price_low, price_max, price_avg) def do_message(self, quote): symbol, price, date, counter = quote #print "Thinking about whether to buy or sell %s at %s" % (symbol, price) price_low, price_max, price_avg = self.add_quote(symbol, price) self.save_quote(symbol, price) if price_low == -1: return #print "Trending minimum/avg/max of %s is %s-%s-%s" % (symbol, price_low, price_avg, price_max) #for symbol in self.holdings.keys(): # print "self.history[symbol][-1] = %s" % self.history[symbol][-1] # print "self.holdings[symbol][0] = %s" % self.holdings[symbol][0] # print "Value of %s is %s" % (symbol, float(self.holdings[symbol][0])*self.history[symbol][-1]) value = sum([self.holdings[symbol][0]*self.history[symbol][-1] for symbol in self.holdings.keys()]) print "Net worth is %s + %s = %s" % (self.cash, value, self.cash + value) if symbol not in self.holdings: if price < 1.01*price_low: shares_to_buy = random.choice([10, 15, 20, 25, 30]) print "I don't own any %s yet, and the price is below the trending minimum of %s so I'm buying %s shares." % (symbol, price_low, shares_to_buy) cost = shares_to_buy * price print "Cost is %s, cash is %s" % (cost, self.cash) if cost < self.cash: self.buy_holdings(symbol, shares_to_buy, price, cost) self.update_cash(-cost) print "Cash is now %s" % self.cash else: print "Unfortunately, I don't have enough cash at this time." else: if price > self.holdings[symbol][1] and price > 0.99*price_max: print "+++++++ Price of %s is higher than my holdings, so I'm going to sell!" % symbol sale_value = self.holdings[symbol][0] * price print "Sale value is %s" % sale_value print "Holdings value is %s" % self.holdings[symbol][2] print "Total net is %s" % (sale_value - self.holdings[symbol][2]) self.update_cash(sale_value) print "Cash is now %s" % self.cash self.sell_holdings(symbol) def update_cash(self, change): self.cash += change cash = self.stored_cash.get('current') cash['amount'] = self.cash self.stored_cash.insert('current', cash) def buy_holdings(self, symbol, shares_to_buy, price, cost): self.holdings[symbol] = (shares_to_buy, price, cost) stored_holding = {'number_of_shares': shares_to_buy, 'price': price, 'cost': cost} self.stored_holdings.insert(symbol, stored_holding) def sell_holdings(self, symbol): del self.holdings[symbol] self.stored_holdings.remove(symbol) def save_quote(self, symbol, price): key = str(uuid.uuid4()) self.quote_history.insert(key, { 'symbol': symbol, 'price': price })
class TestTimeUUIDs(unittest.TestCase): def setUp(self): self.cf_time = ColumnFamily(pool, 'StdTimeUUID') def tearDown(self): self.cf_time.remove('key1') def test_datetime_to_uuid(self): key = 'key1' timeline = [] timeline.append(datetime.now()) time1 = uuid1() col1 = {time1: '0'} self.cf_time.insert(key, col1) time.sleep(1) timeline.append(datetime.now()) time2 = uuid1() col2 = {time2: '1'} self.cf_time.insert(key, col2) time.sleep(1) timeline.append(datetime.now()) cols = {time1: '0', time2: '1'} assert_equal(self.cf_time.get(key, column_start=timeline[0]), cols) assert_equal(self.cf_time.get(key, column_finish=timeline[2]), cols) assert_equal( self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]), cols) assert_equal( self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]), cols) assert_equal( self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[1]), col1) assert_equal( self.cf_time.get(key, column_start=timeline[1], column_finish=timeline[2]), col2) def test_time_to_uuid(self): key = 'key1' timeline = [] timeline.append(time.time()) time1 = uuid1() col1 = {time1: '0'} self.cf_time.insert(key, col1) time.sleep(0.1) timeline.append(time.time()) time2 = uuid1() col2 = {time2: '1'} self.cf_time.insert(key, col2) time.sleep(0.1) timeline.append(time.time()) cols = {time1: '0', time2: '1'} assert_equal(self.cf_time.get(key, column_start=timeline[0]), cols) assert_equal(self.cf_time.get(key, column_finish=timeline[2]), cols) assert_equal( self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]), cols) assert_equal( self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]), cols) assert_equal( self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[1]), col1) assert_equal( self.cf_time.get(key, column_start=timeline[1], column_finish=timeline[2]), col2) def test_auto_time_to_uuid1(self): key = 'key1' t = time.time() col = {t: 'foo'} self.cf_time.insert(key, col) uuid_res = self.cf_time.get(key).keys()[0] timestamp = convert_uuid_to_time(uuid_res) assert_almost_equal(timestamp, t, places=3)
names3.iteritems()]) name_cf.insert("sacharya3", {'last_name': attrs['last_name'].append("acharya3")}) print name_cf.get('sacharya3') ################################# COUNT ####################################### # Count the number of columns for the row key count=author_cf.get_count("sacharya1") print count count=author_cf.multiget_count(["sacharya1","sacharya2"]) print count ################################## REMOVE ##################################### # Remove the column for the row key and column key print "Removing the column last_name for row key sacharya1" author_cf.remove('sacharya1', columns=['last_name']) time.sleep(5) authors = author_cf.get('sacharya') print authors # REMOVE the entire row author_cf.remove('sacharya') try: time.sleep(5) print "Getting object already deleted" author_cf.get('sacharya') except Exception as e: print e
class DailyTemporalBloomFilter(object): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __init__(self, capacity, error_rate, expiration, name, snapshot_path, cassandra_session): self.error_rate = error_rate self.capacity = capacity self._initialize_parameters() self.initialize_bitarray() self.count = 0 self.hashed_values = [] self.name = name self.snapshot_path = snapshot_path self.expiration = expiration self.initialize_period() self.snapshot_to_load = None self.ready = False self.warm_period = None self.next_snapshot_load = time.time() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() def _initialize_parameters(self): self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate))) self.bits_per_slice = int( np.ceil((self.capacity * abs(np.log(self.error_rate))) / (self.nbr_slices * (np.log(2)**2)))) self.nbr_bits = self.nbr_slices * self.bits_per_slice self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families( self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert( '%s_%s' % (self.name, current_period_hour), {k: '' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def resize(self, new_capacity): self.capacity = new_capacity self._initialize_parameters() self.rebuild_from_archive() def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self): """Rebuild the BF using the archived items""" self.initialize_bitarray() last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) rows = [] for i, hour in enumerate(hours): row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) rows.append(row) rows_content = self.columnfamily.multiget(rows, column_count=1E6) for row_content in rows_content.values(): for k in row_content.keys(): self.add(k, rebuild_mode=True) def initialize_bitarray(self): """Initialize both bitarray. This BF contain two bit arrays instead of single one like a plain BF. bitarray is the main bit array where all the historical items are stored. It's the one used for the membership query. The second one, current_day_bitarray is the one used for creating the daily snapshot. """ self.bitarray = bitarray.bitarray(self.nbr_bits) self.current_day_bitarray = bitarray.bitarray(self.nbr_bits) self.bitarray.setall(False) self.current_day_bitarray.setall(False) def __contains__(self, key): """Check membership.""" self.hashed_values = self.hashes(key) offset = 0 for value in self.hashed_values: if not self.bitarray[offset + value]: return False offset += self.bits_per_slice return True def add(self, key, rebuild_mode=False): if not rebuild_mode: self.archive_bf_key(key) if key in self: return True offset = 0 if not self.hashed_values: self.hashed_values = self.hashes(key) for value in self.hashed_values: self.bitarray[offset + value] = True self.current_day_bitarray[offset + value] = True offset += self.bits_per_slice self.count += 1 return False def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def maintenance(self): """Expire the old element of the set. Initialize a new bitarray and load the previous snapshot. Execute this guy at the beginining of each day. """ self.initialize_period() self.initialize_bitarray() self.restore_from_disk() def compute_refresh_period(self): self.warm_period = (60 * 60 * 24) // (self.expiration - 2) def _should_warm(self): return time.time() >= self.next_snapshot_load def warm(self, jittering_ratio=0.2): """Progressively load the previous snapshot during the day. Loading all the snapshots at once can takes a substantial amount of time. This method, if called periodically during the day will progressively load those snapshots one by one. Because many workers are going to use this method at the same time, we add a jittering to the period between load to avoid hammering the disk at the same time. """ if self.snapshot_to_load == None: last_period = self.current_period - dt.timedelta( days=self.expiration - 1) self.compute_refresh_period() self.snapshot_to_load = [] base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime( filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period >= last_period: self.snapshot_to_load.append(filename) self.ready = False if self.snapshot_to_load and self._should_warm(): filename = self.snapshot_to_load.pop() self._union_bf_from_file(filename) jittering = self.warm_period * (np.random.random() - 0.5) * jittering_ratio self.next_snapshot_load = time.time( ) + self.warm_period + jittering if not self.snapshot_to_load: self.ready = True def _union_bf_from_file(self, filename, current=False): snapshot = cPickle.loads(zlib.decompress(open(filename, 'r').read())) if current: self.current_day_bitarray = self.current_day_bitarray | snapshot else: self.bitarray = self.bitarray | snapshot def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration - 1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime( filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def save_snaphot(self): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date) with open(filename, 'w') as f: f.write( zlib.compress( cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL))) def union_current_day(self, bf): """Union only the current_day of an other BF.""" self.bitarray = self.bitarray | bf.current_day_bitarray
#key = 'rss:' + v[0] + ':' + tzids[tzid] + ':SUM' key = 'rss:' + v[0] + ':SUM' if (long(v[1]) < tt): delete[v[0]] = tzids.keys()[0] else: save[v[0]] = tzids.keys()[0] filter(oids) ### delete rows in counter #delete_rows_in_counter(delete) ### delete columns in meta meta.remove('rss.All', columns = delete.keys()) oid = save.keys()[0] counterkey = save[oid] print counterkey print "Start to get all counters ..." counters_generator = countercf.xget(counterkey, column_reversed=True, include_timestamp=True) print "A lot ...." for counter in counters_generator: print counter #print oids_generator #counter.remove(key)
class DailyTemporalBloomFilter(object): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __init__(self, capacity, error_rate, expiration, name, snapshot_path, cassandra_session): self.error_rate = error_rate self.capacity = capacity self._initialize_parameters() self.initialize_bitarray() self.count = 0 self.hashed_values = [] self.name = name self.snapshot_path = snapshot_path self.expiration = expiration self.initialize_period() self.snapshot_to_load = None self.ready = False self.warm_period = None self.next_snapshot_load = time.time() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() def _initialize_parameters(self): self.nbr_slices = int(np.ceil(np.log2(1.0 / self.error_rate))) self.bits_per_slice = int(np.ceil((self.capacity * abs(np.log(self.error_rate))) / (self.nbr_slices * (np.log(2) ** 2)))) self.nbr_bits = self.nbr_slices * self.bits_per_slice self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families(self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert('%s_%s' % (self.name, current_period_hour), {k:'' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def resize(self, new_capacity): self.capacity = new_capacity self._initialize_parameters() self.rebuild_from_archive() def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self): """Rebuild the BF using the archived items""" self.initialize_bitarray() last_period = self.current_period - dt.timedelta(days=self.expiration-1) hours = self._hour_range(last_period, dt.datetime.now()) rows = [] for i,hour in enumerate(hours): row = "%s_%s" % (self.name, hour.strftime('%Y-%m-%d:%H')) rows.append(row) rows_content = self.columnfamily.multiget(rows, column_count=1E6) for row_content in rows_content.values(): for k in row_content.keys(): self.add(k, rebuild_mode=True) def initialize_bitarray(self): """Initialize both bitarray. This BF contain two bit arrays instead of single one like a plain BF. bitarray is the main bit array where all the historical items are stored. It's the one used for the membership query. The second one, current_day_bitarray is the one used for creating the daily snapshot. """ self.bitarray = bitarray.bitarray(self.nbr_bits) self.current_day_bitarray = bitarray.bitarray(self.nbr_bits) self.bitarray.setall(False) self.current_day_bitarray.setall(False) def __contains__(self, key): """Check membership.""" self.hashed_values = self.hashes(key) offset = 0 for value in self.hashed_values: if not self.bitarray[offset + value]: return False offset += self.bits_per_slice return True def add(self, key, rebuild_mode=False): if not rebuild_mode: self.archive_bf_key(key) if key in self: return True offset = 0 if not self.hashed_values: self.hashed_values = self.hashes(key) for value in self.hashed_values: self.bitarray[offset + value] = True self.current_day_bitarray[offset + value] = True offset += self.bits_per_slice self.count += 1 return False def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def maintenance(self): """Expire the old element of the set. Initialize a new bitarray and load the previous snapshot. Execute this guy at the beginining of each day. """ self.initialize_period() self.initialize_bitarray() self.restore_from_disk() def compute_refresh_period(self): self.warm_period = (60 * 60 * 24) // (self.expiration-2) def _should_warm(self): return time.time() >= self.next_snapshot_load def warm(self, jittering_ratio=0.2): """Progressively load the previous snapshot during the day. Loading all the snapshots at once can takes a substantial amount of time. This method, if called periodically during the day will progressively load those snapshots one by one. Because many workers are going to use this method at the same time, we add a jittering to the period between load to avoid hammering the disk at the same time. """ if self.snapshot_to_load == None: last_period = self.current_period - dt.timedelta(days=self.expiration-1) self.compute_refresh_period() self.snapshot_to_load = [] base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period >= last_period: self.snapshot_to_load.append(filename) self.ready = False if self.snapshot_to_load and self._should_warm(): filename = self.snapshot_to_load.pop() self._union_bf_from_file(filename) jittering = self.warm_period * (np.random.random()-0.5) * jittering_ratio self.next_snapshot_load = time.time() + self.warm_period + jittering if not self.snapshot_to_load: self.ready = True def _union_bf_from_file(self, filename, current=False): snapshot = cPickle.loads(zlib.decompress(open(filename,'r').read())) if current: self.current_day_bitarray = self.current_day_bitarray | snapshot else: self.bitarray = self.bitarray | snapshot def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration-1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def save_snaphot(self): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date) with open(filename, 'w') as f: f.write(zlib.compress(cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL))) def union_current_day(self, bf): """Union only the current_day of an other BF.""" self.bitarray = self.bitarray | bf.current_day_bitarray
def Remove(pool, columnFamily, key, val,*args, **kwargs): col_fam = ColumnFamily(pool, columnFamily) d = col_fam.remove(key, columns=val,*args, **kwargs)
class TestTimeUUIDs(unittest.TestCase): def setUp(self): self.cf_time = ColumnFamily(pool, 'StdTimeUUID') def tearDown(self): self.cf_time.remove('key1') def test_datetime_to_uuid(self): key = 'key1' timeline = [] timeline.append(datetime.now()) time1 = uuid1() col1 = {time1:'0'} self.cf_time.insert(key, col1) time.sleep(1) timeline.append(datetime.now()) time2 = uuid1() col2 = {time2:'1'} self.cf_time.insert(key, col2) time.sleep(1) timeline.append(datetime.now()) cols = {time1:'0', time2:'1'} assert_equal(self.cf_time.get(key, column_start=timeline[0]) , cols) assert_equal(self.cf_time.get(key, column_finish=timeline[2]) , cols) assert_equal(self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]) , cols) assert_equal(self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]) , cols) assert_equal(self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[1]) , col1) assert_equal(self.cf_time.get(key, column_start=timeline[1], column_finish=timeline[2]) , col2) def test_time_to_uuid(self): key = 'key1' timeline = [] timeline.append(time.time()) time1 = uuid1() col1 = {time1:'0'} self.cf_time.insert(key, col1) time.sleep(0.1) timeline.append(time.time()) time2 = uuid1() col2 = {time2:'1'} self.cf_time.insert(key, col2) time.sleep(0.1) timeline.append(time.time()) cols = {time1:'0', time2:'1'} assert_equal(self.cf_time.get(key, column_start=timeline[0]) , cols) assert_equal(self.cf_time.get(key, column_finish=timeline[2]) , cols) assert_equal(self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]) , cols) assert_equal(self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[2]) , cols) assert_equal(self.cf_time.get(key, column_start=timeline[0], column_finish=timeline[1]) , col1) assert_equal(self.cf_time.get(key, column_start=timeline[1], column_finish=timeline[2]) , col2) def test_auto_time_to_uuid1(self): key = 'key1' t = time.time() col = {t: 'foo'} self.cf_time.insert(key, col) uuid_res = self.cf_time.get(key).keys()[0] timestamp = convert_uuid_to_time(uuid_res) assert_almost_equal(timestamp, t, places=3)
class CassandraDemo(object): def __init__(self, database, table): self.database = database self.table = table def create_connections(self): self.pool = ConnectionPool(self.database) self.cf = ColumnFamily(self.pool, self.table) def create_database_and_table(self): super_cf = False # consider super columns to be deprecated s = SystemManager() # create keyspace if it doesn't exist if database not in s.list_keyspaces(): s.create_keyspace(database, SIMPLE_STRATEGY, {'replication_factor': '1'}) # delete column family from the keyspace if it does exist. if table in s.get_keyspace_column_families(database): s.drop_column_family(database, table) # create coulmn family in the keyspace if table not in s.get_keyspace_column_families(database): print("table is creating...") s.create_column_family(database, table, super = super_cf, comparator_type = ASCII_TYPE) s.close() return True def insert_data(self): print '\nemployee data is inserting...' self.cf.insert('1', {'fn':'yogesh', 'ln':'kumar', 'ct': 'Ajmer', 'em': '*****@*****.**'}) self.cf.insert('2', {'fn':'amit', 'ln':'pandita', 'ct': 'Delhi', 'em': '*****@*****.**'}) self.cf.insert('3', {'fn':'sandeep', 'ln':'tak', 'ct': 'Ajmer', 'em': '*****@*****.**', 'mb': '8890467032'}) def get_data(self): print '\nemployee data is featching...' data1 = self.cf.get('1') data2 = self.cf.get('2', columns = ['fn', 'ln', 'em']) data3 = self.cf.get('3', column_start = 'ct', column_finish = 'fn') data4 = self.cf.get('1', column_reversed = False, column_count = 3) data5 = self.cf.get('1', column_reversed = True, column_count = 3) print data1 print data2 print data3 print data4 print data5 def get_multiple_data(self): print '\ngetting multiple employees data...' row_keys = ['1','2','3'] data = self.cf.multiget(row_keys) print data def get_data_by_range(self): ''' if you get an error don't worry about this, it's a Cassandra limitation Issue ''' print '\ngetting employees data by range...' start_row_key = '1' end_row_key = '3' data = self.cf.get_range(start = start_row_key, finish = end_row_key) for key, columns in data: print key,coulmns def get_count(self): print '\nget employee row\'s colunm count' print self.cf.get_count('1') print self.cf.get_count('1', columns = ['fn', 'ln']) print self.cf.get_count('1', column_start = 'em') def get_multi_count(self): print '\nget multiple employees row\'s colunm count' row_keys = ['1','2','3'] columns = ['fn', 'ln', 'mb'] column_start = 'ct' column_finish = 'fn' print self.cf.multiget_count(row_keys) print self.cf.multiget_count(row_keys, columns = columns) print self.cf.multiget_count(row_keys, column_start = column_start, column_finish = column_finish) def update_data(self): print '\nemployee data is updating...' self.cf.insert('1', {'pwd':'yoku@2010', 'ct':'Noida'}) def delete_data(self): print '\ndelete data from employee' row = '2' self.cf.remove(row) def get_all_rows(self): print '\ngetting rows name...' print [v[0] for v in self.cf.get_range()] def get_all_columns_of_row(self): print '\ngetting columns name of a row' row = '1' data = self.cf.get(row) print data.keys()
class DailyTemporalBloomFilter(DailyTemporalBase): """Long Range Temporal BloomFilter using a daily resolution. For really high value of expiration (like 60 days) with low requirement on precision. The actual error of this BF will the be native error of the BF + the error related to the coarse aspect of the expiration, since we no longer expires information precisely. Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) AND false negative (reporting non-membership for a member). The upper bound of the temporal_error can be theoricaly quite high. However, if the items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration """ def __new__(cls, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): return super(DailyTemporalBloomFilter, cls).__new__(cls, capacity=capacity, error_rate=error_rate) def __init__(self, capacity, error_rate, expiration, name, cassandra_session, snapshot_path='./'): filename = "" super(DailyTemporalBloomFilter, self).__init__(capacity=capacity, error_rate=error_rate) self.bf_name = name self.expiration = expiration self.initialize_period() self.cassandra_session = cassandra_session self.cassandra_columns_family = "temporal_bf" self.keyspace = 'parsely' self.uncommited_keys = [] self.commit_batch = 1000 self.columnfamily = None self.ensure_cassandra_cf() self.snapshot_path = snapshot_path def ensure_cassandra_cf(self): s = SystemManager() if self.keyspace not in s.list_keyspaces(): s.create_keyspace(self.keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) if self.cassandra_columns_family not in s.get_keyspace_column_families( self.keyspace): s.create_column_family(self.keyspace, self.cassandra_columns_family) self.columnfamily = ColumnFamily(self.cassandra_session, self.cassandra_columns_family) def archive_bf_key(self, bf_key): self.uncommited_keys.append(bf_key) if len(self.uncommited_keys) >= self.commit_batch: current_period_hour = dt.datetime.now().strftime('%Y-%m-%d:%H') self.columnfamily.insert( '%s_%s' % (self.bf_name, current_period_hour), {k: '' for k in self.uncommited_keys}) self.uncommited_keys = [] def _hour_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the hours between a start and end datetime (inclusive).""" def total_seconds(td): return (td.microseconds + (td.seconds + td.days * 24.0 * 3600.0) * 10.0**6) / 10.0**6 hours = int(math.ceil(total_seconds(end - start) / (60.0 * 60.0))) if inclusive: hours += 1 for i in xrange(hours): if reverse: yield end - dt.timedelta(hours=i) else: yield start + dt.timedelta(hours=i) def _day_range(self, start, end, reverse=False, inclusive=True): """Generator that gives us all the days between a start and end datetime (inclusive).""" days = (end - start).days if inclusive: days += 1 for i in xrange(days): if reverse: yield end - dt.timedelta(days=i) else: yield start + dt.timedelta(days=i) def _drop_archive(self): last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) for hour in hours: try: row = "%s_%s" % (self.bf_name, hour.strftime('%Y-%m-%d:%H')) nbr_keys = self.columnfamily.get_count(row) keys = self.columnfamily.remove(row) except: pass def rebuild_from_archive(self, rebuild_snapshot=True): """Rebuild the BF using the archived items""" self.initialize_bitarray() #if rebuild_snapshot: # self.delete_snapshots() def multi_rows_itr(rows): for row in rows.values(): for k in row.keys(): yield k last_period = self.current_period - dt.timedelta(days=self.expiration - 1) hours = self._hour_range(last_period, dt.datetime.now()) days = self._day_range(last_period, dt.datetime.now()) rows = [] for i, day in enumerate(days): rows = [ "%s_%s:%s" % (self.bf_name, day.strftime('%Y-%m-%d'), hour_str) for hour_str in ["%02d" % i for i in range(24)] ] rows_content = self.columnfamily.multiget(rows, column_count=1E6) update_current = day == self.current_period for k in multi_rows_itr(rows_content): self.add_rebuild(k, update_current) if rebuild_snapshot: self.save_snaphot(override_period=day) if not update_current: self.initialize_current_day_bitarray() def restore_from_disk(self, clean_old_snapshot=False): """Restore the state of the BF using previous snapshots. :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) """ base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.bf_name, self.expiration) availables_snapshots = glob.glob(base_filename) last_period = self.current_period - dt.timedelta(days=self.expiration - 1) for filename in availables_snapshots: snapshot_period = dt.datetime.strptime( filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") if snapshot_period < last_period and not clean_old_snapshot: continue else: self._union_bf_from_file(filename) if snapshot_period == self.current_period: self._union_bf_from_file(filename, current=True) if snapshot_period < last_period and clean_old_snapshot: os.remove(filename) self.ready = True def add_rebuild(self, key, update_current=True): super(DailyTemporalBloomFilter, self).add(key, update_current) def add(self, key_string): if isinstance(key_string, unicode): key = key_string.encode('utf8') else: key = key_string self.archive_bf_key(key) result = super(DailyTemporalBloomFilter, self).add(key) return result def resize(self, new_capacity=None, new_error_rate=None): self._set_capacity(new_capacity or self.capacity) self._set_error_rate(new_error_rate or self.error_rate) self._initialize_parameters() self.initialize_bitarray() self.rebuild_from_archive(rebuild_snapshot=True) def initialize_period(self, period=None): """Initialize the period of BF. :period: datetime.datetime for setting the period explicity. """ if not period: self.current_period = dt.datetime.now() else: self.current_period = period self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) self.date = self.current_period.strftime("%Y-%m-%d") def save_snaphot(self, override_period=None): """Save the current state of the current day bitarray on disk. Save the internal representation (bitarray) into a binary file using this format: filename : name_expiration_2013-01-01.dat """ period = override_period or self.current_period filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.bf_name, self.expiration, period.strftime("%Y-%m-%d")) self._save_snapshot(filename)