def read(metric, start_time, end_time, tags): pool = ConnectionPool(keyspace, [address]) # decide which column family to read based on time diffrence if timeDiff(start_time, end_time) <= 3600: col_fam = pycassa.ColumnFamily(pool, 'rawdata') elif timeDiff(start_time, end_time) <= 7200: col_fam = pycassa.ColumnFamily(pool, 'rollups60') elif timeDiff(start_time, end_time) <= 86400: col_fam = pycassa.ColumnFamily(pool, 'rollups300') elif timeDiff(start_time, end_time) <= 2592000: col_fam = pycassa.ColumnFamily(pool, 'rollups7200') else: col_fam = pycassa.ColumnFamily(pool, 'rollups86400') # change start_time , end_time to uper timestamp start_upertime = start_time/upertime_interval end_updertime = end_time/upertime_interval points = {} for i in range(start_upertime, end_updertime + 1): key = generate_key(metric, i, tags) try: points = col_fam.get(key, column_start=start_time, column_finish=end_time) except pycassa.NotFoundException: return None pool.dispose() return points
def main(filename): data = file(filename) # Set up the connection pool pool = ConnectionPool('tuitterdb',['localhost:9160']) # CF connections user_family = ColumnFamily(pool, 'user') tweet_family = ColumnFamily(pool, 'tweet') user_tweets_family = ColumnFamily(pool, 'userTweets') followers = ColumnFamily(pool, 'followers') followerTweets = ColumnFamily(pool, 'followsTweets') # Batch Definitions user_batch = user_family.batch(queue_size=1000) followers_batch = followers.batch(queue_size=500) user_tweets_batch = user_tweets_family.batch(queue_size = 500) followerTweets_batch = followerTweets.batch(queue_size = 500) while True: # loop line = data.readline() if line == "": break # This isn't handled properly else: tweet = tweet_get(line) try: tweet_data = get_tweet_data(tweet) if check_user(tweet[u"from_user_id_str"]) == False: # check in script if user is there. sender = get_sender(tweet) user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) # create user entry for sender user_tweets_batch.insert(sender.user_id,{line:''}) # insert the whole tweet into a userTweets column header if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False: to_user = get_to_user(tweet) user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user}) followers_batch.insert(to_user.user_id,{sender.user_id:'follower_id'}) followerTweets_batch.insert(to_user.user_id,{line:''}) # insert the whole tweet into a followeTweets column header for the to user. if u"entities" in tweet: # iterate over the users in mentions and add them to users and follows if necessary if tweet[u"entities"][u"user_mentions"] != []: user_mentions = get_mentions(tweet) for obj in user_mentions: if check_user(obj.user_id) == False: user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user}) followers_batch.insert(obj.user_id,{'user_id':sender.user_id}) followerTweets_batch.insert(obj.user_id,{line:''}) # insert the whole tweet to a followerTweet entry for the mentioned user else: continue tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp}) except Exception: err = sys.exc_info() print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) #print the exception data with traceback and continue. continue # Pools Closed. pool.dispose()
def main(filename): data = file(filename) # Set up the connection pool pool = ConnectionPool('tuitterdb',['localhost:9160']) # CF connections user_family = ColumnFamily(pool, 'user') tweet_family = ColumnFamily(pool, 'tweet') user_tweets_family = ColumnFamily(pool, 'userTweets') #follows_tweets_family = ColumnFamily(pool, 'follows.tweets') followers = ColumnFamily(pool, 'followers') # Batch Definitions user_batch = user_family.batch(queue_size=1000) followers_batch = followers.batch(queue_size=500) user_tweets_batch = user_tweets_family.batch(queue_size = 500) while True: line = data.readline() if line is None: break else: tweet = tweet_get(line) try: tweet_data = get_tweet_data(tweet) if check_user(tweet[u"from_user_id_str"]) == False: sender = get_sender(tweet) user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) user_tweets_batch.insert(sender.user_id,{tweet_data.tweet_id:tweet_data.timestamp}) if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False: to_user = get_to_user(tweet) user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user}) followers_batch.insert(to_user.user_id,{'user_id':sender.user_id}) if u"entities" in tweet: if tweet[u"entities"][u"user_mentions"] != []: user_mentions = get_mentions(tweet) for obj in user_mentions: user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user}) followers_batch.insert(obj.user_id,{'user_id':sender.user_id}) else: continue tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp}) except Exception: err = sys.exc_info() print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) continue # Pools Closed. pool.dispose() #if __name__ == "__main__": #unittest.main()
def get_values(servlst, ks, cf, key): #print key try: pool = ConnectionPool(ks, servlst) cf_handle = ColumnFamily(pool, cf) result = cf_handle.get(key).values() except pycassa.NotFoundException as err: print "[ERROR] " + key + " not found" result = "" except Exception as err: print "[ERROR] " + str(err) exit(-1) finally: pool.dispose() return result
def write(vl, data=None): # get connection from pycassa connection pool # 创建keyspace Rawdata保存监控数据 pool = ConnectionPool('Monitor', ['localhost:9160']) # 创建columnFamily:RawData, Rollups60, Rollups300, Rollups7200, Rollups86400 col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata') # 加入时间进行分区,以月作为分区 timeString = time.strftime("%Y-%m", time.localtime(vl.time)) key = [str(vl.host), str(vl.plugin), str(vl.plugin_instance), str(vl.type), str(vl.type_instance), timeString] keyString = "#".join(key) for i in vl.values: col_fam_rawdata.insert(keyString, {vl.time: i}) #插入RawData # 同时写入一个文件作为测试 with open('/tmp/workfile', 'a') as f: f.write(keyString + " " + str(vl.time) + " " + str(i) + "\n") pool.dispose() f.close()
def test_pool(self): listener = StatsLogger() pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000, prefill=True, pool_timeout=0.1, timeout=1, keyspace='PycassaTestKeyspace', credentials=_credentials, listeners=[listener], use_threadlocal=False) conns = [] for i in range(10): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['created']['failure'], 0) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) # Pool is maxed out now assert_raises(NoConnectionAvailable, pool.get) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) assert_equal(listener.stats['at_max'], 1) for i in range(0, 5): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 0) assert_equal(listener.stats['checked_in'], 5) assert_equal(listener.stats['opened'], {'current': 5, 'max': 10}) for i in range(5, 10): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 5) assert_equal(listener.stats['checked_in'], 10) conns = [] # These connections should come from the pool for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 15) # But these will need to be made for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 15) assert_equal(listener.stats['checked_out'], 20) assert_equal(listener.stats['disposed']['success'], 5) for i in range(10): conns[i].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) assert_raises(InvalidRequestError, conns[0].return_to_pool) assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) print "in test:", id(conns[-1]) assert_raises(InvalidRequestError, conns[-1].return_to_pool) assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) pool.dispose()
# connect to cassandra pool = ConnectionPool('metrink') # get the column family col_fam = ColumnFamily(pool, 'metrics') # you must create a Cursor object. It will let # you execute all the query you need cur = db.cursor() # Use all the SQL you like cur.execute( 'select company, client, device, groupName, name, time_stamp, value from metrics join metrics_devices on metrics.device_id = metrics_devices.device_id join metrics_groups on metrics.group_id = metrics_groups.group_id join metrics_names on metrics.name_id = metrics_names.name_id join metrics_owners on metrics.ownerId = metrics_owners.ownerId' ) # print all the first cell of all the rows for row in cur.fetchall(): time = datetime.datetime.fromtimestamp(row[5] // 1000) time_str = str(time.strftime("%Y%m")) row_key = str(row[0]) + ":" + str(row[1]) + ":" + time_str + ":" + str( row[2]) + ":" + str(row[3]) + ":" + str(row[4]) print row_key col_fam.insert(row_key, {row[5]: row[6]}) # close our cassandra connection pool.dispose() # close our connection to mysql db.close()
def test_pool(self): listener = StatsLogger() pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000, prefill=True, pool_timeout=0.1, timeout=1, keyspace='PycassaTestKeyspace', credentials=_credentials, listeners=[listener], use_threadlocal=False) conns = [] for i in range(10): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['created']['failure'], 0) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) # Pool is maxed out now assert_raises(NoConnectionAvailable, pool.get) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) assert_equal(listener.stats['at_max'], 1) for i in range(0, 5): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 0) assert_equal(listener.stats['checked_in'], 5) assert_equal(listener.stats['opened'], {'current': 5, 'max': 10}) for i in range(5, 10): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 5) assert_equal(listener.stats['checked_in'], 10) conns = [] # These connections should come from the pool for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 15) # But these will need to be made for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 15) assert_equal(listener.stats['checked_out'], 20) assert_equal(listener.stats['disposed']['success'], 5) for i in range(10): conns[i].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) assert_raises(InvalidRequestError, conns[0].return_to_pool) assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) print("in test:", id(conns[-1])) conns[-1].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) pool.dispose()
def query2(user_id):# pool = ConnectionPool('tuitterdb') followsTweets_family = ColumnFamily(pool,'followsTweets') # Print the tweets of the followers of user supplied in the parameter. query(followsTweets_family,user_id) pool.dispose()
def query1(user_id): pool = ConnectionPool('tuitterdb') userTweets_family = ColumnFamily(pool, 'userTweets') # Print the tweets of the user supplied in the parameter. query(userTweets_family,user_id) pool.dispose()
def write(metric, timestamp, value, tags, ds_type): try: value = normalize_value(metric, tags, value, timestamp, ds_type) except ValueError: return pool = ConnectionPool(keyspace, [address]) upertime = timestamp/upertime_interval # get key from database, if some id is not exist, create new one key = generate_key(metric, upertime, tags) # save to rawdata pool = ConnectionPool(keyspace, [address]) col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata') col_fam_rawdata.insert(key, {timestamp: value}) # save to rollups60,if in the same minute , update the memory. # if it is new minute, write the old value to cassandra, update the memory if dictAvg60[metric]['timestamp'] == 0: dictAvg60[metric]['avg'] = value dictAvg60[metric]['counter'] = 1 elif inOneMinute(timestamp, dictAvg60[metric]['timestamp']): newAvg = caculate(dictAvg60[metric]['avg'], dictAvg60[metric]['counter'], value) dictAvg60[metric]['avg'] = newAvg dictAvg60[metric]['counter'] += 1 else: col_fam_rollups60 = pycassa.ColumnFamily(pool, 'rollups60') col_fam_rollups60.insert(metric, {dictAvg60[metric]['timestamp']: dictAvg60[key]['avg']}) dictAvg60[metric]['avg'] = value dictAvg60[metric]['counter'] = 1 dictAvg60[metric]['timestamp'] = timestamp # save to rollups300 if dictAvg300[metric]['timestamp'] == 0: dictAvg300[metric]['avg'] = value dictAvg300[metric]['counter'] = 1 elif inFiveMinutes(timestamp, dictAvg300[metric]['timestamp']): newAvg = caculate(dictAvg300[metric]['avg'], dictAvg300[metric]['counter'], value) dictAvg300[metric]['avg'] = newAvg dictAvg300[metric]['counter'] += 1 else: col_fam_rollups300 = pycassa.ColumnFamily(pool, 'rollups300') col_fam_rollups300.insert(metric, {dictAvg300[metric]['timestamp']: dictAvg300[key]['avg']}) dictAvg300[metric]['avg'] = value dictAvg300[metric]['counter'] = 1 dictAvg300[metric]['timestamp'] = timestamp # save to rollups7200 if dictAvg7200[metric]['timestamp'] == 0: dictAvg7200[metric]['avg'] = value dictAvg7200[metric]['counter'] = 1 elif inTwoHours(timestamp, dictAvg7200[metric]['timestamp']): newAvg = caculate(dictAvg7200[metric]['avg'], dictAvg7200[metric]['counter'], value) dictAvg7200[metric]['avg'] = newAvg dictAvg7200[metric]['counter'] += 1 else: col_fam_rollups7200 = pycassa.ColumnFamily(pool, 'rollups7200') col_fam_rollups7200.insert(metric, {dictAvg7200[metric]['timestamp']: dictAvg7200[key]['avg']}) dictAvg7200[metric]['avg'] = value dictAvg7200[metric]['counter'] = 1 dictAvg7200[metric]['timestamp'] = timestamp # save to rollups86400 if dictAvg86400[metric]['timestamp'] == 0: dictAvg86400[metric]['avg'] = value dictAvg86400[metric]['counter'] = 1 elif inOneDay(timestamp, dictAvg86400[metric]['timestamp']): newAvg = caculate(dictAvg86400[metric]['avg'], dictAvg86400[metric]['counter'], value) dictAvg86400[metric]['avg'] = newAvg dictAvg86400[metric]['counter'] += 1 else: col_fam_rollups86400 = pycassa.ColumnFamily(pool, 'rollups86400') col_fam_rollups86400.insert(metric, {dictAvg86400[metric]['timestamp']: dictAvg86400[key]['avg']}) dictAvg86400[metric]['avg'] = value dictAvg86400[metric]['counter'] = 1 dictAvg86400[metric]['timestamp'] = timestamp pool.dispose();
class Cassa(object): ''' Provides a simple key=value functionality built on a cassandra table with a key and a single column. Used in ZookeeperTaskQueue to replace the two tables that grow in size with the number of tasks rather than the number of workers. ''' def __init__(self, namespace, server_list=['localhost:9160']): # save cassandra server self.server_list = server_list self.namespace = namespace self._closed = False #setup_logging(self) # Connect to the server creating the namespace if it doesn't # already exist try: self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) except pycassa.InvalidRequestException: self._create_namespace(namespace) self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) try: self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') except pycassa.NotFoundException: self._create_column_family('tasks', key_validation_class=ASCII_TYPE, bytes_columns=['task_data']) self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') try: self._available = pycassa.ColumnFamily(self.pool, 'available') except pycassa.NotFoundException: self._create_column_family('available', key_validation_class=ASCII_TYPE, bytes_columns=['available']) self._available = pycassa.ColumnFamily(self.pool, 'available') try: self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') except pycassa.NotFoundException: self._create_counter_column_family('task_count', key_validation_class=ASCII_TYPE, counter_columns=['task_count']) self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') self._task_count.insert('RowKey', {'task_count': 0}) try: self._available_count = pycassa.ColumnFamily( self.pool, 'available_count') except pycassa.NotFoundException: self._create_counter_column_family( 'available_count', key_validation_class=ASCII_TYPE, counter_columns=['available_count']) self._available_count = pycassa.ColumnFamily( self.pool, 'available_count') self._available_count.insert('RowKey', {'available_count': 0}) def delete_namespace(self): sm = SystemManager(random.choice(self.server_list)) sm.drop_keyspace(self.namespace) sm.close() def _create_namespace(self, namespace): sm = SystemManager(random.choice(self.server_list)) sm.create_keyspace(namespace, SIMPLE_STRATEGY, {'replication_factor': '1'}) sm.close() def _create_column_family(self, family, bytes_columns=[], key_validation_class=TIME_UUID_TYPE): ''' Creates a column family of the name 'family' and sets any of the names in the bytes_column list to have the BYTES_TYPE. key_validation_class defaults to TIME_UUID_TYPE and could also be ASCII_TYPE for md5 hash keys, like we use for 'inbound' ''' sm = SystemManager(random.choice(self.server_list)) # sys.create_column_family(self.namespace, family, super=False) sm.create_column_family(self.namespace, family, super=False, key_validation_class=key_validation_class, default_validation_class=TIME_UUID_TYPE, column_name_class=ASCII_TYPE) for column in bytes_columns: sm.alter_column(self.namespace, family, column, BYTES_TYPE) sm.close() def _create_counter_column_family(self, family, counter_columns=[], key_validation_class=UTF8Type): ''' Creates a column family of the name 'family' and sets any of the names in the bytes_column list to have the BYTES_TYPE. key_validation_class defaults to TIME_UUID_TYPE and could also be ASCII_TYPE for md5 hash keys, like we use for 'inbound' ''' sm = SystemManager(random.choice(self.server_list)) # sys.create_column_family(self.namespace, family, super=False) sm.create_column_family(self.namespace, family, super=False, key_validation_class=key_validation_class, default_validation_class="CounterColumnType", column_name_class=ASCII_TYPE) for column in counter_columns: sm.alter_column(self.namespace, family, column, COUNTER_COLUMN_TYPE) sm.close() def tasks(self, key_prefix=''): ''' generate the data objects for every task ''' for row in self._tasks.get_range(): logger.debug(row) if not row[0].startswith(key_prefix): continue data = json.loads(row[1]['task_data']) data['task_key'] = row[0] yield data def put_task(self, key, task_data): try: found = self._tasks.get(key, column_count=1) exists = True except pycassa.cassandra.ttypes.NotFoundException: exists = False self._tasks.insert(key, {'task_data': json.dumps(task_data)}) if not exists: self._task_count.insert('RowKey', {'task_count': 1}) return exists def get_task(self, key): data = self._tasks.get(key) return json.loads(data['task_data']) def pop_task(self, key): self._tasks.remove(key) self._task_count.insert('RowKey', {'task_count': -1}) return key @property def task_keys(self): c = 0 for key, _ in self._tasks.get_range(column_count=0, filter_empty=False): c += 1 yield key def num_tasks(self): data = self._task_count.get('RowKey') return data['task_count'] def num_available(self): data = self._available_count.get('RowKey') return data['available_count'] def put_available(self, key): ## closest thing to storing only the key try: found = self._available.get(key, column_count=1) exists = True except pycassa.cassandra.ttypes.NotFoundException: exists = False if not exists: self._available.insert(key, {'available': ''}) self._available_count.insert('RowKey', {'available_count': 1}) #def push_batch(self, row_iter): # ''' # Push opaque vertex data objects into the inbound queue # ''' # return self._tasks.batch_insert({k: json.dumps(v) for k, v in row_iter}) def get_random_available(self, max_iter=10000): ''' get a random key out of the first max_iter rows ''' c = 1 keeper = None ## note the ConsistencyLevel here. If we do not do this, and ## get all slick with things like column_count=0 and filter ## empty False, then we can get keys that were recently ## deleted... EVEN if the default consistency would seem to ## rule that out! ## note the random start key, so that we do not always hit the ## same place in the key range with all workers #random_key = hashlib.md5(str(random.random())).hexdigest() #random_key = '0' * 32 #logger.debug('available.get_range(%r)' % random_key) ## scratch that idea: turns out that using a random start key ## OR using row_count=1 can cause get_range to hang for hours ## why we need ConsistencyLevel.ALL on a single node is not ## clear, but experience indicates it is needed. ## note that putting a finite row_count is problematic in two ## ways: # 1) if there are more workers than max_iter, some will not # get tasks # # 2) if there are more than max_iter records, then all workers # have to wade through all of these just to get a task! What # we really want is a "pick random row" function, and that is # probably best implemented using CQL3 token function via the # cql python module instead of pycassa... for row in self._available.get_range( row_count=max_iter, read_consistency_level=pycassa.ConsistencyLevel.ALL): #for row in self._available.get_range(row_count=100): logger.debug('considering %r' % (row, )) if random.random() < 1 / c: keeper = row[0] if c == max_iter: break c += 1 return keeper def in_available(self, key): try: row = self._available.get(key) return True except pycassa.NotFoundException: return False def pop_available(self, key): self._available.remove( key, write_consistency_level=pycassa.ConsistencyLevel.ALL) self._available_count.insert('RowKey', {'available_count': -1}) assert not self.in_available(key) return key def close(self): self._closed = True if hasattr(self, 'pool'): self.pool.dispose()
replicate_on_write='true' AND compression={'sstable_compression': 'SnappyCompressor'}; ''' #UpdateValue(pool,testcf[0],'172.16.40.147',{('apache','port'):'8080',('apache','docbase'):'/var/www',('cassandra','ver'):'1.2.0'}) # UpdateValue(pool,testcf[0],'172.16.40.147',{('apache','port'):'8080'}) # UpdateValue(pool,testcf[0],'172.16.40.145',{('apache','port'):'8080'}) # UpdateValue(pool,testcf[0],'172.16.40.146',{('apache','port'):'8080'}) # UpdateValue(pool,testcf[0],'172.16.40.147',{('apache','docbase'):'/var/www'}) # UpdateValue(pool,testcf[0],'172.16.40.145',{('apache','docbase'):'/var/www'}) # UpdateValue(pool,testcf[0],'172.16.40.146',{('apache','docbase'):'/var/www'}) # UpdateValue(pool,testcf[0],'172.16.40.145',{('cassandra','ver'):'1.2.0'}) # UpdateValue(pool,testcf[0],'172.16.40.146',{('cassandra','ver'):'1.2.0'}) # UpdateValue(pool,testcf[0],'172.16.40.145',{('cassandra','seeds'):'172.16.40.145,172.16.40.147'}) # UpdateValue(pool,testcf[0],'172.16.40.146',{('cassandra','seeds'):'172.16.40.145,172.16.40.147'}) # UpdateValue(pool,testcf[0],'172.16.40.145',{('apache333','zzz'):'empty'}) # s = GetValue(pool,testcf[0],('172.16.40.145'), column_start = ('apache','ver'), column_finish = ('cassandra',)) #s = GetValue(pool,'jobstatus',('wait'), column_start = ('GANGLIA_CHECK_20130104133051_1',''), column_finish = ('GANGLIA_CHECK_20130104133051_1',)) s = GetValue(pool,'nodesoft', ('172.16.40.149_cent_yum'), column_start = ('ganglia',''), column_finish = ('ganglia',)) #s = Remove(pool,'jobstatus',('analysised'),[('GANGLIA_CHECK_20121231175157_1','2013-01-04 09:27:58.996000'),]) #s = Remove(pool,'jobstatus',('analysised'),None) print(s) pool.dispose()
class Cassa(object): ''' Provides a simple key=value functionality built on a cassandra table with a key and a single column. Used in ZookeeperTaskQueue to replace the two tables that grow in size with the number of tasks rather than the number of workers. ''' def __init__(self, namespace, server_list=['localhost:9160']): # save cassandra server self.server_list = server_list self.namespace = namespace self._closed = False #setup_logging(self) # Connect to the server creating the namespace if it doesn't # already exist try: self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) except pycassa.InvalidRequestException: self._create_namespace(namespace) self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) try: self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') except pycassa.NotFoundException: self._create_column_family('tasks', key_validation_class=ASCII_TYPE, bytes_columns=['task_data']) self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') try: self._available = pycassa.ColumnFamily(self.pool, 'available') except pycassa.NotFoundException: self._create_column_family('available', key_validation_class=ASCII_TYPE, bytes_columns=['available']) self._available = pycassa.ColumnFamily(self.pool, 'available') try: self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') except pycassa.NotFoundException: self._create_counter_column_family('task_count', key_validation_class=ASCII_TYPE, counter_columns=['task_count']) self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') self._task_count.insert('RowKey', {'task_count': 0}) try: self._available_count = pycassa.ColumnFamily(self.pool, 'available_count') except pycassa.NotFoundException: self._create_counter_column_family('available_count', key_validation_class=ASCII_TYPE, counter_columns=['available_count']) self._available_count = pycassa.ColumnFamily(self.pool, 'available_count') self._available_count.insert('RowKey', {'available_count': 0}) def delete_namespace(self): sm = SystemManager(random.choice(self.server_list)) sm.drop_keyspace(self.namespace) sm.close() def _create_namespace(self, namespace): sm = SystemManager(random.choice(self.server_list)) sm.create_keyspace(namespace, SIMPLE_STRATEGY, {'replication_factor': '1'}) sm.close() def _create_column_family(self, family, bytes_columns=[], key_validation_class=TIME_UUID_TYPE): ''' Creates a column family of the name 'family' and sets any of the names in the bytes_column list to have the BYTES_TYPE. key_validation_class defaults to TIME_UUID_TYPE and could also be ASCII_TYPE for md5 hash keys, like we use for 'inbound' ''' sm = SystemManager(random.choice(self.server_list)) # sys.create_column_family(self.namespace, family, super=False) sm.create_column_family(self.namespace, family, super=False, key_validation_class = key_validation_class, default_validation_class = TIME_UUID_TYPE, column_name_class = ASCII_TYPE) for column in bytes_columns: sm.alter_column(self.namespace, family, column, BYTES_TYPE) sm.close() def _create_counter_column_family(self, family, counter_columns=[], key_validation_class=UTF8Type): ''' Creates a column family of the name 'family' and sets any of the names in the bytes_column list to have the BYTES_TYPE. key_validation_class defaults to TIME_UUID_TYPE and could also be ASCII_TYPE for md5 hash keys, like we use for 'inbound' ''' sm = SystemManager(random.choice(self.server_list)) # sys.create_column_family(self.namespace, family, super=False) sm.create_column_family(self.namespace, family, super=False, key_validation_class = key_validation_class, default_validation_class="CounterColumnType", column_name_class = ASCII_TYPE) for column in counter_columns: sm.alter_column(self.namespace, family, column, COUNTER_COLUMN_TYPE) sm.close() def tasks(self, key_prefix=''): ''' generate the data objects for every task ''' for row in self._tasks.get_range(): logger.debug(row) if not row[0].startswith(key_prefix): continue data = json.loads(row[1]['task_data']) data['task_key'] = row[0] yield data def put_task(self, key, task_data): try: found = self._tasks.get(key, column_count=1) exists = True except pycassa.cassandra.ttypes.NotFoundException: exists = False self._tasks.insert(key, {'task_data': json.dumps(task_data)}) if not exists: self._task_count.insert('RowKey', {'task_count': 1}) return exists def get_task(self, key): data = self._tasks.get(key) return json.loads(data['task_data']) def pop_task(self, key): self._tasks.remove(key) self._task_count.insert('RowKey', {'task_count': -1}) return key @property def task_keys(self): c = 0 for key, _ in self._tasks.get_range(column_count=0, filter_empty=False): c += 1 yield key def num_tasks(self): data = self._task_count.get('RowKey') return data['task_count'] def num_available(self): data = self._available_count.get('RowKey') return data['available_count'] def put_available(self, key): ## closest thing to storing only the key try: found = self._available.get(key, column_count=1) exists = True except pycassa.cassandra.ttypes.NotFoundException: exists = False if not exists: self._available.insert(key, {'available': ''}) self._available_count.insert('RowKey', {'available_count': 1}) #def push_batch(self, row_iter): # ''' # Push opaque vertex data objects into the inbound queue # ''' # return self._tasks.batch_insert({k: json.dumps(v) for k, v in row_iter}) def get_random_available(self, max_iter=10000): ''' get a random key out of the first max_iter rows ''' c = 1 keeper = None ## note the ConsistencyLevel here. If we do not do this, and ## get all slick with things like column_count=0 and filter ## empty False, then we can get keys that were recently ## deleted... EVEN if the default consistency would seem to ## rule that out! ## note the random start key, so that we do not always hit the ## same place in the key range with all workers #random_key = hashlib.md5(str(random.random())).hexdigest() #random_key = '0' * 32 #logger.debug('available.get_range(%r)' % random_key) ## scratch that idea: turns out that using a random start key ## OR using row_count=1 can cause get_range to hang for hours ## why we need ConsistencyLevel.ALL on a single node is not ## clear, but experience indicates it is needed. ## note that putting a finite row_count is problematic in two ## ways: # 1) if there are more workers than max_iter, some will not # get tasks # # 2) if there are more than max_iter records, then all workers # have to wade through all of these just to get a task! What # we really want is a "pick random row" function, and that is # probably best implemented using CQL3 token function via the # cql python module instead of pycassa... for row in self._available.get_range(row_count=max_iter, read_consistency_level=pycassa.ConsistencyLevel.ALL): #for row in self._available.get_range(row_count=100): logger.debug('considering %r' % (row,)) if random.random() < 1 / c: keeper = row[0] if c == max_iter: break c += 1 return keeper def in_available(self, key): try: row = self._available.get(key) return True except pycassa.NotFoundException: return False def pop_available(self, key): self._available.remove(key, write_consistency_level=pycassa.ConsistencyLevel.ALL) self._available_count.insert('RowKey', {'available_count': -1}) assert not self.in_available(key) return key def close(self): self._closed = True if hasattr(self, 'pool'): self.pool.dispose()