Ejemplo n.º 1
0
def read(metric, start_time, end_time, tags):
    pool = ConnectionPool(keyspace, [address])
#    decide which column family to read based on time diffrence
    if timeDiff(start_time, end_time) <= 3600:
        col_fam = pycassa.ColumnFamily(pool, 'rawdata')
    elif timeDiff(start_time, end_time) <= 7200:
        col_fam = pycassa.ColumnFamily(pool, 'rollups60')
    elif timeDiff(start_time, end_time) <= 86400:
        col_fam = pycassa.ColumnFamily(pool, 'rollups300')
    elif timeDiff(start_time, end_time) <= 2592000:
        col_fam = pycassa.ColumnFamily(pool, 'rollups7200')
    else:
        col_fam = pycassa.ColumnFamily(pool, 'rollups86400') 
        
#  change start_time , end_time to uper timestamp
    start_upertime = start_time/upertime_interval
    end_updertime = end_time/upertime_interval
    points = {}
    for i in range(start_upertime, end_updertime + 1):
        key = generate_key(metric, i, tags)
        try:
            points = col_fam.get(key, column_start=start_time, column_finish=end_time)
        except pycassa.NotFoundException:
            return None
    pool.dispose()
    return points
def main(filename):
    data =  file(filename)
    # Set up the connection pool
    pool = ConnectionPool('tuitterdb',['localhost:9160'])
    
    # CF connections
    user_family = ColumnFamily(pool, 'user')
    tweet_family = ColumnFamily(pool, 'tweet')
    user_tweets_family = ColumnFamily(pool, 'userTweets')
    followers = ColumnFamily(pool, 'followers')
    followerTweets = ColumnFamily(pool, 'followsTweets')

    # Batch Definitions
    user_batch = user_family.batch(queue_size=1000)
    followers_batch = followers.batch(queue_size=500)
    user_tweets_batch = user_tweets_family.batch(queue_size = 500)
    followerTweets_batch = followerTweets.batch(queue_size = 500)
    
                           
    while True: # loop
        line = data.readline()
        if line == "": break # This isn't handled properly
        else:
            tweet = tweet_get(line)
            try:
                tweet_data = get_tweet_data(tweet)
                if check_user(tweet[u"from_user_id_str"]) == False: # check in script if user is there.
                    sender = get_sender(tweet)
                    user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) # create user entry for sender
                    user_tweets_batch.insert(sender.user_id,{line:''}) # insert the whole tweet into a userTweets column header

                if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False:
                    to_user = get_to_user(tweet)
                    user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user})
                    followers_batch.insert(to_user.user_id,{sender.user_id:'follower_id'}) 
                    followerTweets_batch.insert(to_user.user_id,{line:''}) # insert the whole tweet into a followeTweets column header for the to user.


                if u"entities" in tweet: # iterate over the users in mentions and add them to users and follows if necessary
                    if tweet[u"entities"][u"user_mentions"] != []:
                        user_mentions = get_mentions(tweet)
                        for obj in user_mentions:
                            if check_user(obj.user_id) == False:
                                user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user})
                            followers_batch.insert(obj.user_id,{'user_id':sender.user_id})
                            followerTweets_batch.insert(obj.user_id,{line:''}) # insert the whole tweet to a followerTweet entry for the mentioned user
                else:
                    continue

                tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp})

            except Exception:
                err = sys.exc_info()
                print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) #print the exception data with traceback and continue.
                continue

    # Pools Closed.
    pool.dispose()
Ejemplo n.º 3
0
def main(filename):
    data =  file(filename)
    # Set up the connection pool
    pool = ConnectionPool('tuitterdb',['localhost:9160'])
    # CF connections
    user_family = ColumnFamily(pool, 'user')
    tweet_family = ColumnFamily(pool, 'tweet')
    user_tweets_family = ColumnFamily(pool, 'userTweets')
    #follows_tweets_family = ColumnFamily(pool, 'follows.tweets')
    followers = ColumnFamily(pool, 'followers')

    # Batch Definitions
    user_batch = user_family.batch(queue_size=1000)
    followers_batch = followers.batch(queue_size=500)
    user_tweets_batch = user_tweets_family.batch(queue_size = 500)
    
                           
    while True:
        line = data.readline()
        if line is None: break
        else:
            tweet = tweet_get(line)
            try:
                tweet_data = get_tweet_data(tweet)
                if check_user(tweet[u"from_user_id_str"]) == False:
                    sender = get_sender(tweet)
                    user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user})
                    user_tweets_batch.insert(sender.user_id,{tweet_data.tweet_id:tweet_data.timestamp})

                if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False:
                    to_user = get_to_user(tweet)
                    user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user})
                    followers_batch.insert(to_user.user_id,{'user_id':sender.user_id})


                if u"entities" in tweet:
                    if tweet[u"entities"][u"user_mentions"] != []:
                        user_mentions = get_mentions(tweet)
                        for obj in user_mentions:
                            user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user})
                            followers_batch.insert(obj.user_id,{'user_id':sender.user_id})
                else:
                    continue

                tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp})

            except Exception:
                err = sys.exc_info()
                print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2]))
                continue

    # Pools Closed.
    pool.dispose()

#if __name__ == "__main__":
    #unittest.main()
Ejemplo n.º 4
0
def get_values(servlst, ks, cf, key):
    #print key
    try:
        pool = ConnectionPool(ks, servlst)
        cf_handle = ColumnFamily(pool, cf)
        result = cf_handle.get(key).values()
    except pycassa.NotFoundException as err:
        print "[ERROR] " + key + " not found"
        result = ""
    except Exception as err:
        print "[ERROR] " + str(err)
        exit(-1)
    finally:
        pool.dispose()

    return result
def write(vl, data=None):
#    get connection from pycassa connection pool 
#    创建keyspace Rawdata保存监控数据
    pool = ConnectionPool('Monitor', ['localhost:9160'])
#   创建columnFamily:RawData, Rollups60, Rollups300, Rollups7200, Rollups86400
    col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata')    
    
#   加入时间进行分区,以月作为分区
    timeString = time.strftime("%Y-%m", time.localtime(vl.time))
    key = [str(vl.host), str(vl.plugin), str(vl.plugin_instance), str(vl.type), str(vl.type_instance), timeString]
    keyString = "#".join(key)
    
    
    for i in vl.values:
        col_fam_rawdata.insert(keyString, {vl.time: i})     #插入RawData
        

#   同时写入一个文件作为测试
    with open('/tmp/workfile', 'a') as f:
        f.write(keyString + " " + str(vl.time) + " " + str(i) + "\n")
     
    pool.dispose()
    f.close()
Ejemplo n.º 6
0
    def test_pool(self):
        listener = StatsLogger()
        pool = ConnectionPool(pool_size=5,
                              max_overflow=5,
                              recycle=10000,
                              prefill=True,
                              pool_timeout=0.1,
                              timeout=1,
                              keyspace='PycassaTestKeyspace',
                              credentials=_credentials,
                              listeners=[listener],
                              use_threadlocal=False)
        conns = []
        for i in range(10):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['created']['failure'], 0)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})

        # Pool is maxed out now
        assert_raises(NoConnectionAvailable, pool.get)
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})
        assert_equal(listener.stats['at_max'], 1)

        for i in range(0, 5):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 0)
        assert_equal(listener.stats['checked_in'], 5)
        assert_equal(listener.stats['opened'], {'current': 5, 'max': 10})

        for i in range(5, 10):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 5)
        assert_equal(listener.stats['checked_in'], 10)

        conns = []

        # These connections should come from the pool
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 15)

        # But these will need to be made
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 15)
        assert_equal(listener.stats['checked_out'], 20)

        assert_equal(listener.stats['disposed']['success'], 5)
        for i in range(10):
            conns[i].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        assert_raises(InvalidRequestError, conns[0].return_to_pool)
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        print "in test:", id(conns[-1])
        assert_raises(InvalidRequestError, conns[-1].return_to_pool)
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        pool.dispose()
Ejemplo n.º 7
0
# connect to cassandra
pool = ConnectionPool('metrink')

# get the column family
col_fam = ColumnFamily(pool, 'metrics')

# you must create a Cursor object. It will let
#  you execute all the query you need
cur = db.cursor()

# Use all the SQL you like
cur.execute(
    'select company, client, device, groupName, name, time_stamp, value from metrics join metrics_devices on metrics.device_id = metrics_devices.device_id join metrics_groups on metrics.group_id = metrics_groups.group_id join metrics_names on metrics.name_id = metrics_names.name_id join metrics_owners on metrics.ownerId = metrics_owners.ownerId'
)

# print all the first cell of all the rows
for row in cur.fetchall():
    time = datetime.datetime.fromtimestamp(row[5] // 1000)
    time_str = str(time.strftime("%Y%m"))
    row_key = str(row[0]) + ":" + str(row[1]) + ":" + time_str + ":" + str(
        row[2]) + ":" + str(row[3]) + ":" + str(row[4])

    print row_key

    col_fam.insert(row_key, {row[5]: row[6]})

# close our cassandra connection
pool.dispose()

# close our connection to mysql
db.close()
Ejemplo n.º 8
0
    def test_pool(self):
        listener = StatsLogger()
        pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000,
                              prefill=True, pool_timeout=0.1, timeout=1,
                              keyspace='PycassaTestKeyspace', credentials=_credentials,
                              listeners=[listener], use_threadlocal=False)
        conns = []
        for i in range(10):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['created']['failure'], 0)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})

        # Pool is maxed out now
        assert_raises(NoConnectionAvailable, pool.get)
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 10)
        assert_equal(listener.stats['opened'], {'current': 10, 'max': 10})
        assert_equal(listener.stats['at_max'], 1)

        for i in range(0, 5):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 0)
        assert_equal(listener.stats['checked_in'], 5)
        assert_equal(listener.stats['opened'], {'current': 5, 'max': 10})

        for i in range(5, 10):
            pool.return_conn(conns[i])
        assert_equal(listener.stats['disposed']['success'], 5)
        assert_equal(listener.stats['checked_in'], 10)

        conns = []

        # These connections should come from the pool
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 10)
        assert_equal(listener.stats['checked_out'], 15)

        # But these will need to be made
        for i in range(5):
            conns.append(pool.get())
        assert_equal(listener.stats['created']['success'], 15)
        assert_equal(listener.stats['checked_out'], 20)

        assert_equal(listener.stats['disposed']['success'], 5)
        for i in range(10):
            conns[i].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        assert_raises(InvalidRequestError, conns[0].return_to_pool)
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        print("in test:", id(conns[-1]))
        conns[-1].return_to_pool()
        assert_equal(listener.stats['checked_in'], 20)
        assert_equal(listener.stats['disposed']['success'], 10)

        pool.dispose()
Ejemplo n.º 9
0
def query2(user_id):#
    pool = ConnectionPool('tuitterdb')
    followsTweets_family = ColumnFamily(pool,'followsTweets')
    # Print the tweets of the followers of user supplied in the parameter.
    query(followsTweets_family,user_id)
    pool.dispose()
Ejemplo n.º 10
0
def query1(user_id):
    pool = ConnectionPool('tuitterdb')
    userTweets_family = ColumnFamily(pool, 'userTweets')
    # Print the tweets of the user supplied in the parameter. 
    query(userTweets_family,user_id)
    pool.dispose()
Ejemplo n.º 11
0
def write(metric, timestamp, value, tags, ds_type):
    try:
        value = normalize_value(metric, tags, value, timestamp, ds_type)
    except ValueError:
        return

    pool = ConnectionPool(keyspace, [address])
    
    upertime = timestamp/upertime_interval
#    get key from database, if some id is not exist, create new one
    key = generate_key(metric, upertime, tags) 
    
#    save to rawdata
    pool = ConnectionPool(keyspace, [address])
    col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata') 
    col_fam_rawdata.insert(key, {timestamp: value})  
    
#   save to rollups60,if in the same minute , update the memory. 
#   if it is new minute, write the old value to cassandra, update the memory
    if dictAvg60[metric]['timestamp'] == 0:
        dictAvg60[metric]['avg'] = value
        dictAvg60[metric]['counter'] = 1
    elif inOneMinute(timestamp, dictAvg60[metric]['timestamp']):
        newAvg  = caculate(dictAvg60[metric]['avg'], dictAvg60[metric]['counter'], value)
        dictAvg60[metric]['avg'] = newAvg
        dictAvg60[metric]['counter'] += 1
    else:
        col_fam_rollups60 = pycassa.ColumnFamily(pool, 'rollups60')
        col_fam_rollups60.insert(metric, {dictAvg60[metric]['timestamp']:  dictAvg60[key]['avg']})  
        dictAvg60[metric]['avg'] = value
        dictAvg60[metric]['counter'] = 1
    dictAvg60[metric]['timestamp'] = timestamp
    
 #   save to rollups300
    if dictAvg300[metric]['timestamp'] == 0:
        dictAvg300[metric]['avg'] = value
        dictAvg300[metric]['counter'] = 1
    elif inFiveMinutes(timestamp, dictAvg300[metric]['timestamp']):
        newAvg  = caculate(dictAvg300[metric]['avg'], dictAvg300[metric]['counter'], value)
        dictAvg300[metric]['avg'] = newAvg
        dictAvg300[metric]['counter'] += 1
    else:
        col_fam_rollups300 = pycassa.ColumnFamily(pool, 'rollups300')
        col_fam_rollups300.insert(metric, {dictAvg300[metric]['timestamp']:  dictAvg300[key]['avg']})  
        dictAvg300[metric]['avg'] = value
        dictAvg300[metric]['counter'] = 1
    dictAvg300[metric]['timestamp'] = timestamp
    
#   save to rollups7200
    if dictAvg7200[metric]['timestamp'] == 0:
        dictAvg7200[metric]['avg'] = value
        dictAvg7200[metric]['counter'] = 1
    elif inTwoHours(timestamp, dictAvg7200[metric]['timestamp']):
        newAvg  = caculate(dictAvg7200[metric]['avg'], dictAvg7200[metric]['counter'], value)
        dictAvg7200[metric]['avg'] = newAvg
        dictAvg7200[metric]['counter'] += 1
    else:
        col_fam_rollups7200 = pycassa.ColumnFamily(pool, 'rollups7200')
        col_fam_rollups7200.insert(metric, {dictAvg7200[metric]['timestamp']:  dictAvg7200[key]['avg']})  
        dictAvg7200[metric]['avg'] = value
        dictAvg7200[metric]['counter'] = 1
    dictAvg7200[metric]['timestamp'] = timestamp
    
#   save to rollups86400
    if dictAvg86400[metric]['timestamp'] == 0:
        dictAvg86400[metric]['avg'] = value
        dictAvg86400[metric]['counter'] = 1
    elif inOneDay(timestamp, dictAvg86400[metric]['timestamp']):
        newAvg  = caculate(dictAvg86400[metric]['avg'], dictAvg86400[metric]['counter'], value)
        dictAvg86400[metric]['avg'] = newAvg
        dictAvg86400[metric]['counter'] += 1
    else:
        col_fam_rollups86400 = pycassa.ColumnFamily(pool, 'rollups86400')
        col_fam_rollups86400.insert(metric, {dictAvg86400[metric]['timestamp']:  dictAvg86400[key]['avg']})  
        dictAvg86400[metric]['avg'] = value
        dictAvg86400[metric]['counter'] = 1
    dictAvg86400[metric]['timestamp'] = timestamp
    pool.dispose();
class Cassa(object):
    '''
    Provides a simple key=value functionality built on a cassandra
    table with a key and a single column.  Used in ZookeeperTaskQueue
    to replace the two tables that grow in size with the number of
    tasks rather than the number of workers.
    '''
    def __init__(self, namespace, server_list=['localhost:9160']):
        # save cassandra server
        self.server_list = server_list
        self.namespace = namespace
        self._closed = False

        #setup_logging(self)

        # Connect to the server creating the namespace if it doesn't
        # already exist
        try:
            self.pool = ConnectionPool(namespace,
                                       self.server_list,
                                       max_retries=500,
                                       pool_timeout=600,
                                       timeout=10)
        except pycassa.InvalidRequestException:
            self._create_namespace(namespace)
            self.pool = ConnectionPool(namespace,
                                       self.server_list,
                                       max_retries=500,
                                       pool_timeout=600,
                                       timeout=10)

        try:
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')
        except pycassa.NotFoundException:
            self._create_column_family('tasks',
                                       key_validation_class=ASCII_TYPE,
                                       bytes_columns=['task_data'])
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')

        try:
            self._available = pycassa.ColumnFamily(self.pool, 'available')
        except pycassa.NotFoundException:
            self._create_column_family('available',
                                       key_validation_class=ASCII_TYPE,
                                       bytes_columns=['available'])
            self._available = pycassa.ColumnFamily(self.pool, 'available')

        try:
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('task_count',
                                               key_validation_class=ASCII_TYPE,
                                               counter_columns=['task_count'])
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
            self._task_count.insert('RowKey', {'task_count': 0})

        try:
            self._available_count = pycassa.ColumnFamily(
                self.pool, 'available_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family(
                'available_count',
                key_validation_class=ASCII_TYPE,
                counter_columns=['available_count'])
            self._available_count = pycassa.ColumnFamily(
                self.pool, 'available_count')
            self._available_count.insert('RowKey', {'available_count': 0})

    def delete_namespace(self):
        sm = SystemManager(random.choice(self.server_list))
        sm.drop_keyspace(self.namespace)
        sm.close()

    def _create_namespace(self, namespace):
        sm = SystemManager(random.choice(self.server_list))
        sm.create_keyspace(namespace, SIMPLE_STRATEGY,
                           {'replication_factor': '1'})
        sm.close()

    def _create_column_family(self,
                              family,
                              bytes_columns=[],
                              key_validation_class=TIME_UUID_TYPE):
        '''
        Creates a column family of the name 'family' and sets any of
        the names in the bytes_column list to have the BYTES_TYPE.

        key_validation_class defaults to TIME_UUID_TYPE and could also
        be ASCII_TYPE for md5 hash keys, like we use for 'inbound'
        '''
        sm = SystemManager(random.choice(self.server_list))
        # sys.create_column_family(self.namespace, family, super=False)
        sm.create_column_family(self.namespace,
                                family,
                                super=False,
                                key_validation_class=key_validation_class,
                                default_validation_class=TIME_UUID_TYPE,
                                column_name_class=ASCII_TYPE)
        for column in bytes_columns:
            sm.alter_column(self.namespace, family, column, BYTES_TYPE)
        sm.close()

    def _create_counter_column_family(self,
                                      family,
                                      counter_columns=[],
                                      key_validation_class=UTF8Type):
        '''
        Creates a column family of the name 'family' and sets any of
        the names in the bytes_column list to have the BYTES_TYPE.

        key_validation_class defaults to TIME_UUID_TYPE and could also
        be ASCII_TYPE for md5 hash keys, like we use for 'inbound'
        '''
        sm = SystemManager(random.choice(self.server_list))
        # sys.create_column_family(self.namespace, family, super=False)
        sm.create_column_family(self.namespace,
                                family,
                                super=False,
                                key_validation_class=key_validation_class,
                                default_validation_class="CounterColumnType",
                                column_name_class=ASCII_TYPE)
        for column in counter_columns:
            sm.alter_column(self.namespace, family, column,
                            COUNTER_COLUMN_TYPE)
        sm.close()

    def tasks(self, key_prefix=''):
        '''
        generate the data objects for every task
        '''
        for row in self._tasks.get_range():
            logger.debug(row)
            if not row[0].startswith(key_prefix):
                continue
            data = json.loads(row[1]['task_data'])
            data['task_key'] = row[0]
            yield data

    def put_task(self, key, task_data):
        try:
            found = self._tasks.get(key, column_count=1)
            exists = True
        except pycassa.cassandra.ttypes.NotFoundException:
            exists = False

        self._tasks.insert(key, {'task_data': json.dumps(task_data)})
        if not exists:
            self._task_count.insert('RowKey', {'task_count': 1})
        return exists

    def get_task(self, key):
        data = self._tasks.get(key)
        return json.loads(data['task_data'])

    def pop_task(self, key):
        self._tasks.remove(key)
        self._task_count.insert('RowKey', {'task_count': -1})
        return key

    @property
    def task_keys(self):
        c = 0
        for key, _ in self._tasks.get_range(column_count=0,
                                            filter_empty=False):
            c += 1
            yield key

    def num_tasks(self):
        data = self._task_count.get('RowKey')
        return data['task_count']

    def num_available(self):
        data = self._available_count.get('RowKey')
        return data['available_count']

    def put_available(self, key):
        ## closest thing to storing only the key
        try:
            found = self._available.get(key, column_count=1)
            exists = True
        except pycassa.cassandra.ttypes.NotFoundException:
            exists = False

        if not exists:
            self._available.insert(key, {'available': ''})
            self._available_count.insert('RowKey', {'available_count': 1})

    #def push_batch(self, row_iter):
    #    '''
    #    Push opaque vertex data objects into the inbound queue
    #    '''
    #    return self._tasks.batch_insert({k: json.dumps(v) for k, v in row_iter})

    def get_random_available(self, max_iter=10000):
        '''
        get a random key out of the first max_iter rows
        '''
        c = 1
        keeper = None
        ## note the ConsistencyLevel here.  If we do not do this, and
        ## get all slick with things like column_count=0 and filter
        ## empty False, then we can get keys that were recently
        ## deleted... EVEN if the default consistency would seem to
        ## rule that out!

        ## note the random start key, so that we do not always hit the
        ## same place in the key range with all workers
        #random_key = hashlib.md5(str(random.random())).hexdigest()
        #random_key = '0' * 32
        #logger.debug('available.get_range(%r)' % random_key)
        ## scratch that idea: turns out that using a random start key
        ## OR using row_count=1 can cause get_range to hang for hours

        ## why we need ConsistencyLevel.ALL on a single node is not
        ## clear, but experience indicates it is needed.

        ## note that putting a finite row_count is problematic in two
        ## ways:
        # 1) if there are more workers than max_iter, some will not
        # get tasks
        #
        # 2) if there are more than max_iter records, then all workers
        # have to wade through all of these just to get a task!  What
        # we really want is a "pick random row" function, and that is
        # probably best implemented using CQL3 token function via the
        # cql python module instead of pycassa...
        for row in self._available.get_range(
                row_count=max_iter,
                read_consistency_level=pycassa.ConsistencyLevel.ALL):
            #for row in self._available.get_range(row_count=100):
            logger.debug('considering %r' % (row, ))
            if random.random() < 1 / c:
                keeper = row[0]
            if c == max_iter:
                break
            c += 1
        return keeper

    def in_available(self, key):
        try:
            row = self._available.get(key)
            return True
        except pycassa.NotFoundException:
            return False

    def pop_available(self, key):
        self._available.remove(
            key, write_consistency_level=pycassa.ConsistencyLevel.ALL)
        self._available_count.insert('RowKey', {'available_count': -1})
        assert not self.in_available(key)
        return key

    def close(self):
        self._closed = True
        if hasattr(self, 'pool'):
            self.pool.dispose()
Ejemplo n.º 13
0
      replicate_on_write='true' AND
      compression={'sstable_compression': 'SnappyCompressor'};
    
    
    '''
    #UpdateValue(pool,testcf[0],'172.16.40.147',{('apache','port'):'8080',('apache','docbase'):'/var/www',('cassandra','ver'):'1.2.0'})
    # UpdateValue(pool,testcf[0],'172.16.40.147',{('apache','port'):'8080'})
    # UpdateValue(pool,testcf[0],'172.16.40.145',{('apache','port'):'8080'})
    # UpdateValue(pool,testcf[0],'172.16.40.146',{('apache','port'):'8080'})
    # UpdateValue(pool,testcf[0],'172.16.40.147',{('apache','docbase'):'/var/www'})
    # UpdateValue(pool,testcf[0],'172.16.40.145',{('apache','docbase'):'/var/www'})
    # UpdateValue(pool,testcf[0],'172.16.40.146',{('apache','docbase'):'/var/www'})
    # UpdateValue(pool,testcf[0],'172.16.40.145',{('cassandra','ver'):'1.2.0'})
    # UpdateValue(pool,testcf[0],'172.16.40.146',{('cassandra','ver'):'1.2.0'})
    # UpdateValue(pool,testcf[0],'172.16.40.145',{('cassandra','seeds'):'172.16.40.145,172.16.40.147'})
    # UpdateValue(pool,testcf[0],'172.16.40.146',{('cassandra','seeds'):'172.16.40.145,172.16.40.147'})
    # UpdateValue(pool,testcf[0],'172.16.40.145',{('apache333','zzz'):'empty'})
    # s = GetValue(pool,testcf[0],('172.16.40.145'), column_start = ('apache','ver'), column_finish = ('cassandra',))
    #s = GetValue(pool,'jobstatus',('wait'), column_start = ('GANGLIA_CHECK_20130104133051_1',''), column_finish = ('GANGLIA_CHECK_20130104133051_1',))
    s = GetValue(pool,'nodesoft', ('172.16.40.149_cent_yum'), column_start = ('ganglia',''), column_finish = ('ganglia',))
    
    #s = Remove(pool,'jobstatus',('analysised'),[('GANGLIA_CHECK_20121231175157_1','2013-01-04 09:27:58.996000'),])
    #s = Remove(pool,'jobstatus',('analysised'),None)
    
    print(s)
    pool.dispose()


    

class Cassa(object):
    '''
    Provides a simple key=value functionality built on a cassandra
    table with a key and a single column.  Used in ZookeeperTaskQueue
    to replace the two tables that grow in size with the number of
    tasks rather than the number of workers.
    '''

    def __init__(self, namespace, server_list=['localhost:9160']):
        # save cassandra server
        self.server_list = server_list
        self.namespace = namespace
        self._closed = False

        #setup_logging(self)

        # Connect to the server creating the namespace if it doesn't
        # already exist
        try:
            self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10)
        except pycassa.InvalidRequestException:
            self._create_namespace(namespace)
            self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10)

        try:
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')
        except pycassa.NotFoundException:
            self._create_column_family('tasks', 
                                       key_validation_class=ASCII_TYPE, 
                                       bytes_columns=['task_data'])
            self._tasks = pycassa.ColumnFamily(self.pool, 'tasks')

        try:
            self._available = pycassa.ColumnFamily(self.pool, 'available')
        except pycassa.NotFoundException:
            self._create_column_family('available', 
                                        key_validation_class=ASCII_TYPE, 
                                        bytes_columns=['available'])
            self._available = pycassa.ColumnFamily(self.pool, 'available')

        try:
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('task_count', 
                                       key_validation_class=ASCII_TYPE, 
                                       counter_columns=['task_count'])
            self._task_count = pycassa.ColumnFamily(self.pool, 'task_count')
            self._task_count.insert('RowKey', {'task_count': 0})

        try:
            self._available_count = pycassa.ColumnFamily(self.pool, 'available_count')
        except pycassa.NotFoundException:
            self._create_counter_column_family('available_count', 
                                       key_validation_class=ASCII_TYPE, 
                                       counter_columns=['available_count'])
            self._available_count = pycassa.ColumnFamily(self.pool, 'available_count')
            self._available_count.insert('RowKey', {'available_count': 0})

    def delete_namespace(self):
        sm = SystemManager(random.choice(self.server_list))
        sm.drop_keyspace(self.namespace)
        sm.close()

    def _create_namespace(self, namespace):
        sm = SystemManager(random.choice(self.server_list))
        sm.create_keyspace(namespace, SIMPLE_STRATEGY, {'replication_factor': '1'})
        sm.close()

    def _create_column_family(self, family, bytes_columns=[], 
                              key_validation_class=TIME_UUID_TYPE):
        '''
        Creates a column family of the name 'family' and sets any of
        the names in the bytes_column list to have the BYTES_TYPE.

        key_validation_class defaults to TIME_UUID_TYPE and could also
        be ASCII_TYPE for md5 hash keys, like we use for 'inbound'
        '''
        sm = SystemManager(random.choice(self.server_list))
        # sys.create_column_family(self.namespace, family, super=False)
        sm.create_column_family(self.namespace, family, super=False,
                key_validation_class = key_validation_class, 
                default_validation_class  = TIME_UUID_TYPE,
                column_name_class = ASCII_TYPE)
        for column in bytes_columns:
            sm.alter_column(self.namespace, family, column, BYTES_TYPE)
        sm.close()

    def _create_counter_column_family(self, family, counter_columns=[],
                              key_validation_class=UTF8Type):
        '''
        Creates a column family of the name 'family' and sets any of
        the names in the bytes_column list to have the BYTES_TYPE.

        key_validation_class defaults to TIME_UUID_TYPE and could also
        be ASCII_TYPE for md5 hash keys, like we use for 'inbound'
        '''
        sm = SystemManager(random.choice(self.server_list))
        # sys.create_column_family(self.namespace, family, super=False)
        sm.create_column_family(self.namespace, family, super=False,
                key_validation_class = key_validation_class, 
                default_validation_class="CounterColumnType",
                column_name_class = ASCII_TYPE)
        for column in counter_columns:
            sm.alter_column(self.namespace, family, column, COUNTER_COLUMN_TYPE)
        sm.close()

    def tasks(self, key_prefix=''):
        '''
        generate the data objects for every task
        '''
        for row in self._tasks.get_range():
            logger.debug(row)
            if not row[0].startswith(key_prefix):
                continue
            data = json.loads(row[1]['task_data'])
            data['task_key'] = row[0]
            yield data

    def put_task(self, key, task_data):
        try:
            found = self._tasks.get(key, column_count=1)
            exists = True
        except pycassa.cassandra.ttypes.NotFoundException:
            exists = False

        self._tasks.insert(key, {'task_data': json.dumps(task_data)})
        if not exists:
            self._task_count.insert('RowKey', {'task_count': 1})
        return exists

    def get_task(self, key):
        data = self._tasks.get(key)
        return json.loads(data['task_data'])

    def pop_task(self, key):
        self._tasks.remove(key)
        self._task_count.insert('RowKey', {'task_count': -1})
        return key

    @property
    def task_keys(self):
        c = 0
        for key, _ in self._tasks.get_range(column_count=0, filter_empty=False):
            c += 1
            yield key

    def num_tasks(self):
        data = self._task_count.get('RowKey')
        return data['task_count']

    def num_available(self):
        data = self._available_count.get('RowKey')
        return data['available_count']

    def put_available(self, key):
        ## closest thing to storing only the key
        try:
            found = self._available.get(key, column_count=1)
            exists = True
        except pycassa.cassandra.ttypes.NotFoundException:
            exists = False

        if not exists:
            self._available.insert(key, {'available': ''})
            self._available_count.insert('RowKey', {'available_count': 1})

    #def push_batch(self, row_iter):
    #    '''
    #    Push opaque vertex data objects into the inbound queue
    #    '''
    #    return self._tasks.batch_insert({k: json.dumps(v) for k, v in row_iter})

    def get_random_available(self, max_iter=10000):
        '''
        get a random key out of the first max_iter rows
        '''
        c = 1
        keeper = None
        ## note the ConsistencyLevel here.  If we do not do this, and
        ## get all slick with things like column_count=0 and filter
        ## empty False, then we can get keys that were recently
        ## deleted... EVEN if the default consistency would seem to
        ## rule that out!

        ## note the random start key, so that we do not always hit the
        ## same place in the key range with all workers
        #random_key = hashlib.md5(str(random.random())).hexdigest()
        #random_key = '0' * 32
        #logger.debug('available.get_range(%r)' % random_key)
        ## scratch that idea: turns out that using a random start key
        ## OR using row_count=1 can cause get_range to hang for hours

        ## why we need ConsistencyLevel.ALL on a single node is not
        ## clear, but experience indicates it is needed.

        ## note that putting a finite row_count is problematic in two
        ## ways:
        # 1) if there are more workers than max_iter, some will not
        # get tasks
        #
        # 2) if there are more than max_iter records, then all workers
        # have to wade through all of these just to get a task!  What
        # we really want is a "pick random row" function, and that is
        # probably best implemented using CQL3 token function via the
        # cql python module instead of pycassa...
        for row in self._available.get_range(row_count=max_iter, read_consistency_level=pycassa.ConsistencyLevel.ALL):
        #for row in self._available.get_range(row_count=100):
            logger.debug('considering %r' % (row,))
            if random.random() < 1 / c:
                keeper = row[0]
            if c == max_iter:
                break
            c += 1
        return keeper

    def in_available(self, key):
        try:
            row = self._available.get(key)
            return True
        except pycassa.NotFoundException:
            return False

    def pop_available(self, key):
        self._available.remove(key, write_consistency_level=pycassa.ConsistencyLevel.ALL)
        self._available_count.insert('RowKey', {'available_count': -1})
        assert not self.in_available(key)
        return key

    def close(self):
        self._closed = True
        if hasattr(self, 'pool'):
            self.pool.dispose()