def read(metric, start_time, end_time, tags): pool = ConnectionPool(keyspace, [address]) # decide which column family to read based on time diffrence if timeDiff(start_time, end_time) <= 3600: col_fam = pycassa.ColumnFamily(pool, 'rawdata') elif timeDiff(start_time, end_time) <= 7200: col_fam = pycassa.ColumnFamily(pool, 'rollups60') elif timeDiff(start_time, end_time) <= 86400: col_fam = pycassa.ColumnFamily(pool, 'rollups300') elif timeDiff(start_time, end_time) <= 2592000: col_fam = pycassa.ColumnFamily(pool, 'rollups7200') else: col_fam = pycassa.ColumnFamily(pool, 'rollups86400') # change start_time , end_time to uper timestamp start_upertime = start_time/upertime_interval end_updertime = end_time/upertime_interval points = {} for i in range(start_upertime, end_updertime + 1): key = generate_key(metric, i, tags) try: points = col_fam.get(key, column_start=start_time, column_finish=end_time) except pycassa.NotFoundException: return None pool.dispose() return points
def get_cassandra_connection(keyspace_name, hosts): key = keyspace_name, tuple(hosts) connection_pool, created_at = connection_pool_cache.get(key, (None, None)) init_new_pool = connection_pool is None or connection_pool_expired( created_at) if connection_pool is not None and len(connection_pool.server_list) == 0: logging.error('connection pool had no active hosts') init_new_pool = True if init_new_pool: nodes = detect_nodes(hosts, keyspace_name) logger.info('setting up a new connection pool') connection_pool = ConnectionPool( keyspace_name, nodes, pool_size=settings.FEEDLY_CASSANDRA_CONNECTION_POOL_SIZE, prefill=False, timeout=settings.FEEDLY_CASSANDRA_TIMEOUT, max_retries=3 ) listener = FeedlyPoolListener(connection_pool) connection_pool.add_listener(listener) connection_pool_cache[key] = (connection_pool, time.time()) return connection_pool
def main(filename): data = file(filename) # Set up the connection pool pool = ConnectionPool('tuitterdb',['localhost:9160']) # CF connections user_family = ColumnFamily(pool, 'user') tweet_family = ColumnFamily(pool, 'tweet') user_tweets_family = ColumnFamily(pool, 'userTweets') followers = ColumnFamily(pool, 'followers') followerTweets = ColumnFamily(pool, 'followsTweets') # Batch Definitions user_batch = user_family.batch(queue_size=1000) followers_batch = followers.batch(queue_size=500) user_tweets_batch = user_tweets_family.batch(queue_size = 500) followerTweets_batch = followerTweets.batch(queue_size = 500) while True: # loop line = data.readline() if line == "": break # This isn't handled properly else: tweet = tweet_get(line) try: tweet_data = get_tweet_data(tweet) if check_user(tweet[u"from_user_id_str"]) == False: # check in script if user is there. sender = get_sender(tweet) user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) # create user entry for sender user_tweets_batch.insert(sender.user_id,{line:''}) # insert the whole tweet into a userTweets column header if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False: to_user = get_to_user(tweet) user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user}) followers_batch.insert(to_user.user_id,{sender.user_id:'follower_id'}) followerTweets_batch.insert(to_user.user_id,{line:''}) # insert the whole tweet into a followeTweets column header for the to user. if u"entities" in tweet: # iterate over the users in mentions and add them to users and follows if necessary if tweet[u"entities"][u"user_mentions"] != []: user_mentions = get_mentions(tweet) for obj in user_mentions: if check_user(obj.user_id) == False: user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user}) followers_batch.insert(obj.user_id,{'user_id':sender.user_id}) followerTweets_batch.insert(obj.user_id,{line:''}) # insert the whole tweet to a followerTweet entry for the mentioned user else: continue tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp}) except Exception: err = sys.exc_info() print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) #print the exception data with traceback and continue. continue # Pools Closed. pool.dispose()
def main(filename): data = file(filename) # Set up the connection pool pool = ConnectionPool('tuitterdb',['localhost:9160']) # CF connections user_family = ColumnFamily(pool, 'user') tweet_family = ColumnFamily(pool, 'tweet') user_tweets_family = ColumnFamily(pool, 'userTweets') #follows_tweets_family = ColumnFamily(pool, 'follows.tweets') followers = ColumnFamily(pool, 'followers') # Batch Definitions user_batch = user_family.batch(queue_size=1000) followers_batch = followers.batch(queue_size=500) user_tweets_batch = user_tweets_family.batch(queue_size = 500) while True: line = data.readline() if line is None: break else: tweet = tweet_get(line) try: tweet_data = get_tweet_data(tweet) if check_user(tweet[u"from_user_id_str"]) == False: sender = get_sender(tweet) user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) user_tweets_batch.insert(sender.user_id,{tweet_data.tweet_id:tweet_data.timestamp}) if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False: to_user = get_to_user(tweet) user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user}) followers_batch.insert(to_user.user_id,{'user_id':sender.user_id}) if u"entities" in tweet: if tweet[u"entities"][u"user_mentions"] != []: user_mentions = get_mentions(tweet) for obj in user_mentions: user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user}) followers_batch.insert(obj.user_id,{'user_id':sender.user_id}) else: continue tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp}) except Exception: err = sys.exc_info() print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) continue # Pools Closed. pool.dispose() #if __name__ == "__main__": #unittest.main()
def test_big_batched_writes(): ## this is an m1.xlarge doing nothing but supporting this test server = 'localhost:9160' keyspace = 'testkeyspace_' + getpass.getuser().replace('-', '_') family = 'testcf' sm = SystemManager(server) try: sm.drop_keyspace(keyspace) except pycassa.InvalidRequestException: pass sm.create_keyspace(keyspace, SIMPLE_STRATEGY, {'replication_factor': '1'}) sm.create_column_family(keyspace, family, super=False, key_validation_class = LEXICAL_UUID_TYPE, default_validation_class = LEXICAL_UUID_TYPE, column_name_class = ASCII_TYPE) sm.alter_column(keyspace, family, 'test', ASCII_TYPE) sm.close() pool = ConnectionPool(keyspace, [server], max_retries=10, pool_timeout=0, pool_size=10, timeout=120) pool.fill() pool.add_listener( Listener() ) ## assert that we are using framed transport conn = pool._q.get() assert isinstance(conn.transport, thrift.transport.TTransport.TFramedTransport) pool._q.put(conn) try: for num_rows in range(14, 20): ## write some data to cassandra using increasing data sizes one_mb = ' ' * 2**20 rows = [] for i in xrange(num_rows): key = uuid.uuid4() rows.append((key, dict(test=one_mb))) testcf = pycassa.ColumnFamily(pool, family) with testcf.batch() as batch: for (key, data_dict) in rows: data_size = len(data_dict.values()[0]) logger.critical('adding %r with %.6f MB' % (key, float(data_size)/2**20)) batch.insert(key, data_dict) logger.critical('%d rows written' % num_rows) finally: sm = SystemManager(server) try: sm.drop_keyspace(keyspace) except pycassa.InvalidRequestException: pass sm.close() logger.critical('clearing test keyspace: %r' % keyspace)
def __init__(self, namespace, server_list=['localhost:9160']): # save cassandra server self.server_list = server_list self.namespace = namespace self._closed = False #setup_logging(self) # Connect to the server creating the namespace if it doesn't # already exist try: self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) except pycassa.InvalidRequestException: self._create_namespace(namespace) self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) try: self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') except pycassa.NotFoundException: self._create_column_family('tasks', key_validation_class=ASCII_TYPE, bytes_columns=['task_data']) self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') try: self._available = pycassa.ColumnFamily(self.pool, 'available') except pycassa.NotFoundException: self._create_column_family('available', key_validation_class=ASCII_TYPE, bytes_columns=['available']) self._available = pycassa.ColumnFamily(self.pool, 'available') try: self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') except pycassa.NotFoundException: self._create_counter_column_family('task_count', key_validation_class=ASCII_TYPE, counter_columns=['task_count']) self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') self._task_count.insert('RowKey', {'task_count': 0}) try: self._available_count = pycassa.ColumnFamily(self.pool, 'available_count') except pycassa.NotFoundException: self._create_counter_column_family('available_count', key_validation_class=ASCII_TYPE, counter_columns=['available_count']) self._available_count = pycassa.ColumnFamily(self.pool, 'available_count') self._available_count.insert('RowKey', {'available_count': 0})
def get_values(servlst, ks, cf, key): #print key try: pool = ConnectionPool(ks, servlst) cf_handle = ColumnFamily(pool, cf) result = cf_handle.get(key).values() except pycassa.NotFoundException as err: print "[ERROR] " + key + " not found" result = "" except Exception as err: print "[ERROR] " + str(err) exit(-1) finally: pool.dispose() return result
def write(vl, data=None): # get connection from pycassa connection pool # 创建keyspace Rawdata保存监控数据 pool = ConnectionPool('Monitor', ['localhost:9160']) # 创建columnFamily:RawData, Rollups60, Rollups300, Rollups7200, Rollups86400 col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata') # 加入时间进行分区,以月作为分区 timeString = time.strftime("%Y-%m", time.localtime(vl.time)) key = [str(vl.host), str(vl.plugin), str(vl.plugin_instance), str(vl.type), str(vl.type_instance), timeString] keyString = "#".join(key) for i in vl.values: col_fam_rawdata.insert(keyString, {vl.time: i}) #插入RawData # 同时写入一个文件作为测试 with open('/tmp/workfile', 'a') as f: f.write(keyString + " " + str(vl.time) + " " + str(i) + "\n") pool.dispose() f.close()
def _update_analytics_start_time(self, start_time): pool = ConnectionPool(COLLECTOR_KEYSPACE, ['127.0.0.1:%s' \ % (self.__class__.cassandra_port)]) col_family = ColumnFamily(pool, SYSTEM_OBJECT_TABLE) col_family.insert(SYSTEM_OBJECT_ANALYTICS, {SYSTEM_OBJECT_START_TIME: start_time})
def write(metric, timestamp, value, tags, ds_type): try: value = normalize_value(metric, tags, value, timestamp, ds_type) except ValueError: return pool = ConnectionPool(keyspace, [address]) upertime = timestamp/upertime_interval # get key from database, if some id is not exist, create new one key = generate_key(metric, upertime, tags) # save to rawdata pool = ConnectionPool(keyspace, [address]) col_fam_rawdata = pycassa.ColumnFamily(pool, 'rawdata') col_fam_rawdata.insert(key, {timestamp: value}) # save to rollups60,if in the same minute , update the memory. # if it is new minute, write the old value to cassandra, update the memory if dictAvg60[metric]['timestamp'] == 0: dictAvg60[metric]['avg'] = value dictAvg60[metric]['counter'] = 1 elif inOneMinute(timestamp, dictAvg60[metric]['timestamp']): newAvg = caculate(dictAvg60[metric]['avg'], dictAvg60[metric]['counter'], value) dictAvg60[metric]['avg'] = newAvg dictAvg60[metric]['counter'] += 1 else: col_fam_rollups60 = pycassa.ColumnFamily(pool, 'rollups60') col_fam_rollups60.insert(metric, {dictAvg60[metric]['timestamp']: dictAvg60[key]['avg']}) dictAvg60[metric]['avg'] = value dictAvg60[metric]['counter'] = 1 dictAvg60[metric]['timestamp'] = timestamp # save to rollups300 if dictAvg300[metric]['timestamp'] == 0: dictAvg300[metric]['avg'] = value dictAvg300[metric]['counter'] = 1 elif inFiveMinutes(timestamp, dictAvg300[metric]['timestamp']): newAvg = caculate(dictAvg300[metric]['avg'], dictAvg300[metric]['counter'], value) dictAvg300[metric]['avg'] = newAvg dictAvg300[metric]['counter'] += 1 else: col_fam_rollups300 = pycassa.ColumnFamily(pool, 'rollups300') col_fam_rollups300.insert(metric, {dictAvg300[metric]['timestamp']: dictAvg300[key]['avg']}) dictAvg300[metric]['avg'] = value dictAvg300[metric]['counter'] = 1 dictAvg300[metric]['timestamp'] = timestamp # save to rollups7200 if dictAvg7200[metric]['timestamp'] == 0: dictAvg7200[metric]['avg'] = value dictAvg7200[metric]['counter'] = 1 elif inTwoHours(timestamp, dictAvg7200[metric]['timestamp']): newAvg = caculate(dictAvg7200[metric]['avg'], dictAvg7200[metric]['counter'], value) dictAvg7200[metric]['avg'] = newAvg dictAvg7200[metric]['counter'] += 1 else: col_fam_rollups7200 = pycassa.ColumnFamily(pool, 'rollups7200') col_fam_rollups7200.insert(metric, {dictAvg7200[metric]['timestamp']: dictAvg7200[key]['avg']}) dictAvg7200[metric]['avg'] = value dictAvg7200[metric]['counter'] = 1 dictAvg7200[metric]['timestamp'] = timestamp # save to rollups86400 if dictAvg86400[metric]['timestamp'] == 0: dictAvg86400[metric]['avg'] = value dictAvg86400[metric]['counter'] = 1 elif inOneDay(timestamp, dictAvg86400[metric]['timestamp']): newAvg = caculate(dictAvg86400[metric]['avg'], dictAvg86400[metric]['counter'], value) dictAvg86400[metric]['avg'] = newAvg dictAvg86400[metric]['counter'] += 1 else: col_fam_rollups86400 = pycassa.ColumnFamily(pool, 'rollups86400') col_fam_rollups86400.insert(metric, {dictAvg86400[metric]['timestamp']: dictAvg86400[key]['avg']}) dictAvg86400[metric]['avg'] = value dictAvg86400[metric]['counter'] = 1 dictAvg86400[metric]['timestamp'] = timestamp pool.dispose();
# UNABLE TO CONNECT import csv from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool('test', ['127.0.0.1:9042']) cf = ColumnFamily(pool, "testtable") with open('test.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) for row in reader: print str(row) key = row['id'] del row['id'] cf.insert(key, row) pool.dispose() # TO RUN # $ python # python shell > python seedCassandra.py
from pycassa.types import * from pycassa.system_manager import * from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily def create_ks(): # create test keyspace sys = SystemManager() comparator = CompositeType(LongType(), BytesType()) sys.create_column_family("testing", "testing", comparator_type=comparator) pool = ConnectionPool('testing') cf = ColumnFamily(pool, 'testing') # Check the column added by the Haskell test script # print [k for k in cf.get_range()] # cf.insert("row2", {(125, 'oklahoma'): 'asdf'}) print cf.get('row1') print cf.get('row2') # should see: OrderedDict([((125, 'oklahoma'), 'asdf')])
import pycassa from pycassa.pool import ConnectionPool from pycassa import index from pycassa.columnfamily import ColumnFamily pool1 = ConnectionPool('MINDNET', ['localhost:9160'], timeout=10000000) pool2 = ConnectionPool('MINDNET', ['213.136.81.102:9160'], timeout=10000000) def migr(tab1, tab2, tb): #r1=tab1.get_range() #tab2.truncate() ind = 0 while True: cach = [] r1 = tab1.get_range() for ky, col in r1: cach.append([ky, col]) if len(cach) % 1000 == 0: print 'collect(', tb, '):', len(cach) if len(cach) >= 500000: break if len(cach) == 0: break b1 = tab2.batch(55000) b2 = tab1.batch(55000) indc = 0 for ky, col in cach: tab2.insert(ky, col) tab1.remove(ky) indc += 1
import json from twitter import settings from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily from pycassa.cassandra.ttypes import NotFoundException from twitter.backend.base import TimelineFile as BaseTimelineFile from twitter.backend.base import FollowerFile as BaseFollowerFile print "Initializing connnection pool..." POOL = ConnectionPool(settings.CASSANDRA_KEYSPACE, settings.CASSANDRA_POOL, timeout=2) FOLLOWERS = ColumnFamily(POOL, 'Followers') USERTIMELINE = ColumnFamily(POOL, 'UserTimeline') TIMELINE = ColumnFamily(POOL, 'Timeline') COUNTERS = ColumnFamily(POOL, 'Counters') class TimelineFile(BaseTimelineFile): def __init__(self, user_id): BaseTimelineFile.__init__(self, user_id) def get_first(self): dct = USERTIMELINE.get(self.user_id, column_count=1, column_reversed=True) tweet_id = dct[dct.keys()[0]]
from bs4 import BeautifulSoup import datetime import hashlib import numpy as np import pandas as pd import csv import re import os from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool( 'dnm', ['158.85.217.74:9160']) #Needs to be in the format of '169.53.141.8:9160' cf = ColumnFamily(pool, 'products') wdir = "/sandisk1/darknetmarket/silkroad2" #Ensure the current directory is correctly set os.chdir(wdir) c = 0 #Counter non_decimal = re.compile(r'[^\d.]+') #Clean strings with numbers #####CODE TO READ IN CSV OF BITCOIN PRICES GOES HERE##### bitcoin = pd.read_csv('/sandisk1/darknetmarket/Bitcoin Prices.csv', sep=',') #Reads in historical bitcoin prices bitcoin['Date'] = pd.to_datetime( bitcoin['Date']) #Converts 'Date' field to Datetime #########################################################
import time from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily from pycassa.cassandra.ttypes import NotFoundException __all__ = ['get_user_by_username', 'get_friend_usernames', 'get_follower_usernames', 'get_users_for_usernames', 'get_friends', 'get_followers', 'get_timeline', 'get_userline', 'get_tweet', 'save_user', 'save_tweet', 'add_friends', 'remove_friends', 'DatabaseError', 'NotFound', 'InvalidDictionary', 'PUBLIC_USERLINE_KEY'] POOL = ConnectionPool('Twissandra') USER = ColumnFamily(POOL, 'User') FRIENDS = ColumnFamily(POOL, 'Friends') FOLLOWERS = ColumnFamily(POOL, 'Followers') TWEET = ColumnFamily(POOL, 'Tweet') TIMELINE = ColumnFamily(POOL, 'Timeline') USERLINE = ColumnFamily(POOL, 'Userline') # NOTE: Having a single userline key to store all of the public tweets is not # scalable. Currently, Cassandra requires that an entire row (meaning # every column under a given key) to be able to fit in memory. You can # imagine that after a while, the entire public timeline would exceed # available memory. # # The fix for this is to partition the timeline by time, so we could use # a key like !PUBLIC!2010-04-01 to partition it per day. We could drill # down even further into hourly keys, etc. Since this is a demonstration # and that would add quite a bit of extra code, this excercise is left to
import time from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily from pycassa.cassandra.ttypes import NotFoundException __all__ = ['get_user_by_userid', 'DatabaseError', 'NotFound', 'InvalidDictionary'] POOL = ConnectionPool(keyspace='TEST', server_list=['localhost:9160'], prefill=False) USER = ColumnFamily(POOL, 'Users') Board = ColumnFamily(POOL, 'Board') class DatabaseError(Exception): """ The base error that functions in this module will raise when things go wrong. """ pass class NotFound(DatabaseError): pass class InvalidDictionary(DatabaseError): pass def get_user_by_userid(userid): try: user = USER.get(str(userid), columns=['name', 'password'])
if keyspace in sysm.list_keyspaces(): sysm.drop_keyspace(keyspace) sysm.create_keyspace(keyspace, system_manager.SIMPLE_STRATEGY, {'replication_factor': '1'}) sysm.create_column_family(keyspace, columnfamily) sysm.alter_column(keyspace, columnfamily, 'strcol', system_manager.ASCII_TYPE) sysm.alter_column(keyspace, columnfamily, 'intcol', system_manager.INT_TYPE) sysm.alter_column(keyspace, columnfamily, 'longcol', system_manager.LONG_TYPE) sysm.alter_column(keyspace, columnfamily, 'floatcol', system_manager.FLOAT_TYPE) sysm.alter_column(keyspace, columnfamily, 'doublecol', system_manager.DOUBLE_TYPE) sysm.alter_column(keyspace, columnfamily, 'datecol', system_manager.DATE_TYPE) if __name__ == "__main__": if len(sys.argv) < 2: print "Error. Pass the name of the YAML configuration file as parameter." sys.exit(-1) conffile = sys.argv[1] sysm = system_manager.SystemManager() setup_keyspace(sysm) pool = ConnectionPool(keyspace) cf = ColumnFamily(pool, columnfamily) # Write and read keys write(cf) clist = read_cl(cf) #print "First rows of clist ->", clist[:10] sarray = read_np(cf, conffile) print "First rows of sarray->", repr(sarray[:10])
def setUp(self): n = 10000 self.weibos = self._load_items(n) pool = ConnectionPool('master_timeline', server_list=['219.224.135.60:9160', '219.224.135.61:9160'], pool_size=10) col_fam = pycassa.ColumnFamily(pool, 'weibos') self.weibos_col_fam = col_fam
import random import sys import pycassa from random import choice from random import sample from pycassa.index import * from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily from datetime import datetime t1 = datetime.now() pool = ConnectionPool('JBJ') col_fam_master = pycassa.ColumnFamily(pool, 'Master') delta = 0.003 T0 = 2 #Function: Load graph from a file def get_color_degree(graph,index,color): neighbors = list(graph[index][1]) energy = 0 for neighbor in neighbors: if graph[neighbor][0] == color: energy +=1 return energy def get_neighbor(graph, index): temp = list(graph[index][1]) return temp[random.randint(0,len(temp)-1)]
from pycassa.cassandra.ttypes import * from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily try: cp = ConnectionPool("demo") cf = ColumnFamily(cp, "Test3") cf.insert('2345', 'ss') x = cf.get('1234') print(x) except InvalidRequestException as e: print("ERROR " + e.why) except NotFoundException as e: print("ERROR " + e.why)
#!/usr/bin/python #-*- coding:utf-8 -*- from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool('MyKeyspace') cf = ColumnFamily(pool, 'MyCF') #cf.insert('row_key', {'col_name': 'col_val'}) #cf.insert('row_key', {'col_name':'col_val', 'col_name2':'col_val2'}) #cf.batch_insert({'row1': {'name1': 'val1', 'name2': 'val2'},'row2': {'foo': 'bar'}}) print cf.get('row_key') print cf.get('row_key', columns=['col_name', 'col_name2']) #for i in xrange(10): # cf.insert('row_key', {str(i): 'val'}) print cf.get('row_key', column_start='5', column_finish='7') print cf.get('row_key', column_reversed=True, column_count=3) print cf.multiget(['row1', 'row2']) result = cf.get_range(start='row_key5', finish='row_key7') for key, columns in result: print key, '=>', columns
import sys from datetime import datetime as dt from functools import wraps import zlib import msgpack import pycassa from pycassa.pool import ConnectionPool from pycassa.index import create_index_clause, create_index_expression from pycassa.cassandra.ttypes import NotFoundException, ConsistencyLevel from pyhackers.common import unix_time_millisecond, time_with_ms, epoch_to_date, unix_time from pyhackers.config import config pool = ConnectionPool("sweetio", [config.cassandra]) status_cf = pycassa.ColumnFamily(pool, "status") user_timeline_cf = pycassa.ColumnFamily(pool, "user_timeline") user_cf = pycassa.ColumnFamily(pool, "user2") channel_timeline_cf = pycassa.ColumnFamily(pool, "channel_timeline") #create column family user_following_timeline with comparator = IntegerType; user_following_timeline_cf = pycassa.ColumnFamily(pool, "user_following_timeline") counters_cf = pycassa.ColumnFamily(pool, "counters") status_upvotes_cf = pycassa.ColumnFamily(pool, "status_upvotes") status_downvotes_cf = pycassa.ColumnFamily(pool, "status_downvotes") status_replies_cf = pycassa.ColumnFamily(pool, "status_replies") status_resweets_cf = pycassa.ColumnFamily(pool, "status_resweets") status_favs_cf = pycassa.ColumnFamily(pool, "status_favs") user_follower_cf = pycassa.ColumnFamily(pool, "user_followers") user_following_cf = pycassa.ColumnFamily(pool, "user_following")
def query2(user_id):# pool = ConnectionPool('tuitterdb') followsTweets_family = ColumnFamily(pool,'followsTweets') # Print the tweets of the followers of user supplied in the parameter. query(followsTweets_family,user_id) pool.dispose()
with open(filename, 'w') as f: f.write( zlib.compress( cPickle.dumps(self.current_day_bitarray, protocol=cPickle.HIGHEST_PROTOCOL))) def union_current_day(self, bf): """Union only the current_day of an other BF.""" self.bitarray = self.bitarray | bf.current_day_bitarray if __name__ == "__main__": import numpy as np from pycassa.pool import ConnectionPool pool = ConnectionPool('parsely') bf = DailyTemporalBloomFilter(100000, 0.01, 60, 'session_site', './', pool) random_items = [str(r) for r in np.random.randn(200000)] for item in random_items[:100000]: bf.add(item) false_positive = 0 for item in random_items[100000:200000]: if item in bf: false_positive += 1 print "Error rate (false positive): %s" % str( float(false_positive) / 100000)
#!/usr/bin/python #-*- coding:utf-8 -*- import csv import glob from pycassa.system_manager import * from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily sys = SystemManager('localhost:9160') #sys.create_keyspace('employees', SIMPLE_STRATEGY, {'replication_factor': '1'}) pool = ConnectionPool('employees') filenames = glob.glob('employees/*.csv') for filename in filenames: only_name = filename.split('/')[-1].split('.')[0] print only_name #sys.create_column_family('employees',only_name,super=False) #sys.drop_column_family('employees',only_name) cf = ColumnFamily(pool, only_name) csv_file = open(filename, 'rb') reader = csv.reader(csv_file) r = 1 for row in reader: c = 1 for col in row: cf.insert('row' + str(r), {'col' + str(c): col}) c += 1 r += 1
#nodeCfg = {'ip': '172.16.40.147','hostname':'centos123','user':'******','passwd':'1','desc':u'没有表述'} nodeinfo1 = {'software':'apache2','ver':'2.0.1','docbase':'/var/www'} nodeinfo2 = {'software':'cassandra','ver':'1.0.12','seeds':'172.16.40.145'} cols = [{'software':UTF8_TYPE,'ver':UTF8_TYPE}] #CreateCFByDefaultConn(ks,testcf,cols) #ip_name: # apache:ver # apache:docbase # cassandra:ver # cassandra:seeds #comparator = CompositeType(UTF8Type(), UTF8Type(),UTF8Type()) #cols = [{"param":comparator}] #CreateCompositeCF(getConnectString()[0],ks,testcf,None,[comparator]) pool = ConnectionPool(ks, getConnectString()) #print(pool) # key = '172.16.40.145:cent_client1:'+nodeinfo1.get('software') # print(key) #UpdateValue(pool,testcf[0],'172.16.40.147',{('172.16.40.147','centos123','tomcat','7.0','port'):'8080'}) #key = '172.16.40.145:cent_client1:'+nodeinfo2.get('software') #print(key) #UpdateValue(pool,testcf[0],key,nodeinfo2) #s = GetValue(pool,'testcf','172.16.40.145') #print(s) # update column family testcf with column_metadata=[{column_name:docbase, validation_class: UTF8Type, index_type: KEYS}] ''' CREATE TABLE testcf ( key ascii,
import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily import csv import time pool = ConnectionPool('highwaydata', ['10.138.0.5', '10.138.0.4', '10.138.0.3'], use_threadlocal=False, pool_size=3) detectorFile = '/home/highway_data/csv_fies/ProjectData-Cloud2015/freeway_detectors.csv' loopFile = '/home/highway_data/csv_fies/ProjectData-Cloud2015/freeway_loopdata.csv' superLoops_start_time = time.time() print('starting to load detector loopdata supercolumn family') with open(detectorFile, 'rU') as fin: cin = csv.DictReader(fin) detectorData = [row for row in cin] col_fam_detectors = ColumnFamily(pool, 'superLoops') for detector in detectorData: detectorid = detector['detectorid'] col_fam_detectors.insert( detectorid, { 'detectorInfo': { 'highwayid': detector['highwayid'], 'milepost': detector['milepost'], 'locationtext': detector['locationtext'],
def setup_module(): global pool credentials = {'username': '******', 'password': '******'} pool = ConnectionPool(TEST_KS, pool_size=10, credentials=credentials)
'mobile:Your Club Specials', 'mobile:Weekly Specials', 'mobile:savings' ] pageNameList = [ 'mobile:safeway:savings', 'mobile:safeway:savings:couponctr', 'mobile:safeway:savings:personaldeal', 'mobile:safeway:savings:clubspecial', 'mobile:safeway:Weekly Specials', 'mobile:safeway:Your Club Specials' ] print 'Connecting to Cassandra ' + gKeyspace + '/' + gColumnFamily + '...' pool = ConnectionPool(gKeyspace, ['10.5.14.58:9160']) deviceLogCF = pycassa.ColumnFamily(pool, gColumnFamily) print 'Connected to ' + gKeyspace + '/' + gColumnFamily if len(sys.argv) < 2: usage() sys.exit() operation = sys.argv[1] option = "full" if (len(sys.argv) >= 3): option = sys.argv[2] if (operation == 'insert'): # profile1()
#EXEC import mdER import mdNeural import umisc import sys sys.path.append('./pymongo') sys.path.append('./pycassa') import pycassa from pycassa.pool import ConnectionPool from pycassa import index from pycassa.columnfamily import ColumnFamily import pymongo import bson #============ base local de testes ============================ pool2 = ConnectionPool('MINDNET', ['localhost:9160'], timeout=10) to_posting = pycassa.ColumnFamily(pool2, 'to_posting') wb3 = pycassa.ColumnFamily(pool2, 'web_cache3') to_posting2 = pycassa.ColumnFamily(pool2, 'to_posting') to_posting3 = pycassa.ColumnFamily(pool2, 'to_posting') #=========== base producao ============================= ''' MONGO_URL='mongodb://*****:*****@91.205.172.85:27017/mdnet' connMC = pymongo.Connection(MONGO_URL) dbMC=connMC.mdnet to_posting1=dbMC['to_posting'] ''' ''' pool2 = ConnectionPool('MINDNET', ['79.143.185.3:9160'],timeout=10000) #to_posting2 = pycassa.ColumnFamily(pool2, 'to_posting') to_posting3 = pycassa.ColumnFamily(pool2, 'to_posting2')
from pycassa import index import logging from StringIO import StringIO logging.basicConfig(level=logging.DEBUG) log = logging.getLogger('pblnksExtra') ch = logging.StreamHandler() lbuffer = StringIO() logHandler = logging.StreamHandler(lbuffer) log.addHandler(logHandler) log.addHandler(ch) pool2 = ConnectionPool('MINDNET', ['79.143.185.3:9160'], timeout=10) tab2 = pycassa.ColumnFamily(pool2, 'cache_products') wb2 = pycassa.ColumnFamily(pool2, 'web_cache3') # lugar para indexar def short_url(urllong): return bitly.short_url(urllong) def lomadeezar_links(links_tw): areturn = [] lnk = [] ind = 1 for l in links_tw: #l=urllib.quote(l) lnk.append(['link' + str(ind), l])
def test_pool(self): listener = StatsLogger() pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000, prefill=True, pool_timeout=0.1, timeout=1, keyspace='PycassaTestKeyspace', credentials=_credentials, listeners=[listener], use_threadlocal=False) conns = [] for i in range(10): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['created']['failure'], 0) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) # Pool is maxed out now assert_raises(NoConnectionAvailable, pool.get) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) assert_equal(listener.stats['at_max'], 1) for i in range(0, 5): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 0) assert_equal(listener.stats['checked_in'], 5) assert_equal(listener.stats['opened'], {'current': 5, 'max': 10}) for i in range(5, 10): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 5) assert_equal(listener.stats['checked_in'], 10) conns = [] # These connections should come from the pool for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 15) # But these will need to be made for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 15) assert_equal(listener.stats['checked_out'], 20) assert_equal(listener.stats['disposed']['success'], 5) for i in range(10): conns[i].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) assert_raises(InvalidRequestError, conns[0].return_to_pool) assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) print("in test:", id(conns[-1])) conns[-1].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) pool.dispose()
import os,sys import pycassa import logging from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily from datetime import datetime as d __author__ = 'rahul' logging.basicConfig(filename="example.log",level=logging.DEBUG,format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') pool = ConnectionPool('icepice') col_fam = ColumnFamily(pool, 'info') #Sample Insert operations #col_fam.insert('row_key',{'col_name' : 'value' }) '''Sample Get operations. Need to wrap the get operations in exception block for safe failing from 'key not found error'. ''' try: for i in range(0,1000): s = d.now() col_fam.get(i) logging.info(str(i) +"=>"+ str((d.now() - s).microseconds)) except: logging.info("Error" + str(sys.exc_info()[0]))
def __init__(self, keyspace, column_family_name): self.pool = ConnectionPool(keyspace, cassandra_settings.NODE_POOL) self.cf = ColumnFamily(self.pool, column_family_name) self.batch = {}
# Include python library for pycassa import sys import functions import node import pycassa from pycassa.index import * from datetime import datetime # Connect to Cassandra Instance from pycassa.pool import ConnectionPool pool = ConnectionPool('OSN') #pool = ConnectionPool('OSN', ['localhost:9160']) from pycassa.columnfamily import ColumnFamily col_fam_master = pycassa.ColumnFamily(pool, 'Master') col_fam_replica = pycassa.ColumnFamily(pool, 'Replica') col_fam_edge = pycassa.ColumnFamily(pool, 'Edge') col_fam_mme = pycassa.ColumnFamily(pool, 'Master_Master_Edge') col_fam_mse = pycassa.ColumnFamily(pool, 'Master_Slave_Edge') # Input: Number of servers & number of replicas (K-redundancy) total_servers = 4 total_replicas = 2 # Variables server_id = 0 num_replica = 0 replica_id = 0 total_replicas_needed = 0
# set up the cassandra object cass = pycassa.system_manager.SystemManager('localhost') # Normally you wouldn't drop the keyspace first # I only do it here to make everything clean print "Dropping keyspace" if 'jedberg_test' in cass.list_keyspaces(): cass.drop_keyspace('jedberg_test') # create the keyspace print "Creating keyspace" cass.create_keyspace('jedberg_test', topology, {'replication_factor': '1'}) cass.ks = 'jedberg_test' pool = ConnectionPool('jedberg_test') conn = pool.get() # create the column families families = ['collected_properties', 'collection_cache_by_times', 'collections_by_cache'] print "Creating column families" for fam in families: cass.create_column_family(cass.ks, fam) # Let's see if those keyspaces are set up correctly print "Keyspaces: " print cass.list_keyspaces() print
def test_pool(self): listener = StatsLogger() pool = ConnectionPool(pool_size=5, max_overflow=5, recycle=10000, prefill=True, pool_timeout=0.1, timeout=1, keyspace='PycassaTestKeyspace', credentials=_credentials, listeners=[listener], use_threadlocal=False) conns = [] for i in range(10): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['created']['failure'], 0) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) # Pool is maxed out now assert_raises(NoConnectionAvailable, pool.get) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 10) assert_equal(listener.stats['opened'], {'current': 10, 'max': 10}) assert_equal(listener.stats['at_max'], 1) for i in range(0, 5): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 0) assert_equal(listener.stats['checked_in'], 5) assert_equal(listener.stats['opened'], {'current': 5, 'max': 10}) for i in range(5, 10): pool.return_conn(conns[i]) assert_equal(listener.stats['disposed']['success'], 5) assert_equal(listener.stats['checked_in'], 10) conns = [] # These connections should come from the pool for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 10) assert_equal(listener.stats['checked_out'], 15) # But these will need to be made for i in range(5): conns.append(pool.get()) assert_equal(listener.stats['created']['success'], 15) assert_equal(listener.stats['checked_out'], 20) assert_equal(listener.stats['disposed']['success'], 5) for i in range(10): conns[i].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) assert_raises(InvalidRequestError, conns[0].return_to_pool) assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) print "in test:", id(conns[-1]) conns[-1].return_to_pool() assert_equal(listener.stats['checked_in'], 20) assert_equal(listener.stats['disposed']['success'], 10) pool.dispose()
def serve_stats(dmu,dmu2): global atu_reg s.bind((host, port)) # Bind to the port s.listen(5) # Now wait for client connection. while True: c, addr = s.accept() # Establish connection with client. #atu_reg= 'Got connection from', addr msg = c.recv(1024) #print addr, ' >> ', msg msg = str(atu_reg) c.send(msg); atu_reg= 'Connect to cassandra...' pool2 = ConnectionPool('MINDNET', ['91.205.172.85:9160'],timeout=10000) fcb = pycassa.ColumnFamily(pool2, 'fcb_users2') fcb2 = pycassa.ColumnFamily(pool2, 'fcb_users3') thread.start_new_thread(serve_stats,(0,0) ) ind_files=1 total_collected=0 import os.path
def query1(user_id): pool = ConnectionPool('tuitterdb') userTweets_family = ColumnFamily(pool, 'userTweets') # Print the tweets of the user supplied in the parameter. query(userTweets_family,user_id) pool.dispose()
import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily import csv pool = ConnectionPool('highwayData', ['localhost:9160']) #change to not super column col_fam = ColumnFamily(pool, 'stationid') print('\n\ngetting record for station 1098 from the stations column family') print(col_fam.get('1098')) print('\n\n') col_fam_detectors = ColumnFamily(pool, 'detectors') print( 'getting record for detector 1345, 09-15-2011 from the detectors & loopdata super-column family' ) print('record check: 1345,2011-09-15 00:00:00-07,0,,0,0,0') print(col_fam_detectors.get( '1345', columns=['2011-09-15 00:00:00-07'], )) print('\n\n') #print(col_fam_detectors.get('1345')) #1346,2011-09-24 21:21:20-07,7,63,11,2,0 #1348,2011-11-06 03:53:20-08,0,,0,0,0
#captura web_know import pycassa from pycassa.pool import ConnectionPool from pycassa import index from pycassa.columnfamily import ColumnFamily pool2 = ConnectionPool('MINDNET', ['91.205.172.85:9160'], timeout=10000) pool1 = ConnectionPool('MINDNET', ['79.143.185.3:9160'], timeout=10000) tb_web1 = pycassa.ColumnFamily(pool1, 'web_know') tb_web2 = pycassa.ColumnFamily(pool2, 'web_know2') tb_web2 = pycassa.ColumnFamily(pool2, 'web_know') #=== tb_web1.truncate() rg1 = tb_web2.get_range() ind = 0 for k, r in rg1: tb_web1.insert(k, r) print r ind += 1 if ind % 1000 == 0: print 'ind:', ind
from pycassa.index import * from pycassa.cassandra import ttypes import json import datetime class JSONDateTimeEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, (datetime.date, datetime.datetime)): return obj.isoformat() else: return json.JSONEncoder.default(self, obj) def has_many_woeids(x): key, values = x return len(values) > 1 pool = ConnectionPool('processing_llama_Processor') trends = ColumnFamily(pool, 'Trend') for trend_name, country_specifics in filter(has_many_woeids, trends.get_range()): print json.dumps(country_specifics, sort_keys=True, indent=4, separators=(',', ': '), cls=JSONDateTimeEncoder) #track_trend(trend_name, country_specifics['query'], country_specifics.keys)
import time import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily import sys sys.path.append('/Neural') import conn3 conn = conn3.conn_mx pool2 = ConnectionPool('MINDNET', ['localhost:9160'], timeout=10) tab2 = pycassa.ColumnFamily(pool2, 'fcb_users') ''' create column family fcb_users with comparator = UTF8Type and default_validation_class = UTF8Type and column_metadata = [{ column_name : user_name, validation_class : UTF8Type, index_name : user_name_idx1, index_type : 0}, { column_name : id, validation_class : UTF8Type, index_name : id_idx1, index_type : 0}, { column_name : u_name, validation_class : UTF8Type,
class Cassa(object): ''' Provides a simple key=value functionality built on a cassandra table with a key and a single column. Used in ZookeeperTaskQueue to replace the two tables that grow in size with the number of tasks rather than the number of workers. ''' def __init__(self, namespace, server_list=['localhost:9160']): # save cassandra server self.server_list = server_list self.namespace = namespace self._closed = False #setup_logging(self) # Connect to the server creating the namespace if it doesn't # already exist try: self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) except pycassa.InvalidRequestException: self._create_namespace(namespace) self.pool = ConnectionPool(namespace, self.server_list, max_retries=500, pool_timeout=600, timeout=10) try: self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') except pycassa.NotFoundException: self._create_column_family('tasks', key_validation_class=ASCII_TYPE, bytes_columns=['task_data']) self._tasks = pycassa.ColumnFamily(self.pool, 'tasks') try: self._available = pycassa.ColumnFamily(self.pool, 'available') except pycassa.NotFoundException: self._create_column_family('available', key_validation_class=ASCII_TYPE, bytes_columns=['available']) self._available = pycassa.ColumnFamily(self.pool, 'available') try: self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') except pycassa.NotFoundException: self._create_counter_column_family('task_count', key_validation_class=ASCII_TYPE, counter_columns=['task_count']) self._task_count = pycassa.ColumnFamily(self.pool, 'task_count') self._task_count.insert('RowKey', {'task_count': 0}) try: self._available_count = pycassa.ColumnFamily(self.pool, 'available_count') except pycassa.NotFoundException: self._create_counter_column_family('available_count', key_validation_class=ASCII_TYPE, counter_columns=['available_count']) self._available_count = pycassa.ColumnFamily(self.pool, 'available_count') self._available_count.insert('RowKey', {'available_count': 0}) def delete_namespace(self): sm = SystemManager(random.choice(self.server_list)) sm.drop_keyspace(self.namespace) sm.close() def _create_namespace(self, namespace): sm = SystemManager(random.choice(self.server_list)) sm.create_keyspace(namespace, SIMPLE_STRATEGY, {'replication_factor': '1'}) sm.close() def _create_column_family(self, family, bytes_columns=[], key_validation_class=TIME_UUID_TYPE): ''' Creates a column family of the name 'family' and sets any of the names in the bytes_column list to have the BYTES_TYPE. key_validation_class defaults to TIME_UUID_TYPE and could also be ASCII_TYPE for md5 hash keys, like we use for 'inbound' ''' sm = SystemManager(random.choice(self.server_list)) # sys.create_column_family(self.namespace, family, super=False) sm.create_column_family(self.namespace, family, super=False, key_validation_class = key_validation_class, default_validation_class = TIME_UUID_TYPE, column_name_class = ASCII_TYPE) for column in bytes_columns: sm.alter_column(self.namespace, family, column, BYTES_TYPE) sm.close() def _create_counter_column_family(self, family, counter_columns=[], key_validation_class=UTF8Type): ''' Creates a column family of the name 'family' and sets any of the names in the bytes_column list to have the BYTES_TYPE. key_validation_class defaults to TIME_UUID_TYPE and could also be ASCII_TYPE for md5 hash keys, like we use for 'inbound' ''' sm = SystemManager(random.choice(self.server_list)) # sys.create_column_family(self.namespace, family, super=False) sm.create_column_family(self.namespace, family, super=False, key_validation_class = key_validation_class, default_validation_class="CounterColumnType", column_name_class = ASCII_TYPE) for column in counter_columns: sm.alter_column(self.namespace, family, column, COUNTER_COLUMN_TYPE) sm.close() def tasks(self, key_prefix=''): ''' generate the data objects for every task ''' for row in self._tasks.get_range(): logger.debug(row) if not row[0].startswith(key_prefix): continue data = json.loads(row[1]['task_data']) data['task_key'] = row[0] yield data def put_task(self, key, task_data): try: found = self._tasks.get(key, column_count=1) exists = True except pycassa.cassandra.ttypes.NotFoundException: exists = False self._tasks.insert(key, {'task_data': json.dumps(task_data)}) if not exists: self._task_count.insert('RowKey', {'task_count': 1}) return exists def get_task(self, key): data = self._tasks.get(key) return json.loads(data['task_data']) def pop_task(self, key): self._tasks.remove(key) self._task_count.insert('RowKey', {'task_count': -1}) return key @property def task_keys(self): c = 0 for key, _ in self._tasks.get_range(column_count=0, filter_empty=False): c += 1 yield key def num_tasks(self): data = self._task_count.get('RowKey') return data['task_count'] def num_available(self): data = self._available_count.get('RowKey') return data['available_count'] def put_available(self, key): ## closest thing to storing only the key try: found = self._available.get(key, column_count=1) exists = True except pycassa.cassandra.ttypes.NotFoundException: exists = False if not exists: self._available.insert(key, {'available': ''}) self._available_count.insert('RowKey', {'available_count': 1}) #def push_batch(self, row_iter): # ''' # Push opaque vertex data objects into the inbound queue # ''' # return self._tasks.batch_insert({k: json.dumps(v) for k, v in row_iter}) def get_random_available(self, max_iter=10000): ''' get a random key out of the first max_iter rows ''' c = 1 keeper = None ## note the ConsistencyLevel here. If we do not do this, and ## get all slick with things like column_count=0 and filter ## empty False, then we can get keys that were recently ## deleted... EVEN if the default consistency would seem to ## rule that out! ## note the random start key, so that we do not always hit the ## same place in the key range with all workers #random_key = hashlib.md5(str(random.random())).hexdigest() #random_key = '0' * 32 #logger.debug('available.get_range(%r)' % random_key) ## scratch that idea: turns out that using a random start key ## OR using row_count=1 can cause get_range to hang for hours ## why we need ConsistencyLevel.ALL on a single node is not ## clear, but experience indicates it is needed. ## note that putting a finite row_count is problematic in two ## ways: # 1) if there are more workers than max_iter, some will not # get tasks # # 2) if there are more than max_iter records, then all workers # have to wade through all of these just to get a task! What # we really want is a "pick random row" function, and that is # probably best implemented using CQL3 token function via the # cql python module instead of pycassa... for row in self._available.get_range(row_count=max_iter, read_consistency_level=pycassa.ConsistencyLevel.ALL): #for row in self._available.get_range(row_count=100): logger.debug('considering %r' % (row,)) if random.random() < 1 / c: keeper = row[0] if c == max_iter: break c += 1 return keeper def in_available(self, key): try: row = self._available.get(key) return True except pycassa.NotFoundException: return False def pop_available(self, key): self._available.remove(key, write_consistency_level=pycassa.ConsistencyLevel.ALL) self._available_count.insert('RowKey', {'available_count': -1}) assert not self.in_available(key) return key def close(self): self._closed = True if hasattr(self, 'pool'): self.pool.dispose()