def load_builders(self, o): cf = ColumnFamily(self._pool, 'builders') indices = ColumnFamily(self._pool, 'indices') batch = cf.batch() i_batch = indices.batch() for builder_id, params in o.items(): cat = params['category'] columns = { 'category': cat, 'master': unicode(params['master_id']), 'name': params['name'], } batch.insert(builder_id, columns) i_batch.insert('builder_category_to_builder_ids', {cat: {builder_id: ''}}) i_batch.insert('master_id_to_slave_ids', {columns['master']: { builder_id: ''}}) if len(params['slaves']): i_batch.insert('builder_id_to_slave_ids', {builder_id: { unicode(slave_id): '' for slave_id in params['slaves']}}) batch.send() i_batch.send() return len(o)
def main(filename): data = file(filename) # Set up the connection pool pool = ConnectionPool('tuitterdb',['localhost:9160']) # CF connections user_family = ColumnFamily(pool, 'user') tweet_family = ColumnFamily(pool, 'tweet') user_tweets_family = ColumnFamily(pool, 'userTweets') followers = ColumnFamily(pool, 'followers') followerTweets = ColumnFamily(pool, 'followsTweets') # Batch Definitions user_batch = user_family.batch(queue_size=1000) followers_batch = followers.batch(queue_size=500) user_tweets_batch = user_tweets_family.batch(queue_size = 500) followerTweets_batch = followerTweets.batch(queue_size = 500) while True: # loop line = data.readline() if line == "": break # This isn't handled properly else: tweet = tweet_get(line) try: tweet_data = get_tweet_data(tweet) if check_user(tweet[u"from_user_id_str"]) == False: # check in script if user is there. sender = get_sender(tweet) user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) # create user entry for sender user_tweets_batch.insert(sender.user_id,{line:''}) # insert the whole tweet into a userTweets column header if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False: to_user = get_to_user(tweet) user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user}) followers_batch.insert(to_user.user_id,{sender.user_id:'follower_id'}) followerTweets_batch.insert(to_user.user_id,{line:''}) # insert the whole tweet into a followeTweets column header for the to user. if u"entities" in tweet: # iterate over the users in mentions and add them to users and follows if necessary if tweet[u"entities"][u"user_mentions"] != []: user_mentions = get_mentions(tweet) for obj in user_mentions: if check_user(obj.user_id) == False: user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user}) followers_batch.insert(obj.user_id,{'user_id':sender.user_id}) followerTweets_batch.insert(obj.user_id,{line:''}) # insert the whole tweet to a followerTweet entry for the mentioned user else: continue tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp}) except Exception: err = sys.exc_info() print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) #print the exception data with traceback and continue. continue # Pools Closed. pool.dispose()
def main(filename): data = file(filename) # Set up the connection pool pool = ConnectionPool('tuitterdb',['localhost:9160']) # CF connections user_family = ColumnFamily(pool, 'user') tweet_family = ColumnFamily(pool, 'tweet') user_tweets_family = ColumnFamily(pool, 'userTweets') #follows_tweets_family = ColumnFamily(pool, 'follows.tweets') followers = ColumnFamily(pool, 'followers') # Batch Definitions user_batch = user_family.batch(queue_size=1000) followers_batch = followers.batch(queue_size=500) user_tweets_batch = user_tweets_family.batch(queue_size = 500) while True: line = data.readline() if line is None: break else: tweet = tweet_get(line) try: tweet_data = get_tweet_data(tweet) if check_user(tweet[u"from_user_id_str"]) == False: sender = get_sender(tweet) user_batch.insert(sender.user_id,{'user_name':sender.user_name, 'screen_name':sender.from_user}) user_tweets_batch.insert(sender.user_id,{tweet_data.tweet_id:tweet_data.timestamp}) if tweet[u"to_user"] is not None and check_user(tweet[u"to_user_id"]) == False: to_user = get_to_user(tweet) user_batch.insert(to_user.user_id,{'user_name':to_user.user_name, 'screen_name':to_user.from_user}) followers_batch.insert(to_user.user_id,{'user_id':sender.user_id}) if u"entities" in tweet: if tweet[u"entities"][u"user_mentions"] != []: user_mentions = get_mentions(tweet) for obj in user_mentions: user_batch.insert(obj.user_id,{'user_name':obj.user_name, 'screen_name':obj.from_user}) followers_batch.insert(obj.user_id,{'user_id':sender.user_id}) else: continue tweet_family.insert(tweet_data.tweet_id,{'text':tweet_data.textbody,'user_id':sender.user_id,'timeanddate':tweet_data.timestamp}) except Exception: err = sys.exc_info() print "Broken cos %s %s %s" % (err[0],err[1], traceback.print_tb(err[2])) continue # Pools Closed. pool.dispose() #if __name__ == "__main__": #unittest.main()
def load_builds(self, o, builders): cf = ColumnFamily(self._pool, 'builds') batch = cf.batch() indices = ColumnFamily(self._pool, 'indices') i_batch = indices.batch() simple_indices = ColumnFamily(self._pool, 'simple_indices') si_batch = simple_indices.batch() # We defer counter inserts until then end because Python # increments are cheaper than Cassandra increments (hopefully). counters = { 'builder_number': Counter(), 'builder_duration': Counter(), 'builder_number_by_day': {}, 'builder_duration_by_day': {}, 'builder_number_by_category': {}, 'builder_duration_by_category': {}, 'builder_number_by_day_and_category': {}, 'builder_duration_by_day_and_category': {}, } existing_filenames = set(self._connection.filenames()) for build in o: self._load_build(batch, i_batch, si_batch, counters, build, builders, existing_filenames) batch.send() i_batch.send() si_batch.send() cf = ColumnFamily(self._pool, 'counters') for builder, count in counters['builder_number'].items(): cf.add('builder_number', builder, count) for builder, count in counters['builder_duration'].items(): cf.add('builder_duration', builder, count) cf = ColumnFamily(self._pool, 'super_counters') del counters['builder_number'] del counters['builder_duration'] for key, super_columns in counters.items(): for super_column, counts in super_columns.items(): for column, count in counts.items(): cf.add(key, column, count, super_column) return len(o)
def truncate_log_metadata(self): for cf in ['build_timelines']: cf = ColumnFamily(self.pool, cf) cf.truncate() cf = ColumnFamily(self.pool, 'indices') for key in LOG_METADATA_INDICES: cf.remove(key) cf = ColumnFamily(self.pool, 'counters') for key in LOG_METADATA_COUNTERS: cf.remove(key) cf = ColumnFamily(self.pool, 'super_counters') for key in LOG_METADATA_SUPER_COUNTERS: cf.remove(key) cf = ColumnFamily(self.pool, 'builds') batch = cf.batch() # Remove log parsing state from builds. for key, cols in cf.get_range(columns=['log_parsing_version']): if 'log_parsing_version' not in cols: continue batch.remove(key, ['log_parsing_version']) batch.send()
def load_slaves(self, o): cf = ColumnFamily(self._pool, 'slaves') with cf.batch() as batch: for slave_id, name in o.items(): batch.insert(slave_id, {'name': name}) return len(o)
def load_masters(self, o): cf = ColumnFamily(self._pool, 'masters') with cf.batch() as batch: for master_id, info in o.items(): batch.insert(master_id, info) return len(o)
def undo(self): if type == InsertCommand.INS_BASIC: ## I know that data for a basic insert is of this tuple type domain, row_key, basic_type_dict = self.data client = db_connection.get_client() cf = ColumnFamily(client, domain) cf.remove(row_key) elif type == InsertCommand.INS_OBJECT: ## call the save operation for the object if self.data: self.data.delete(cascade=False) elif type == InsertCommand.INS_BATCH: domain, basic_type_item_dict = self.data client = db_connection.get_client() cf = ColumnFamily(client, domain) b = cf.batch() for row_key in basic_type_item_dict.keys(): b.remove(row_key) b.send()
def update(self, values): """ Changes an entity that already exists in the database. :param values: A list of (field, new-value) pairs. """ pool = self.connection column_family_name = get_column_family() col_fam = CF(pool, column_family_name) pk_column = get_pk_column() pk_index = -1 fields = self.get_fields() for index in range(len(fields)): if fields[index].column == pk_column: pk_index = index break if pk_index == -1: raise DatabaseError('Invalid primary key column.') b = col_fam.batch( write_consistency_level=self.connection.write_consistency_level) row_count = 0 for result in self.results_iter(): row_count += 1 key = result[pk_index] for k, v in values.items(): b.insert(k, v) b.send() return row_count
def update(self, values): """ Changes an entity that already exists in the database. :param values: A list of (field, new-value) pairs. """ pool = self.connection column_family_name = get_column_family() col_fam = CF(pool, column_family_name) pk_column = get_pk_column() pk_index = -1 fields = self.get_fields() for index in range(len(fields)): if fields[index].column == pk_column: pk_index = index break if pk_index == -1: raise DatabaseError('Invalid primary key column.') b = col_fam.batch(write_consistency_level=self.connection.write_consistency_level) row_count = 0 for result in self.results_iter(): row_count += 1 key = result[pk_index] for k, v in values.items(): b.insert(k, v) b.send() return row_count
def do(self): if type == InsertCommand.INS_BASIC: ## I know that data for a basic insert is of this tuple type domain, row_key, basic_type_dict = self.data client = db_connection.get_client() cf = ColumnFamily(client, domain) cf.insert(row_key, basic_type_dict) elif type == InsertCommand.INS_OBJECT: ## call the save operation for the object if self.data: self.data.save() elif type == InsertCommand.INS_BATCH: ## Again, I know data for a batch insert will be of the following tuple type domain, basic_type_item_dict = self.data client = db_connection.get_client() cf = ColumnFamily(client, domain) b = cf.batch() for row_key in basic_type_item_dict.keys(): b.insert(row_key, basic_type_item_dict[row_key]) b.send()
def add_data(opt): pool = ConnectionPool('CrashData', ['localhost:9160']) col_fam = ColumnFamily(pool, 'CrashInfo2') col_fam.insert('7d625afa-ca2b-41e7-bcf3-e180d2140202', {"useragent_locale": "en-US", "AdapterVendorID": "0x10de", "TotalVirtualMemory": "4294836224", "BreakpadReserveAddress": "44826624", "Theme": "classic/1.0", "Version": "29.0a1", "id": "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}", "BIOS_Manufacturer": "stuff", "Vendor": "Mozilla", "uuid": "7d625afa-ca2b-41e7-bcf3-e180d2140202", "EMCheckCompatibility": "true", "Throttleable": "1", "throttle_rate": "100", "AvailablePageFile": "14036480000", "version": "29.0a1", "AdapterDeviceID": "0x1080", "ReleaseChannel": "nightly", "submitted_timestamp": "2014-02-02T23:32:59.584636+00:00", "buildid": "20140202030204", "Notes": "AdapterVendorID: 0x10de, AdapterDeviceID: 0x1080, AdapterSubsysID: 15803842, AdapterDriverVersion: 9.18.13.3158\nD2D? D2D+ DWrite? DWrite+ D3D10 Layers? D3D10 Layers+ D3D10 Layers- D3D9 Layers? D3D9 Layers- ", "CrashTime": "1391383937", "Winsock_LSP": "MSAFD Tcpip [TCP/IP] : 2 : 1 : %SystemRoot%\\system32\\mswsock.dll \n MSAFD Tcpip [UDP/IP] : 2 : 2 : \n MSAFD Tcpip [RAW/IP] : 2 : 3 : %SystemRoot%\\system32\\mswsock.dll \n MSAFD Tcpip [TCP/IPv6] : 2 : 1 : \n MSAFD Tcpip [UDP/IPv6] : 2 : 2 : %SystemRoot%\\system32\\mswsock.dll \n MSAFD Tcpip [RAW/IPv6] : 2 : 3 : \n RSVP TCPv6 Service Provider : 2 : 1 : %SystemRoot%\\system32\\mswsock.dll \n RSVP TCP Service Provider : 2 : 1 : \n RSVP UDPv6 Service Provider : 2 : 2 : %SystemRoot%\\system32\\mswsock.dll \n RSVP UDP Service Provider : 2 : 2 : ", "FramePoisonBase": "00000000f0de0000", "AvailablePhysicalMemory": "5240811520", "FramePoisonSize": "65536", "BreakpadReserveSize": "37748736", "StartupTime": "1391382356", "Add-ons": "%7B972ce4c6-7e08-4474-a285-3208198ce6fd%7D:29.0a1", "BuildID": "20140202030204", "SecondsSinceLastCrash": "930758", "ProductName": "Firefox", "legacy_processing": "0", "BlockedDllList": "", "AvailableVirtualMemory": "3497549824", "SystemMemoryUsePercentage": "38", "ProductID": "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}"}) crash_column_family = ColumnFamily(pool, opt.column_family_counter) crash_batch = crash_column_family.batch(queue_size=100) # Insert buckets of data hourly for crashes crash_seed_signatures = [ "FakeSignature1", "FakeSignature2", "FakeSignature3", "FakeSignature4", "FakeSignature5", "FakeSignature6", "FakeSignature7", "FakeSignature8", "FakeSignature9" ] now = datetime.datetime.now() for i in xrange(100): current_time = now + datetime.timedelta(seconds=i) row_bucket = "h-%d" % int(current_time.strftime("%H")) print "Adding row_bucket %s" % row_bucket column_bucket = int(current_time.strftime("%M")) print "Adding to column_bucket %s" % column_bucket crash_batch.insert(row_bucket, {column_bucket: 1}) next_sig = random.randint(0, len(crash_seed_signatures)-1) #new_row_bucket = "%s-%s" % (row_bucket, crash_seed_signatures[next_sig]) new_row_bucket = '{ "hour": "%d", "signature": "%s" }' % ( int(current_time.strftime("%H")), crash_seed_signatures[next_sig] ) print "Adding row_bucket %s, column_bucket %s" % (new_row_bucket, column_bucket) crash_batch.insert(new_row_bucket, {column_bucket: 1}) print "Just put %s into cassandra." % crash_seed_signatures[next_sig]
from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool("pykeyspace", ["localhost:9160"]) col_family = ColumnFamily(pool, "UserInfo") col_family.insert("dosht2", {"email": "*****@*****.**", "name": "mostafa"}) # print col_family.get("dosht2", columns=["email"])['email'] print col_family.get("dosht2") b = col_family.batch() b.insert("dodo", {"email": "*****@*****.**"}) b.remove("dosht2", ["name"]) b.send() print col_family.get("dosht2") print col_family.multiget(["dosht", "dodo"])["dodo"] from pycassa.types import * class User(object): key = UTF8Type() # name key is mandatory email = AsciiType() age = IntegerType() def __repr__(self): return "User(key: %s, email: %s, age: %s)" % (self.key, self.email, self.age) from pycassa.columnfamilymap import ColumnFamilyMap cfmap = ColumnFamilyMap(User, pool, "UserInfo") user = User()
import pycassa from pycassa.pool import ConnectionPool from pycassa.columnfamily import ColumnFamily pool = ConnectionPool('ApplicationData', ['localhost:9160']) col_fam = ColumnFamily(pool, 'UserInfo') col_fam.insert('Diego', {'email': '*****@*****.**'}) readData = col_fam.get('Diego', columns=['email']) col_fam.remove('Diego', columns=['email']) #batch b = col_fam.batch(queue_size=10) b.insert('John', {'email': '*****@*****.**', 'state': 'IL', 'gender': 'M'}) b.insert('Jane', {'email': '*****@*****.**', 'state': 'CA'}) b.remove('John', ['gender']) b.remove('Jane') b.send()