def test_Binify_Hours(self): """ this function has a stupid name, but will get data from one dimensionList an will aggregate it """ self.dL = DimensionListModel('user_ids', mode='cassandra') binSize = 'hours' start_seconds = 58 end_seconds = 61 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) start_seconds = 3600 end_seconds = start_seconds + 1 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) binified = self.dL.binify(binSize, 1, 2) self.assertIn(u'user_ids_by_seconds_3600', binified) binified = self.dL.binify(binSize, 0, 1) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) self.assertNotIn(u'user_ids_by_seconds_3600', binified)
def testGetDimensionListRangeByHours(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' self.dL = DimensionListModel(dimension, 'cassandra') self.dL.save(dimension_id=1, timestamp=59) self.dL.save(dimension_id=2, timestamp=59) self.dL.save(dimension_id=3, timestamp=59) self.dL.save(dimension_id=1, timestamp=61) self.dL.save(dimension_id=4, timestamp=3000) #### hours #### id_stats = self.dL.getByTime(0, 1, binSize='hours') self.assertEquals(4, len(id_stats[u'user_ids_by_hours_0']), "wrong length") self.dL.save(dimension_id=4, timestamp=4000) id_stats = self.dL.getByTime(1, 2, binSize='hours', renew=True) #print mylist self.assertEquals(1, len(id_stats[u'user_ids_by_hours_1']), "wrong length") self.dL.save(dimension_id=3, timestamp=4000) id_stats = self.dL.getByTime(1, 2, binSize='hours', renew=True) self.assertEquals(2, len(id_stats[u'user_ids_by_hours_1']), "wrong length")
def test_Binify_Minutes(self): """ this function has a stupid name, but will get data from one dimensionList an will aggregate it """ self.dL = DimensionListModel('user_ids', mode='cassandra') binSize = 'minutes' start_seconds = 58 end_seconds = 61 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) list = self.dL.getByTime(45, 75, 'seconds') #self.dL.getByTime(timestampStart, timestampEnd, binSize) #print list """ save the data """ binified = self.dL.binify(binSize, 0, 1) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertNotIn(u'user_ids_by_seconds_60', binified) binified = self.dL.binify(binSize, 1, 2) self.assertNotIn(u'user_ids_by_seconds_58', binified) self.assertNotIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) print binified
def testGetDimensionListRangeByDay(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' self.dL = DimensionListModel(dimension, 'cassandra') #self.save_test_data(timestamp_start, time_stamp_range, id_range) self.dL.save(dimension_id=1, timestamp=59) self.dL.save(dimension_id=2, timestamp=59) self.dL.save(dimension_id=3, timestamp=59) self.dL.save(dimension_id=1, timestamp=61) #self.dL.save(dimension_id=1, timestamp=3000) #self.dL.save(dimension_id=2, timestamp=3000) #self.dL.save(dimension_id=3, timestamp=3000) self.dL.save(dimension_id=4, timestamp=3000) ####### days #id_stats = self.dL.getByTime(0, 2, binSize = 'days') #print id_stats #self.assertEquals(4, len(id_stats[u'user_ids_by_days_0']), "wrong length") #print id_stats #self.assertEquals(1, len(id_stats), "wrong length") # save second based data for hour one timestamp_start = 86401 timestamp_start = 90000 time_stamp_range = 5 id_range = 10 self.save_test_data(timestamp_start, time_stamp_range, id_range) id_stats = self.dL.getByTime(0, 2, binSize = 'days') self.assertEquals(2, len(id_stats), "wrong length") # self.assertEquals(id_range, len(id_stats[u'user_ids_by_days_1']), "wrong length") id_stats = self.dL.getByTime(1, 2, binSize = 'days') self.assertEquals(1, len(id_stats), "wrong length") print id_stats
def testSaveDimensionListRangeBySeconds(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' # save user ids self.dL = DimensionListModel(dimension, 'cassandra') # create the model timestamp_start = 4000 time_stamp_range = 50 id_range = 98 self.save_test_data(timestamp_start, time_stamp_range, id_range) id_stats = self.dL.getByTime(timestamp_start + 1, timestamp_start + 2) self.assertEqual(id_stats[u'user_ids_by_seconds_4001'][0], 0, "the entries are not equal") self.assertEqual(len(id_stats[u'user_ids_by_seconds_4001']), id_range, "the list has the wrong length") desired_length = 5 id_stats = self.dL.getByTime(timestamp_start, timestamp_start + desired_length) self.assertEqual(len(id_stats), desired_length, "the requested list has not {} entries".format(desired_length))
def testSetComputedIds(self): dimension = 'user_ids' dL = DimensionListModel(dimension, mode = 'cassandra') rangeStart = 0 rangeEnd = 10 binSize = 'minutes' dL.setComputedIds(dimension, rangeStart, rangeEnd, binSize) r = dL.getComputedIds(dimension, rangeStart, rangeEnd, binSize) print r self.assertEqual(rangeEnd, len(r) ) rangeEnd = 5 binSize = 'hours' r = dL.getComputedIds(dimension, rangeStart, rangeEnd, binSize) """ nothing is yet computed for hours """ self.assertEqual(0, len(r) ) dL.setComputedIds(dimension, rangeStart, rangeEnd, binSize) r = dL.getComputedIds(dimension, rangeStart, rangeEnd, binSize) print r self.assertEqual(rangeEnd, len(r) )
def testGetDimensionListRangeByMinutes(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' self.dL = DimensionListModel(dimension, 'cassandra') self.dL.save(dimension_id=1, timestamp=59) self.dL.save(dimension_id=2, timestamp=59) self.dL.save(dimension_id=3, timestamp=59) self.dL.save(dimension_id=1, timestamp=61) self.dL.save(dimension_id=4, timestamp=3000) id_stats = self.dL.getByTime(0, 1, binSize='minutes') self.assertEquals(3, len(id_stats[u'user_ids_by_minutes_0']), "wrong length") id_stats = self.dL.getByTime(1, 2, binSize='minutes') self.assertEquals(1, len(id_stats[u'user_ids_by_minutes_1']), "wrong length") id_stats = self.dL.getByTime(0, 2, binSize='minutes') self.assertEquals(1, len(id_stats[u'user_ids_by_minutes_1']), "wrong length") self.assertEquals(3, len(id_stats[u'user_ids_by_minutes_0']), "wrong length")
print "DimensionListModel: no time given at" dL = DimensionListModel('days') try: dL.save( getTimestamp.gettimeStampIn_Days( flattenedJson['item_created'] ) ) except: print "DimensionListModel: no time given at" """ """ now we want a log for each user and each item with the corresponding seen items or users respectiveley """ if (SAVE_DIMENSION_LIST): dimension = 'user_ids' dL = DimensionListModel( dimension, mode = 'redis' ) if(debug): print "userid:\t" + str(flattenedJson['client_id']) dL.save( dimension_id = flattenedJson['client_id'], timestamp = timestamp_sec ) if( SAVE_HADOOP_SINK ): if ( ('item_id' in flattenedJson) and ('client_id' in flattenedJson) ): if ( 519516260 == int(flattenedJson['client_id']) ): print "" #print int(flattenedJson['item_id']) hS.save_mode1(int(flattenedJson['client_id']), int(flattenedJson['item_id']), 1) #_mysql.connection.query()
class TestDimensionListModel(unittest.TestCase): def setUp(self): print "setting up database" config_global.cassandra_default_keyspace = 'unitTest' sK = Setup_Keyspaces() dM = dimensionListsMigration() try: self.dbconn = cql.connect(config_local.cassandra_host, config_local.cassandra_port) self.cursor = self.dbconn.cursor() except: print "not able to create a database connection" self.cursor.execute("USE " + config_global.cassandra_default_keyspace) print "setting up database done" def save_test_data(self, timestamp_start = 4000, time_stamp_range = 5, id_range = 2): """ save some sample data """ for i in xrange(time_stamp_range): curr_timestamp = timestamp_start + i for user_id in xrange(id_range): self.dL.save(user_id, curr_timestamp) def testSaveDimensionListRangeBySeconds(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' # save user ids self.dL = DimensionListModel(dimension, 'cassandra') # create the model timestamp_start = 4000 time_stamp_range = 50 id_range = 98 self.save_test_data(timestamp_start, time_stamp_range, id_range) id_stats = self.dL.getByTime(timestamp_start + 1, timestamp_start + 2) self.assertEqual(id_stats[u'user_ids_by_seconds_4001'][0], 0, "the entries are not equal") self.assertEqual(len(id_stats[u'user_ids_by_seconds_4001']), id_range, "the list has the wrong length") desired_length = 5 id_stats = self.dL.getByTime(timestamp_start, timestamp_start + desired_length) self.assertEqual(len(id_stats), desired_length, "the requested list has not {} entries".format(desired_length)) def testGetDimensionListRangeByMinutes(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' self.dL = DimensionListModel(dimension, 'cassandra') self.dL.save(dimension_id=1, timestamp=59) self.dL.save(dimension_id=2, timestamp=59) self.dL.save(dimension_id=3, timestamp=59) self.dL.save(dimension_id=1, timestamp=61) self.dL.save(dimension_id=4, timestamp=3000) id_stats = self.dL.getByTime(0, 1, binSize='minutes') self.assertEquals(3, len(id_stats[u'user_ids_by_minutes_0']), "wrong length") id_stats = self.dL.getByTime(1, 2, binSize='minutes') self.assertEquals(1, len(id_stats[u'user_ids_by_minutes_1']), "wrong length") id_stats = self.dL.getByTime(0, 2, binSize='minutes') self.assertEquals(1, len(id_stats[u'user_ids_by_minutes_1']), "wrong length") self.assertEquals(3, len(id_stats[u'user_ids_by_minutes_0']), "wrong length") def testGetDimensionListRangeByHours(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' self.dL = DimensionListModel(dimension, 'cassandra') self.dL.save(dimension_id=1, timestamp=59) self.dL.save(dimension_id=2, timestamp=59) self.dL.save(dimension_id=3, timestamp=59) self.dL.save(dimension_id=1, timestamp=61) self.dL.save(dimension_id=4, timestamp=3000) #### hours #### id_stats = self.dL.getByTime(0, 1, binSize='hours') self.assertEquals(4, len(id_stats[u'user_ids_by_hours_0']), "wrong length") self.dL.save(dimension_id=4, timestamp=4000) id_stats = self.dL.getByTime(1, 2, binSize='hours', renew=True) #print mylist self.assertEquals(1, len(id_stats[u'user_ids_by_hours_1']), "wrong length") self.dL.save(dimension_id=3, timestamp=4000) id_stats = self.dL.getByTime(1, 2, binSize='hours', renew=True) self.assertEquals(2, len(id_stats[u'user_ids_by_hours_1']), "wrong length") def testGetDimensionListRangeByDay(self): """ test if the stream of information is written as wished """ dimension = 'user_ids' self.dL = DimensionListModel(dimension, 'cassandra') #self.save_test_data(timestamp_start, time_stamp_range, id_range) self.dL.save(dimension_id=1, timestamp=59) self.dL.save(dimension_id=2, timestamp=59) self.dL.save(dimension_id=3, timestamp=59) self.dL.save(dimension_id=1, timestamp=61) #self.dL.save(dimension_id=1, timestamp=3000) #self.dL.save(dimension_id=2, timestamp=3000) #self.dL.save(dimension_id=3, timestamp=3000) self.dL.save(dimension_id=4, timestamp=3000) ####### days #id_stats = self.dL.getByTime(0, 2, binSize = 'days') #print id_stats #self.assertEquals(4, len(id_stats[u'user_ids_by_days_0']), "wrong length") #print id_stats #self.assertEquals(1, len(id_stats), "wrong length") # save second based data for hour one timestamp_start = 86401 timestamp_start = 90000 time_stamp_range = 5 id_range = 10 self.save_test_data(timestamp_start, time_stamp_range, id_range) id_stats = self.dL.getByTime(0, 2, binSize = 'days') self.assertEquals(2, len(id_stats), "wrong length") # self.assertEquals(id_range, len(id_stats[u'user_ids_by_days_1']), "wrong length") id_stats = self.dL.getByTime(1, 2, binSize = 'days') self.assertEquals(1, len(id_stats), "wrong length") print id_stats #self.assertEquals(id_range, len(id_stats[u'user_ids_by_days_1']), "wrong length") def testSetComputedIds(self): dimension = 'user_ids' dL = DimensionListModel(dimension, mode = 'cassandra') rangeStart = 0 rangeEnd = 10 binSize = 'minutes' dL.setComputedIds(dimension, rangeStart, rangeEnd, binSize) r = dL.getComputedIds(dimension, rangeStart, rangeEnd, binSize) print r self.assertEqual(rangeEnd, len(r) ) rangeEnd = 5 binSize = 'hours' r = dL.getComputedIds(dimension, rangeStart, rangeEnd, binSize) """ nothing is yet computed for hours """ self.assertEqual(0, len(r) ) dL.setComputedIds(dimension, rangeStart, rangeEnd, binSize) r = dL.getComputedIds(dimension, rangeStart, rangeEnd, binSize) print r self.assertEqual(rangeEnd, len(r) ) def test_Binify_Minutes(self): """ this function has a stupid name, but will get data from one dimensionList an will aggregate it """ self.dL = DimensionListModel('user_ids', mode='cassandra') binSize = 'minutes' start_seconds = 58 end_seconds = 61 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) list = self.dL.getByTime(45, 75, 'seconds') #self.dL.getByTime(timestampStart, timestampEnd, binSize) #print list """ save the data """ binified = self.dL.binify(binSize, 0, 1) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertNotIn(u'user_ids_by_seconds_60', binified) binified = self.dL.binify(binSize, 1, 2) self.assertNotIn(u'user_ids_by_seconds_58', binified) self.assertNotIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) print binified def test_Binify_Hours(self): """ this function has a stupid name, but will get data from one dimensionList an will aggregate it """ self.dL = DimensionListModel('user_ids', mode='cassandra') binSize = 'hours' start_seconds = 58 end_seconds = 61 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) start_seconds = 3600 end_seconds = start_seconds + 1 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) binified = self.dL.binify(binSize, 1, 2) self.assertIn(u'user_ids_by_seconds_3600', binified) binified = self.dL.binify(binSize, 0, 1) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) self.assertNotIn(u'user_ids_by_seconds_3600', binified) def test_Binify_Days(self): """ this function has a stupid name, but will get data from one dimensionList an will aggregate it """ self.dL = DimensionListModel('user_ids', mode='cassandra') binSize = 'days' start_seconds = 58 end_seconds = 61 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) start_seconds = 86400 end_seconds = start_seconds + 1 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) """ save the data """ binified = self.dL.binify(binSize, 0, 1) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) self.assertNotIn(u'user_ids_by_seconds_86400', binified) binified = self.dL.binify(binSize, 1, 2) self.assertIn(u'user_ids_by_seconds_86400', binified) binified = self.dL.binify(binSize, 0, 2) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) self.assertIn(u'user_ids_by_seconds_86400', binified) def _test_Binify_Performance(self): """ this function has a stupid name, but will get data from one dimensionList an will aggregate it """ self.dL = DimensionListModel('user_ids', mode='cassandra') binSize = 'days' start_seconds = 58 end_seconds = 90000 for i in xrange(start_seconds, end_seconds): for user_id in xrange(int(uniform(1, 6))): curr_timestamp = i self.dL.save(user_id, curr_timestamp) """ save the data """ binified = self.dL.binify(binSize, 0, 1) print binified self.assertIn(u'user_ids_by_seconds_58', binified) self.assertIn(u'user_ids_by_seconds_59', binified) self.assertIn(u'user_ids_by_seconds_60', binified) self.assertNotIn(u'user_ids_by_seconds_86400', binified) binified = self.dL.binify(binSize, 1, 2) self.assertIn(u'user_ids_by_seconds_86400', binified)
hours: 366560 2011-10-26 12:26:01 1319624761 hours: 366562 2011-10-26 13:53:33 1319630013 2011-10-27 15:21:41 1319721701 """ dL = DimensionListModel('user_ids', mode = 'redis') """ userList = dL.getByTime(1319617057, 1319617424, binSize = 'seconds') print len( userList ) """ first_computation = time.time() userList = dL.getByTime(366560, 366562, binSize = 'hours', renew = True) print "\nhour bin" print len( userList ) first_computation = time.time() - first_computation print first_computation second_computation = time.time() userList = dL.getByTime(366560, 366562, binSize = 'hours', renew = False ) print "\nhour bin"