def test_metadata(self): db = TrailDB('testtrail.tdb') self.assertEqual(1, db.min_timestamp()) self.assertEqual(3, db.max_timestamp()) self.assertEqual((1, 3), db.time_range()) self.assertEqual((1, 3), db.time_range(parsetime = False))
def test_simple_disjunction(self): tdb = TrailDB('testtrail') # test shorthand notation (not a list of lists) events = list(tdb.trail(0, event_filter=[('field1', 'a'), ('field2', '4')])) self.assertEqual(len(events), 2) self.assertEqual((events[0].field1, events[0].field2), ('a', '1')) self.assertEqual((events[1].field1, events[1].field2), ('d', '4'))
def test_negation(self): tdb = TrailDB('testtrail') events = list(tdb.trail(0, event_filter=[('field3', 'x', True)])) self.assertEqual(len(events), 1) self.assertEqual( (events[0].field1, events[0].field2, events[0].field3), ('c', '3', 'y'))
def test_apply_blacklist(self): uuids = [ "02345678123456781234567812345678", "12345678123456781234567812345678", "22345678123456781234567812345678", "32345678123456781234567812345678", "42345678123456781234567812345678" ] cons = TrailDBConstructor('blacklist_testtrail', ['field1', 'field2']) for uuid in uuids: cons.add(uuid, 1, ['a', '1']) cons.add(uuid, 2, ['b', '2']) cons.add(uuid, 3, ['c', '3']) cons.finalize() tdb = TrailDB('blacklist_testtrail') blacklist = [uuids[1], uuids[2]] tdb.apply_blacklist(blacklist) found_trails = list(tdb.trails(parsetime=False)) for trail_uuid, trail_events in found_trails: if trail_uuid in blacklist: expected_length = 0 else: expected_length = 3 trail_events = list(trail_events) self.assertEqual(len(trail_events), expected_length)
def test_conjunction(self): tdb = TrailDB('testtrail') events = list( tdb.trail(0, event_filter=[[('field1', 'e'), ('field1', 'c')], [('field3', 'y', True)]])) self.assertEqual(len(events), 1) self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
def test_time_range(self): tdb = TrailDB('testtrail') events = list(tdb.trail(0, event_filter=[[(2, 4)]], parsetime=False)) self.assertEqual(len(events), 2) self.assertEqual(events[0].time, 2L) self.assertEqual(events[1].time, 3L)
def test_silly_open(self): self.assertTrue(os.path.exists('testtrail.tdb')) self.assertFalse(os.path.exists('testtrail')) db1 = TrailDB('testtrail.tdb') db2 = TrailDB('testtrail') with self.assertRaises(TrailDBError): TrailDB('foo.tdb')
def test_filter_object(self): tdb = TrailDB('testtrail') obj = tdb.create_filter([[('field1', 'e'), ('field1', 'c')], [('field3', 'y', True)]]) events = list(tdb.trail(0, event_filter=obj)) self.assertEqual(len(events), 1) self.assertEqual((events[0].field1, events[0].field2), ('e', '5')) events = list(tdb.trail(0, event_filter=obj)) self.assertEqual(len(events), 1) self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))
def test_crumbs(self): db = TrailDB('testtrail.tdb') n = 0 for uuid, trail in db.trails(): n += 1 self.assertEqual(self.uuid, uuid) self.assertIsInstance(trail, TrailDBCursor) self.assertEqual(3, len(list(trail))) self.assertEqual(1, n)
def test_trails(self): db = TrailDB('testtrail') self.assertEqual(1, db.num_trails) trail = db.trail(0) self.assertIsInstance(trail, TrailDBCursor) events = list(trail) # Force evaluation of generator self.assertEqual(3, len(events)) for event in events: self.assertTrue(hasattr(event, 'time')) self.assertTrue(hasattr(event, 'field1')) self.assertTrue(hasattr(event, 'field2'))
def loading(): traildb = TrailDB("/mnt/data/wikipedia-history-small.tdb") user_edits = 0 ip_edits = 0 for uuid, trail in traildb.trails(): for event in trail: if event.user != "": user_edits += 1 elif event.ip != "": ip_edits += 1 print("User edits: {}".format(user_edits)) print("IP edits: {}".format(ip_edits))
def get_dataframe(): tdb = TrailDB('pydata-tutorial.tdb') base = tdb.min_timestamp() types = [] xs = [] ys = [] # try this: # for y, (first_ts, events) in enumerate(sorted(get_events(tdb), reverse=True)): for y, (first_ts, events) in enumerate(get_events(tdb)): for event in events: xs.append(old_div(int(event.time - base), (24 * 3600))) ys.append(y) types.append('user' if event.user else 'anon') data = pd.DataFrame({'x': xs, 'y': ys}) data['type'] = pd.Series(types, dtype='category') return data
def get_dataframe(): tdb = TrailDB('pydata-tutorial.tdb') base = tdb.min_timestamp() types = [] xs = [] ys = [] #try this: #for y, (first_ts, events) in enumerate(sorted(get_events(tdb), reverse=True)): for y, (first_ts, events) in enumerate(get_events(tdb)): for event in events: xs.append(int(event.time - base) / (24 * 3600)) ys.append(y) types.append('user' if event.user else 'anon') data = pd.DataFrame({'x': xs, 'y': ys}) data['type'] = pd.Series(types, dtype='category') return data
def test_metadata(self): db = TrailDB('testtrail.tdb') self.assertEqual(1, db.min_timestamp()) self.assertEqual(3, db.max_timestamp()) self.assertEqual((1, 3), db.time_range()) self.assertEqual((1, 3), db.time_range(parsetime=False))
def test_trails_selected_uuids(self): uuids = [ "02345678123456781234567812345678", "12345678123456781234567812345678", "22345678123456781234567812345678", "32345678123456781234567812345678", "42345678123456781234567812345678" ] cons = TrailDBConstructor('whitelist_testtrail', ['field1', 'field2']) for uuid in uuids: cons.add(uuid, 1, ['a', '1']) cons.add(uuid, 2, ['b', '2']) cons.add(uuid, 3, ['c', '3']) cons.finalize() tdb = TrailDB('whitelist_testtrail') whitelist = [uuids[0], uuids[3], uuids[4]] expected_length = 3 for trail_uuid, trail_events in tdb.trails(selected_uuids=whitelist): trail_events = list(trail_events) self.assertEqual(len(trail_events), expected_length)
def test_lexicons(self): db = TrailDB('testtrail') # First field self.assertEqual(4, db.lexicon_size(1)) self.assertEqual(['a', 'b', 'c'], list(db.lexicon(1))) # Second field self.assertEqual(['1', '2', '3'], list(db.lexicon(2))) with self.assertRaises(TrailDBError): db.lexicon(3) # Out of bounds
def traildb_to_coo(db, fieldname): if not TrailDB: raise ImportError("Could not find traildb") db_handle = TrailDB(db) num_events = db_handle.num_events del db_handle r_idx = np.zeros(num_events, dtype=np.uint64) c_idx = np.zeros(num_events, dtype=np.uint64) uuids = np.zeros((num_events, 16), dtype=np.uint8) timestamps = np.zeros(num_events, dtype=np.uint64) cols = traildb_coo_repr_func(db.encode(), fieldname.encode(), r_idx, c_idx, uuids, timestamps) return uuids, timestamps, cols,\ sparse.coo_matrix((np.ones(num_events), (r_idx, c_idx)))
def test_fields(self): db = TrailDB('testtrail') self.assertEqual(['time', 'field1', 'field2'], db.fields)
from __future__ import division from __future__ import print_function from __future__ import unicode_literals from __future__ import absolute_import from past.utils import old_div from random import random import sys from traildb import TrailDB, TrailDBConstructor def extract(tdb, cons, sample_size): for uuid, trail in tdb.trails(): if random() < sample_size: for event in trail: cons.add(uuid, event.time, list(event)[1:]) return cons.finalize() if __name__ == '__main__': if len(sys.argv) < 3: print( 'Usage: extract_sample source_tdb destination_tdb sample_percentage' ) sys.exit(1) tdb = TrailDB(sys.argv[1]) cons = TrailDBConstructor(sys.argv[2], tdb.fields[1:]) num = extract(tdb, cons, old_div(float(sys.argv[3]), 100.)).num_trails print('Extracted %d trails to %s' % (num, sys.argv[2]))
from traildb import TrailDBConstructor, TrailDB from uuid import uuid4 from datetime import datetime cons = TrailDBConstructor('tiny', ['username', 'action']) for i in range(3): uuid = uuid4().hex username = '******' % i for day, action in enumerate(['open', 'save', 'close']): cons.add(uuid, datetime(2016, i + 1, day + 1), (username, action)) cons.finalize() for uuid, trail in TrailDB('tiny').trails(): print uuid, list(trail)
import sys from traildb import TrailDB SESSION_LIMIT = 30 * 60 def sessions(tdb): for i, (uuid, trail) in enumerate(tdb.trails(only_timestamp=True)): prev_time = trail.next() num_events = 1 num_sessions = 1 for timestamp in trail: if timestamp - prev_time > SESSION_LIMIT: num_sessions += 1 prev_time = timestamp num_events += 1 print 'Trail[%d] Number of Sessions: %d Number of Events: %d' %\ (i, num_sessions, num_events) if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: tutorial_wikipedia_sessions <wikipedia-history.tdb>' else: sessions(TrailDB(sys.argv[1]))
def test_negation(self): tdb = TrailDB('testtrail') events = list(tdb.trail(0, event_filter=[('field3', 'x', True)])) self.assertEqual(len(events), 1) self.assertEqual((events[0].field1, events[0].field2, events[0].field3), ('c', '3', 'y'))
def item_top(): tdb = TrailDB('pydata-tutorial') stats = Counter(event.title for uuid, trail in tdb.trails(rawitems=True) for event in trail) return [(tdb.get_item_value(item), f) for item, f in stats.most_common(5)]
def string_top(): tdb = TrailDB('pydata-tutorial') return Counter(event.title for uuid, trail in tdb.trails() for event in trail).most_common(5)
def test_uuids(self): db = TrailDB('testtrail') self.assertEqual(0, db.get_trail_id(self.uuid)) self.assertEqual(self.uuid, db.get_uuid(0)) self.assertTrue(self.uuid in db)
def test_conjunction(self): tdb = TrailDB('testtrail') events = list(tdb.trail(0, event_filter=[[('field1', 'e'), ('field1', 'c')], [('field3', 'y', True)]])) self.assertEqual(len(events), 1) self.assertEqual((events[0].field1, events[0].field2), ('e', '5'))