from pprint import pprint import time from bsddb3 import db as bdb import numpy as np import u outer_ct = 5 inner_ct = 50000 u.logging_init('bdb', verbose_=True) l = u.l u.memory_use_log() db = bdb.DB() #db.set_flags(bdb.DB_TXN_NOT_DURABLE) db.set_cachesize(0, 32*1024*1024) db.set_pagesize(64*1024) db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE)) start_out = time.time() for j in range(outer_ct): start = time.time() for i in range(inner_ct): db.put(str(j * inner_ct + i).encode('UTF-8'), np.ones(720, dtype=np.int32)) db.sync() end = time.time()
from pprint import pprint import time from bsddb3 import db as bdb import numpy as np import u outer_ct = 5 inner_ct = 50000 u.logging_init('bdb', verbose_=True) l = u.l u.memory_use_log() db = bdb.DB() #db.set_flags(bdb.DB_TXN_NOT_DURABLE) db.set_cachesize(0, 32 * 1024 * 1024) db.set_pagesize(64 * 1024) db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE)) start_out = time.time() for j in range(outer_ct): start = time.time() for i in range(inner_ct): db.put( str(j * inner_ct + i).encode('UTF-8'), np.ones(720, dtype=np.int32)) db.sync()
def main(): l.info('starting') start_time = time.time() args_clean() g.args = args u.memory_use_log(level=l.info) l.info('loading input data') g.truth = truth_load() g.graph = graph_load() g.vectors = vectors_load() u.memory_use_log(level=l.info) g.tests = tests_enumerate() l.info('scheduled %d tests' % len(g.tests)) l.info('saving input data') pickle_dump('input', None, g) with jl.Parallel(n_jobs=-1, verbose=5) as P: l.info('1. Building models') # # { Context: sk.LinearModel [fitted model] } models = { ctx: m for (ctx, m) in zip(g.tests, P(jl.delayed(model_build)(t) for t in g.tests)) if m is not None } l.info('built %d models (%d at max iterations)' % (len(models), sum(not m.converged for (_, m) in models.items()))) l.info('2. Dumping models') # These dumps are self-contained enough to be loaded in a Python # interpreter that is not QUAC-aware. # # { outbreak: { horizon: { training: { now: fitted model } } } } summs = u.defaultdict_recursive() for (ctx, m) in models.items(): summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m for (ob, ob_data) in summs.as_dict().items(): pickle_dump(ob, 'model', ob_data) l.info('3. Evaluating models') # Evaluations run in ~0.15s (according to joblib), so it's not clear to # me that distributing the computation outweighs overhead. # # { Context: Series [predicted incidence] # index: period # values: prediction } preds = dict(model_predict(cm) for cm in models.items()) l.info('4. Aggregating results') # Re-key so we can aggregate the nows # # [ ((outbreak, training, horizon), # (now, Series [predicted incidence])), ... ] preds = sorted(((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p)) for (ctx, p) in preds.items()) # Aggregate into DataFrames. # # { (outbreak, training, horizon): DataFrame [predicted incidence] # index: period # columns: now } preds = { k: model_summarize(preds) for (k, preds) in itertools.groupby(preds, operator.itemgetter(0)) } l.info('5. Dumping results') # Gather by outbreak # # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } } preds2 = u.defaultdict_recursive() for ((ob, tr, ho), df) in preds.items(): preds2[ob][ho][tr] = df # For each outbreak, dump a pickle file containing the dict above. These # are then translated to TSV files for plotting in later steps. for (ob, ob_data) in preds2.as_dict().items(): pickle_dump(ob, 'out', ob_data) l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
def main(self): u.memory_use_log() t_start = time.time() db = db_glue.DB(self.args.database_file) l.info('opened database %s' % (self.args.database_file)) assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet' self.args.start = db.sql(sql)[0]['st'] if (self.args.end is None): sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet' # add one second because end time is exclusive self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i+1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i+1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except IOError, x: if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t)) t.do_test(model_class, db, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log()
def main(self): u.memory_use_log() t_start = time.time() # Replaced with self.cur in __init__ # db = db_glue.DB(self.args.database_file) # assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table) self.cur.execute(sql) self.args.start = self.cur.fetchone()[0] if (self.args.end is None): sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table) self.cur.execute(sql) # add one second because end time is exclusive self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i+1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i+1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except (IOError, x): if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t)) t.do_test(model_class, self.cur, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log() # done! l.debug('computing summary') self.summarize() l.debug('summary: %s' % (self.summary)) l.debug('saving TSV results') test_indices = u.sl_union_fromtext(len(self.schedule), ':') self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'), test_indices) l.debug('saving pickled summary') self.memory_use = u.memory_use() self.memory_use_peak = "Not implemented" self.time_use = time.time() - t_start u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self) u.memory_use_log() l.info('done in %s' % (u.fmt_seconds(self.time_use)))
def main(): l.info('starting') start_time = time.time() args_clean() g.args = args u.memory_use_log(level=l.info) l.info('loading input data') g.truth = truth_load() g.graph = graph_load() g.vectors = vectors_load() u.memory_use_log(level=l.info) g.tests = tests_enumerate() l.info('scheduled %d tests' % len(g.tests)) l.info('saving input data') pickle_dump('input', None, g) with jl.Parallel(n_jobs=-1, verbose=5) as P: l.info('1. Building models') # # { Context: sk.LinearModel [fitted model] } models = { ctx: m for (ctx, m) in zip(g.tests, P(jl.delayed(model_build)(t) for t in g.tests)) if m is not None } l.info('built %d models (%d at max iterations)' % (len(models), sum(not m.converged for (_, m) in models.items()))) l.info('2. Dumping models') # These dumps are self-contained enough to be loaded in a Python # interpreter that is not QUAC-aware. # # { outbreak: { horizon: { training: { now: fitted model } } } } summs = u.defaultdict_recursive() for (ctx, m) in models.items(): summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m for (ob, ob_data) in summs.as_dict().items(): pickle_dump(ob, 'model', ob_data) l.info('3. Evaluating models') # Evaluations run in ~0.15s (according to joblib), so it's not clear to # me that distributing the computation outweighs overhead. # # { Context: Series [predicted incidence] # index: period # values: prediction } preds = dict(model_predict(cm) for cm in models.items()) l.info('4. Aggregating results') # Re-key so we can aggregate the nows # # [ ((outbreak, training, horizon), # (now, Series [predicted incidence])), ... ] preds = sorted( ((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p)) for (ctx, p) in preds.items()) # Aggregate into DataFrames. # # { (outbreak, training, horizon): DataFrame [predicted incidence] # index: period # columns: now } preds = { k: model_summarize(preds) for (k, preds) in itertools.groupby(preds, operator.itemgetter(0)) } l.info('5. Dumping results') # Gather by outbreak # # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } } preds2 = u.defaultdict_recursive() for ((ob, tr, ho), df) in preds.items(): preds2[ob][ho][tr] = df # For each outbreak, dump a pickle file containing the dict above. These # are then translated to TSV files for plotting in later steps. for (ob, ob_data) in preds2.as_dict().items(): pickle_dump(ob, 'out', ob_data) l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
def main(self): u.memory_use_log() t_start = time.time() db = db_glue.DB(self.args.database_file) l.info('opened database %s' % (self.args.database_file)) assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet' self.args.start = db.sql(sql)[0]['st'] if (self.args.end is None): sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet' # add one second because end time is exclusive self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i + 1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i + 1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except IOError, x: if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i + 1, len(self.schedule), t)) t.do_test(model_class, db, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log()
class Test_Sequence(object): def __init__(self, args): self.args = args @property def first_good_test(self): # Any attempted test will give us what we need, but an arbitrary # number of tests might not have been attempted. return next(itertools.ifilter(lambda t: t.attempted, self.schedule)) def main(self): u.memory_use_log() t_start = time.time() db = db_glue.DB(self.args.database_file) l.info('opened database %s' % (self.args.database_file)) assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet' self.args.start = db.sql(sql)[0]['st'] if (self.args.end is None): sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet' # add one second because end time is exclusive self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i + 1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i + 1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except IOError, x: if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i + 1, len(self.schedule), t)) t.do_test(model_class, db, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log() # done! l.debug('computing summary') self.summarize() l.debug('summary: %s' % (self.summary)) l.debug('saving TSV results') test_indices = u.sl_union_fromtext(len(self.schedule), ':') self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'), test_indices) l.debug('saving pickled summary') self.memory_use = u.memory_use() self.memory_use_peak = u.memory_use(True) self.time_use = time.time() - t_start u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self) u.memory_use_log() l.info('done in %s' % (u.fmt_seconds(self.time_use)))