def sae_opt(tms, tweets, tokenpoints): '''Optimize token_weights to minimize SAE over all training tweets''' l.debug('preparing token models') t_start = time.time() # FIXME: multicore? for g in tms.values(): g.populate_best_point() l.debug('done preparing in %s' % (u.fmt_seconds(time.time() - t_start))) gmms_list = [] errors_list = [] l.debug('computing MSAE for all tweets') t_start = time.time() for tw in tweets: r_gmms = relevant_gmms(tw.tokens, tms) if (len(r_gmms) == 0): continue errors = [g.sae(tw.geom) for g in r_gmms] gmms_list.append(r_gmms) errors_list.append(errors) l.debug('done computing SAE in %s' % (u.fmt_seconds(time.time() - t_start))) return optimize.Weight(gmms_list, errors_list, regularizer=model_parms['opt_reg'], identity_feature=model_parms['opt_feature_id'], misc_feature=model_parms['opt_feature_misc'], init_by_feature=model_parms['opt_init']).optimize()
def fetch(self, db, srid, phase, tzer, fields, unify, excluded=None): # fetch tweets rows = db.select( (('tweet_id', 'tweet_id'), ('created_at', 'created_at'), ('day', 'day'), ('hour', 'hour'), ('text', 'text'), ('user_screen_name', 'user_screen_name'), ('user_description', 'user_description'), ('user_lang', 'user_lang'), ('user_location', 'user_location'), ('user_time_zone', 'user_time_zone'), ('ST_Transform(geom, %d)' % (srid), '"geom [geometry]"')), ("FROM tweet WHERE %s" % (self.where(phase, 'created_at')))) l.debug('fetched %d rows' % (len(rows))) tweets_raw = [tweet.Tweet.from_dict(row) for row in rows] l.debug('fetched %d tweets' % (len(tweets_raw))) # filter out duplicate users users = set() tweets = list() for tw in tweets_raw: if (excluded is None or (tw.user_screen_name not in excluded and tw.user_screen_name not in users)): users.add(tw.user_screen_name) tweets.append(tw) l.info('%s on %d tweets by %d users' % (phase, len(tweets), len(users))) # tokenize tweets t = time.time() for tw in tweets: # FIXME: This could be refactored to run in parallel tw.tokenize(tzer, fields, unify) l.debug('tokenized in %s' % (u.fmt_seconds(time.time() - t))) # done return (tweets, users)
def fetch(self, cur, srid, phase, tzer, fields, unify, excluded=None): # fetch tweets try: cur.execute( "SELECT tweet_id as tweet_id, created_at as created_at, day as day, \ hour as hour, text as text, user_screen_name as user_screen_name, \ user_description as user_description, user_lang as user_lang, \ user_location as user_location, user_time_zone as user_time_zone, \ lat as lat, lon as lon, geotagged as geom_src \ FROM tweet WHERE {0}".format(self.where(phase, 'created_at'))) rows = cur.fetchall() except: l.info("tweet selection from db failed") raise Exception l.debug('fetched %d rows' % (len(rows))) tweets_raw = [tweet.Tweet.from_dict(row) for row in rows] l.debug('fetched %d tweets' % (len(tweets_raw))) # filter out duplicate users users = set() tweets = list() for tw in tweets_raw: if (excluded is None or (tw.user_screen_name not in excluded and tw.user_screen_name not in users)): users.add(tw.user_screen_name) tweets.append(tw) l.info('%s on %d tweets by %d users' % (phase, len(tweets), len(users))) # tokenize tweets t = time.time() for tw in tweets: # FIXME: This could be refactored to run in parallel tw.tokenize(tzer, fields, unify) l.debug('tokenized in %s' % (u.fmt_seconds(time.time() - t))) # done return (tweets, users)
def optimize(self): 'Run optimization and return dictionary of token->weight' if self.init_by_feature == '': init_vals = self.initialize_random() else: init_vals = self.initialize_from_feature() t_start = time.time() l.debug('minimizing obj f\'n with %d weights...' % len(self.feature_alphabet)) l.debug('initial function value=%g' % self.func(init_vals)) res = scopt.minimize(self.func, init_vals, method='L-BFGS-B', jac=self.func_deriv, options={'disp': self.verbose}, tol=1e-4) l.debug('minimized in %s; %d f calls and %d f\' calls (%d cache hits)' % (u.fmt_seconds(time.time() - t_start), self.n_fun_calls, self.n_deriv_calls, self.n_cache_hits)) l.debug('final function value=%g' % self.func(res.x)) self.score_gmms(res.x) di = dict([(next(gmm.tokens.iterkeys()), max(self.min_value, gmm.score)) for gmm in self.all_gmms]) if self.verbose: for (fv,fi) in self.feature_alphabet.iteritems(): l.debug('feature weight %s=%g' % (fv,res.x[fi])) for (t,w) in di.iteritems(): l.debug('token weight %s=%s'%(t,str(w))) # clean up for g in self.all_gmms: g.feature_vector = None return di
def optimize(self): 'Run optimization and return dictionary of token->weight' if self.init_by_feature == '': init_vals = self.initialize_random() else: init_vals = self.initialize_from_feature() t_start = time.time() l.debug('minimizing obj f\'n with %d weights...' % len(self.feature_alphabet)) l.debug('initial function value=%g' % self.func(init_vals)) res = scopt.minimize(self.func, init_vals, method='L-BFGS-B', jac=self.func_deriv, options={'disp': self.verbose}, tol=1e-4) l.debug( 'minimized in %s; %d f calls and %d f\' calls (%d cache hits)' % (u.fmt_seconds(time.time() - t_start), self.n_fun_calls, self.n_deriv_calls, self.n_cache_hits)) l.debug('final function value=%g' % self.func(res.x)) self.score_gmms(res.x) di = dict([(next(gmm.tokens.iterkeys()), max(self.min_value, gmm.score)) for gmm in self.all_gmms]) if self.verbose: for (fv, fi) in self.feature_alphabet.iteritems(): l.debug('feature weight %s=%g' % (fv, res.x[fi])) for (t, w) in di.iteritems(): l.debug('token weight %s=%s' % (t, str(w))) # clean up for g in self.all_gmms: g.feature_vector = None return di
def do_test(self, m_class, db, args, i): self.i = i # create tokenizer tzer = u.class_by_name(args.tokenizer)(args.ngram) # load training & testing tweets from database exu = None if args.dup_users else set() (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer, args.fields, args.unify_fields, exu) exu = None if args.dup_users else tr_users (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer, args.fields, args.unify_fields, exu) if (not args.skip_small_tests or self.enough_data_p(len(tr_tweets), len(te_tweets))): self.attempted = True else: l.info('insufficient data, skipping test %s ' % (self)) self.attempted = False self.results = [] return # tokenize training tweets tr_tokens = self.group_tokens(tr_tweets, args.trim_head, args.min_instances) self.train_tweet_ct = len(tr_tweets) self.train_token_ct = len(tr_tokens) # downsample test tweets if (len(te_tweets) > args.test_tweet_limit): te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit) l.info('sampled %d test tweets per --test-tweet-limit' % (args.test_tweet_limit)) self.test_tweet_ct = len(te_tweets) # build model self.model = m_class(tr_tokens, args.srid, tr_tweets) l.debug('starting model build') t_start = time.time() self.model.build() l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start))) t_start = time.time() # test 'em self.results = multicore.do(test_tweet, (self.model, args.fields), te_tweets) l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
def wt_inv_error(tms, tweets, tokenpts, errattr): '''Weight of token T is |1/E^x|, where E is the mean error between T and each tweet in tweets having that token, using measure errattr ('sae' or 'cae'), and x is model parm wt_inv_error_exponent. The number of samples used in computing CAE is model parm wt_inv_sample_ct. If the number of tweets with the token is less than model parm wt_inv_min_tweets, the weight is 0.''' l.debug('computing inverse errors') t1 = time.time() # We work in chunks to keep memory use down. The chunk size is currently # not configurable, though we could make it so if needed. models = tms.values() weights = dict() x = model_parms['wt_inv_error_exponent'] for chunk in u.groupn(models, 20000): weights.update((tok, min(1, abs(1/(1+err**x)))) for (tok, err) in multicore.do(model_error, (errattr, tokenpts), chunk)) l.debug('inverse error chunk completed') dur = time.time() - t1 l.debug('computed inverse errors in %s (%.2gs per token)' % (u.fmt_seconds(dur), dur / len(models))) return weights
def fetch(self, db, srid, phase, tzer, fields, unify, excluded=None): # fetch tweets rows = db.select((('tweet_id', 'tweet_id'), ('created_at', 'created_at'), ('day', 'day'), ('hour', 'hour'), ('text', 'text'), ('user_screen_name', 'user_screen_name'), ('user_description', 'user_description'), ('user_lang', 'user_lang'), ('user_location', 'user_location'), ('user_time_zone', 'user_time_zone'), ('ST_Transform(geom, %d)' % (srid), '"geom [geometry]"')), ("FROM tweet WHERE %s" % (self.where(phase, 'created_at')))) l.debug('fetched %d rows' % (len(rows))) tweets_raw = [tweet.Tweet.from_dict(row) for row in rows] l.debug('fetched %d tweets' % (len(tweets_raw))) # filter out duplicate users users = set() tweets = list() for tw in tweets_raw: if (excluded is None or (tw.user_screen_name not in excluded and tw.user_screen_name not in users)): users.add(tw.user_screen_name) tweets.append(tw) l.info('%s on %d tweets by %d users' % (phase, len(tweets), len(users))) # tokenize tweets t = time.time() for tw in tweets: # FIXME: This could be refactored to run in parallel tw.tokenize(tzer, fields, unify) l.debug('tokenized in %s' % (u.fmt_seconds(time.time() - t))) # done return (tweets, users)
def wt_inv_error(tms, tweets, tokenpts, errattr): '''Weight of token T is |1/E^x|, where E is the mean error between T and each tweet in tweets having that token, using measure errattr ('sae' or 'cae'), and x is model parm wt_inv_error_exponent. The number of samples used in computing CAE is model parm wt_inv_sample_ct. If the number of tweets with the token is less than model parm wt_inv_min_tweets, the weight is 0.''' l.debug('computing inverse errors') t1 = time.time() # We work in chunks to keep memory use down. The chunk size is currently # not configurable, though we could make it so if needed. models = tms.values() weights = dict() x = model_parms['wt_inv_error_exponent'] for chunk in u.groupn(models, 20000): weights.update( (tok, min(1, abs(1 / (1 + err**x)))) for (tok, err) in multicore.do(model_error, (errattr, tokenpts), chunk)) l.debug('inverse error chunk completed') dur = time.time() - t1 l.debug('computed inverse errors in %s (%.2gs per token)' % (u.fmt_seconds(dur), dur / len(models))) return weights
def main(self): u.memory_use_log() t_start = time.time() # Replaced with self.cur in __init__ # db = db_glue.DB(self.args.database_file) # assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table) self.cur.execute(sql) self.args.start = self.cur.fetchone()[0] if (self.args.end is None): sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table) self.cur.execute(sql) # add one second because end time is exclusive self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i+1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i+1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except (IOError, x): if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t)) t.do_test(model_class, self.cur, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log() # done! l.debug('computing summary') self.summarize() l.debug('summary: %s' % (self.summary)) l.debug('saving TSV results') test_indices = u.sl_union_fromtext(len(self.schedule), ':') self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'), test_indices) l.debug('saving pickled summary') self.memory_use = u.memory_use() self.memory_use_peak = "Not implemented" self.time_use = time.time() - t_start u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self) u.memory_use_log() l.info('done in %s' % (u.fmt_seconds(self.time_use)))
db.set_cachesize(0, 32 * 1024 * 1024) db.set_pagesize(64 * 1024) db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE)) start_out = time.time() for j in range(outer_ct): start = time.time() for i in range(inner_ct): db.put( str(j * inner_ct + i).encode('UTF-8'), np.ones(720, dtype=np.int32)) db.sync() end = time.time() elapsed = end - start l.info('%d vectors in %s (%d/s), %.3f' % (inner_ct, u.fmt_seconds(elapsed), inner_ct / elapsed, (j + 1) * inner_ct / (outer_ct * inner_ct))) u.memory_use_log() l.info('compacting database') pprint(db.stat()) db.compact(flags=bdb.DB_FREE_SPACE) l.info('closing database') pprint(db.stat()) db.close() end_out = time.time() elapsed_out = end_out - start_out l.info('%d vectors in %s (%d/s)' % (outer_ct * inner_ct, u.fmt_seconds(elapsed_out), (outer_ct * inner_ct) / elapsed_out)) u.memory_use_log()
db.execute('PRAGMA synchronous = OFF') db.execute('CREATE TABLE ts (namespace TEXT, name TEXT, total INT, data TEXT)') db.execute('CREATE INDEX ts_idx ON ts (namespace, name)') start_out = time.time() for j in range(outer_ct): start = time.time() db.executemany('INSERT INTO ts VALUES (?, ?, ?, ?)', (('en', str(10 * (j * inner_ct + i)), 8675309, np.ones(720, dtype=np.int32).data) for i in range(inner_ct))) conn.commit() end = time.time() elapsed = end - start l.info('inserted %d vectors in %s (%d/s), %d, %.3f' % (inner_ct, u.fmt_seconds(elapsed), inner_ct/elapsed, (j+1)*inner_ct, (j+1)*inner_ct/(outer_ct*inner_ct))) #u.memory_use_log() os.system('clear-disk-cache') start_out = time.time() # for j in range(outer_ct): # start = time.time() # db.execute('begin') # insert = list(range(0, inner_ct, 100)) # for i in insert: # db.execute('UPDATE ts SET total=?, data=? WHERE namespace=? AND name=?', # (1, np.zeros(720, dtype=np.int32).data, # 'en', str(10 * (j * inner_ct + i))))
def main(): l.info('starting') args_clean() # set up Spark conf = pyspark.SparkConf() conf.setExecutorEnv('PYTHONPATH', QUACLIB) if (args.profile): conf.set('spark.python.profile', 'true') sc = pyspark.SparkContext(conf=conf) global args_b args_b = sc.broadcast(args) # load ground truth data global truth truth = truth_load() l.info('found truth with %d outbreaks' % len(truth.columns)) global truth_b truth_b = sc.broadcast(truth) # find dataset shard_ct = shards_count() l.info('found dataset with %d shards' % shard_ct) if (args.shards is not None): shard_ct = args.shards l.info('will process %d shards' % shard_ct) # figure out what tests to do global tests tests = tests_enumerate() l.info('planning %d tests' % len(tests)) global tests_b tests_b = sc.broadcast(tests) # some timing accumulators global article_ct article_ct = sc.accumulator(0) global eval_elapsed eval_elapsed = sc.accumulator(0) # let's go l.info('starting computation') # 1. Distribute shard indexes # shards = sc.parallelize(range(shard_ct), shard_ct) # 2. Find candidate articles # # 2a. Find top candidates within each shard for each context # # key: Context # val: Priority_Queue: # pri: r [correlation with ground truth on training data] # val: (Series [complete time series, .name is URL], # Series [shifted/truncated training data, .name is URL]) cands = shards.flatMap(candidates_read) # 2b. Find global top candidates for each context # # (form same as above) cands = cands.reduceByKey(candidates_merge) cands.cache() # 2c. Dump top candidate summaries # # articles and correlations for each context # key: outbreak # val: dict: # key: (training duration (timedelta), # forecast horizon (timedelta), # now (datetime)) # val: articles (ordered list of (URL, r)) #l.info('dumping candidate summaries') #summs = cands.map(candidate_summarize) \ # .reduceByKey(u.dicts_merge) #summs.foreach(pickle_dump('r')) # 3. Build models # # 3a. Build a model for each context # # key: Context # val: (sk.LinearModel [fitted model], # DataFrame [full candidate time series, URL columns], # DataFrame [training candidate time series, URL columns]) # # Order of coefficients in model and DataFrame are the same. models = cands.map(model_build) models.cache() # 3b. Dump models and article data for each context. These dumps are # self-contained enough to be loaded in a Python interpreter that is # not QUAC-aware. This should produce a few 10's of GiB of data. # # key: outbreak # val: { horizon: # { training: # { now: # { 'model': sk.LinearModel [fitted model], # 'data': DataFrame [full data, URL columns], # 'trdata': DataFrame [training data, URL columns] }}}} summs = models.map(model_summarize) \ .reduceByKey(u.dicts_merge) summs.foreach(pickle_dump('model')) # 4. Evaluate models # # 4a. Compute predicted values # # key: Context # val: Series [predicted incidence] # index: period # values: prediction) preds = models.map(model_predict) # 4b. Re-key results to put nows in value # # key: (outbreak, training duration, forecast horizon) # val: (now, Series [predicted incidence]) preds = preds.map(lambda x: ((x[0].outbreak, x[0].training, x[0].horizon), (x[0].now, x[1]))) # 4c. Summarize results (~2K keys) # # key: (outbreak, training duration, forecast horizon) # val: (DataFrame [predicted incidence]: # index: period # columns: nows) preds = preds.groupByKey() \ .map(model_result_summarize) # 4d. Gather results by outbreak (~20 keys, ~20MB/key) # # key: outbreak # val: dict: # key: forecast horizon # val: dict: # key: training duration # val: DataFrame [predicted incidence] # # Note: we could also use a Panel4D for this, but I haven't put in # the effort to wrap my head around it. preds = preds.map(lambda x: (x[0][0], { x[0][2]: { x[0][1]: x[1] } })) \ .reduceByKey(u.dicts_merge) # 4e. Dump predictions # # For each outbreak, dump a pickle file containing the dict above. # These are then translated to TSV files for plotting in later steps. preds.foreach(pickle_dump('out')) # finish up eval_ct = article_ct.value * len(tests) l.info('evaluated: %d articles, %d contexts, %d total; %s (%.0f µs/eval)' % (article_ct.value, len(tests), eval_ct, u.fmt_seconds(eval_elapsed.value), eval_elapsed.value * 1e6 / eval_ct)) l.info('done') try: sc.dump_profiles(args.outdir) #sc.show_profiles() except AttributeError: pass
db.execute('PRAGMA synchronous = OFF') db.execute('CREATE TABLE ts (namespace TEXT, name TEXT, total INT, data TEXT)') db.execute('CREATE INDEX ts_idx ON ts (namespace, name)') start_out = time.time() for j in range(outer_ct): start = time.time() db.executemany('INSERT INTO ts VALUES (?, ?, ?, ?)', (('en', str(10 * (j * inner_ct + i)), 8675309, np.ones(720, dtype=np.int32).data) for i in range(inner_ct))) conn.commit() end = time.time() elapsed = end - start l.info('inserted %d vectors in %s (%d/s), %d, %.3f' % (inner_ct, u.fmt_seconds(elapsed), inner_ct / elapsed, (j + 1) * inner_ct, (j + 1) * inner_ct / (outer_ct * inner_ct))) #u.memory_use_log() os.system('clear-disk-cache') start_out = time.time() # for j in range(outer_ct): # start = time.time() # db.execute('begin') # insert = list(range(0, inner_ct, 100)) # for i in insert: # db.execute('UPDATE ts SET total=?, data=? WHERE namespace=? AND name=?', # (1, np.zeros(720, dtype=np.int32).data, # 'en', str(10 * (j * inner_ct + i))))
def main(): l.info('starting') start_time = time.time() args_clean() g.args = args u.memory_use_log(level=l.info) l.info('loading input data') g.truth = truth_load() g.graph = graph_load() g.vectors = vectors_load() u.memory_use_log(level=l.info) g.tests = tests_enumerate() l.info('scheduled %d tests' % len(g.tests)) l.info('saving input data') pickle_dump('input', None, g) with jl.Parallel(n_jobs=-1, verbose=5) as P: l.info('1. Building models') # # { Context: sk.LinearModel [fitted model] } models = { ctx: m for (ctx, m) in zip(g.tests, P(jl.delayed(model_build)(t) for t in g.tests)) if m is not None } l.info('built %d models (%d at max iterations)' % (len(models), sum(not m.converged for (_, m) in models.items()))) l.info('2. Dumping models') # These dumps are self-contained enough to be loaded in a Python # interpreter that is not QUAC-aware. # # { outbreak: { horizon: { training: { now: fitted model } } } } summs = u.defaultdict_recursive() for (ctx, m) in models.items(): summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m for (ob, ob_data) in summs.as_dict().items(): pickle_dump(ob, 'model', ob_data) l.info('3. Evaluating models') # Evaluations run in ~0.15s (according to joblib), so it's not clear to # me that distributing the computation outweighs overhead. # # { Context: Series [predicted incidence] # index: period # values: prediction } preds = dict(model_predict(cm) for cm in models.items()) l.info('4. Aggregating results') # Re-key so we can aggregate the nows # # [ ((outbreak, training, horizon), # (now, Series [predicted incidence])), ... ] preds = sorted(((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p)) for (ctx, p) in preds.items()) # Aggregate into DataFrames. # # { (outbreak, training, horizon): DataFrame [predicted incidence] # index: period # columns: now } preds = { k: model_summarize(preds) for (k, preds) in itertools.groupby(preds, operator.itemgetter(0)) } l.info('5. Dumping results') # Gather by outbreak # # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } } preds2 = u.defaultdict_recursive() for ((ob, tr, ho), df) in preds.items(): preds2[ob][ho][tr] = df # For each outbreak, dump a pickle file containing the dict above. These # are then translated to TSV files for plotting in later steps. for (ob, ob_data) in preds2.as_dict().items(): pickle_dump(ob, 'out', ob_data) l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
class Test_Sequence(object): def __init__(self, args): self.args = args @property def first_good_test(self): # Any attempted test will give us what we need, but an arbitrary # number of tests might not have been attempted. return next(itertools.ifilter(lambda t: t.attempted, self.schedule)) def main(self): u.memory_use_log() t_start = time.time() db = db_glue.DB(self.args.database_file) l.info('opened database %s' % (self.args.database_file)) assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet' self.args.start = db.sql(sql)[0]['st'] if (self.args.end is None): sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet' # add one second because end time is exclusive self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i + 1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i + 1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except IOError, x: if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i + 1, len(self.schedule), t)) t.do_test(model_class, db, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log() # done! l.debug('computing summary') self.summarize() l.debug('summary: %s' % (self.summary)) l.debug('saving TSV results') test_indices = u.sl_union_fromtext(len(self.schedule), ':') self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'), test_indices) l.debug('saving pickled summary') self.memory_use = u.memory_use() self.memory_use_peak = u.memory_use(True) self.time_use = time.time() - t_start u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self) u.memory_use_log() l.info('done in %s' % (u.fmt_seconds(self.time_use)))
def main(): l.info('starting') start_time = time.time() args_clean() g.args = args u.memory_use_log(level=l.info) l.info('loading input data') g.truth = truth_load() g.graph = graph_load() g.vectors = vectors_load() u.memory_use_log(level=l.info) g.tests = tests_enumerate() l.info('scheduled %d tests' % len(g.tests)) l.info('saving input data') pickle_dump('input', None, g) with jl.Parallel(n_jobs=-1, verbose=5) as P: l.info('1. Building models') # # { Context: sk.LinearModel [fitted model] } models = { ctx: m for (ctx, m) in zip(g.tests, P(jl.delayed(model_build)(t) for t in g.tests)) if m is not None } l.info('built %d models (%d at max iterations)' % (len(models), sum(not m.converged for (_, m) in models.items()))) l.info('2. Dumping models') # These dumps are self-contained enough to be loaded in a Python # interpreter that is not QUAC-aware. # # { outbreak: { horizon: { training: { now: fitted model } } } } summs = u.defaultdict_recursive() for (ctx, m) in models.items(): summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m for (ob, ob_data) in summs.as_dict().items(): pickle_dump(ob, 'model', ob_data) l.info('3. Evaluating models') # Evaluations run in ~0.15s (according to joblib), so it's not clear to # me that distributing the computation outweighs overhead. # # { Context: Series [predicted incidence] # index: period # values: prediction } preds = dict(model_predict(cm) for cm in models.items()) l.info('4. Aggregating results') # Re-key so we can aggregate the nows # # [ ((outbreak, training, horizon), # (now, Series [predicted incidence])), ... ] preds = sorted( ((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p)) for (ctx, p) in preds.items()) # Aggregate into DataFrames. # # { (outbreak, training, horizon): DataFrame [predicted incidence] # index: period # columns: now } preds = { k: model_summarize(preds) for (k, preds) in itertools.groupby(preds, operator.itemgetter(0)) } l.info('5. Dumping results') # Gather by outbreak # # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } } preds2 = u.defaultdict_recursive() for ((ob, tr, ho), df) in preds.items(): preds2[ob][ho][tr] = df # For each outbreak, dump a pickle file containing the dict above. These # are then translated to TSV files for plotting in later steps. for (ob, ob_data) in preds2.as_dict().items(): pickle_dump(ob, 'out', ob_data) l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
#db.set_flags(bdb.DB_TXN_NOT_DURABLE) db.set_cachesize(0, 32*1024*1024) db.set_pagesize(64*1024) db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE)) start_out = time.time() for j in range(outer_ct): start = time.time() for i in range(inner_ct): db.put(str(j * inner_ct + i).encode('UTF-8'), np.ones(720, dtype=np.int32)) db.sync() end = time.time() elapsed = end - start l.info('%d vectors in %s (%d/s), %.3f' % (inner_ct, u.fmt_seconds(elapsed), inner_ct/elapsed, (j+1)*inner_ct/(outer_ct*inner_ct))) u.memory_use_log() l.info('compacting database') pprint(db.stat()) db.compact(flags=bdb.DB_FREE_SPACE) l.info('closing database') pprint(db.stat()) db.close() end_out = time.time() elapsed_out = end_out - start_out l.info('%d vectors in %s (%d/s)' % (outer_ct * inner_ct, u.fmt_seconds(elapsed_out), (outer_ct * inner_ct)/elapsed_out)) u.memory_use_log()
def main(): l.info('starting') args_clean() # set up Spark conf = pyspark.SparkConf() conf.setExecutorEnv('PYTHONPATH', QUACLIB) if (args.profile): conf.set('spark.python.profile', 'true') sc = pyspark.SparkContext(conf=conf) global args_b args_b = sc.broadcast(args) # load ground truth data global truth truth = truth_load() l.info('found truth with %d outbreaks' % len(truth.columns)) global truth_b truth_b = sc.broadcast(truth) # find dataset shard_ct = shards_count() l.info('found dataset with %d shards' % shard_ct) if (args.shards is not None): shard_ct = args.shards l.info('will process %d shards' % shard_ct) # figure out what tests to do global tests tests = tests_enumerate() l.info('planning %d tests' % len(tests)) global tests_b tests_b = sc.broadcast(tests) # some timing accumulators global article_ct article_ct = sc.accumulator(0) global eval_elapsed eval_elapsed = sc.accumulator(0) # let's go l.info('starting computation') # 1. Distribute shard indexes # shards = sc.parallelize(range(shard_ct), shard_ct) # 2. Find candidate articles # # 2a. Find top candidates within each shard for each context # # key: Context # val: Priority_Queue: # pri: r [correlation with ground truth on training data] # val: (Series [complete time series, .name is URL], # Series [shifted/truncated training data, .name is URL]) cands = shards.flatMap(candidates_read) # 2b. Find global top candidates for each context # # (form same as above) cands = cands.reduceByKey(candidates_merge) cands.cache() # 2c. Dump top candidate summaries # # articles and correlations for each context # key: outbreak # val: dict: # key: (training duration (timedelta), # forecast horizon (timedelta), # now (datetime)) # val: articles (ordered list of (URL, r)) #l.info('dumping candidate summaries') #summs = cands.map(candidate_summarize) \ # .reduceByKey(u.dicts_merge) #summs.foreach(pickle_dump('r')) # 3. Build models # # 3a. Build a model for each context # # key: Context # val: (sk.LinearModel [fitted model], # DataFrame [full candidate time series, URL columns], # DataFrame [training candidate time series, URL columns]) # # Order of coefficients in model and DataFrame are the same. models = cands.map(model_build) models.cache() # 3b. Dump models and article data for each context. These dumps are # self-contained enough to be loaded in a Python interpreter that is # not QUAC-aware. This should produce a few 10's of GiB of data. # # key: outbreak # val: { horizon: # { training: # { now: # { 'model': sk.LinearModel [fitted model], # 'data': DataFrame [full data, URL columns], # 'trdata': DataFrame [training data, URL columns] }}}} summs = models.map(model_summarize) \ .reduceByKey(u.dicts_merge) summs.foreach(pickle_dump('model')) # 4. Evaluate models # # 4a. Compute predicted values # # key: Context # val: Series [predicted incidence] # index: period # values: prediction) preds = models.map(model_predict) # 4b. Re-key results to put nows in value # # key: (outbreak, training duration, forecast horizon) # val: (now, Series [predicted incidence]) preds = preds.map(lambda x: ((x[0].outbreak, x[0].training, x[0].horizon), (x[0].now, x[1]))) # 4c. Summarize results (~2K keys) # # key: (outbreak, training duration, forecast horizon) # val: (DataFrame [predicted incidence]: # index: period # columns: nows) preds = preds.groupByKey() \ .map(model_result_summarize) # 4d. Gather results by outbreak (~20 keys, ~20MB/key) # # key: outbreak # val: dict: # key: forecast horizon # val: dict: # key: training duration # val: DataFrame [predicted incidence] # # Note: we could also use a Panel4D for this, but I haven't put in # the effort to wrap my head around it. preds = preds.map(lambda x: (x[0][0], { x[0][2]: { x[0][1]: x[1] } })) \ .reduceByKey(u.dicts_merge) # 4e. Dump predictions # # For each outbreak, dump a pickle file containing the dict above. # These are then translated to TSV files for plotting in later steps. preds.foreach(pickle_dump('out')) # finish up eval_ct = article_ct.value * len(tests) l.info('evaluated: %d articles, %d contexts, %d total; %s (%.0f µs/eval)' % (article_ct.value, len(tests), eval_ct, u.fmt_seconds( eval_elapsed.value), eval_elapsed.value * 1e6 / eval_ct)) l.info('done') try: sc.dump_profiles(args.outdir) #sc.show_profiles() except AttributeError: pass