Exemple #1
0
def sae_opt(tms, tweets, tokenpoints):
    '''Optimize token_weights to minimize SAE over all training tweets'''
    l.debug('preparing token models')
    t_start = time.time()
    # FIXME: multicore?
    for g in tms.values():
        g.populate_best_point()
    l.debug('done preparing in %s' % (u.fmt_seconds(time.time() - t_start)))
    gmms_list = []
    errors_list = []
    l.debug('computing MSAE for all tweets')
    t_start = time.time()
    for tw in tweets:
        r_gmms = relevant_gmms(tw.tokens, tms)
        if (len(r_gmms) == 0):
            continue
        errors = [g.sae(tw.geom) for g in r_gmms]
        gmms_list.append(r_gmms)
        errors_list.append(errors)
    l.debug('done computing SAE in %s' %
            (u.fmt_seconds(time.time() - t_start)))
    return optimize.Weight(gmms_list,
                           errors_list,
                           regularizer=model_parms['opt_reg'],
                           identity_feature=model_parms['opt_feature_id'],
                           misc_feature=model_parms['opt_feature_misc'],
                           init_by_feature=model_parms['opt_init']).optimize()
Exemple #2
0
def sae_opt(tms, tweets, tokenpoints):
   '''Optimize token_weights to minimize SAE over all training tweets'''
   l.debug('preparing token models')
   t_start = time.time()
   # FIXME: multicore?
   for g in tms.values():
      g.populate_best_point()
   l.debug('done preparing in %s' % (u.fmt_seconds(time.time() - t_start)))
   gmms_list = []
   errors_list = []
   l.debug('computing MSAE for all tweets')
   t_start = time.time()
   for tw in tweets:
      r_gmms = relevant_gmms(tw.tokens, tms)
      if (len(r_gmms) == 0):
         continue
      errors = [g.sae(tw.geom) for g in r_gmms]
      gmms_list.append(r_gmms)
      errors_list.append(errors)
   l.debug('done computing SAE in %s' % (u.fmt_seconds(time.time() - t_start)))
   return optimize.Weight(gmms_list, errors_list,
                          regularizer=model_parms['opt_reg'],
                          identity_feature=model_parms['opt_feature_id'],
                          misc_feature=model_parms['opt_feature_misc'],
                          init_by_feature=model_parms['opt_init']).optimize()
Exemple #3
0
 def fetch(self, db, srid, phase, tzer, fields, unify, excluded=None):
     # fetch tweets
     rows = db.select(
         (('tweet_id', 'tweet_id'), ('created_at', 'created_at'),
          ('day', 'day'), ('hour', 'hour'), ('text', 'text'),
          ('user_screen_name', 'user_screen_name'),
          ('user_description', 'user_description'),
          ('user_lang', 'user_lang'), ('user_location', 'user_location'),
          ('user_time_zone', 'user_time_zone'),
          ('ST_Transform(geom, %d)' % (srid), '"geom [geometry]"')),
         ("FROM tweet WHERE %s" % (self.where(phase, 'created_at'))))
     l.debug('fetched %d rows' % (len(rows)))
     tweets_raw = [tweet.Tweet.from_dict(row) for row in rows]
     l.debug('fetched %d tweets' % (len(tweets_raw)))
     # filter out duplicate users
     users = set()
     tweets = list()
     for tw in tweets_raw:
         if (excluded is None or (tw.user_screen_name not in excluded
                                  and tw.user_screen_name not in users)):
             users.add(tw.user_screen_name)
             tweets.append(tw)
     l.info('%s on %d tweets by %d users' %
            (phase, len(tweets), len(users)))
     # tokenize tweets
     t = time.time()
     for tw in tweets:
         # FIXME: This could be refactored to run in parallel
         tw.tokenize(tzer, fields, unify)
     l.debug('tokenized in %s' % (u.fmt_seconds(time.time() - t)))
     # done
     return (tweets, users)
Exemple #4
0
 def fetch(self, cur, srid, phase, tzer, fields, unify, excluded=None):
 # fetch tweets
     try:
         cur.execute(
             "SELECT tweet_id as tweet_id, created_at as created_at, day as day, \
                 hour as hour, text as text, user_screen_name as user_screen_name, \
                 user_description as user_description, user_lang as user_lang, \
                 user_location as user_location, user_time_zone as user_time_zone, \
                 lat as lat, lon as lon, geotagged as geom_src \
             FROM tweet WHERE {0}".format(self.where(phase, 'created_at')))
         rows = cur.fetchall()
     except:
         l.info("tweet selection from db failed")
         raise Exception
     l.debug('fetched %d rows' % (len(rows)))
     tweets_raw = [tweet.Tweet.from_dict(row) for row in rows]
     l.debug('fetched %d tweets' % (len(tweets_raw)))
     # filter out duplicate users
     users = set()
     tweets = list()
     for tw in tweets_raw:
         if (excluded is None or (tw.user_screen_name not in excluded
                                  and tw.user_screen_name not in users)):
             users.add(tw.user_screen_name)
             tweets.append(tw)
     l.info('%s on %d tweets by %d users'
            % (phase, len(tweets), len(users)))
     # tokenize tweets
     t = time.time()
     for tw in tweets:
         # FIXME: This could be refactored to run in parallel
         tw.tokenize(tzer, fields, unify)
     l.debug('tokenized in %s' % (u.fmt_seconds(time.time() - t)))
     # done
     return (tweets, users)
Exemple #5
0
 def optimize(self):
    'Run optimization and return dictionary of token->weight'
    if self.init_by_feature == '':
       init_vals = self.initialize_random()
    else:
       init_vals = self.initialize_from_feature()
    t_start = time.time()
    l.debug('minimizing obj f\'n with %d weights...' %
            len(self.feature_alphabet))
    l.debug('initial function value=%g' % self.func(init_vals))
    res = scopt.minimize(self.func, init_vals,
                         method='L-BFGS-B', jac=self.func_deriv,
                         options={'disp': self.verbose}, tol=1e-4)
    l.debug('minimized in %s; %d f calls and %d f\' calls (%d cache hits)'
            % (u.fmt_seconds(time.time() - t_start), self.n_fun_calls,
               self.n_deriv_calls, self.n_cache_hits))
    l.debug('final function value=%g' % self.func(res.x))
    self.score_gmms(res.x)
    di = dict([(next(gmm.tokens.iterkeys()),
                max(self.min_value, gmm.score))
               for gmm in self.all_gmms])
    if self.verbose:
       for (fv,fi) in self.feature_alphabet.iteritems():
          l.debug('feature weight %s=%g' % (fv,res.x[fi]))
       for (t,w) in di.iteritems():
          l.debug('token weight %s=%s'%(t,str(w)))
    # clean up
    for g in self.all_gmms:
       g.feature_vector = None
    return di
Exemple #6
0
 def optimize(self):
     'Run optimization and return dictionary of token->weight'
     if self.init_by_feature == '':
         init_vals = self.initialize_random()
     else:
         init_vals = self.initialize_from_feature()
     t_start = time.time()
     l.debug('minimizing obj f\'n with %d weights...' %
             len(self.feature_alphabet))
     l.debug('initial function value=%g' % self.func(init_vals))
     res = scopt.minimize(self.func,
                          init_vals,
                          method='L-BFGS-B',
                          jac=self.func_deriv,
                          options={'disp': self.verbose},
                          tol=1e-4)
     l.debug(
         'minimized in %s; %d f calls and %d f\' calls (%d cache hits)' %
         (u.fmt_seconds(time.time() - t_start), self.n_fun_calls,
          self.n_deriv_calls, self.n_cache_hits))
     l.debug('final function value=%g' % self.func(res.x))
     self.score_gmms(res.x)
     di = dict([(next(gmm.tokens.iterkeys()), max(self.min_value,
                                                  gmm.score))
                for gmm in self.all_gmms])
     if self.verbose:
         for (fv, fi) in self.feature_alphabet.iteritems():
             l.debug('feature weight %s=%g' % (fv, res.x[fi]))
         for (t, w) in di.iteritems():
             l.debug('token weight %s=%s' % (t, str(w)))
     # clean up
     for g in self.all_gmms:
         g.feature_vector = None
     return di
Exemple #7
0
 def do_test(self, m_class, db, args, i):
    self.i = i
    # create tokenizer
    tzer = u.class_by_name(args.tokenizer)(args.ngram)
    # load training & testing tweets from database
    exu = None if args.dup_users else set()
    (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer,
                                       args.fields, args.unify_fields, exu)
    exu = None if args.dup_users else tr_users
    (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer,
                                args.fields, args.unify_fields, exu)
    if (not args.skip_small_tests or
         self.enough_data_p(len(tr_tweets), len(te_tweets))):
       self.attempted = True
    else:
       l.info('insufficient data, skipping test %s ' % (self))
       self.attempted = False
       self.results = []
       return
    # tokenize training tweets
    tr_tokens = self.group_tokens(tr_tweets,
                                  args.trim_head, args.min_instances)
    self.train_tweet_ct = len(tr_tweets)
    self.train_token_ct = len(tr_tokens)
    # downsample test tweets
    if (len(te_tweets) > args.test_tweet_limit):
       te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit)
       l.info('sampled %d test tweets per --test-tweet-limit'
              % (args.test_tweet_limit))
    self.test_tweet_ct = len(te_tweets)
    # build model
    self.model = m_class(tr_tokens, args.srid, tr_tweets)
    l.debug('starting model build')
    t_start = time.time()
    self.model.build()
    l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start)))
    t_start = time.time()
    # test 'em
    self.results = multicore.do(test_tweet,
                                (self.model, args.fields), te_tweets)
    l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
Exemple #8
0
 def do_test(self, m_class, db, args, i):
     self.i = i
     # create tokenizer
     tzer = u.class_by_name(args.tokenizer)(args.ngram)
     # load training & testing tweets from database
     exu = None if args.dup_users else set()
     (tr_tweets, tr_users) = self.fetch(db, args.srid, 'training', tzer,
                                        args.fields, args.unify_fields, exu)
     exu = None if args.dup_users else tr_users
     (te_tweets, _) = self.fetch(db, args.srid, 'testing', tzer,
                                 args.fields, args.unify_fields, exu)
     if (not args.skip_small_tests
             or self.enough_data_p(len(tr_tweets), len(te_tweets))):
         self.attempted = True
     else:
         l.info('insufficient data, skipping test %s ' % (self))
         self.attempted = False
         self.results = []
         return
     # tokenize training tweets
     tr_tokens = self.group_tokens(tr_tweets, args.trim_head,
                                   args.min_instances)
     self.train_tweet_ct = len(tr_tweets)
     self.train_token_ct = len(tr_tokens)
     # downsample test tweets
     if (len(te_tweets) > args.test_tweet_limit):
         te_tweets = u.rand.sample(te_tweets, args.test_tweet_limit)
         l.info('sampled %d test tweets per --test-tweet-limit' %
                (args.test_tweet_limit))
     self.test_tweet_ct = len(te_tweets)
     # build model
     self.model = m_class(tr_tokens, args.srid, tr_tweets)
     l.debug('starting model build')
     t_start = time.time()
     self.model.build()
     l.info('built model in %s' % (u.fmt_seconds(time.time() - t_start)))
     t_start = time.time()
     # test 'em
     self.results = multicore.do(test_tweet, (self.model, args.fields),
                                 te_tweets)
     l.info('tested tweets in %s' % (u.fmt_seconds(time.time() - t_start)))
Exemple #9
0
def wt_inv_error(tms, tweets, tokenpts, errattr):
   '''Weight of token T is |1/E^x|, where E is the mean error between T and
      each tweet in tweets having that token, using measure errattr ('sae' or
      'cae'), and x is model parm wt_inv_error_exponent. The number of samples
      used in computing CAE is model parm wt_inv_sample_ct. If the number of
      tweets with the token is less than model parm wt_inv_min_tweets, the
      weight is 0.'''
   l.debug('computing inverse errors')
   t1 = time.time()
   # We work in chunks to keep memory use down. The chunk size is currently
   # not configurable, though we could make it so if needed.
   models = tms.values()
   weights = dict()
   x = model_parms['wt_inv_error_exponent']
   for chunk in u.groupn(models, 20000):
      weights.update((tok, min(1, abs(1/(1+err**x))))
                     for (tok, err)
                     in multicore.do(model_error, (errattr, tokenpts), chunk))
      l.debug('inverse error chunk completed')
   dur = time.time() - t1
   l.debug('computed inverse errors in %s (%.2gs per token)'
           % (u.fmt_seconds(dur), dur / len(models)))
   return weights
Exemple #10
0
 def fetch(self, db, srid, phase, tzer, fields, unify, excluded=None):
    # fetch tweets
    rows = db.select((('tweet_id', 'tweet_id'),
                      ('created_at', 'created_at'),
                      ('day', 'day'),
                      ('hour', 'hour'),
                      ('text', 'text'),
                      ('user_screen_name', 'user_screen_name'),
                      ('user_description', 'user_description'),
                      ('user_lang', 'user_lang'),
                      ('user_location', 'user_location'),
                      ('user_time_zone', 'user_time_zone'),
                      ('ST_Transform(geom, %d)' % (srid),
                       '"geom [geometry]"')),
                     ("FROM tweet WHERE %s"
                      % (self.where(phase, 'created_at'))))
    l.debug('fetched %d rows' % (len(rows)))
    tweets_raw = [tweet.Tweet.from_dict(row) for row in rows]
    l.debug('fetched %d tweets' % (len(tweets_raw)))
    # filter out duplicate users
    users = set()
    tweets = list()
    for tw in tweets_raw:
       if (excluded is None or (tw.user_screen_name not in excluded
                                and tw.user_screen_name not in users)):
          users.add(tw.user_screen_name)
          tweets.append(tw)
    l.info('%s on %d tweets by %d users'
           % (phase, len(tweets), len(users)))
    # tokenize tweets
    t = time.time()
    for tw in tweets:
       # FIXME: This could be refactored to run in parallel
       tw.tokenize(tzer, fields, unify)
    l.debug('tokenized in %s' % (u.fmt_seconds(time.time() - t)))
    # done
    return (tweets, users)
Exemple #11
0
def wt_inv_error(tms, tweets, tokenpts, errattr):
    '''Weight of token T is |1/E^x|, where E is the mean error between T and
      each tweet in tweets having that token, using measure errattr ('sae' or
      'cae'), and x is model parm wt_inv_error_exponent. The number of samples
      used in computing CAE is model parm wt_inv_sample_ct. If the number of
      tweets with the token is less than model parm wt_inv_min_tweets, the
      weight is 0.'''
    l.debug('computing inverse errors')
    t1 = time.time()
    # We work in chunks to keep memory use down. The chunk size is currently
    # not configurable, though we could make it so if needed.
    models = tms.values()
    weights = dict()
    x = model_parms['wt_inv_error_exponent']
    for chunk in u.groupn(models, 20000):
        weights.update(
            (tok, min(1, abs(1 / (1 + err**x))))
            for (tok,
                 err) in multicore.do(model_error, (errattr, tokenpts), chunk))
        l.debug('inverse error chunk completed')
    dur = time.time() - t1
    l.debug('computed inverse errors in %s (%.2gs per token)' %
            (u.fmt_seconds(dur), dur / len(models)))
    return weights
Exemple #12
0
 def main(self):
     u.memory_use_log()
     t_start = time.time()
     # Replaced with self.cur in __init__
     # db = db_glue.DB(self.args.database_file)
     # assert (db.metadata_get('schema_version') == '5')
     # normalize start and end times
     if (self.args.start is None):
         sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table)
         self.cur.execute(sql)
         self.args.start = self.cur.fetchone()[0]
     if (self.args.end is None):
         sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table)
         self.cur.execute(sql)
         # add one second because end time is exclusive
         self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1)
     self.args.start = time_.as_utc(self.args.start)
     self.args.end = time_.as_utc(self.args.end)
     # print test sequence parameters
     self.log_parameters()
     # set up model parameters
     model_class = u.class_by_name(self.args.model)
     model_class.parms_init(self.args.model_parms, log_parms=True)
     # build schedule
     self.schedule_build(self.args.limit)
     l.info('scheduled %s tests (%s left over)'
            % (len(self.schedule), self.args.end - self.schedule[-1].end))
     if (not os.path.exists(self.args.output_dir)):
         os.mkdir(self.args.output_dir)
     l.info('results in %s' % (self.args.output_dir))
     # testing loop
     for (i, t) in enumerate(self.schedule):
         if (i+1 < self.args.start_test):
             l.info('using saved test %d per --start-test' % (i+1))
             l.warning('token and tweet counts will be incorrect')
             # FIXME: hack.....
             try:
                 t.model = u.Deleted_To_Save_Memory()
                 t.results = u.Deleted_To_Save_Memory()
                 t.i = i
                 t.train_tweet_ct = -1e6
                 t.train_token_ct = -1e6
                 t.test_tweet_ct = -1e6
                 t.unshrink_from_disk(self.args.output_dir, results=True)
                 t.attempted = True
             except (IOError, x):
                 if (x.errno != 2):
                     raise
                 t.attempted = False
         else:
             l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t))
             t.do_test(model_class, self.cur, self.args, i)
         t.summarize()
         if (t.attempted):
             if (self.args.profile_memory):
                 # We dump a memory profile here because it's the high water
                 # mark; we're about to reduce usage significantly.
                 import meliae.scanner as ms
                 filename = 'memory.%d.json' % (i)
                 l.info('dumping memory profile %s' % (filename))
                 ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename))
             t.shrink_to_disk(self.args.output_dir)
         l.debug('result: %s' % (t.summary))
         u.memory_use_log()
     # done!
     l.debug('computing summary')
     self.summarize()
     l.debug('summary: %s' % (self.summary))
     l.debug('saving TSV results')
     test_indices = u.sl_union_fromtext(len(self.schedule), ':')
     self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                         test_indices)
     l.debug('saving pickled summary')
     self.memory_use = u.memory_use()
     self.memory_use_peak = "Not implemented"
     self.time_use = time.time() - t_start
     u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
     u.memory_use_log()
     l.info('done in %s' % (u.fmt_seconds(self.time_use)))
Exemple #13
0
db.set_cachesize(0, 32 * 1024 * 1024)
db.set_pagesize(64 * 1024)
db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE))

start_out = time.time()
for j in range(outer_ct):
    start = time.time()
    for i in range(inner_ct):
        db.put(
            str(j * inner_ct + i).encode('UTF-8'), np.ones(720,
                                                           dtype=np.int32))
    db.sync()
    end = time.time()
    elapsed = end - start
    l.info('%d vectors in %s (%d/s), %.3f' %
           (inner_ct, u.fmt_seconds(elapsed), inner_ct / elapsed,
            (j + 1) * inner_ct / (outer_ct * inner_ct)))
    u.memory_use_log()

l.info('compacting database')
pprint(db.stat())
db.compact(flags=bdb.DB_FREE_SPACE)
l.info('closing database')
pprint(db.stat())
db.close()
end_out = time.time()
elapsed_out = end_out - start_out
l.info('%d vectors in %s (%d/s)' %
       (outer_ct * inner_ct, u.fmt_seconds(elapsed_out),
        (outer_ct * inner_ct) / elapsed_out))
u.memory_use_log()
Exemple #14
0
db.execute('PRAGMA synchronous = OFF')
db.execute('CREATE TABLE ts (namespace TEXT, name TEXT, total INT, data TEXT)')
db.execute('CREATE INDEX ts_idx ON ts (namespace, name)')

start_out = time.time()
for j in range(outer_ct):
   start = time.time()
   db.executemany('INSERT INTO ts VALUES (?, ?, ?, ?)',
                  (('en', str(10 * (j * inner_ct + i)), 8675309,
                    np.ones(720, dtype=np.int32).data)
                   for i in range(inner_ct)))
   conn.commit()
   end = time.time()
   elapsed = end - start
   l.info('inserted %d vectors in %s (%d/s), %d, %.3f'
          % (inner_ct, u.fmt_seconds(elapsed), inner_ct/elapsed,
             (j+1)*inner_ct, (j+1)*inner_ct/(outer_ct*inner_ct)))
   #u.memory_use_log()

os.system('clear-disk-cache')

start_out = time.time()

# for j in range(outer_ct):
#    start = time.time()
#    db.execute('begin')
#    insert = list(range(0, inner_ct, 100))
#    for i in insert:
#       db.execute('UPDATE ts SET total=?, data=? WHERE namespace=? AND name=?',
#                  (1, np.zeros(720, dtype=np.int32).data,
#                   'en', str(10 * (j * inner_ct + i))))
Exemple #15
0
def main():
   l.info('starting')
   args_clean()
   # set up Spark
   conf = pyspark.SparkConf()
   conf.setExecutorEnv('PYTHONPATH', QUACLIB)
   if (args.profile):
      conf.set('spark.python.profile', 'true')
   sc = pyspark.SparkContext(conf=conf)
   global args_b
   args_b = sc.broadcast(args)
   # load ground truth data
   global truth
   truth = truth_load()
   l.info('found truth with %d outbreaks' % len(truth.columns))
   global truth_b
   truth_b = sc.broadcast(truth)
   # find dataset
   shard_ct = shards_count()
   l.info('found dataset with %d shards' % shard_ct)
   if (args.shards is not None):
      shard_ct = args.shards
   l.info('will process %d shards' % shard_ct)
   # figure out what tests to do
   global tests
   tests = tests_enumerate()
   l.info('planning %d tests' % len(tests))
   global tests_b
   tests_b = sc.broadcast(tests)
   # some timing accumulators
   global article_ct
   article_ct = sc.accumulator(0)
   global eval_elapsed
   eval_elapsed = sc.accumulator(0)
   # let's go
   l.info('starting computation')

   # 1. Distribute shard indexes
   #
   shards = sc.parallelize(range(shard_ct), shard_ct)

   # 2. Find candidate articles
   #
   # 2a. Find top candidates within each shard for each context
   #
   #     key: Context
   #     val: Priority_Queue:
   #             pri:  r [correlation with ground truth on training data]
   #             val:  (Series [complete time series, .name is URL],
   #                    Series [shifted/truncated training data, .name is URL])
   cands = shards.flatMap(candidates_read)

   # 2b. Find global top candidates for each context
   #
   #     (form same as above)
   cands = cands.reduceByKey(candidates_merge)
   cands.cache()

   # 2c. Dump top candidate summaries
   #
   #     articles and correlations for each context
   #       key: outbreak
   #       val: dict:
   #              key: (training duration (timedelta),
   #                    forecast horizon (timedelta),
   #                    now (datetime))
   #              val: articles (ordered list of (URL, r))
   #l.info('dumping candidate summaries')
   #summs = cands.map(candidate_summarize) \
   #             .reduceByKey(u.dicts_merge)
   #summs.foreach(pickle_dump('r'))

   # 3. Build models
   #
   # 3a. Build a model for each context
   #
   #       key: Context
   #       val: (sk.LinearModel [fitted model],
   #             DataFrame [full candidate time series, URL columns],
   #             DataFrame [training candidate time series, URL columns])
   #
   #     Order of coefficients in model and DataFrame are the same.
   models = cands.map(model_build)
   models.cache()

   # 3b. Dump models and article data for each context. These dumps are
   #     self-contained enough to be loaded in a Python interpreter that is
   #     not QUAC-aware. This should produce a few 10's of GiB of data.
   #
   #       key: outbreak
   #       val: { horizon:
   #              { training:
   #                { now:
   #                  { 'model':  sk.LinearModel [fitted model],
   #                    'data':   DataFrame [full data, URL columns],
   #                    'trdata': DataFrame [training data, URL columns] }}}}
   summs = models.map(model_summarize) \
                 .reduceByKey(u.dicts_merge)
   summs.foreach(pickle_dump('model'))

   # 4. Evaluate models
   #
   # 4a. Compute predicted values
   #
   #       key: Context
   #       val: Series [predicted incidence]
   #               index: period
   #               values: prediction)
   preds = models.map(model_predict)

   # 4b. Re-key results to put nows in value
   #
   #       key: (outbreak, training duration, forecast horizon)
   #       val: (now, Series [predicted incidence])
   preds = preds.map(lambda x: ((x[0].outbreak, x[0].training, x[0].horizon),
                                (x[0].now, x[1])))

   # 4c. Summarize results (~2K keys)
   #
   #       key: (outbreak, training duration, forecast horizon)
   #       val: (DataFrame [predicted incidence]:
   #               index: period
   #               columns: nows)
   preds = preds.groupByKey() \
                .map(model_result_summarize)

   # 4d. Gather results by outbreak (~20 keys, ~20MB/key)
   #
   #       key: outbreak
   #       val: dict:
   #              key: forecast horizon
   #              val: dict:
   #                     key: training duration
   #                     val: DataFrame [predicted incidence]
   #
   #       Note: we could also use a Panel4D for this, but I haven't put in
   #       the effort to wrap my head around it.
   preds = preds.map(lambda x: (x[0][0], { x[0][2]: { x[0][1]: x[1] } })) \
                .reduceByKey(u.dicts_merge)

   # 4e. Dump predictions
   #
   #       For each outbreak, dump a pickle file containing the dict above.
   #       These are then translated to TSV files for plotting in later steps.
   preds.foreach(pickle_dump('out'))

   # finish up
   eval_ct = article_ct.value * len(tests)
   l.info('evaluated: %d articles, %d contexts, %d total; %s (%.0f µs/eval)'
          % (article_ct.value, len(tests), eval_ct,
             u.fmt_seconds(eval_elapsed.value),
             eval_elapsed.value * 1e6 / eval_ct))
   l.info('done')
   try:
      sc.dump_profiles(args.outdir)
      #sc.show_profiles()
   except AttributeError:
      pass
Exemple #16
0
db.execute('PRAGMA synchronous = OFF')
db.execute('CREATE TABLE ts (namespace TEXT, name TEXT, total INT, data TEXT)')
db.execute('CREATE INDEX ts_idx ON ts (namespace, name)')

start_out = time.time()
for j in range(outer_ct):
    start = time.time()
    db.executemany('INSERT INTO ts VALUES (?, ?, ?, ?)',
                   (('en', str(10 * (j * inner_ct + i)), 8675309,
                     np.ones(720, dtype=np.int32).data)
                    for i in range(inner_ct)))
    conn.commit()
    end = time.time()
    elapsed = end - start
    l.info('inserted %d vectors in %s (%d/s), %d, %.3f' %
           (inner_ct, u.fmt_seconds(elapsed), inner_ct / elapsed,
            (j + 1) * inner_ct, (j + 1) * inner_ct / (outer_ct * inner_ct)))
    #u.memory_use_log()

os.system('clear-disk-cache')

start_out = time.time()

# for j in range(outer_ct):
#    start = time.time()
#    db.execute('begin')
#    insert = list(range(0, inner_ct, 100))
#    for i in insert:
#       db.execute('UPDATE ts SET total=?, data=? WHERE namespace=? AND name=?',
#                  (1, np.zeros(720, dtype=np.int32).data,
#                   'en', str(10 * (j * inner_ct + i))))
Exemple #17
0
def main():

   l.info('starting')
   start_time = time.time()
   args_clean()
   g.args = args

   u.memory_use_log(level=l.info)
   l.info('loading input data')
   g.truth = truth_load()
   g.graph = graph_load()
   g.vectors = vectors_load()
   u.memory_use_log(level=l.info)

   g.tests = tests_enumerate()
   l.info('scheduled %d tests' % len(g.tests))

   l.info('saving input data')
   pickle_dump('input', None, g)

   with jl.Parallel(n_jobs=-1, verbose=5) as P:

      l.info('1. Building models')
      #
      # { Context: sk.LinearModel [fitted model] }
      models = { ctx: m for (ctx, m)
                 in zip(g.tests, P(jl.delayed(model_build)(t) for t in g.tests))
                 if m is not None }
      l.info('built %d models (%d at max iterations)'
             % (len(models), sum(not m.converged for (_, m) in models.items())))

      l.info('2. Dumping models')
      # These dumps are self-contained enough to be loaded in a Python
      # interpreter that is not QUAC-aware.
      #
      # { outbreak: { horizon: { training: { now: fitted model } } } }
      summs = u.defaultdict_recursive()
      for (ctx, m) in models.items():
         summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m
      for (ob, ob_data) in summs.as_dict().items():
         pickle_dump(ob, 'model', ob_data)

      l.info('3. Evaluating models')
      # Evaluations run in ~0.15s (according to joblib), so it's not clear to
      # me that distributing the computation outweighs overhead.
      #
      # { Context: Series [predicted incidence]
      #              index: period
      #              values: prediction }
      preds = dict(model_predict(cm) for cm in models.items())

      l.info('4. Aggregating results')
      # Re-key so we can aggregate the nows
      #
      # [ ((outbreak, training, horizon),
      #    (now, Series [predicted incidence])), ... ]
      preds = sorted(((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p))
                     for (ctx, p) in preds.items())
      # Aggregate into DataFrames.
      #
      # { (outbreak, training, horizon): DataFrame [predicted incidence]
      #                                    index: period
      #                                    columns: now }
      preds = { k: model_summarize(preds)
                for (k, preds)
                in itertools.groupby(preds, operator.itemgetter(0)) }

      l.info('5. Dumping results')
      # Gather by outbreak
      #
      # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } }
      preds2 = u.defaultdict_recursive()
      for ((ob, tr, ho), df) in preds.items():
         preds2[ob][ho][tr] = df
      # For each outbreak, dump a pickle file containing the dict above. These
      # are then translated to TSV files for plotting in later steps.
      for (ob, ob_data) in preds2.as_dict().items():
         pickle_dump(ob, 'out', ob_data)

   l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
Exemple #18
0
class Test_Sequence(object):
    def __init__(self, args):
        self.args = args

    @property
    def first_good_test(self):
        # Any attempted test will give us what we need, but an arbitrary
        # number of tests might not have been attempted.
        return next(itertools.ifilter(lambda t: t.attempted, self.schedule))

    def main(self):
        u.memory_use_log()
        t_start = time.time()
        db = db_glue.DB(self.args.database_file)
        l.info('opened database %s' % (self.args.database_file))
        assert (db.metadata_get('schema_version') == '5')
        # normalize start and end times
        if (self.args.start is None):
            sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
            self.args.start = db.sql(sql)[0]['st']
        if (self.args.end is None):
            sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
            # add one second because end time is exclusive
            self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
        self.args.start = time_.as_utc(self.args.start)
        self.args.end = time_.as_utc(self.args.end)
        # print test sequence parameters
        self.log_parameters()
        # set up model parameters
        model_class = u.class_by_name(self.args.model)
        model_class.parms_init(self.args.model_parms, log_parms=True)
        # build schedule
        self.schedule_build(self.args.limit)
        l.info('scheduled %s tests (%s left over)' %
               (len(self.schedule), self.args.end - self.schedule[-1].end))
        if (not os.path.exists(self.args.output_dir)):
            os.mkdir(self.args.output_dir)
        l.info('results in %s' % (self.args.output_dir))
        # testing loop
        for (i, t) in enumerate(self.schedule):
            if (i + 1 < self.args.start_test):
                l.info('using saved test %d per --start-test' % (i + 1))
                l.warning('token and tweet counts will be incorrect')
                # FIXME: hack.....
                try:
                    t.model = u.Deleted_To_Save_Memory()
                    t.results = u.Deleted_To_Save_Memory()
                    t.i = i
                    t.train_tweet_ct = -1e6
                    t.train_token_ct = -1e6
                    t.test_tweet_ct = -1e6
                    t.unshrink_from_disk(self.args.output_dir, results=True)
                    t.attempted = True
                except IOError, x:
                    if (x.errno != 2):
                        raise
                    t.attempted = False
            else:
                l.info('starting test %d of %d: %s' %
                       (i + 1, len(self.schedule), t))
                t.do_test(model_class, db, self.args, i)
            t.summarize()
            if (t.attempted):
                if (self.args.profile_memory):
                    # We dump a memory profile here because it's the high water
                    # mark; we're about to reduce usage significantly.
                    import meliae.scanner as ms
                    filename = 'memory.%d.json' % (i)
                    l.info('dumping memory profile %s' % (filename))
                    ms.dump_all_objects('%s/%s' %
                                        (self.args.output_dir, filename))
                t.shrink_to_disk(self.args.output_dir)
            l.debug('result: %s' % (t.summary))
            u.memory_use_log()
        # done!
        l.debug('computing summary')
        self.summarize()
        l.debug('summary: %s' % (self.summary))
        l.debug('saving TSV results')
        test_indices = u.sl_union_fromtext(len(self.schedule), ':')
        self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                            test_indices)
        l.debug('saving pickled summary')
        self.memory_use = u.memory_use()
        self.memory_use_peak = u.memory_use(True)
        self.time_use = time.time() - t_start
        u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
        u.memory_use_log()
        l.info('done in %s' % (u.fmt_seconds(self.time_use)))
Exemple #19
0
def main():

    l.info('starting')
    start_time = time.time()
    args_clean()
    g.args = args

    u.memory_use_log(level=l.info)
    l.info('loading input data')
    g.truth = truth_load()
    g.graph = graph_load()
    g.vectors = vectors_load()
    u.memory_use_log(level=l.info)

    g.tests = tests_enumerate()
    l.info('scheduled %d tests' % len(g.tests))

    l.info('saving input data')
    pickle_dump('input', None, g)

    with jl.Parallel(n_jobs=-1, verbose=5) as P:

        l.info('1. Building models')
        #
        # { Context: sk.LinearModel [fitted model] }
        models = {
            ctx: m
            for (ctx, m) in zip(g.tests,
                                P(jl.delayed(model_build)(t) for t in g.tests))
            if m is not None
        }
        l.info('built %d models (%d at max iterations)' %
               (len(models), sum(not m.converged
                                 for (_, m) in models.items())))

        l.info('2. Dumping models')
        # These dumps are self-contained enough to be loaded in a Python
        # interpreter that is not QUAC-aware.
        #
        # { outbreak: { horizon: { training: { now: fitted model } } } }
        summs = u.defaultdict_recursive()
        for (ctx, m) in models.items():
            summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m
        for (ob, ob_data) in summs.as_dict().items():
            pickle_dump(ob, 'model', ob_data)

        l.info('3. Evaluating models')
        # Evaluations run in ~0.15s (according to joblib), so it's not clear to
        # me that distributing the computation outweighs overhead.
        #
        # { Context: Series [predicted incidence]
        #              index: period
        #              values: prediction }
        preds = dict(model_predict(cm) for cm in models.items())

        l.info('4. Aggregating results')
        # Re-key so we can aggregate the nows
        #
        # [ ((outbreak, training, horizon),
        #    (now, Series [predicted incidence])), ... ]
        preds = sorted(
            ((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p))
            for (ctx, p) in preds.items())
        # Aggregate into DataFrames.
        #
        # { (outbreak, training, horizon): DataFrame [predicted incidence]
        #                                    index: period
        #                                    columns: now }
        preds = {
            k: model_summarize(preds)
            for (k, preds) in itertools.groupby(preds, operator.itemgetter(0))
        }

        l.info('5. Dumping results')
        # Gather by outbreak
        #
        # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } }
        preds2 = u.defaultdict_recursive()
        for ((ob, tr, ho), df) in preds.items():
            preds2[ob][ho][tr] = df
        # For each outbreak, dump a pickle file containing the dict above. These
        # are then translated to TSV files for plotting in later steps.
        for (ob, ob_data) in preds2.as_dict().items():
            pickle_dump(ob, 'out', ob_data)

    l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
Exemple #20
0
#db.set_flags(bdb.DB_TXN_NOT_DURABLE)
db.set_cachesize(0, 32*1024*1024)
db.set_pagesize(64*1024)
db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE))

start_out = time.time()
for j in range(outer_ct):
   start = time.time()
   for i in range(inner_ct):
      db.put(str(j * inner_ct + i).encode('UTF-8'),
             np.ones(720, dtype=np.int32))
   db.sync()
   end = time.time()
   elapsed = end - start
   l.info('%d vectors in %s (%d/s), %.3f'
          % (inner_ct, u.fmt_seconds(elapsed), inner_ct/elapsed,
             (j+1)*inner_ct/(outer_ct*inner_ct)))
   u.memory_use_log()

l.info('compacting database')
pprint(db.stat())
db.compact(flags=bdb.DB_FREE_SPACE)
l.info('closing database')
pprint(db.stat())
db.close()
end_out = time.time()
elapsed_out = end_out - start_out
l.info('%d vectors in %s (%d/s)'
       % (outer_ct * inner_ct, u.fmt_seconds(elapsed_out),
          (outer_ct * inner_ct)/elapsed_out))
u.memory_use_log()
Exemple #21
0
def main():
    l.info('starting')
    args_clean()
    # set up Spark
    conf = pyspark.SparkConf()
    conf.setExecutorEnv('PYTHONPATH', QUACLIB)
    if (args.profile):
        conf.set('spark.python.profile', 'true')
    sc = pyspark.SparkContext(conf=conf)
    global args_b
    args_b = sc.broadcast(args)
    # load ground truth data
    global truth
    truth = truth_load()
    l.info('found truth with %d outbreaks' % len(truth.columns))
    global truth_b
    truth_b = sc.broadcast(truth)
    # find dataset
    shard_ct = shards_count()
    l.info('found dataset with %d shards' % shard_ct)
    if (args.shards is not None):
        shard_ct = args.shards
    l.info('will process %d shards' % shard_ct)
    # figure out what tests to do
    global tests
    tests = tests_enumerate()
    l.info('planning %d tests' % len(tests))
    global tests_b
    tests_b = sc.broadcast(tests)
    # some timing accumulators
    global article_ct
    article_ct = sc.accumulator(0)
    global eval_elapsed
    eval_elapsed = sc.accumulator(0)
    # let's go
    l.info('starting computation')

    # 1. Distribute shard indexes
    #
    shards = sc.parallelize(range(shard_ct), shard_ct)

    # 2. Find candidate articles
    #
    # 2a. Find top candidates within each shard for each context
    #
    #     key: Context
    #     val: Priority_Queue:
    #             pri:  r [correlation with ground truth on training data]
    #             val:  (Series [complete time series, .name is URL],
    #                    Series [shifted/truncated training data, .name is URL])
    cands = shards.flatMap(candidates_read)

    # 2b. Find global top candidates for each context
    #
    #     (form same as above)
    cands = cands.reduceByKey(candidates_merge)
    cands.cache()

    # 2c. Dump top candidate summaries
    #
    #     articles and correlations for each context
    #       key: outbreak
    #       val: dict:
    #              key: (training duration (timedelta),
    #                    forecast horizon (timedelta),
    #                    now (datetime))
    #              val: articles (ordered list of (URL, r))
    #l.info('dumping candidate summaries')
    #summs = cands.map(candidate_summarize) \
    #             .reduceByKey(u.dicts_merge)
    #summs.foreach(pickle_dump('r'))

    # 3. Build models
    #
    # 3a. Build a model for each context
    #
    #       key: Context
    #       val: (sk.LinearModel [fitted model],
    #             DataFrame [full candidate time series, URL columns],
    #             DataFrame [training candidate time series, URL columns])
    #
    #     Order of coefficients in model and DataFrame are the same.
    models = cands.map(model_build)
    models.cache()

    # 3b. Dump models and article data for each context. These dumps are
    #     self-contained enough to be loaded in a Python interpreter that is
    #     not QUAC-aware. This should produce a few 10's of GiB of data.
    #
    #       key: outbreak
    #       val: { horizon:
    #              { training:
    #                { now:
    #                  { 'model':  sk.LinearModel [fitted model],
    #                    'data':   DataFrame [full data, URL columns],
    #                    'trdata': DataFrame [training data, URL columns] }}}}
    summs = models.map(model_summarize) \
                  .reduceByKey(u.dicts_merge)
    summs.foreach(pickle_dump('model'))

    # 4. Evaluate models
    #
    # 4a. Compute predicted values
    #
    #       key: Context
    #       val: Series [predicted incidence]
    #               index: period
    #               values: prediction)
    preds = models.map(model_predict)

    # 4b. Re-key results to put nows in value
    #
    #       key: (outbreak, training duration, forecast horizon)
    #       val: (now, Series [predicted incidence])
    preds = preds.map(lambda x: ((x[0].outbreak, x[0].training, x[0].horizon),
                                 (x[0].now, x[1])))

    # 4c. Summarize results (~2K keys)
    #
    #       key: (outbreak, training duration, forecast horizon)
    #       val: (DataFrame [predicted incidence]:
    #               index: period
    #               columns: nows)
    preds = preds.groupByKey() \
                 .map(model_result_summarize)

    # 4d. Gather results by outbreak (~20 keys, ~20MB/key)
    #
    #       key: outbreak
    #       val: dict:
    #              key: forecast horizon
    #              val: dict:
    #                     key: training duration
    #                     val: DataFrame [predicted incidence]
    #
    #       Note: we could also use a Panel4D for this, but I haven't put in
    #       the effort to wrap my head around it.
    preds = preds.map(lambda x: (x[0][0], { x[0][2]: { x[0][1]: x[1] } })) \
                 .reduceByKey(u.dicts_merge)

    # 4e. Dump predictions
    #
    #       For each outbreak, dump a pickle file containing the dict above.
    #       These are then translated to TSV files for plotting in later steps.
    preds.foreach(pickle_dump('out'))

    # finish up
    eval_ct = article_ct.value * len(tests)
    l.info('evaluated: %d articles, %d contexts, %d total; %s (%.0f µs/eval)' %
           (article_ct.value, len(tests), eval_ct,
            u.fmt_seconds(
                eval_elapsed.value), eval_elapsed.value * 1e6 / eval_ct))
    l.info('done')
    try:
        sc.dump_profiles(args.outdir)
        #sc.show_profiles()
    except AttributeError:
        pass