コード例 #1
0
ファイル: bdbtest.py プロジェクト: reidpr/quac
from pprint import pprint
import time

from bsddb3 import db as bdb
import numpy as np

import u

outer_ct = 5
inner_ct = 50000

u.logging_init('bdb', verbose_=True)
l = u.l

u.memory_use_log()

db = bdb.DB()
#db.set_flags(bdb.DB_TXN_NOT_DURABLE)
db.set_cachesize(0, 32*1024*1024)
db.set_pagesize(64*1024)
db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE))

start_out = time.time()
for j in range(outer_ct):
   start = time.time()
   for i in range(inner_ct):
      db.put(str(j * inner_ct + i).encode('UTF-8'),
             np.ones(720, dtype=np.int32))
   db.sync()
   end = time.time()
コード例 #2
0
ファイル: bdbtest.py プロジェクト: suen049/quac
from pprint import pprint
import time

from bsddb3 import db as bdb
import numpy as np

import u

outer_ct = 5
inner_ct = 50000

u.logging_init('bdb', verbose_=True)
l = u.l

u.memory_use_log()

db = bdb.DB()
#db.set_flags(bdb.DB_TXN_NOT_DURABLE)
db.set_cachesize(0, 32 * 1024 * 1024)
db.set_pagesize(64 * 1024)
db.open('/data6/foo.db', dbtype=bdb.DB_BTREE, flags=(bdb.DB_CREATE))

start_out = time.time()
for j in range(outer_ct):
    start = time.time()
    for i in range(inner_ct):
        db.put(
            str(j * inner_ct + i).encode('UTF-8'), np.ones(720,
                                                           dtype=np.int32))
    db.sync()
コード例 #3
0
ファイル: experiment.py プロジェクト: reidpr/quac
def main():

   l.info('starting')
   start_time = time.time()
   args_clean()
   g.args = args

   u.memory_use_log(level=l.info)
   l.info('loading input data')
   g.truth = truth_load()
   g.graph = graph_load()
   g.vectors = vectors_load()
   u.memory_use_log(level=l.info)

   g.tests = tests_enumerate()
   l.info('scheduled %d tests' % len(g.tests))

   l.info('saving input data')
   pickle_dump('input', None, g)

   with jl.Parallel(n_jobs=-1, verbose=5) as P:

      l.info('1. Building models')
      #
      # { Context: sk.LinearModel [fitted model] }
      models = { ctx: m for (ctx, m)
                 in zip(g.tests, P(jl.delayed(model_build)(t) for t in g.tests))
                 if m is not None }
      l.info('built %d models (%d at max iterations)'
             % (len(models), sum(not m.converged for (_, m) in models.items())))

      l.info('2. Dumping models')
      # These dumps are self-contained enough to be loaded in a Python
      # interpreter that is not QUAC-aware.
      #
      # { outbreak: { horizon: { training: { now: fitted model } } } }
      summs = u.defaultdict_recursive()
      for (ctx, m) in models.items():
         summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m
      for (ob, ob_data) in summs.as_dict().items():
         pickle_dump(ob, 'model', ob_data)

      l.info('3. Evaluating models')
      # Evaluations run in ~0.15s (according to joblib), so it's not clear to
      # me that distributing the computation outweighs overhead.
      #
      # { Context: Series [predicted incidence]
      #              index: period
      #              values: prediction }
      preds = dict(model_predict(cm) for cm in models.items())

      l.info('4. Aggregating results')
      # Re-key so we can aggregate the nows
      #
      # [ ((outbreak, training, horizon),
      #    (now, Series [predicted incidence])), ... ]
      preds = sorted(((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p))
                     for (ctx, p) in preds.items())
      # Aggregate into DataFrames.
      #
      # { (outbreak, training, horizon): DataFrame [predicted incidence]
      #                                    index: period
      #                                    columns: now }
      preds = { k: model_summarize(preds)
                for (k, preds)
                in itertools.groupby(preds, operator.itemgetter(0)) }

      l.info('5. Dumping results')
      # Gather by outbreak
      #
      # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } }
      preds2 = u.defaultdict_recursive()
      for ((ob, tr, ho), df) in preds.items():
         preds2[ob][ho][tr] = df
      # For each outbreak, dump a pickle file containing the dict above. These
      # are then translated to TSV files for plotting in later steps.
      for (ob, ob_data) in preds2.as_dict().items():
         pickle_dump(ob, 'out', ob_data)

   l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
コード例 #4
0
ファイル: model_test.py プロジェクト: bussiere/quac
 def main(self):
    u.memory_use_log()
    t_start = time.time()
    db = db_glue.DB(self.args.database_file)
    l.info('opened database %s' % (self.args.database_file))
    assert (db.metadata_get('schema_version') == '5')
    # normalize start and end times
    if (self.args.start is None):
       sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
       self.args.start = db.sql(sql)[0]['st']
    if (self.args.end is None):
       sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
       # add one second because end time is exclusive
       self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
    self.args.start = time_.as_utc(self.args.start)
    self.args.end = time_.as_utc(self.args.end)
    # print test sequence parameters
    self.log_parameters()
    # set up model parameters
    model_class = u.class_by_name(self.args.model)
    model_class.parms_init(self.args.model_parms, log_parms=True)
    # build schedule
    self.schedule_build(self.args.limit)
    l.info('scheduled %s tests (%s left over)'
           % (len(self.schedule), self.args.end - self.schedule[-1].end))
    if (not os.path.exists(self.args.output_dir)):
       os.mkdir(self.args.output_dir)
    l.info('results in %s' % (self.args.output_dir))
    # testing loop
    for (i, t) in enumerate(self.schedule):
       if (i+1 < self.args.start_test):
          l.info('using saved test %d per --start-test' % (i+1))
          l.warning('token and tweet counts will be incorrect')
          # FIXME: hack.....
          try:
             t.model = u.Deleted_To_Save_Memory()
             t.results = u.Deleted_To_Save_Memory()
             t.i = i
             t.train_tweet_ct = -1e6
             t.train_token_ct = -1e6
             t.test_tweet_ct = -1e6
             t.unshrink_from_disk(self.args.output_dir, results=True)
             t.attempted = True
          except IOError, x:
             if (x.errno != 2):
                raise
             t.attempted = False
       else:
          l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t))
          t.do_test(model_class, db, self.args, i)
       t.summarize()
       if (t.attempted):
          if (self.args.profile_memory):
             # We dump a memory profile here because it's the high water
             # mark; we're about to reduce usage significantly.
             import meliae.scanner as ms
             filename = 'memory.%d.json' % (i)
             l.info('dumping memory profile %s' % (filename))
             ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename))
          t.shrink_to_disk(self.args.output_dir)
       l.debug('result: %s' % (t.summary))
       u.memory_use_log()
コード例 #5
0
ファイル: model_test.py プロジェクト: joh12041/quac
 def main(self):
     u.memory_use_log()
     t_start = time.time()
     # Replaced with self.cur in __init__
     # db = db_glue.DB(self.args.database_file)
     # assert (db.metadata_get('schema_version') == '5')
     # normalize start and end times
     if (self.args.start is None):
         sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table)
         self.cur.execute(sql)
         self.args.start = self.cur.fetchone()[0]
     if (self.args.end is None):
         sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table)
         self.cur.execute(sql)
         # add one second because end time is exclusive
         self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1)
     self.args.start = time_.as_utc(self.args.start)
     self.args.end = time_.as_utc(self.args.end)
     # print test sequence parameters
     self.log_parameters()
     # set up model parameters
     model_class = u.class_by_name(self.args.model)
     model_class.parms_init(self.args.model_parms, log_parms=True)
     # build schedule
     self.schedule_build(self.args.limit)
     l.info('scheduled %s tests (%s left over)'
            % (len(self.schedule), self.args.end - self.schedule[-1].end))
     if (not os.path.exists(self.args.output_dir)):
         os.mkdir(self.args.output_dir)
     l.info('results in %s' % (self.args.output_dir))
     # testing loop
     for (i, t) in enumerate(self.schedule):
         if (i+1 < self.args.start_test):
             l.info('using saved test %d per --start-test' % (i+1))
             l.warning('token and tweet counts will be incorrect')
             # FIXME: hack.....
             try:
                 t.model = u.Deleted_To_Save_Memory()
                 t.results = u.Deleted_To_Save_Memory()
                 t.i = i
                 t.train_tweet_ct = -1e6
                 t.train_token_ct = -1e6
                 t.test_tweet_ct = -1e6
                 t.unshrink_from_disk(self.args.output_dir, results=True)
                 t.attempted = True
             except (IOError, x):
                 if (x.errno != 2):
                     raise
                 t.attempted = False
         else:
             l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t))
             t.do_test(model_class, self.cur, self.args, i)
         t.summarize()
         if (t.attempted):
             if (self.args.profile_memory):
                 # We dump a memory profile here because it's the high water
                 # mark; we're about to reduce usage significantly.
                 import meliae.scanner as ms
                 filename = 'memory.%d.json' % (i)
                 l.info('dumping memory profile %s' % (filename))
                 ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename))
             t.shrink_to_disk(self.args.output_dir)
         l.debug('result: %s' % (t.summary))
         u.memory_use_log()
     # done!
     l.debug('computing summary')
     self.summarize()
     l.debug('summary: %s' % (self.summary))
     l.debug('saving TSV results')
     test_indices = u.sl_union_fromtext(len(self.schedule), ':')
     self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                         test_indices)
     l.debug('saving pickled summary')
     self.memory_use = u.memory_use()
     self.memory_use_peak = "Not implemented"
     self.time_use = time.time() - t_start
     u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
     u.memory_use_log()
     l.info('done in %s' % (u.fmt_seconds(self.time_use)))
コード例 #6
0
ファイル: experiment.py プロジェクト: suen049/quac
def main():

    l.info('starting')
    start_time = time.time()
    args_clean()
    g.args = args

    u.memory_use_log(level=l.info)
    l.info('loading input data')
    g.truth = truth_load()
    g.graph = graph_load()
    g.vectors = vectors_load()
    u.memory_use_log(level=l.info)

    g.tests = tests_enumerate()
    l.info('scheduled %d tests' % len(g.tests))

    l.info('saving input data')
    pickle_dump('input', None, g)

    with jl.Parallel(n_jobs=-1, verbose=5) as P:

        l.info('1. Building models')
        #
        # { Context: sk.LinearModel [fitted model] }
        models = {
            ctx: m
            for (ctx, m) in zip(g.tests,
                                P(jl.delayed(model_build)(t) for t in g.tests))
            if m is not None
        }
        l.info('built %d models (%d at max iterations)' %
               (len(models), sum(not m.converged
                                 for (_, m) in models.items())))

        l.info('2. Dumping models')
        # These dumps are self-contained enough to be loaded in a Python
        # interpreter that is not QUAC-aware.
        #
        # { outbreak: { horizon: { training: { now: fitted model } } } }
        summs = u.defaultdict_recursive()
        for (ctx, m) in models.items():
            summs[ctx.outbreak][ctx.horizon][ctx.training][ctx.now] = m
        for (ob, ob_data) in summs.as_dict().items():
            pickle_dump(ob, 'model', ob_data)

        l.info('3. Evaluating models')
        # Evaluations run in ~0.15s (according to joblib), so it's not clear to
        # me that distributing the computation outweighs overhead.
        #
        # { Context: Series [predicted incidence]
        #              index: period
        #              values: prediction }
        preds = dict(model_predict(cm) for cm in models.items())

        l.info('4. Aggregating results')
        # Re-key so we can aggregate the nows
        #
        # [ ((outbreak, training, horizon),
        #    (now, Series [predicted incidence])), ... ]
        preds = sorted(
            ((ctx.outbreak, ctx.training, ctx.horizon), (ctx.now, p))
            for (ctx, p) in preds.items())
        # Aggregate into DataFrames.
        #
        # { (outbreak, training, horizon): DataFrame [predicted incidence]
        #                                    index: period
        #                                    columns: now }
        preds = {
            k: model_summarize(preds)
            for (k, preds) in itertools.groupby(preds, operator.itemgetter(0))
        }

        l.info('5. Dumping results')
        # Gather by outbreak
        #
        # { outbreak: { horizon: { training: DataFrame [predicted incidence] } } }
        preds2 = u.defaultdict_recursive()
        for ((ob, tr, ho), df) in preds.items():
            preds2[ob][ho][tr] = df
        # For each outbreak, dump a pickle file containing the dict above. These
        # are then translated to TSV files for plotting in later steps.
        for (ob, ob_data) in preds2.as_dict().items():
            pickle_dump(ob, 'out', ob_data)

    l.info('done in %s' % u.fmt_seconds(time.time() - start_time))
コード例 #7
0
ファイル: model_test.py プロジェクト: suen049/quac
 def main(self):
     u.memory_use_log()
     t_start = time.time()
     db = db_glue.DB(self.args.database_file)
     l.info('opened database %s' % (self.args.database_file))
     assert (db.metadata_get('schema_version') == '5')
     # normalize start and end times
     if (self.args.start is None):
         sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
         self.args.start = db.sql(sql)[0]['st']
     if (self.args.end is None):
         sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
         # add one second because end time is exclusive
         self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
     self.args.start = time_.as_utc(self.args.start)
     self.args.end = time_.as_utc(self.args.end)
     # print test sequence parameters
     self.log_parameters()
     # set up model parameters
     model_class = u.class_by_name(self.args.model)
     model_class.parms_init(self.args.model_parms, log_parms=True)
     # build schedule
     self.schedule_build(self.args.limit)
     l.info('scheduled %s tests (%s left over)' %
            (len(self.schedule), self.args.end - self.schedule[-1].end))
     if (not os.path.exists(self.args.output_dir)):
         os.mkdir(self.args.output_dir)
     l.info('results in %s' % (self.args.output_dir))
     # testing loop
     for (i, t) in enumerate(self.schedule):
         if (i + 1 < self.args.start_test):
             l.info('using saved test %d per --start-test' % (i + 1))
             l.warning('token and tweet counts will be incorrect')
             # FIXME: hack.....
             try:
                 t.model = u.Deleted_To_Save_Memory()
                 t.results = u.Deleted_To_Save_Memory()
                 t.i = i
                 t.train_tweet_ct = -1e6
                 t.train_token_ct = -1e6
                 t.test_tweet_ct = -1e6
                 t.unshrink_from_disk(self.args.output_dir, results=True)
                 t.attempted = True
             except IOError, x:
                 if (x.errno != 2):
                     raise
                 t.attempted = False
         else:
             l.info('starting test %d of %d: %s' %
                    (i + 1, len(self.schedule), t))
             t.do_test(model_class, db, self.args, i)
         t.summarize()
         if (t.attempted):
             if (self.args.profile_memory):
                 # We dump a memory profile here because it's the high water
                 # mark; we're about to reduce usage significantly.
                 import meliae.scanner as ms
                 filename = 'memory.%d.json' % (i)
                 l.info('dumping memory profile %s' % (filename))
                 ms.dump_all_objects('%s/%s' %
                                     (self.args.output_dir, filename))
             t.shrink_to_disk(self.args.output_dir)
         l.debug('result: %s' % (t.summary))
         u.memory_use_log()
コード例 #8
0
ファイル: model_test.py プロジェクト: suen049/quac
class Test_Sequence(object):
    def __init__(self, args):
        self.args = args

    @property
    def first_good_test(self):
        # Any attempted test will give us what we need, but an arbitrary
        # number of tests might not have been attempted.
        return next(itertools.ifilter(lambda t: t.attempted, self.schedule))

    def main(self):
        u.memory_use_log()
        t_start = time.time()
        db = db_glue.DB(self.args.database_file)
        l.info('opened database %s' % (self.args.database_file))
        assert (db.metadata_get('schema_version') == '5')
        # normalize start and end times
        if (self.args.start is None):
            sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
            self.args.start = db.sql(sql)[0]['st']
        if (self.args.end is None):
            sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
            # add one second because end time is exclusive
            self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
        self.args.start = time_.as_utc(self.args.start)
        self.args.end = time_.as_utc(self.args.end)
        # print test sequence parameters
        self.log_parameters()
        # set up model parameters
        model_class = u.class_by_name(self.args.model)
        model_class.parms_init(self.args.model_parms, log_parms=True)
        # build schedule
        self.schedule_build(self.args.limit)
        l.info('scheduled %s tests (%s left over)' %
               (len(self.schedule), self.args.end - self.schedule[-1].end))
        if (not os.path.exists(self.args.output_dir)):
            os.mkdir(self.args.output_dir)
        l.info('results in %s' % (self.args.output_dir))
        # testing loop
        for (i, t) in enumerate(self.schedule):
            if (i + 1 < self.args.start_test):
                l.info('using saved test %d per --start-test' % (i + 1))
                l.warning('token and tweet counts will be incorrect')
                # FIXME: hack.....
                try:
                    t.model = u.Deleted_To_Save_Memory()
                    t.results = u.Deleted_To_Save_Memory()
                    t.i = i
                    t.train_tweet_ct = -1e6
                    t.train_token_ct = -1e6
                    t.test_tweet_ct = -1e6
                    t.unshrink_from_disk(self.args.output_dir, results=True)
                    t.attempted = True
                except IOError, x:
                    if (x.errno != 2):
                        raise
                    t.attempted = False
            else:
                l.info('starting test %d of %d: %s' %
                       (i + 1, len(self.schedule), t))
                t.do_test(model_class, db, self.args, i)
            t.summarize()
            if (t.attempted):
                if (self.args.profile_memory):
                    # We dump a memory profile here because it's the high water
                    # mark; we're about to reduce usage significantly.
                    import meliae.scanner as ms
                    filename = 'memory.%d.json' % (i)
                    l.info('dumping memory profile %s' % (filename))
                    ms.dump_all_objects('%s/%s' %
                                        (self.args.output_dir, filename))
                t.shrink_to_disk(self.args.output_dir)
            l.debug('result: %s' % (t.summary))
            u.memory_use_log()
        # done!
        l.debug('computing summary')
        self.summarize()
        l.debug('summary: %s' % (self.summary))
        l.debug('saving TSV results')
        test_indices = u.sl_union_fromtext(len(self.schedule), ':')
        self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                            test_indices)
        l.debug('saving pickled summary')
        self.memory_use = u.memory_use()
        self.memory_use_peak = u.memory_use(True)
        self.time_use = time.time() - t_start
        u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
        u.memory_use_log()
        l.info('done in %s' % (u.fmt_seconds(self.time_use)))