Example #1
0
 def tsv_save_tokens(self, filename, geofiles_p, geoimage_width,
                     test_indices, token_idx, tw_tokens):
    l.info('writing tokens summaries to %s' % (filename))
    if (not geofiles_p):
       tsv = self.tsv_open(filename)
       self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                               model=True)
       tsv.writerow(['test_idx', 'token_idx']
                 + list(self.first_good_test.model.token_summary_keys))
    for i in sorted(test_indices):
       test = self.schedule[i]
       if (not test.attempted):
          continue
       test.unshrink_from_disk(self.args.output_dir, model=True)
       tokenrows = [test.model.token_summary(token)
                    for token in test.model.tokens]
       tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True)
       token_indices = u.sl_union_fromtext(len(tokenrows), token_idx)
       for j in xrange(len(tokenrows)):
          tokenrow = tokenrows[j]
          if (not (j in token_indices
                   or i in tw_tokens.get(tokenrow['token'], set()))):
             continue
          if (not geofiles_p):
             tsv.writerow([i, j] + tokenrow.values())
          else:
             assert (geoimage_width > 0)
             gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
             l.debug('writing geofiles %s' % (gi_basename))
             test.model.dump_geofiles(gi_basename, geoimage_width,
                                      tokenrow['token'])
       test.shrink()
Example #2
0
 def tsv_save_tokens(self, filename, geofiles_p, geoimage_width,
                     test_indices, token_idx, tw_tokens):
     l.info('writing tokens summaries to %s' % (filename))
     if (not geofiles_p):
         tsv = self.tsv_open(filename)
         self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                                 model=True)
         tsv.writerow(['test_idx', 'token_idx'] +
                      list(self.first_good_test.model.token_summary_keys))
     for i in sorted(test_indices):
         test = self.schedule[i]
         if (not test.attempted):
             continue
         test.unshrink_from_disk(self.args.output_dir, model=True)
         tokenrows = [
             test.model.token_summary(token) for token in test.model.tokens
         ]
         tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True)
         token_indices = u.sl_union_fromtext(len(tokenrows), token_idx)
         for j in xrange(len(tokenrows)):
             tokenrow = tokenrows[j]
             if (not (j in token_indices
                      or i in tw_tokens.get(tokenrow['token'], set()))):
                 continue
             if (not geofiles_p):
                 tsv.writerow([i, j] + tokenrow.values())
             else:
                 assert (geoimage_width > 0)
                 gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
                 l.debug('writing geofiles %s' % (gi_basename))
                 test.model.dump_geofiles(gi_basename, geoimage_width,
                                          tokenrow['token'])
         test.shrink()
Example #3
0
 def tsv_save_tweets(self, filename, include_fails_p, geofiles_p,
                     geoimage_width, test_indices, tweet_idx):
    '''Return value is a mapping from tokens involved in printed tweets to
       a set of test indices in which they appeared.'''
    tokens = defaultdict(set)
    l.info('writing tweet summaries%s to %s'
           % (' and geoimages' if geofiles_p else '', filename))
    if (not geofiles_p):
       tsv = self.tsv_open(filename)
       self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                               results=True)
       tsv.writerow(['test_idx', 'tweet_idx']
                    + self.first_good_test.results[0].summary_dict.keys())
    for i in sorted(test_indices):
       test = self.schedule[i]
       if (not test.attempted):
          continue
       test.unshrink_from_disk(self.args.output_dir, results=True)
       tweetrows = test.results[:]
       tweetrows.sort(key=operator.attrgetter('cae'))
       for j in sorted(u.sl_union_fromtext(len(tweetrows), tweet_idx)):
          r = tweetrows[j]
          if (not r.success_ct and not include_fails_p):
             continue
          for token in r.location_estimate.explanation.iterkeys():
             tokens[token].add(i)
          if (not geofiles_p):
             tsv.writerow([i, j] + r.summary_dict.values())
          else:
             assert (geoimage_width > 0)
             gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
             l.debug('writing geofiles %s' % (gi_basename))
             # FIXME: ugly hack to get PR_90 instead of PR_95
             import geo.gmm
             geo.gmm.Token.parms_init({})
             r.location_estimate.dump_geofiles(gi_basename,
                                               geoimage_width, 0.90)
             srs.dump_geojson(gi_basename + '.truth', r.tweet.geom)
       test.shrink()
    return tokens
Example #4
0
 def tsv_save_tweets(self, filename, include_fails_p, geofiles_p,
                     geoimage_width, test_indices, tweet_idx):
     '''Return value is a mapping from tokens involved in printed tweets to
      a set of test indices in which they appeared.'''
     tokens = defaultdict(set)
     l.info('writing tweet summaries%s to %s' %
            (' and geoimages' if geofiles_p else '', filename))
     if (not geofiles_p):
         tsv = self.tsv_open(filename)
         self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                                 results=True)
         tsv.writerow(['test_idx', 'tweet_idx'] +
                      self.first_good_test.results[0].summary_dict.keys())
     for i in sorted(test_indices):
         test = self.schedule[i]
         if (not test.attempted):
             continue
         test.unshrink_from_disk(self.args.output_dir, results=True)
         tweetrows = test.results[:]
         tweetrows.sort(key=operator.attrgetter('cae'))
         for j in sorted(u.sl_union_fromtext(len(tweetrows), tweet_idx)):
             r = tweetrows[j]
             if (not r.success_ct and not include_fails_p):
                 continue
             for token in r.location_estimate.explanation.iterkeys():
                 tokens[token].add(i)
             if (not geofiles_p):
                 tsv.writerow([i, j] + r.summary_dict.values())
             else:
                 assert (geoimage_width > 0)
                 gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
                 l.debug('writing geofiles %s' % (gi_basename))
                 # FIXME: ugly hack to get PR_90 instead of PR_95
                 import geo.gmm
                 geo.gmm.Token.parms_init({})
                 r.location_estimate.dump_geofiles(gi_basename,
                                                   geoimage_width, 0.90)
                 srs.dump_geojson(gi_basename + '.truth', r.tweet.geom)
         test.shrink()
     return tokens
Example #5
0
 def main(self):
     u.memory_use_log()
     t_start = time.time()
     # Replaced with self.cur in __init__
     # db = db_glue.DB(self.args.database_file)
     # assert (db.metadata_get('schema_version') == '5')
     # normalize start and end times
     if (self.args.start is None):
         sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table)
         self.cur.execute(sql)
         self.args.start = self.cur.fetchone()[0]
     if (self.args.end is None):
         sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table)
         self.cur.execute(sql)
         # add one second because end time is exclusive
         self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1)
     self.args.start = time_.as_utc(self.args.start)
     self.args.end = time_.as_utc(self.args.end)
     # print test sequence parameters
     self.log_parameters()
     # set up model parameters
     model_class = u.class_by_name(self.args.model)
     model_class.parms_init(self.args.model_parms, log_parms=True)
     # build schedule
     self.schedule_build(self.args.limit)
     l.info('scheduled %s tests (%s left over)'
            % (len(self.schedule), self.args.end - self.schedule[-1].end))
     if (not os.path.exists(self.args.output_dir)):
         os.mkdir(self.args.output_dir)
     l.info('results in %s' % (self.args.output_dir))
     # testing loop
     for (i, t) in enumerate(self.schedule):
         if (i+1 < self.args.start_test):
             l.info('using saved test %d per --start-test' % (i+1))
             l.warning('token and tweet counts will be incorrect')
             # FIXME: hack.....
             try:
                 t.model = u.Deleted_To_Save_Memory()
                 t.results = u.Deleted_To_Save_Memory()
                 t.i = i
                 t.train_tweet_ct = -1e6
                 t.train_token_ct = -1e6
                 t.test_tweet_ct = -1e6
                 t.unshrink_from_disk(self.args.output_dir, results=True)
                 t.attempted = True
             except (IOError, x):
                 if (x.errno != 2):
                     raise
                 t.attempted = False
         else:
             l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t))
             t.do_test(model_class, self.cur, self.args, i)
         t.summarize()
         if (t.attempted):
             if (self.args.profile_memory):
                 # We dump a memory profile here because it's the high water
                 # mark; we're about to reduce usage significantly.
                 import meliae.scanner as ms
                 filename = 'memory.%d.json' % (i)
                 l.info('dumping memory profile %s' % (filename))
                 ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename))
             t.shrink_to_disk(self.args.output_dir)
         l.debug('result: %s' % (t.summary))
         u.memory_use_log()
     # done!
     l.debug('computing summary')
     self.summarize()
     l.debug('summary: %s' % (self.summary))
     l.debug('saving TSV results')
     test_indices = u.sl_union_fromtext(len(self.schedule), ':')
     self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                         test_indices)
     l.debug('saving pickled summary')
     self.memory_use = u.memory_use()
     self.memory_use_peak = "Not implemented"
     self.time_use = time.time() - t_start
     u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
     u.memory_use_log()
     l.info('done in %s' % (u.fmt_seconds(self.time_use)))
Example #6
0
class Test_Sequence(object):
    def __init__(self, args):
        self.args = args

    @property
    def first_good_test(self):
        # Any attempted test will give us what we need, but an arbitrary
        # number of tests might not have been attempted.
        return next(itertools.ifilter(lambda t: t.attempted, self.schedule))

    def main(self):
        u.memory_use_log()
        t_start = time.time()
        db = db_glue.DB(self.args.database_file)
        l.info('opened database %s' % (self.args.database_file))
        assert (db.metadata_get('schema_version') == '5')
        # normalize start and end times
        if (self.args.start is None):
            sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet'
            self.args.start = db.sql(sql)[0]['st']
        if (self.args.end is None):
            sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet'
            # add one second because end time is exclusive
            self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1)
        self.args.start = time_.as_utc(self.args.start)
        self.args.end = time_.as_utc(self.args.end)
        # print test sequence parameters
        self.log_parameters()
        # set up model parameters
        model_class = u.class_by_name(self.args.model)
        model_class.parms_init(self.args.model_parms, log_parms=True)
        # build schedule
        self.schedule_build(self.args.limit)
        l.info('scheduled %s tests (%s left over)' %
               (len(self.schedule), self.args.end - self.schedule[-1].end))
        if (not os.path.exists(self.args.output_dir)):
            os.mkdir(self.args.output_dir)
        l.info('results in %s' % (self.args.output_dir))
        # testing loop
        for (i, t) in enumerate(self.schedule):
            if (i + 1 < self.args.start_test):
                l.info('using saved test %d per --start-test' % (i + 1))
                l.warning('token and tweet counts will be incorrect')
                # FIXME: hack.....
                try:
                    t.model = u.Deleted_To_Save_Memory()
                    t.results = u.Deleted_To_Save_Memory()
                    t.i = i
                    t.train_tweet_ct = -1e6
                    t.train_token_ct = -1e6
                    t.test_tweet_ct = -1e6
                    t.unshrink_from_disk(self.args.output_dir, results=True)
                    t.attempted = True
                except IOError, x:
                    if (x.errno != 2):
                        raise
                    t.attempted = False
            else:
                l.info('starting test %d of %d: %s' %
                       (i + 1, len(self.schedule), t))
                t.do_test(model_class, db, self.args, i)
            t.summarize()
            if (t.attempted):
                if (self.args.profile_memory):
                    # We dump a memory profile here because it's the high water
                    # mark; we're about to reduce usage significantly.
                    import meliae.scanner as ms
                    filename = 'memory.%d.json' % (i)
                    l.info('dumping memory profile %s' % (filename))
                    ms.dump_all_objects('%s/%s' %
                                        (self.args.output_dir, filename))
                t.shrink_to_disk(self.args.output_dir)
            l.debug('result: %s' % (t.summary))
            u.memory_use_log()
        # done!
        l.debug('computing summary')
        self.summarize()
        l.debug('summary: %s' % (self.summary))
        l.debug('saving TSV results')
        test_indices = u.sl_union_fromtext(len(self.schedule), ':')
        self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'),
                            test_indices)
        l.debug('saving pickled summary')
        self.memory_use = u.memory_use()
        self.memory_use_peak = u.memory_use(True)
        self.time_use = time.time() - t_start
        u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self)
        u.memory_use_log()
        l.info('done in %s' % (u.fmt_seconds(self.time_use)))