def tsv_save_tokens(self, filename, geofiles_p, geoimage_width, test_indices, token_idx, tw_tokens): l.info('writing tokens summaries to %s' % (filename)) if (not geofiles_p): tsv = self.tsv_open(filename) self.first_good_test.unshrink_from_disk(self.args.output_dir, model=True) tsv.writerow(['test_idx', 'token_idx'] + list(self.first_good_test.model.token_summary_keys)) for i in sorted(test_indices): test = self.schedule[i] if (not test.attempted): continue test.unshrink_from_disk(self.args.output_dir, model=True) tokenrows = [test.model.token_summary(token) for token in test.model.tokens] tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True) token_indices = u.sl_union_fromtext(len(tokenrows), token_idx) for j in xrange(len(tokenrows)): tokenrow = tokenrows[j] if (not (j in token_indices or i in tw_tokens.get(tokenrow['token'], set()))): continue if (not geofiles_p): tsv.writerow([i, j] + tokenrow.values()) else: assert (geoimage_width > 0) gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j) l.debug('writing geofiles %s' % (gi_basename)) test.model.dump_geofiles(gi_basename, geoimage_width, tokenrow['token']) test.shrink()
def tsv_save_tokens(self, filename, geofiles_p, geoimage_width, test_indices, token_idx, tw_tokens): l.info('writing tokens summaries to %s' % (filename)) if (not geofiles_p): tsv = self.tsv_open(filename) self.first_good_test.unshrink_from_disk(self.args.output_dir, model=True) tsv.writerow(['test_idx', 'token_idx'] + list(self.first_good_test.model.token_summary_keys)) for i in sorted(test_indices): test = self.schedule[i] if (not test.attempted): continue test.unshrink_from_disk(self.args.output_dir, model=True) tokenrows = [ test.model.token_summary(token) for token in test.model.tokens ] tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True) token_indices = u.sl_union_fromtext(len(tokenrows), token_idx) for j in xrange(len(tokenrows)): tokenrow = tokenrows[j] if (not (j in token_indices or i in tw_tokens.get(tokenrow['token'], set()))): continue if (not geofiles_p): tsv.writerow([i, j] + tokenrow.values()) else: assert (geoimage_width > 0) gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j) l.debug('writing geofiles %s' % (gi_basename)) test.model.dump_geofiles(gi_basename, geoimage_width, tokenrow['token']) test.shrink()
def tsv_save_tweets(self, filename, include_fails_p, geofiles_p, geoimage_width, test_indices, tweet_idx): '''Return value is a mapping from tokens involved in printed tweets to a set of test indices in which they appeared.''' tokens = defaultdict(set) l.info('writing tweet summaries%s to %s' % (' and geoimages' if geofiles_p else '', filename)) if (not geofiles_p): tsv = self.tsv_open(filename) self.first_good_test.unshrink_from_disk(self.args.output_dir, results=True) tsv.writerow(['test_idx', 'tweet_idx'] + self.first_good_test.results[0].summary_dict.keys()) for i in sorted(test_indices): test = self.schedule[i] if (not test.attempted): continue test.unshrink_from_disk(self.args.output_dir, results=True) tweetrows = test.results[:] tweetrows.sort(key=operator.attrgetter('cae')) for j in sorted(u.sl_union_fromtext(len(tweetrows), tweet_idx)): r = tweetrows[j] if (not r.success_ct and not include_fails_p): continue for token in r.location_estimate.explanation.iterkeys(): tokens[token].add(i) if (not geofiles_p): tsv.writerow([i, j] + r.summary_dict.values()) else: assert (geoimage_width > 0) gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j) l.debug('writing geofiles %s' % (gi_basename)) # FIXME: ugly hack to get PR_90 instead of PR_95 import geo.gmm geo.gmm.Token.parms_init({}) r.location_estimate.dump_geofiles(gi_basename, geoimage_width, 0.90) srs.dump_geojson(gi_basename + '.truth', r.tweet.geom) test.shrink() return tokens
def main(self): u.memory_use_log() t_start = time.time() # Replaced with self.cur in __init__ # db = db_glue.DB(self.args.database_file) # assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS st FROM {0};'.format(self.table) self.cur.execute(sql) self.args.start = self.cur.fetchone()[0] if (self.args.end is None): sql = 'SELECT max(created_at) AS et FROM {0};'.format(self.table) self.cur.execute(sql) # add one second because end time is exclusive self.args.end = self.cur.fetchone()[0] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i+1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i+1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except (IOError, x): if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i+1, len(self.schedule), t)) t.do_test(model_class, self.cur, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log() # done! l.debug('computing summary') self.summarize() l.debug('summary: %s' % (self.summary)) l.debug('saving TSV results') test_indices = u.sl_union_fromtext(len(self.schedule), ':') self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'), test_indices) l.debug('saving pickled summary') self.memory_use = u.memory_use() self.memory_use_peak = "Not implemented" self.time_use = time.time() - t_start u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self) u.memory_use_log() l.info('done in %s' % (u.fmt_seconds(self.time_use)))
class Test_Sequence(object): def __init__(self, args): self.args = args @property def first_good_test(self): # Any attempted test will give us what we need, but an arbitrary # number of tests might not have been attempted. return next(itertools.ifilter(lambda t: t.attempted, self.schedule)) def main(self): u.memory_use_log() t_start = time.time() db = db_glue.DB(self.args.database_file) l.info('opened database %s' % (self.args.database_file)) assert (db.metadata_get('schema_version') == '5') # normalize start and end times if (self.args.start is None): sql = 'SELECT min(created_at) AS "st [timestamp]" FROM tweet' self.args.start = db.sql(sql)[0]['st'] if (self.args.end is None): sql = 'SELECT max(created_at) AS "et [timestamp]" FROM tweet' # add one second because end time is exclusive self.args.end = db.sql(sql)[0]['et'] + timedelta(seconds=1) self.args.start = time_.as_utc(self.args.start) self.args.end = time_.as_utc(self.args.end) # print test sequence parameters self.log_parameters() # set up model parameters model_class = u.class_by_name(self.args.model) model_class.parms_init(self.args.model_parms, log_parms=True) # build schedule self.schedule_build(self.args.limit) l.info('scheduled %s tests (%s left over)' % (len(self.schedule), self.args.end - self.schedule[-1].end)) if (not os.path.exists(self.args.output_dir)): os.mkdir(self.args.output_dir) l.info('results in %s' % (self.args.output_dir)) # testing loop for (i, t) in enumerate(self.schedule): if (i + 1 < self.args.start_test): l.info('using saved test %d per --start-test' % (i + 1)) l.warning('token and tweet counts will be incorrect') # FIXME: hack..... try: t.model = u.Deleted_To_Save_Memory() t.results = u.Deleted_To_Save_Memory() t.i = i t.train_tweet_ct = -1e6 t.train_token_ct = -1e6 t.test_tweet_ct = -1e6 t.unshrink_from_disk(self.args.output_dir, results=True) t.attempted = True except IOError, x: if (x.errno != 2): raise t.attempted = False else: l.info('starting test %d of %d: %s' % (i + 1, len(self.schedule), t)) t.do_test(model_class, db, self.args, i) t.summarize() if (t.attempted): if (self.args.profile_memory): # We dump a memory profile here because it's the high water # mark; we're about to reduce usage significantly. import meliae.scanner as ms filename = 'memory.%d.json' % (i) l.info('dumping memory profile %s' % (filename)) ms.dump_all_objects('%s/%s' % (self.args.output_dir, filename)) t.shrink_to_disk(self.args.output_dir) l.debug('result: %s' % (t.summary)) u.memory_use_log() # done! l.debug('computing summary') self.summarize() l.debug('summary: %s' % (self.summary)) l.debug('saving TSV results') test_indices = u.sl_union_fromtext(len(self.schedule), ':') self.tsv_save_tests('%s/%s' % (self.args.output_dir, 'tests.tsv'), test_indices) l.debug('saving pickled summary') self.memory_use = u.memory_use() self.memory_use_peak = u.memory_use(True) self.time_use = time.time() - t_start u.pickle_dump('%s/%s' % (self.args.output_dir, 'summary'), self) u.memory_use_log() l.info('done in %s' % (u.fmt_seconds(self.time_use)))