def map_init(self): pickle_ = u.pickle_load(self.params['total_file']) self.totals = pickle_['projects'] # Compute masks (find days with insufficient data) try: # use precomputed mask if available (Wikipedia) self.mask = pickle_['mask'] except KeyError: # compute a mask for Twitter pdata = self.totals['t@'] mask = [ tweet.is_enough(pdata['series'].date(i), pdata['series'][i], sample_rate=self.params['tw_sample_rate']) for i in xrange(len(pdata['series'])) ] self.mask = math_.Date_Vector(pdata['series'].first_day, np.array(mask, dtype=np.bool)) if (self.mask.sum() < 0.5 * len(self.mask)): u.abort('too many low-data days (%d of %d); check sample rate?' % (self.mask.sum(), len(self.mask))) # Read target time series self.targets = list() short_names = u.without_common_prefix(self.params['input_sss']) for (sn, ln) in zip(short_names, self.params['input_sss']): e = ssheet.Excel(file_=ln) for (name, (series, mask)) in e.data.iteritems(): name = '%s:%s' % (urllib.quote_plus(u.without_ext( sn, '.xls')), urllib.quote_plus(name)) self.targets.append({ 'name': name, 'series': series, 'mask': mask })
def tsv_save_tokens(self, filename, geofiles_p, geoimage_width, test_indices, token_idx, tw_tokens): l.info('writing tokens summaries to %s' % (filename)) if (not geofiles_p): tsv = self.tsv_open(filename) self.first_good_test.unshrink_from_disk(self.args.output_dir, model=True) tsv.writerow(['test_idx', 'token_idx'] + list(self.first_good_test.model.token_summary_keys)) for i in sorted(test_indices): test = self.schedule[i] if (not test.attempted): continue test.unshrink_from_disk(self.args.output_dir, model=True) tokenrows = [test.model.token_summary(token) for token in test.model.tokens] tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True) token_indices = u.sl_union_fromtext(len(tokenrows), token_idx) for j in xrange(len(tokenrows)): tokenrow = tokenrows[j] if (not (j in token_indices or i in tw_tokens.get(tokenrow['token'], set()))): continue if (not geofiles_p): tsv.writerow([i, j] + tokenrow.values()) else: assert (geoimage_width > 0) gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j) l.debug('writing geofiles %s' % (gi_basename)) test.model.dump_geofiles(gi_basename, geoimage_width, tokenrow['token']) test.shrink()
def map_init(self): pickle_ = u.pickle_load(self.params['total_file']) self.totals = pickle_['projects'] # Compute masks (find days with insufficient data) try: # use precomputed mask if available (Wikipedia) self.mask = pickle_['mask'] except KeyError: # compute a mask for Twitter pdata = self.totals['t@'] mask = [tweet.is_enough(pdata['series'].date(i), pdata['series'][i], sample_rate=self.params['tw_sample_rate']) for i in xrange(len(pdata['series']))] self.mask = math_.Date_Vector(pdata['series'].first_day, np.array(mask, dtype=np.bool)) if (self.mask.sum() < 0.5 * len(self.mask)): u.abort('too many low-data days (%d of %d); check sample rate?' % (self.mask.sum(), len(self.mask))) # Read target time series self.targets = list() short_names = u.without_common_prefix(self.params['input_sss']) for (sn, ln) in zip(short_names, self.params['input_sss']): e = ssheet.Excel(file_=ln) for (name, (series, mask)) in e.data.iteritems(): name = '%s:%s' % (urllib.quote_plus(u.without_ext(sn, '.xls')), urllib.quote_plus(name)) self.targets.append({ 'name': name, 'series': series, 'mask': mask })
def tsv_save_tokens(self, filename, geofiles_p, geoimage_width, test_indices, token_idx, tw_tokens): l.info('writing tokens summaries to %s' % (filename)) if (not geofiles_p): tsv = self.tsv_open(filename) self.first_good_test.unshrink_from_disk(self.args.output_dir, model=True) tsv.writerow(['test_idx', 'token_idx'] + list(self.first_good_test.model.token_summary_keys)) for i in sorted(test_indices): test = self.schedule[i] if (not test.attempted): continue test.unshrink_from_disk(self.args.output_dir, model=True) tokenrows = [ test.model.token_summary(token) for token in test.model.tokens ] tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True) token_indices = u.sl_union_fromtext(len(tokenrows), token_idx) for j in xrange(len(tokenrows)): tokenrow = tokenrows[j] if (not (j in token_indices or i in tw_tokens.get(tokenrow['token'], set()))): continue if (not geofiles_p): tsv.writerow([i, j] + tokenrow.values()) else: assert (geoimage_width > 0) gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j) l.debug('writing geofiles %s' % (gi_basename)) test.model.dump_geofiles(gi_basename, geoimage_width, tokenrow['token']) test.shrink()
def tsv_save_tweets(self, filename, include_fails_p, geofiles_p, geoimage_width, test_indices, tweet_idx): '''Return value is a mapping from tokens involved in printed tweets to a set of test indices in which they appeared.''' tokens = defaultdict(set) l.info('writing tweet summaries%s to %s' % (' and geoimages' if geofiles_p else '', filename)) if (not geofiles_p): tsv = self.tsv_open(filename) self.first_good_test.unshrink_from_disk(self.args.output_dir, results=True) tsv.writerow(['test_idx', 'tweet_idx'] + self.first_good_test.results[0].summary_dict.keys()) for i in sorted(test_indices): test = self.schedule[i] if (not test.attempted): continue test.unshrink_from_disk(self.args.output_dir, results=True) tweetrows = test.results[:] tweetrows.sort(key=operator.attrgetter('cae')) for j in sorted(u.sl_union_fromtext(len(tweetrows), tweet_idx)): r = tweetrows[j] if (not r.success_ct and not include_fails_p): continue for token in r.location_estimate.explanation.iterkeys(): tokens[token].add(i) if (not geofiles_p): tsv.writerow([i, j] + r.summary_dict.values()) else: assert (geoimage_width > 0) gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j) l.debug('writing geofiles %s' % (gi_basename)) # FIXME: ugly hack to get PR_90 instead of PR_95 import geo.gmm geo.gmm.Token.parms_init({}) r.location_estimate.dump_geofiles(gi_basename, geoimage_width, 0.90) srs.dump_geojson(gi_basename + '.truth', r.tweet.geom) test.shrink() return tokens