Example #1
0
 def map_init(self):
     pickle_ = u.pickle_load(self.params['total_file'])
     self.totals = pickle_['projects']
     # Compute masks (find days with insufficient data)
     try:
         # use precomputed mask if available (Wikipedia)
         self.mask = pickle_['mask']
     except KeyError:
         # compute a mask for Twitter
         pdata = self.totals['t@']
         mask = [
             tweet.is_enough(pdata['series'].date(i),
                             pdata['series'][i],
                             sample_rate=self.params['tw_sample_rate'])
             for i in xrange(len(pdata['series']))
         ]
         self.mask = math_.Date_Vector(pdata['series'].first_day,
                                       np.array(mask, dtype=np.bool))
     if (self.mask.sum() < 0.5 * len(self.mask)):
         u.abort('too many low-data days (%d of %d); check sample rate?' %
                 (self.mask.sum(), len(self.mask)))
     # Read target time series
     self.targets = list()
     short_names = u.without_common_prefix(self.params['input_sss'])
     for (sn, ln) in zip(short_names, self.params['input_sss']):
         e = ssheet.Excel(file_=ln)
         for (name, (series, mask)) in e.data.iteritems():
             name = '%s:%s' % (urllib.quote_plus(u.without_ext(
                 sn, '.xls')), urllib.quote_plus(name))
             self.targets.append({
                 'name': name,
                 'series': series,
                 'mask': mask
             })
Example #2
0
 def tsv_save_tokens(self, filename, geofiles_p, geoimage_width,
                     test_indices, token_idx, tw_tokens):
    l.info('writing tokens summaries to %s' % (filename))
    if (not geofiles_p):
       tsv = self.tsv_open(filename)
       self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                               model=True)
       tsv.writerow(['test_idx', 'token_idx']
                 + list(self.first_good_test.model.token_summary_keys))
    for i in sorted(test_indices):
       test = self.schedule[i]
       if (not test.attempted):
          continue
       test.unshrink_from_disk(self.args.output_dir, model=True)
       tokenrows = [test.model.token_summary(token)
                    for token in test.model.tokens]
       tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True)
       token_indices = u.sl_union_fromtext(len(tokenrows), token_idx)
       for j in xrange(len(tokenrows)):
          tokenrow = tokenrows[j]
          if (not (j in token_indices
                   or i in tw_tokens.get(tokenrow['token'], set()))):
             continue
          if (not geofiles_p):
             tsv.writerow([i, j] + tokenrow.values())
          else:
             assert (geoimage_width > 0)
             gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
             l.debug('writing geofiles %s' % (gi_basename))
             test.model.dump_geofiles(gi_basename, geoimage_width,
                                      tokenrow['token'])
       test.shrink()
Example #3
0
 def map_init(self):
    pickle_ = u.pickle_load(self.params['total_file'])
    self.totals = pickle_['projects']
    # Compute masks (find days with insufficient data)
    try:
       # use precomputed mask if available (Wikipedia)
       self.mask = pickle_['mask']
    except KeyError:
       # compute a mask for Twitter
       pdata = self.totals['t@']
       mask = [tweet.is_enough(pdata['series'].date(i),
                               pdata['series'][i],
                               sample_rate=self.params['tw_sample_rate'])
               for i in xrange(len(pdata['series']))]
       self.mask = math_.Date_Vector(pdata['series'].first_day,
                                     np.array(mask, dtype=np.bool))
    if (self.mask.sum() < 0.5 * len(self.mask)):
       u.abort('too many low-data days (%d of %d); check sample rate?'
               % (self.mask.sum(), len(self.mask)))
    # Read target time series
    self.targets = list()
    short_names = u.without_common_prefix(self.params['input_sss'])
    for (sn, ln) in zip(short_names, self.params['input_sss']):
       e = ssheet.Excel(file_=ln)
       for (name, (series, mask)) in e.data.iteritems():
          name = '%s:%s' % (urllib.quote_plus(u.without_ext(sn, '.xls')),
                            urllib.quote_plus(name))
          self.targets.append({ 'name':   name,
                                'series': series,
                                'mask':   mask })
Example #4
0
 def tsv_save_tokens(self, filename, geofiles_p, geoimage_width,
                     test_indices, token_idx, tw_tokens):
     l.info('writing tokens summaries to %s' % (filename))
     if (not geofiles_p):
         tsv = self.tsv_open(filename)
         self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                                 model=True)
         tsv.writerow(['test_idx', 'token_idx'] +
                      list(self.first_good_test.model.token_summary_keys))
     for i in sorted(test_indices):
         test = self.schedule[i]
         if (not test.attempted):
             continue
         test.unshrink_from_disk(self.args.output_dir, model=True)
         tokenrows = [
             test.model.token_summary(token) for token in test.model.tokens
         ]
         tokenrows.sort(key=operator.itemgetter('point_ct'), reverse=True)
         token_indices = u.sl_union_fromtext(len(tokenrows), token_idx)
         for j in xrange(len(tokenrows)):
             tokenrow = tokenrows[j]
             if (not (j in token_indices
                      or i in tw_tokens.get(tokenrow['token'], set()))):
                 continue
             if (not geofiles_p):
                 tsv.writerow([i, j] + tokenrow.values())
             else:
                 assert (geoimage_width > 0)
                 gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
                 l.debug('writing geofiles %s' % (gi_basename))
                 test.model.dump_geofiles(gi_basename, geoimage_width,
                                          tokenrow['token'])
         test.shrink()
Example #5
0
 def tsv_save_tweets(self, filename, include_fails_p, geofiles_p,
                     geoimage_width, test_indices, tweet_idx):
    '''Return value is a mapping from tokens involved in printed tweets to
       a set of test indices in which they appeared.'''
    tokens = defaultdict(set)
    l.info('writing tweet summaries%s to %s'
           % (' and geoimages' if geofiles_p else '', filename))
    if (not geofiles_p):
       tsv = self.tsv_open(filename)
       self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                               results=True)
       tsv.writerow(['test_idx', 'tweet_idx']
                    + self.first_good_test.results[0].summary_dict.keys())
    for i in sorted(test_indices):
       test = self.schedule[i]
       if (not test.attempted):
          continue
       test.unshrink_from_disk(self.args.output_dir, results=True)
       tweetrows = test.results[:]
       tweetrows.sort(key=operator.attrgetter('cae'))
       for j in sorted(u.sl_union_fromtext(len(tweetrows), tweet_idx)):
          r = tweetrows[j]
          if (not r.success_ct and not include_fails_p):
             continue
          for token in r.location_estimate.explanation.iterkeys():
             tokens[token].add(i)
          if (not geofiles_p):
             tsv.writerow([i, j] + r.summary_dict.values())
          else:
             assert (geoimage_width > 0)
             gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
             l.debug('writing geofiles %s' % (gi_basename))
             # FIXME: ugly hack to get PR_90 instead of PR_95
             import geo.gmm
             geo.gmm.Token.parms_init({})
             r.location_estimate.dump_geofiles(gi_basename,
                                               geoimage_width, 0.90)
             srs.dump_geojson(gi_basename + '.truth', r.tweet.geom)
       test.shrink()
    return tokens
Example #6
0
 def tsv_save_tweets(self, filename, include_fails_p, geofiles_p,
                     geoimage_width, test_indices, tweet_idx):
     '''Return value is a mapping from tokens involved in printed tweets to
      a set of test indices in which they appeared.'''
     tokens = defaultdict(set)
     l.info('writing tweet summaries%s to %s' %
            (' and geoimages' if geofiles_p else '', filename))
     if (not geofiles_p):
         tsv = self.tsv_open(filename)
         self.first_good_test.unshrink_from_disk(self.args.output_dir,
                                                 results=True)
         tsv.writerow(['test_idx', 'tweet_idx'] +
                      self.first_good_test.results[0].summary_dict.keys())
     for i in sorted(test_indices):
         test = self.schedule[i]
         if (not test.attempted):
             continue
         test.unshrink_from_disk(self.args.output_dir, results=True)
         tweetrows = test.results[:]
         tweetrows.sort(key=operator.attrgetter('cae'))
         for j in sorted(u.sl_union_fromtext(len(tweetrows), tweet_idx)):
             r = tweetrows[j]
             if (not r.success_ct and not include_fails_p):
                 continue
             for token in r.location_estimate.explanation.iterkeys():
                 tokens[token].add(i)
             if (not geofiles_p):
                 tsv.writerow([i, j] + r.summary_dict.values())
             else:
                 assert (geoimage_width > 0)
                 gi_basename = u'%s.%d.%d' % (u.without_ext(filename), i, j)
                 l.debug('writing geofiles %s' % (gi_basename))
                 # FIXME: ugly hack to get PR_90 instead of PR_95
                 import geo.gmm
                 geo.gmm.Token.parms_init({})
                 r.location_estimate.dump_geofiles(gi_basename,
                                                   geoimage_width, 0.90)
                 srs.dump_geojson(gi_basename + '.truth', r.tweet.geom)
         test.shrink()
     return tokens