Beispiel #1
0
 def map_init(self):
     pickle_ = u.pickle_load(self.params['total_file'])
     self.totals = pickle_['projects']
     # Compute masks (find days with insufficient data)
     try:
         # use precomputed mask if available (Wikipedia)
         self.mask = pickle_['mask']
     except KeyError:
         # compute a mask for Twitter
         pdata = self.totals['t@']
         mask = [
             tweet.is_enough(pdata['series'].date(i),
                             pdata['series'][i],
                             sample_rate=self.params['tw_sample_rate'])
             for i in xrange(len(pdata['series']))
         ]
         self.mask = math_.Date_Vector(pdata['series'].first_day,
                                       np.array(mask, dtype=np.bool))
     if (self.mask.sum() < 0.5 * len(self.mask)):
         u.abort('too many low-data days (%d of %d); check sample rate?' %
                 (self.mask.sum(), len(self.mask)))
     # Read target time series
     self.targets = list()
     short_names = u.without_common_prefix(self.params['input_sss'])
     for (sn, ln) in zip(short_names, self.params['input_sss']):
         e = ssheet.Excel(file_=ln)
         for (name, (series, mask)) in e.data.iteritems():
             name = '%s:%s' % (urllib.quote_plus(u.without_ext(
                 sn, '.xls')), urllib.quote_plus(name))
             self.targets.append({
                 'name': name,
                 'series': series,
                 'mask': mask
             })
Beispiel #2
0
 def map_init(self):
    pickle_ = u.pickle_load(self.params['total_file'])
    self.totals = pickle_['projects']
    # Compute masks (find days with insufficient data)
    try:
       # use precomputed mask if available (Wikipedia)
       self.mask = pickle_['mask']
    except KeyError:
       # compute a mask for Twitter
       pdata = self.totals['t@']
       mask = [tweet.is_enough(pdata['series'].date(i),
                               pdata['series'][i],
                               sample_rate=self.params['tw_sample_rate'])
               for i in xrange(len(pdata['series']))]
       self.mask = math_.Date_Vector(pdata['series'].first_day,
                                     np.array(mask, dtype=np.bool))
    if (self.mask.sum() < 0.5 * len(self.mask)):
       u.abort('too many low-data days (%d of %d); check sample rate?'
               % (self.mask.sum(), len(self.mask)))
    # Read target time series
    self.targets = list()
    short_names = u.without_common_prefix(self.params['input_sss'])
    for (sn, ln) in zip(short_names, self.params['input_sss']):
       e = ssheet.Excel(file_=ln)
       for (name, (series, mask)) in e.data.iteritems():
          name = '%s:%s' % (urllib.quote_plus(u.without_ext(sn, '.xls')),
                            urllib.quote_plus(name))
          self.targets.append({ 'name':   name,
                                'series': series,
                                'mask':   mask })
Beispiel #3
0
 def summarize(self):
    summaries = [t.summary for t in self.schedule]
    sum_ = sum(summaries)
    if (sum_.attempted_ct <= 0):
       u.abort('%d tests were attempted' % (sum_.attempted_ct))
    # In this case it is mostly OK that we are simply averaging the test
    # results, since counts should be averaged across the tests.
    self.summary = sum_ / sum_.attempted_ct
    self.summary.test_ct = sum_.test_ct
    self.summary.attempted_ct = sum_.attempted_ct
    # standard deviations (FIXME: awkward)
    for attr in ('success_ct', 'mcae', 'msae', 'mpra95', 'mpra90', 'mpra50',
                 'mcontour', 'mcovt95', 'mcovt90', 'mcovt50'):
       setattr(self.summary, 's' + attr,
               np.std([getattr(i, attr) for i in summaries
                       if i.attempted_ct > 0]))
Beispiel #4
0
def setup(args):
    # These tests are here, rather than in argument parsing, so scripts can
    # insert mappers and reducers not specified on the command line.
    if (not (args.map and args.reduce or args.python)):
        u.abort('must specify both mapper and reducer')
    if (args.map and args.reduce and args.python):
        u.abort('cannot specify all of --python, --map, --reduce')

    # Bad script might screw this up. It's a programming problem, not a user
    # problem, so assert instead of erroring.
    assert (len(args.inputs) > 0)

    directories_setup(args)
    if (args.python):
        pythonify(args)
    makefile_dump(args)
    slurm_dump(args)
Beispiel #5
0
def setup(args):
   # These tests are here, rather than in argument parsing, so scripts can
   # insert mappers and reducers not specified on the command line.
   if (not (args.map and args.reduce or args.python)):
      u.abort('must specify both mapper and reducer')
   if (args.map and args.reduce and args.python):
      u.abort('cannot specify all of --python, --map, --reduce')

   # Bad script might screw this up. It's a programming problem, not a user
   # problem, so assert instead of erroring.
   assert (len(args.inputs) > 0)

   directories_setup(args)
   if (args.python):
      pythonify(args)
   makefile_dump(args)
   slurm_dump(args)
Beispiel #6
0
 def summarize(self):
     summaries = [t.summary for t in self.schedule]
     sum_ = sum(summaries)
     if (sum_.attempted_ct <= 0):
         u.abort('%d tests were attempted' % (sum_.attempted_ct))
     # In this case it is mostly OK that we are simply averaging the test
     # results, since counts should be averaged across the tests.
     self.summary = sum_ / sum_.attempted_ct
     self.summary.test_ct = sum_.test_ct
     self.summary.attempted_ct = sum_.attempted_ct
     # standard deviations (FIXME: awkward)
     for attr in ('success_ct', 'mcae', 'msae', 'mpra95', 'mpra90',
                  'mpra50', 'mcontour', 'mcovt95', 'mcovt90', 'mcovt50'):
         setattr(
             self.summary, 's' + attr,
             np.std([
                 getattr(i, attr) for i in summaries if i.attempted_ct > 0
             ]))