def map_init(self): pickle_ = u.pickle_load(self.params['total_file']) self.totals = pickle_['projects'] # Compute masks (find days with insufficient data) try: # use precomputed mask if available (Wikipedia) self.mask = pickle_['mask'] except KeyError: # compute a mask for Twitter pdata = self.totals['t@'] mask = [ tweet.is_enough(pdata['series'].date(i), pdata['series'][i], sample_rate=self.params['tw_sample_rate']) for i in xrange(len(pdata['series'])) ] self.mask = math_.Date_Vector(pdata['series'].first_day, np.array(mask, dtype=np.bool)) if (self.mask.sum() < 0.5 * len(self.mask)): u.abort('too many low-data days (%d of %d); check sample rate?' % (self.mask.sum(), len(self.mask))) # Read target time series self.targets = list() short_names = u.without_common_prefix(self.params['input_sss']) for (sn, ln) in zip(short_names, self.params['input_sss']): e = ssheet.Excel(file_=ln) for (name, (series, mask)) in e.data.iteritems(): name = '%s:%s' % (urllib.quote_plus(u.without_ext( sn, '.xls')), urllib.quote_plus(name)) self.targets.append({ 'name': name, 'series': series, 'mask': mask })
def map_init(self): pickle_ = u.pickle_load(self.params['total_file']) self.totals = pickle_['projects'] # Compute masks (find days with insufficient data) try: # use precomputed mask if available (Wikipedia) self.mask = pickle_['mask'] except KeyError: # compute a mask for Twitter pdata = self.totals['t@'] mask = [tweet.is_enough(pdata['series'].date(i), pdata['series'][i], sample_rate=self.params['tw_sample_rate']) for i in xrange(len(pdata['series']))] self.mask = math_.Date_Vector(pdata['series'].first_day, np.array(mask, dtype=np.bool)) if (self.mask.sum() < 0.5 * len(self.mask)): u.abort('too many low-data days (%d of %d); check sample rate?' % (self.mask.sum(), len(self.mask))) # Read target time series self.targets = list() short_names = u.without_common_prefix(self.params['input_sss']) for (sn, ln) in zip(short_names, self.params['input_sss']): e = ssheet.Excel(file_=ln) for (name, (series, mask)) in e.data.iteritems(): name = '%s:%s' % (urllib.quote_plus(u.without_ext(sn, '.xls')), urllib.quote_plus(name)) self.targets.append({ 'name': name, 'series': series, 'mask': mask })
def summarize(self): summaries = [t.summary for t in self.schedule] sum_ = sum(summaries) if (sum_.attempted_ct <= 0): u.abort('%d tests were attempted' % (sum_.attempted_ct)) # In this case it is mostly OK that we are simply averaging the test # results, since counts should be averaged across the tests. self.summary = sum_ / sum_.attempted_ct self.summary.test_ct = sum_.test_ct self.summary.attempted_ct = sum_.attempted_ct # standard deviations (FIXME: awkward) for attr in ('success_ct', 'mcae', 'msae', 'mpra95', 'mpra90', 'mpra50', 'mcontour', 'mcovt95', 'mcovt90', 'mcovt50'): setattr(self.summary, 's' + attr, np.std([getattr(i, attr) for i in summaries if i.attempted_ct > 0]))
def setup(args): # These tests are here, rather than in argument parsing, so scripts can # insert mappers and reducers not specified on the command line. if (not (args.map and args.reduce or args.python)): u.abort('must specify both mapper and reducer') if (args.map and args.reduce and args.python): u.abort('cannot specify all of --python, --map, --reduce') # Bad script might screw this up. It's a programming problem, not a user # problem, so assert instead of erroring. assert (len(args.inputs) > 0) directories_setup(args) if (args.python): pythonify(args) makefile_dump(args) slurm_dump(args)
def summarize(self): summaries = [t.summary for t in self.schedule] sum_ = sum(summaries) if (sum_.attempted_ct <= 0): u.abort('%d tests were attempted' % (sum_.attempted_ct)) # In this case it is mostly OK that we are simply averaging the test # results, since counts should be averaged across the tests. self.summary = sum_ / sum_.attempted_ct self.summary.test_ct = sum_.test_ct self.summary.attempted_ct = sum_.attempted_ct # standard deviations (FIXME: awkward) for attr in ('success_ct', 'mcae', 'msae', 'mpra95', 'mpra90', 'mpra50', 'mcontour', 'mcovt95', 'mcovt90', 'mcovt50'): setattr( self.summary, 's' + attr, np.std([ getattr(i, attr) for i in summaries if i.attempted_ct > 0 ]))