def map_init(self): pickle_ = u.pickle_load(self.params['total_file']) self.totals = pickle_['projects'] # Compute masks (find days with insufficient data) try: # use precomputed mask if available (Wikipedia) self.mask = pickle_['mask'] except KeyError: # compute a mask for Twitter pdata = self.totals['t@'] mask = [ tweet.is_enough(pdata['series'].date(i), pdata['series'][i], sample_rate=self.params['tw_sample_rate']) for i in xrange(len(pdata['series'])) ] self.mask = math_.Date_Vector(pdata['series'].first_day, np.array(mask, dtype=np.bool)) if (self.mask.sum() < 0.5 * len(self.mask)): u.abort('too many low-data days (%d of %d); check sample rate?' % (self.mask.sum(), len(self.mask))) # Read target time series self.targets = list() short_names = u.without_common_prefix(self.params['input_sss']) for (sn, ln) in zip(short_names, self.params['input_sss']): e = ssheet.Excel(file_=ln) for (name, (series, mask)) in e.data.iteritems(): name = '%s:%s' % (urllib.quote_plus(u.without_ext( sn, '.xls')), urllib.quote_plus(name)) self.targets.append({ 'name': name, 'series': series, 'mask': mask })
def map_init(self): pickle_ = u.pickle_load(self.params['total_file']) self.totals = pickle_['projects'] # Compute masks (find days with insufficient data) try: # use precomputed mask if available (Wikipedia) self.mask = pickle_['mask'] except KeyError: # compute a mask for Twitter pdata = self.totals['t@'] mask = [tweet.is_enough(pdata['series'].date(i), pdata['series'][i], sample_rate=self.params['tw_sample_rate']) for i in xrange(len(pdata['series']))] self.mask = math_.Date_Vector(pdata['series'].first_day, np.array(mask, dtype=np.bool)) if (self.mask.sum() < 0.5 * len(self.mask)): u.abort('too many low-data days (%d of %d); check sample rate?' % (self.mask.sum(), len(self.mask))) # Read target time series self.targets = list() short_names = u.without_common_prefix(self.params['input_sss']) for (sn, ln) in zip(short_names, self.params['input_sss']): e = ssheet.Excel(file_=ln) for (name, (series, mask)) in e.data.iteritems(): name = '%s:%s' % (urllib.quote_plus(u.without_ext(sn, '.xls')), urllib.quote_plus(name)) self.targets.append({ 'name': name, 'series': series, 'mask': mask })
def graph_load(): g = u.pickle_load(args.in_ + '/articles/wiki-graph.pkl.gz') for root in g.keys(): g[root] = { timeseries.name_url_canonicalize(url): dist for (url, dist) in g[root].items() } return g
def unshrink_from_disk(self, dir_, model=False, results=False): assert (model or results) if (model and isinstance(self.model, u.Deleted_To_Save_Memory)): self.model = u.pickle_load('%s/model.%d' % (dir_, self.i)) if (results and isinstance(self.results, u.Deleted_To_Save_Memory)): self.results = u.pickle_load('%s/results.%d' % (dir_, self.i))