def filter(self): starttime = time.time() parser = soomparse.SoomFilterParse(self.parent_dataset, self.expr) record_ids = parser.filter() # Empty filter? if record_ids is None or len(record_ids) == 0: record_ids = [] self.record_ids = Numeric.array(record_ids, typecode=Numeric.Int) del record_ids self.generation = self.parent_dataset.generation self.length = len(self.record_ids) filename = self._blobstore_filename(mkdirs=True) if self.backed and filename: # initialise a BLOB dict to hold filter record ID vector self.filter_blob = ArrayDict(filename, 'w+') self.filter_blob['vector'] = self.record_ids # this syncs the data to disc - EVIL - relying on cyclic GC to # reap immediately. del self.filter_blob # re-instate the reference to the BLOBstore self.filter_blob = ArrayDict(filename, 'r') self.record_ids = self.filter_blob['vector'] else: self.filter_blob = None self.save_metadata() soom.info('Assembling filter %s containing %d elements took %.3fs' %\ (self.name, len(self.record_ids), time.time() - starttime))
def yield_rows(self): row = SummaryRow() isect_time = 0.0 for item in Utils.combinations( *self.condcols.veckey_pairs(self.zeros)): row.level = len(item) row.suppress = False if row.level in self.levels: row.type_string = ['0'] * len(self.condcols) colnames = [] colvalues = [] intersect_rows = [] for var_val, var_rows, suppress, condcol in item: row.type_string[condcol.index] = '1' intersect_rows.append(var_rows) colnames.append(condcol.name) colvalues.append(var_val) if suppress: row.suppress = True isect_start = time.time() if len(intersect_rows) == 0: row.count = len(self.filtered_ds) row.extract = self.filtered_ds else: if len(intersect_rows) == 1: cellrows = intersect_rows[0] else: cellrows = soomfunc.intersect(*intersect_rows) row.count = len(cellrows) row.extract = DatasetTake(self.dataset, cellrows) isect_time += time.time() - isect_start row.colnames = tuple(colnames) row.colvalues = tuple(colvalues) yield row soom.info('Summarise intersect() time: %.3f' % isect_time)
def from_summset(cls, ds, shaped_like=None): self = cls(ds.name) st = time.time() cols = ds.get_columns() if shaped_like is not None: for axis in xtab_axes(shaped_like): try: col = ds[axis.name] except KeyError: pass else: self.axes.append(CrossTabAxis.from_col(col, axis.values)) cols.remove(col) for col in cols: if col.is_discrete() and not col.name.startswith('_'): self.axes.append(CrossTabAxis.from_col(col)) if not self.axes: raise Error('dataset %r must have at least one discrete column' % (ds.name,)) indices = [axis.indices.filled() for axis in self.axes] masks = [axis.indices.mask() for axis in self.axes] map = MA.transpose(MA.array(indices, mask=masks)) shape = self.get_shape() for col in ds.get_columns(): if col.is_scalar(): self.add_table(col.name, data=self.from_vector(map, col.data, shape), label=col.label) elapsed = time.time() - st soom.info('%r crosstab generation took %.3f, %.1f rows/s' % (self.name, elapsed, len(map) / elapsed)) return self
def yield_rows(self): row = SummaryRow() isect_time = 0.0 for item in Utils.combinations(*self.condcols.veckey_pairs(self.zeros)): row.level = len(item) row.suppress = False if row.level in self.levels: row.type_string = ['0'] * len(self.condcols) colnames = [] colvalues = [] intersect_rows = [] for var_val, var_rows, suppress, condcol in item: row.type_string[condcol.index] = '1' intersect_rows.append(var_rows) colnames.append(condcol.name) colvalues.append(var_val) if suppress: row.suppress = True isect_start = time.time() if len(intersect_rows) == 0: row.count = len(self.filtered_ds) row.extract = self.filtered_ds else: if len(intersect_rows) == 1: cellrows = intersect_rows[0] else: cellrows = soomfunc.intersect(*intersect_rows) row.count = len(cellrows) row.extract = DatasetTake(self.dataset, cellrows) isect_time += time.time() - isect_start row.colnames = tuple(colnames) row.colvalues = tuple(colvalues) yield row soom.info('Summarise intersect() time: %.3f' % isect_time)
def load_data(self): if self.parent_dataset.backed: if self._data is None: starttime = time.time() filename = self.object_path(self.datatype.file_extension) self._data = self.datatype.load_data(filename) elapsed = time.time() - starttime soom.info('load of %r data vector took %.3f seconds.' %\ (self.name, elapsed))
def dsunload(self, dsname): """Unloads datasets""" if isinstance(dsname, Dataset): dsname = dsname.name try: ds = self.datasets.pop(dsname.lower()) except KeyError: pass else: ds.unload() soom.info('Dataset %r unloaded.' % dsname)
def load_inverted(self): if self.parent_dataset.backed: if self._inverted is None: starttime = time.time() filename = self.object_path('SOOMblobstore', 'inverted') self._inverted = ArrayDict(filename, 'r') elapsed = time.time() - starttime soom.info('load of %r index took %.3f seconds.' %\ (self.name, elapsed)) else: # we need to build the inverted index! self._build_inverted()
def as_dict(self): start_time = time.time() freqcol = '_freq_' if self.proportions and self.default_weightcol: # proportions code needs to know weighted frequency wgtfreq_method = SummaryStats.freq() self.stat_methods.append(wgtfreq_method) freqcol = self.stat_methods.get_method_statcolname(wgtfreq_method) summaryset = TempSummarySet() summaryset.addcolumn('_freq_', 'Frequency', 'int', 'weighting') summaryset.addcolumn('_level_', 'Level', 'int', 'scalar') summaryset.addcolumn('_type_', 'Summary type', 'str', 'categorical') summaryset.addcolumn('_condcols_', 'Conditioning Columns', 'tuple', 'categorical') for condcol in self.condcols: summaryset.addcolumnfromcondcol(condcol) _freq = summaryset['_freq_'].data _level = summaryset['_level_'].data _type = summaryset['_type_'].data _condcols = summaryset['_condcols_'].data self.stat_methods.add_statcols(self.dataset, summaryset) row_ordinal = -1 for row in self.yield_rows(): row_ordinal += 1 _freq.append(row.count) _level.append(row.level) _type.append(''.join(row.type_string)) _condcols.append(row.colnames) for colname, colvalue in zip(row.colnames, row.colvalues): summaryset[colname].data.append(colvalue) if row.suppress: summaryset.suppressed_rows.append(row_ordinal) if row.level != len(self.condcols): mtvals = [] for condcol in self.condcols: if condcol.name not in row.colnames: colvalue = condcol.col.all_value summaryset[condcol.name].data.append(colvalue) mtvals.append(summaryset[condcol.name].data[-1]) summaryset.marginal_total_idx[tuple(mtvals)] = row_ordinal summaryset.marginal_total_rows.append(row_ordinal) self.stat_methods.calc(summaryset, row.extract) if self.proportions: allvals = [col.all_value for col in self.condcols.cols()] calc_props(summaryset, self.condcols.names(), allvals, freqcol) summaryset.suppress_rows(suppress_marginal_totals=self.nomt) soom.info('Summarise as_dict() time: %.3f' % (time.time() - start_time)) return summaryset
def create_wordidx(self): """ Open the word index files read-write and prepare them """ if not self.parent_dataset.backed: raise NotImplementedError if self._wordidx is None: starttime = time.time() self._wordidx = ArrayVocab(self.wordidx_filename, 'c') self._occurrences = file(self.occurrences_filename, 'wb+') # create an empty block 0 self._occurrences.write("\0" * self.BLOCK_SIZE) elapsed = time.time() - starttime soom.info('creation of %r index took %.3f seconds.' % (self.name, elapsed))
def load_wordidx(self): """ Open the word index files read-only """ if not self.parent_dataset.backed: raise NotImplementedError if self._wordidx is None: starttime = time.time() self._wordidx = ArrayVocab(self.wordidx_filename, 'r') self._occurrences = file(self.occurrences_filename, 'rb') blocks = os.fstat(self._occurrences.fileno())[stat.ST_SIZE] / self.BLOCK_SIZE elapsed = time.time() - starttime soom.info('load of %r index (%d words/%d blocks) took %.3f seconds.' %\ (self.name, len(self._wordidx), blocks, elapsed))
def dsload(self, dsname, path=None): """ Function to load a stored data set definition (but not all its data) from disc. The data is loaded column by column only as required. The function returns a DataSet object instance. """ ds = self._dsload(dsname, path) # now load all the columns if soom.lazy_column_loading is turned off if not soom.lazy_column_loading: soom.info('Loading columns for dataset %r' % ds.name) for col in ds.get_columns(): col.load('data') col.load('inverted') return ds
def load_wordidx(self): """ Open the word index files read-only """ if not self.parent_dataset.backed: raise NotImplementedError if self._wordidx is None: starttime = time.time() self._wordidx = ArrayVocab(self.wordidx_filename, 'r') self._occurrences = file(self.occurrences_filename, 'rb') blocks = os.fstat( self._occurrences.fileno())[stat.ST_SIZE] / self.BLOCK_SIZE elapsed = time.time() - starttime soom.info('load of %r index (%d words/%d blocks) took %.3f seconds.' %\ (self.name, len(self._wordidx), blocks, elapsed))
def flush(self): starttime = time.time() for col, data in self.columns: fn = self._chunk_filename(col.name, self.numchunks) f = open(fn, 'wb') try: if self.compress_chunk: f.write(zlib.compress(cPickle.dumps(data, -1))) else: cPickle.dump(data, f, -1) finally: f.close() del data[:] soom.mem_report() soom.info('chunk flush took %.3f seconds' % (time.time() - starttime)) self.numchunks += 1
def makedataset(self, dsname, path=None, **kwargs): """ Factory function to create a new DataSet instance (inheriting metadata from any existing dataset with the same name). The returned dataset is locked for update. """ kwargs['backed'] = True try: ds = self._dsload(dsname, path) except DatasetNotFound: ds = Dataset(dsname, **kwargs) ds.lock() soom.info('Dataset %r created.' % dsname) else: ds.lock() ds.new_generation() self.datasets[dsname.lower()] = ds return ds
def store_column(self, data, mask=None): st = time.time() # The chain of generators returned by get_store_chain does the actual # processing. if not getattr(self, 'trace_load', False): for value in self.get_store_chain(data, mask): pass else: tracer = store_trace() for value in self.get_store_chain(data, mask): tracer.flush() tracer.done() soom.info('Stored data for column %s in dataset %s (%.3fs)' % (self.name, self.parent_dataset.name, time.time() - st)) if self.multisourcecols: # We unload source cols to contain mapped memory use on 32 bit plats for colname in self.multisourcecols: self.parent_dataset[colname].unload() soom.mem_report()
def _build_inverted(self): """ Build an inverted index NOTE - This is now only used where there is no on-disk inverted index, but the column is discrete. For persistent discrete columns, the inverted index is built as the data is filtered, and the inverted index is saved along with the data. """ starttime = time.time() # keep track of time inverted_dict = {} # Use fast NumPy methods if the column type is numeric if self.is_numerictype(): # first get all the unique values uniquevalues = soomfunc.unique( Numeric.sort(self._data.compressed())) ordinals = Numeric.array(range(len(self._data)), typecode=Numeric.Int) for value in uniquevalues: inverted = Numeric.compress( Numeric.where(Numeric.equal(self._data, value), 1, 0), ordinals) inverted_dict[value] = inverted else: # loop over each element for rownum, value in enumerate(self._data): if type(value) is tuple: for v in value: row_nums = inverted_dict.setdefault(v, []) row_nums.append(rownum) else: row_nums = inverted_dict.setdefault(value, []) row_nums.append(rownum) for value, row_nums in inverted_dict.iteritems(): row_array = Numeric.array(row_nums, typecode=Numeric.Int) if self.datatype.name == 'tuple': row_array = soomfunc.unique(Numeric.sort(row_array)) inverted_dict[value] = row_array self._inverted = inverted_dict soom.info( 'Building inverted index for column %s in dataset %s took %.3f seconds' % (self.name, self.parent_dataset.name, time.time() - starttime))
def _dsload(self, dsname, path): try: return self.datasets[dsname.lower()] except KeyError: pass metadata_file = os.path.join(dsname, soom.metadata_filename) path = soom.object_path(metadata_file, path) if not path: raise DatasetNotFound('Unknown dataset %r' % dsname) f = open(os.path.join(path, metadata_file), 'rb') try: ds = cPickle.load(f) finally: f.close() soom.info('Dataset %r loaded.' % dsname) if (hasattr(ds, 'soom_version_info') and ds.soom_version_info[:2] != version_info[:2]): soom.warning('Dataset created by SOOM %s, this is SOOM %s' %\ (ds.soom_version, version)) ds.load_notify(path) self.datasets[dsname.lower()] = ds return ds
def loadrows(self, datasource, chunkrows=0, rowlimit=0): source_rownum = 0 starttime = time.time() if not rowlimit: rowlimit = -1 for row in datasource: source_rownum += 1 for col, data in self.columns: data.append(row.get(col.name, None)) self.rownum += 1 if source_rownum == rowlimit: break if chunkrows and source_rownum and source_rownum % chunkrows == 0: self.flush() if source_rownum and source_rownum % 1000 == 0: soom.info('%s (%d total) rows read from source %s (%.1f per sec)' %\ (source_rownum, self.rownum, datasource.name, source_rownum / (time.time() - starttime))) self.flush() soom.info( '%s rows read from DataSource %s, in %.3f seconds (%d rows total)' % (source_rownum, datasource.name, time.time() - starttime, self.rownum)) return source_rownum
def summ(self, *args, **kwargs): '''Summarise a Dataset summ(conditioning_columns..., stat_methods..., options...) For example: summary_set = dataset.summ('sex', 'agegrp', mean('age'), median('age'), allcalc = True) Options include: name name of summary set label summary set label allcalc calculate all combinations datasetpath for persistent summary sets, the dataset path. filtername apply the named filter levels calculate combinations at the specified levels, eg: 2 & 3 is '23' permanent resulting summary dataset should be written to disk. proportions ''' from SOOMv0.Dataset import SummarisedDataset starttime = time.time() # Method argument parsing label = kwargs.pop('label', None) # datasetpath = kwargs.pop('datasetpath', soom.default_object_path) name = kwargs.pop('name', None) # permanent = kwargs.pop('permanent', False) summarise = Summarise(self, *args, **kwargs) summaryset = summarise.as_dict() # print "summaryset:", # debug # print summaryset # debug soom.info('Summarise took %.3fs' % (time.time() - starttime)) if not name: by = ['_by_%s' % condcol.name for condcol in summarise.condcols] name = 'sumof_%s%s' % (self.name, ''.join(by)) if not label: label = self.label by = [' by %s' % condcol.col.label for condcol in summarise.condcols] summ_label = ''.join(by) starttime = time.time() sumset = SummarisedDataset(name, label=label, summ_label=summ_label, filter_label=summarise.filtered_ds.filter_label, # path=datasetpath, backed=permanent, weightcol="_freq_", date_created=summarise.filtered_ds.date_created, date_updated=summarise.filtered_ds.date_updated) summaryset.columntodataset(sumset) sumset.stat_methods = summarise.stat_methods sumset.nonprintcols = ('_level_', '_type_', '_condcols_') soom.info('summary dict into dataset took %.3f' % (time.time() - starttime)) return sumset
def delete(self, name): filter = self.filters.pop(name, None) if filter: filter.delete() soom.info('deleted filter %r' % name)
def calc_indirectly_std_ratios(summset, popset, stdsummset, stdpopset, conflev=0.95, baseratio=100, timeinterval='years', popset_popcol='_freq_', stdpopset_popcol='_stdpop_', ci_method='daly', debug=False): """ Calculate Indirectly Standardised Population Event Ratios - summset is a summary dataset of counts of events for the population-of-interest being compared to the standard population. - popset is the stratified population counts for the population-of-interest - stdsummset is a summary dataset of counts of events for the standard population - stdpopset is the stratified population counts for the standard population """ from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION alpha = get_alpha(conflev) if ci_method != 'daly': raise Error("Only Daly method for confidence intervals " "currently implemented") if not popset.has_column(popset_popcol): raise Error('Denominator population dataset %r does not have a ' '%r column' % (popset.label or popset.name, popset_popcol)) if not stdpopset.has_column(stdpopset_popcol): raise Error('Standard population dataset %r does not have a ' '%r column' % (stdpopset.label or stdpopset.name, stdpopset_popcol)) st = time.time() r_mode = get_default_mode() try: set_default_mode(BASIC_CONVERSION) shape = shape_union(stdsummset, summset) summtab = CrossTab.from_summset(summset, shaped_like=shape) stdsummtab = CrossTab.from_summset(stdsummset, shaped_like=shape) stdpoptab = CrossTab.from_summset(stdpopset, shaped_like=shape) stdpoptab.collapse_axes_not_in(stdsummtab) stdsummtab.replicate_axes(shape) stdpoptab.replicate_axes(shape) poptab = CrossTab.from_summset(popset, shaped_like=shape) poptab.collapse_axes_not_in(shape) if poptab.get_shape() != stdsummtab.get_shape(): raise Error( 'Observed population does not have all the required columns') popfreq = poptab[popset_popcol].data.astype(MA.Float64) result = stdsummtab.empty_copy() result.add_table('popfreq', data=popfreq, label='Total person-' + timeinterval + ' at risk') expected_cols = [] for table, name, n_add, l_add in just_freq_tables(stdsummtab): stdsummfreq = stdsummtab[name].data.astype(MA.Float64) stdpopfreq = stdpoptab[stdpopset_popcol].data.astype(MA.Float64) std_strata_rates = stdsummfreq / stdpopfreq strata_expected_freq = std_strata_rates * popfreq # print stdsummfreq[0,0,0], stdpopfreq[0,0,0], popfreq[0,0,0] result.add_table('expected' + n_add, data=strata_expected_freq, label='Expected events' + l_add) expected_cols.append('expected' + n_add) result.collapse_axes_not_in(summtab) axis = 0 baseratio = float(baseratio) for table, name, n_add, l_add in just_freq_tables(summtab): observed = table.data.astype(Numeric.Float64) result.add_table('observed' + n_add, data=observed, label='Observed events' + l_add) expected = result['expected' + n_add].data isr = observed / expected result.add_table('isr' + n_add, data=isr * baseratio, label='Indirectly Standardised Event Ratio') # Confidence Intervals if alpha is None or name != '_freq_': # Can only calculate confidence intervals on freq cols continue conflev_l = (1 - conflev) / 2.0 conflev_u = (1 + conflev) / 2.0 # get shape of observed observed_shape = observed.shape # flattened version observed_flat = MA.ravel(observed) # sanity check on shapes - should be the same! assert expected.shape == observed.shape # flattened version of expecetd expected_flat = MA.ravel(expected) # lists to hold results isr_ll = Numeric.empty(len(observed_flat), typecode=Numeric.Float64) isr_ul = Numeric.empty(len(observed_flat), typecode=Numeric.Float64) isr_ll_mask = Numeric.zeros(len(observed_flat), typecode=Numeric.Int8) isr_ul_mask = Numeric.zeros(len(observed_flat), typecode=Numeric.Int8) obs_mask = MA.getmaskarray(observed_flat) exp_mask = MA.getmaskarray(expected_flat) for i, v in enumerate(observed_flat): if obs_mask[i] or exp_mask[i]: isr_ll[i] = 0.0 isr_ul[i] = 0.0 isr_ll_mask[i] = 1 isr_ul_mask[i] = 1 else: if v == 0.: obs_ll = 0.0 obs_ul = -math.log(1 - conflev) else: obs_ll = r.qgamma(conflev_l, v, scale=1.) obs_ul = r.qgamma(conflev_u, v + 1., scale=1.) isr_ll[i] = obs_ll / expected_flat[i] isr_ul[i] = obs_ul / expected_flat[i] isr_ll = MA.array(isr_ll, typecode=MA.Float64, mask=isr_ll_mask) isr_ul = MA.array(isr_ul, typecode=MA.Float64, mask=isr_ul_mask) isr_ll.shape = observed_shape isr_ul.shape = observed_shape isr_base = 'ISR %d%%' % (100.0 * conflev) result.add_table('isr_ll' + n_add, data=isr_ll * baseratio, label=isr_base + ' lower confidence limit' + l_add) result.add_table('isr_ul' + n_add, data=isr_ul * baseratio, label=isr_base + ' upper confidence limit' + l_add) finally: set_default_mode(r_mode) soom.info('calc_indirectly_std_ratios took %.03f' % (time.time() - st)) name = 'indir_std_ratios_' + summset.name label = 'Indirectly Standardised Ratios for ' + (summset.label or summset.name) if conflev: label += ' (%g%% conf. limits)' % (conflev * 100) if debug: global vars vars = Vars(locals()) return result.to_summset(name, label=label)
def calc_directly_std_rates(summset, popset, stdpopset=None, conflev=0.95, basepop=100000, timeinterval='years', ci_method='dobson', popset_popcol='_freq_', stdpopset_popcol='_stdpop_', axis=0, debug=False): """ Calculate Directly Standardised Population Rates summset is a summary dataset of counts of events for the population-of-interest being compared to the standard population. popset is the stratified population counts for the population-of-interest stdpopset is the stratified population counts for the standard population """ from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION alpha = get_alpha(conflev) if ci_method not in ('dobson', 'ff'): raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) methods ' 'for confidence intervals currently implemented') if not popset.has_column(popset_popcol): raise Error('Denominator population dataset %r does not have a ' '%r column' % (popset.label or popset.name, popset_popcol)) if stdpopset is not None and not stdpopset.has_column(stdpopset_popcol): raise Error('Standard population dataset %r does not have a ' '%r column' % (stdpopset.label or stdpopset.name, stdpopset_popcol)) st = time.time() r_mode = get_default_mode() try: set_default_mode(BASIC_CONVERSION) # We turn the summset into an Ncondcols-dimensional matrix summtab = CrossTab.from_summset(summset) if stdpopset is not None: # Then attempt to do the same to the stdpop data, summing any # axes not required and replicate any missing until we have an # array the same shape as the summtab array. stdtab = CrossTab.from_summset(stdpopset, shaped_like=summtab) stdtab.collapse_axes_not_in(summtab) stdtab.replicate_axes(summtab) stdpop = stdtab[stdpopset_popcol].data.astype(Numeric.Float64) # The population dataset must have at least as many dimensions as # summary dataset. Any additional axes are eliminated by summing. # any missing axes are created by replication. poptab = CrossTab.from_summset(popset, shaped_like=summtab) poptab.collapse_axes_not_in(summtab) poptab.replicate_axes(summtab) popfreq = poptab[popset_popcol].data.astype(Numeric.Float64) # Manufacture a CrossTab for the result, with one less axis (the first) result = summtab.empty_copy() del result.axes[axis] if stdpopset is not None: sum_stdpop = sumaxis(stdpop) stdwgts = stdpop / sum_stdpop stdpop_sq = stdpop**2 sum_stdpop_sq = sum_stdpop**2 ffwi = stdwgts / popfreq ffwm = MA.maximum(MA.ravel(ffwi)) basepop = float(basepop) for table, name, n_add, l_add in just_freq_tables(summtab): # avoid integer overflows... summfreq = table.data.astype(Numeric.Float64) strata_rate = summfreq / popfreq result.add_table('summfreq' + n_add, data=sumaxis(summfreq, axis), label='Total events' + l_add) result.add_table('popfreq' + n_add, data=sumaxis(popfreq, axis), label='Total person-' + timeinterval + ' at risk' + l_add) if stdpopset is not None: std_strata_summfreq = summfreq * Numeric.where( MA.getmask(stdwgts), 0., 1.) wgtrate = strata_rate * stdwgts result.add_table('std_strata_summfreq' + n_add, data=sumaxis(std_strata_summfreq, axis), label="Total events in standard strata" + l_add) # Crude rate cr = sumaxis(summfreq, axis) / sumaxis(popfreq, axis) * basepop result.add_table('cr' + n_add, data=cr, label='Crude Rate per ' + '%d' % basepop + ' person-' + timeinterval + l_add) if alpha is not None: # CIs for crude rate count = sumaxis(summfreq, axis) count_shape = count.shape count_flat = MA.ravel(count) totpop = sumaxis(popfreq, axis) assert totpop.shape == count.shape totpop_flat = MA.ravel(totpop) cr_ll = Numeric.empty(len(count_flat), typecode=Numeric.Float64) cr_ul = Numeric.empty(len(count_flat), typecode=Numeric.Float64) cr_ll_mask = Numeric.zeros(len(count_flat), typecode=Numeric.Int8) cr_ul_mask = Numeric.zeros(len(count_flat), typecode=Numeric.Int8) for i, v in enumerate(count_flat): try: if v == 0: cr_ll[i] = 0.0 else: cr_ll[i] = ( (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) / totpop_flat[i]) * basepop cr_ul[i] = ( (r.qchisq(1. - alpha / 2., df=2.0 * (v + 1)) / 2.0) / totpop_flat[i]) * basepop except: cr_ll[i] = 0.0 cr_ul[i] = 0.0 cr_ll_mask[i] = 1 cr_ul_mask[i] = 1 cr_ll = MA.array(cr_ll, mask=cr_ll_mask, typecode=MA.Float64) cr_ul = MA.array(cr_ul, mask=cr_ul_mask, typecode=MA.Float64) cr_ll.shape = count_shape cr_ul.shape = count_shape cr_base = 'Crude rate %d%%' % (100.0 * conflev) result.add_table('cr_ll' + n_add, data=cr_ll, label=cr_base + ' lower confidence limit ' + l_add) result.add_table('cr_ul' + n_add, data=cr_ul, label=cr_base + ' upper confidence limit ' + l_add) if stdpopset is not None: # Directly Standardised Rate dsr = sumaxis(wgtrate, axis) result.add_table('dsr' + n_add, data=dsr * basepop, label='Directly Standardised Rate per ' + '%d' % basepop + ' person-' + timeinterval + l_add) # Confidence Intervals if alpha is None or name != '_freq_': # Can only calculate confidence intervals on freq cols continue if ci_method == 'dobson': # Dobson et al method # see: Dobson A, Kuulasmaa K, Eberle E, Schere J. Confidence intervals for weighted sums # of Poisson parameters, Statistics in Medicine, Vol. 10, 1991, pp. 457-62. # se_wgtrate = summfreq*((stdwgts/(popfreq/basepop))**2) se_wgtrate = summfreq * ((stdwgts / (popfreq))**2) stderr = stdpop_sq * strata_rate * (1.0 - strata_rate) se_rate = sumaxis(se_wgtrate, axis) sumsei = sumaxis(stderr, axis) total_freq = sumaxis(std_strata_summfreq, axis) # get shape of total_freq total_freq_shape = total_freq.shape total_freq_flat = MA.ravel(total_freq) # flat arrays to hold results and associated masks l_lam = Numeric.empty(len(total_freq_flat), typecode=Numeric.Float64) u_lam = Numeric.empty(len(total_freq_flat), typecode=Numeric.Float64) l_lam_mask = Numeric.zeros(len(total_freq_flat), typecode=Numeric.Int8) u_lam_mask = Numeric.zeros(len(total_freq_flat), typecode=Numeric.Int8) conflev_l = (1 - conflev) / 2.0 conflev_u = (1 + conflev) / 2.0 for i, v in enumerate(total_freq_flat): try: if v == 0.: u_lam[i] = -math.log(1 - conflev) l_lam[i] = 0.0 else: l_lam[i] = r.qgamma(conflev_l, v, scale=1.) u_lam[i] = r.qgamma(conflev_u, v + 1., scale=1.) except: l_lam[i] = 0.0 u_lam[i] = 0.0 l_lam_mask[i] = 1 u_lam_mask[i] = 1 l_lam = MA.array(l_lam, mask=l_lam_mask, typecode=MA.Float64) u_lam = MA.array(u_lam, mask=u_lam_mask, typecode=MA.Float64) l_lam.shape = total_freq_shape u_lam.shape = total_freq_shape dsr_ll = dsr + (((se_rate / total_freq)**0.5) * (l_lam - total_freq)) dsr_ul = dsr + (((se_rate / total_freq)**0.5) * (u_lam - total_freq)) elif ci_method == 'ff': # Fay and Feuer method # see: Fay MP, Feuer EJ. Confidence intervals for directly standardized rates: # a method based on the gamma distribution. Statistics in Medicine 1997 Apr 15;16(7):791-801. ffvari = summfreq * ffwi**2.0 ffvar = sumaxis(ffvari, axis) dsr_flat = Numeric.ravel(MA.filled(dsr, 0)) dsr_shape = dsr.shape ffvar_flat = Numeric.ravel(MA.filled(ffvar, 0)) # flat arrays to hold results and associated masks dsr_ll = Numeric.empty(len(dsr_flat), typecode=Numeric.Float64) dsr_ul = Numeric.empty(len(dsr_flat), typecode=Numeric.Float64) dsr_ll_mask = Numeric.zeros(len(dsr_flat), typecode=Numeric.Int8) dsr_ul_mask = Numeric.zeros(len(dsr_flat), typecode=Numeric.Int8) for i, y in enumerate(dsr_flat): try: dsr_ll[i] = (ffvar_flat[i] / (2.0 * y)) * r.qchisq( alpha / 2., df=(2.0 * (y**2.) / ffvar_flat[i])) dsr_ul[i] = ((ffvar_flat[i] + (ffwm**2.0)) / (2.0 * (y + ffwm))) * r.qchisq( 1. - alpha / 2., df=((2.0 * ((y + ffwm)**2.0)) / (ffvar_flat[i] + ffwm**2.0))) except: dsr_ll[i] = 0.0 dsr_ul[i] = 0.0 dsr_ll_mask[i] = 1 dsr_ul_mask[i] = 1 dsr_ll = MA.array(dsr_ll, mask=dsr_ll_mask, typecode=MA.Float64) dsr_ul = MA.array(dsr_ul, mask=dsr_ul_mask, typecode=MA.Float64) dsr_ll.shape = dsr_shape dsr_ul.shape = dsr_shape result.add_table('dsr_ll' + n_add, data=dsr_ll * basepop, label='DSR ' + '%d' % (100.0 * conflev) + '% lower confidence limit' + l_add) result.add_table('dsr_ul' + n_add, data=dsr_ul * basepop, label='DSR ' + '%d' % (100.0 * conflev) + '% upper confidence limit' + l_add) finally: set_default_mode(r_mode) soom.info('calc_directly_std_rates took %.03f' % (time.time() - st)) if stdpopset is not None: name = 'dir_std_rates_' + summset.name label = 'Directly Standardised Rates for ' + (summset.label or summset.name) else: name = 'crude_rates_' + summset.name label = 'Crude Rates for ' + (summset.label or summset.name) if conflev: label += ' (%g%% conf. limits)' % (conflev * 100) if debug: global vars vars = Vars(locals()) return result.to_summset(name, label=label)
def summ(self, *args, **kwargs): '''Summarise a Dataset summ(conditioning_columns..., stat_methods..., options...) For example: summary_set = dataset.summ('sex', 'agegrp', mean('age'), median('age'), allcalc = True) Options include: name name of summary set label summary set label allcalc calculate all combinations datasetpath for persistent summary sets, the dataset path. filtername apply the named filter levels calculate combinations at the specified levels, eg: 2 & 3 is '23' permanent resulting summary dataset should be written to disk. proportions ''' from SOOMv0.Dataset import SummarisedDataset starttime = time.time() # Method argument parsing label = kwargs.pop('label', None) # datasetpath = kwargs.pop('datasetpath', soom.default_object_path) name = kwargs.pop('name', None) # permanent = kwargs.pop('permanent', False) summarise = Summarise(self, *args, **kwargs) summaryset = summarise.as_dict() # print "summaryset:", # debug # print summaryset # debug soom.info('Summarise took %.3fs' % (time.time() - starttime)) if not name: by = ['_by_%s' % condcol.name for condcol in summarise.condcols] name = 'sumof_%s%s' % (self.name, ''.join(by)) if not label: label = self.label by = [' by %s' % condcol.col.label for condcol in summarise.condcols] summ_label = ''.join(by) starttime = time.time() sumset = SummarisedDataset( name, label=label, summ_label=summ_label, filter_label=summarise.filtered_ds.filter_label, # path=datasetpath, backed=permanent, weightcol="_freq_", date_created=summarise.filtered_ds.date_created, date_updated=summarise.filtered_ds.date_updated) summaryset.columntodataset(sumset) sumset.stat_methods = summarise.stat_methods sumset.nonprintcols = ('_level_', '_type_', '_condcols_') soom.info('summary dict into dataset took %.3f' % (time.time() - starttime)) return sumset
def calc_stratified_rates(summset, popset, conflev=0.95, basepop=100000, timeinterval='years', ci_method='dobson', popset_popcol='_freq_', debug=False): """ Calculate stratified population rates summset is a straified summary dataset of counts of events for the population-of-interest popset is the stratified population counts for the population-of-interest """ from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION alpha = get_alpha(conflev) if ci_method not in ('dobson', 'ff'): raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) ' 'methods for confidence intervals currently ' 'implemented') if not popset.has_column(popset_popcol): raise Error('Denominator population dataset %r does not have a ' '%r column' % (popset.label or popset.name, popset_popcol)) st = time.time() r_mode = get_default_mode() try: set_default_mode(BASIC_CONVERSION) # We turn the summset into an Ncondcols-dimensional matrix summtab = CrossTab.from_summset(summset) # The population dataset must have at least as many dimensions as # summary dataset. Any additional axes are eliminated by summing. # any missing axes are created by replication. poptab = CrossTab.from_summset(popset, shaped_like=summtab) poptab.collapse_axes_not_in(summtab) poptab.replicate_axes(summtab) popfreq = poptab[popset_popcol].data.astype(Numeric.Float64) # Manufacture a CrossTab for the result result = summtab.empty_copy() basepop = float(basepop) for table, name, n_add, l_add in just_freq_tables(summtab): # avoid integer overflows... summfreq = table.data.astype(Numeric.Float64) strata_rate = summfreq / popfreq result.add_table('summfreq' + n_add, data=summfreq, label='Events' + l_add) result.add_table('popfreq' + n_add, data=popfreq, label='Person-' + timeinterval + ' at risk' + l_add) result.add_table('sr' + n_add, data=strata_rate * basepop, label='Strata-specific Rate per ' + '%d' % basepop + ' person-' + timeinterval + l_add) if alpha is not None: # CIs for stratified rates summfreq_shape = summfreq.shape summfreq_flat = MA.ravel(summfreq) assert popfreq.shape == summfreq.shape popfreq_flat = MA.ravel(popfreq) sr_ll = Numeric.empty(len(summfreq_flat), typecode=Numeric.Float64) sr_ul = Numeric.empty(len(summfreq_flat), typecode=Numeric.Float64) sr_ll_mask = Numeric.zeros(len(summfreq_flat), typecode=Numeric.Int8) sr_ul_mask = Numeric.zeros(len(summfreq_flat), typecode=Numeric.Int8) for i, v in enumerate(summfreq_flat): try: if v == 0: sr_ll[i] = 0.0 else: sr_ll[i] = ( (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) / popfreq_flat[i]) * basepop sr_ul[i] = ( (r.qchisq(1. - alpha / 2., df=2.0 * (v + 1)) / 2.0) / popfreq_flat[i]) * basepop except: sr_ll[i] = 0.0 sr_ul[i] = 0.0 sr_ll_mask[i] = 1 sr_ul_mask[i] = 1 sr_ll = MA.array(sr_ll, mask=sr_ll_mask, typecode=MA.Float64) sr_ul = MA.array(sr_ul, mask=sr_ul_mask, typecode=MA.Float64) sr_ll.shape = summfreq_shape sr_ul.shape = summfreq_shape sr_base = 'Stratified rate %s%%' % (100.0 * conflev) result.add_table('sr_ll' + n_add, data=sr_ll, label=sr_base + ' lower confidence limit ' + l_add) result.add_table('sr_ul' + n_add, data=sr_ul, label=sr_base + ' upper confidence limit ' + l_add) finally: set_default_mode(r_mode) soom.info('calc_stratified_rates took %.03f' % (time.time() - st)) name = 'stratified_rates_' + summset.name label = 'Stratified Rates for ' + (summset.label or summset.name) if conflev: label += ' (%g%% conf. limits)' % (conflev * 100) if debug: global vars vars = Vars(locals()) return result.to_summset(name, label=label)