Ejemplo n.º 1
0
    def filter(self):
        starttime = time.time()
        parser = soomparse.SoomFilterParse(self.parent_dataset, self.expr)
        record_ids = parser.filter()
        # Empty filter?
        if record_ids is None or len(record_ids) == 0:
            record_ids = []
        self.record_ids = Numeric.array(record_ids, typecode=Numeric.Int)
        del record_ids
        self.generation = self.parent_dataset.generation
        self.length = len(self.record_ids)

        filename = self._blobstore_filename(mkdirs=True)
        if self.backed and filename:
            # initialise a BLOB dict to hold filter record ID vector
            self.filter_blob = ArrayDict(filename, 'w+')
            self.filter_blob['vector'] = self.record_ids
            # this syncs the data to disc - EVIL - relying on cyclic GC to
            # reap immediately.
            del self.filter_blob
            # re-instate the reference to the BLOBstore
            self.filter_blob = ArrayDict(filename, 'r')
            self.record_ids = self.filter_blob['vector']
        else:
            self.filter_blob = None
        self.save_metadata()

        soom.info('Assembling filter %s containing %d elements took %.3fs' %\
                  (self.name, len(self.record_ids), time.time() - starttime))
Ejemplo n.º 2
0
 def yield_rows(self):
     row = SummaryRow()
     isect_time = 0.0
     for item in Utils.combinations(
             *self.condcols.veckey_pairs(self.zeros)):
         row.level = len(item)
         row.suppress = False
         if row.level in self.levels:
             row.type_string = ['0'] * len(self.condcols)
             colnames = []
             colvalues = []
             intersect_rows = []
             for var_val, var_rows, suppress, condcol in item:
                 row.type_string[condcol.index] = '1'
                 intersect_rows.append(var_rows)
                 colnames.append(condcol.name)
                 colvalues.append(var_val)
                 if suppress:
                     row.suppress = True
             isect_start = time.time()
             if len(intersect_rows) == 0:
                 row.count = len(self.filtered_ds)
                 row.extract = self.filtered_ds
             else:
                 if len(intersect_rows) == 1:
                     cellrows = intersect_rows[0]
                 else:
                     cellrows = soomfunc.intersect(*intersect_rows)
                 row.count = len(cellrows)
                 row.extract = DatasetTake(self.dataset, cellrows)
             isect_time += time.time() - isect_start
             row.colnames = tuple(colnames)
             row.colvalues = tuple(colvalues)
             yield row
     soom.info('Summarise intersect() time: %.3f' % isect_time)
Ejemplo n.º 3
0
 def from_summset(cls, ds, shaped_like=None):
     self = cls(ds.name)
     st = time.time()
     cols = ds.get_columns()
     if shaped_like is not None:
         for axis in xtab_axes(shaped_like):
             try:
                 col = ds[axis.name]
             except KeyError:
                 pass
             else:
                 self.axes.append(CrossTabAxis.from_col(col, axis.values))
                 cols.remove(col)
     for col in cols:
         if col.is_discrete() and not col.name.startswith('_'):
             self.axes.append(CrossTabAxis.from_col(col))
     if not self.axes:
         raise Error('dataset %r must have at least one discrete column' % 
                     (ds.name,))
     indices = [axis.indices.filled() for axis in self.axes]
     masks = [axis.indices.mask() for axis in self.axes]
     map = MA.transpose(MA.array(indices, mask=masks))
     shape = self.get_shape()
     for col in ds.get_columns():
         if col.is_scalar():
             self.add_table(col.name, 
                            data=self.from_vector(map, col.data, shape),
                            label=col.label)
     elapsed = time.time() - st
     soom.info('%r crosstab generation took %.3f, %.1f rows/s' % 
                 (self.name, elapsed, len(map) / elapsed))
     return self
Ejemplo n.º 4
0
 def yield_rows(self):
     row = SummaryRow()
     isect_time = 0.0
     for item in Utils.combinations(*self.condcols.veckey_pairs(self.zeros)):
         row.level = len(item)
         row.suppress = False
         if row.level in self.levels:
             row.type_string = ['0'] * len(self.condcols)
             colnames = []
             colvalues = []
             intersect_rows = []
             for var_val, var_rows, suppress, condcol in item:
                 row.type_string[condcol.index] = '1'
                 intersect_rows.append(var_rows)
                 colnames.append(condcol.name)
                 colvalues.append(var_val)
                 if suppress:
                     row.suppress = True
             isect_start = time.time()
             if len(intersect_rows) == 0:
                 row.count = len(self.filtered_ds)
                 row.extract = self.filtered_ds
             else:
                 if len(intersect_rows) == 1:
                     cellrows = intersect_rows[0]
                 else:
                     cellrows = soomfunc.intersect(*intersect_rows)
                 row.count = len(cellrows)
                 row.extract = DatasetTake(self.dataset, cellrows)
             isect_time += time.time() - isect_start
             row.colnames = tuple(colnames)
             row.colvalues = tuple(colvalues)
             yield row
     soom.info('Summarise intersect() time: %.3f' % isect_time)
Ejemplo n.º 5
0
    def filter(self):
        starttime = time.time()
        parser = soomparse.SoomFilterParse(self.parent_dataset, self.expr)
        record_ids = parser.filter()
        # Empty filter?
        if record_ids is None or len(record_ids) == 0:
            record_ids = []
        self.record_ids = Numeric.array(record_ids, typecode=Numeric.Int)
        del record_ids
        self.generation = self.parent_dataset.generation
        self.length = len(self.record_ids)

        filename = self._blobstore_filename(mkdirs=True)
        if self.backed and filename:
            # initialise a BLOB dict to hold filter record ID vector
            self.filter_blob = ArrayDict(filename, 'w+')
            self.filter_blob['vector'] = self.record_ids
            # this syncs the data to disc - EVIL - relying on cyclic GC to
            # reap immediately.
            del self.filter_blob
            # re-instate the reference to the BLOBstore
            self.filter_blob = ArrayDict(filename, 'r')
            self.record_ids = self.filter_blob['vector']
        else:
            self.filter_blob = None
        self.save_metadata()

        soom.info('Assembling filter %s containing %d elements took %.3fs' %\
                  (self.name, len(self.record_ids), time.time() - starttime))
Ejemplo n.º 6
0
 def load_data(self):
     if self.parent_dataset.backed:
         if self._data is None:
             starttime = time.time()
             filename = self.object_path(self.datatype.file_extension)
             self._data = self.datatype.load_data(filename)
             elapsed = time.time() - starttime
             soom.info('load of %r data vector took %.3f seconds.' %\
                         (self.name, elapsed))
Ejemplo n.º 7
0
 def load_data(self):
     if self.parent_dataset.backed:
         if self._data is None:
             starttime = time.time()
             filename = self.object_path(self.datatype.file_extension)
             self._data = self.datatype.load_data(filename)
             elapsed = time.time() - starttime
             soom.info('load of %r data vector took %.3f seconds.' %\
                         (self.name, elapsed))
Ejemplo n.º 8
0
 def dsunload(self, dsname):
     """Unloads datasets"""
     if isinstance(dsname, Dataset):
         dsname = dsname.name
     try:
         ds = self.datasets.pop(dsname.lower())
     except KeyError:
         pass
     else:
         ds.unload()
         soom.info('Dataset %r unloaded.' % dsname)
Ejemplo n.º 9
0
 def load_inverted(self):
     if self.parent_dataset.backed:
         if self._inverted is None:
             starttime = time.time()
             filename = self.object_path('SOOMblobstore', 'inverted')
             self._inverted = ArrayDict(filename, 'r')
             elapsed = time.time() - starttime
             soom.info('load of %r index took %.3f seconds.' %\
                         (self.name, elapsed))
     else:
         # we need to build the inverted index!
         self._build_inverted()
Ejemplo n.º 10
0
    def as_dict(self):
        start_time = time.time()

        freqcol = '_freq_'
        if self.proportions and self.default_weightcol:
            # proportions code needs to know weighted frequency
            wgtfreq_method = SummaryStats.freq()
            self.stat_methods.append(wgtfreq_method)
            freqcol = self.stat_methods.get_method_statcolname(wgtfreq_method)

        summaryset = TempSummarySet()
        summaryset.addcolumn('_freq_', 'Frequency', 'int', 'weighting')
        summaryset.addcolumn('_level_', 'Level', 'int', 'scalar')
        summaryset.addcolumn('_type_', 'Summary type', 'str', 'categorical')
        summaryset.addcolumn('_condcols_', 'Conditioning Columns', 'tuple',
                             'categorical')
        for condcol in self.condcols:
            summaryset.addcolumnfromcondcol(condcol)
        _freq = summaryset['_freq_'].data
        _level = summaryset['_level_'].data
        _type = summaryset['_type_'].data
        _condcols = summaryset['_condcols_'].data
        self.stat_methods.add_statcols(self.dataset, summaryset)
        row_ordinal = -1
        for row in self.yield_rows():
            row_ordinal += 1
            _freq.append(row.count)
            _level.append(row.level)
            _type.append(''.join(row.type_string))
            _condcols.append(row.colnames)
            for colname, colvalue in zip(row.colnames, row.colvalues):
                summaryset[colname].data.append(colvalue)
            if row.suppress:
                summaryset.suppressed_rows.append(row_ordinal)
            if row.level != len(self.condcols):
                mtvals = []
                for condcol in self.condcols:
                    if condcol.name not in row.colnames:
                        colvalue = condcol.col.all_value
                        summaryset[condcol.name].data.append(colvalue)
                    mtvals.append(summaryset[condcol.name].data[-1])
                summaryset.marginal_total_idx[tuple(mtvals)] = row_ordinal
                summaryset.marginal_total_rows.append(row_ordinal)
            self.stat_methods.calc(summaryset, row.extract)

        if self.proportions:
            allvals = [col.all_value for col in self.condcols.cols()]
            calc_props(summaryset, self.condcols.names(), allvals, freqcol)
        summaryset.suppress_rows(suppress_marginal_totals=self.nomt)
        soom.info('Summarise as_dict() time: %.3f' %
                  (time.time() - start_time))
        return summaryset
Ejemplo n.º 11
0
    def as_dict(self):
        start_time = time.time()

        freqcol = '_freq_'
        if self.proportions and self.default_weightcol:
            # proportions code needs to know weighted frequency
            wgtfreq_method = SummaryStats.freq()
            self.stat_methods.append(wgtfreq_method)
            freqcol = self.stat_methods.get_method_statcolname(wgtfreq_method)

        summaryset = TempSummarySet()
        summaryset.addcolumn('_freq_', 'Frequency', 'int', 'weighting')
        summaryset.addcolumn('_level_', 'Level', 'int', 'scalar')
        summaryset.addcolumn('_type_', 'Summary type', 'str', 'categorical')
        summaryset.addcolumn('_condcols_', 'Conditioning Columns', 
                                            'tuple', 'categorical')
        for condcol in self.condcols:
            summaryset.addcolumnfromcondcol(condcol)
        _freq = summaryset['_freq_'].data
        _level = summaryset['_level_'].data
        _type = summaryset['_type_'].data
        _condcols = summaryset['_condcols_'].data
        self.stat_methods.add_statcols(self.dataset, summaryset)
        row_ordinal = -1
        for row in self.yield_rows():
            row_ordinal += 1
            _freq.append(row.count)
            _level.append(row.level)
            _type.append(''.join(row.type_string))
            _condcols.append(row.colnames)
            for colname, colvalue in zip(row.colnames, row.colvalues):
                summaryset[colname].data.append(colvalue)
            if row.suppress:
                summaryset.suppressed_rows.append(row_ordinal)
            if row.level != len(self.condcols):
                mtvals = []
                for condcol in self.condcols:
                    if condcol.name not in row.colnames:
                        colvalue = condcol.col.all_value
                        summaryset[condcol.name].data.append(colvalue)
                    mtvals.append(summaryset[condcol.name].data[-1])
                summaryset.marginal_total_idx[tuple(mtvals)] = row_ordinal
                summaryset.marginal_total_rows.append(row_ordinal)
            self.stat_methods.calc(summaryset, row.extract)

        if self.proportions:
            allvals = [col.all_value for col in self.condcols.cols()]
            calc_props(summaryset, self.condcols.names(), allvals, freqcol)
        summaryset.suppress_rows(suppress_marginal_totals=self.nomt)
        soom.info('Summarise as_dict() time: %.3f' % (time.time() - start_time))
        return summaryset
Ejemplo n.º 12
0
 def create_wordidx(self):
     """
     Open the word index files read-write and prepare them
     """
     if not self.parent_dataset.backed:
         raise NotImplementedError
     if self._wordidx is None:
         starttime = time.time()
         self._wordidx = ArrayVocab(self.wordidx_filename, 'c')
         self._occurrences = file(self.occurrences_filename, 'wb+')
         # create an empty block 0
         self._occurrences.write("\0" * self.BLOCK_SIZE)
         elapsed = time.time() - starttime
         soom.info('creation of %r index took %.3f seconds.' % (self.name, elapsed))
Ejemplo n.º 13
0
 def load_wordidx(self):
     """
     Open the word index files read-only
     """
     if not self.parent_dataset.backed:
         raise NotImplementedError
     if self._wordidx is None:
         starttime = time.time()
         self._wordidx = ArrayVocab(self.wordidx_filename, 'r')
         self._occurrences = file(self.occurrences_filename, 'rb')
         blocks = os.fstat(self._occurrences.fileno())[stat.ST_SIZE] / self.BLOCK_SIZE
         elapsed = time.time() - starttime
         soom.info('load of %r index (%d words/%d blocks) took %.3f seconds.' %\
                     (self.name, len(self._wordidx), blocks, elapsed))
Ejemplo n.º 14
0
 def dsload(self, dsname, path=None):
     """
     Function to load a stored data set definition (but not all
     its data) from disc. The data is loaded column by column only
     as required. The function returns a DataSet object instance.
     """
     ds = self._dsload(dsname, path)
     # now load all the columns if soom.lazy_column_loading is turned off
     if not soom.lazy_column_loading:
         soom.info('Loading columns for dataset %r' % ds.name)
         for col in ds.get_columns():
             col.load('data')
             col.load('inverted')
     return ds
Ejemplo n.º 15
0
 def create_wordidx(self):
     """
     Open the word index files read-write and prepare them
     """
     if not self.parent_dataset.backed:
         raise NotImplementedError
     if self._wordidx is None:
         starttime = time.time()
         self._wordidx = ArrayVocab(self.wordidx_filename, 'c')
         self._occurrences = file(self.occurrences_filename, 'wb+')
         # create an empty block 0
         self._occurrences.write("\0" * self.BLOCK_SIZE)
         elapsed = time.time() - starttime
         soom.info('creation of %r index took %.3f seconds.' %
                   (self.name, elapsed))
Ejemplo n.º 16
0
 def load_wordidx(self):
     """
     Open the word index files read-only
     """
     if not self.parent_dataset.backed:
         raise NotImplementedError
     if self._wordidx is None:
         starttime = time.time()
         self._wordidx = ArrayVocab(self.wordidx_filename, 'r')
         self._occurrences = file(self.occurrences_filename, 'rb')
         blocks = os.fstat(
             self._occurrences.fileno())[stat.ST_SIZE] / self.BLOCK_SIZE
         elapsed = time.time() - starttime
         soom.info('load of %r index (%d words/%d blocks) took %.3f seconds.' %\
                     (self.name, len(self._wordidx), blocks, elapsed))
Ejemplo n.º 17
0
 def flush(self):
     starttime = time.time()
     for col, data in self.columns:
         fn = self._chunk_filename(col.name, self.numchunks)
         f = open(fn, 'wb')
         try:
             if self.compress_chunk:
                 f.write(zlib.compress(cPickle.dumps(data, -1)))
             else:
                 cPickle.dump(data, f, -1)
         finally:
             f.close()
         del data[:]
     soom.mem_report()
     soom.info('chunk flush took %.3f seconds' % (time.time() - starttime))
     self.numchunks += 1
Ejemplo n.º 18
0
 def makedataset(self, dsname, path=None, **kwargs):
     """
     Factory function to create a new DataSet instance (inheriting
     metadata from any existing dataset with the same name). The
     returned dataset is locked for update.
     """
     kwargs['backed'] = True
     try:
         ds = self._dsload(dsname, path)
     except DatasetNotFound:
         ds = Dataset(dsname, **kwargs)
         ds.lock()
         soom.info('Dataset %r created.' % dsname)
     else:
         ds.lock()
         ds.new_generation()
     self.datasets[dsname.lower()] = ds
     return ds
Ejemplo n.º 19
0
 def store_column(self, data, mask=None): 
     st = time.time()
     # The chain of generators returned by get_store_chain does the actual
     # processing.
     if not getattr(self, 'trace_load', False):
         for value in self.get_store_chain(data, mask):
             pass
     else:
         tracer = store_trace()
         for value in self.get_store_chain(data, mask):
             tracer.flush()
         tracer.done()
     soom.info('Stored data for column %s in dataset %s (%.3fs)' %
               (self.name, self.parent_dataset.name, time.time() - st))
     if self.multisourcecols:
         # We unload source cols to contain mapped memory use on 32 bit plats
         for colname in self.multisourcecols:
             self.parent_dataset[colname].unload()
     soom.mem_report()
Ejemplo n.º 20
0
 def store_column(self, data, mask=None):
     st = time.time()
     # The chain of generators returned by get_store_chain does the actual
     # processing.
     if not getattr(self, 'trace_load', False):
         for value in self.get_store_chain(data, mask):
             pass
     else:
         tracer = store_trace()
         for value in self.get_store_chain(data, mask):
             tracer.flush()
         tracer.done()
     soom.info('Stored data for column %s in dataset %s (%.3fs)' %
               (self.name, self.parent_dataset.name, time.time() - st))
     if self.multisourcecols:
         # We unload source cols to contain mapped memory use on 32 bit plats
         for colname in self.multisourcecols:
             self.parent_dataset[colname].unload()
     soom.mem_report()
Ejemplo n.º 21
0
    def _build_inverted(self):
        """
        Build an inverted index

        NOTE - This is now only used where there is no on-disk
        inverted index, but the column is discrete. For persistent
        discrete columns, the inverted index is built as the data
        is filtered, and the inverted index is saved along with
        the data.
        """
        starttime = time.time()  # keep track of time
        inverted_dict = {}
        # Use fast NumPy methods if the column type is numeric
        if self.is_numerictype():
            # first get all the unique values
            uniquevalues = soomfunc.unique(
                Numeric.sort(self._data.compressed()))
            ordinals = Numeric.array(range(len(self._data)),
                                     typecode=Numeric.Int)
            for value in uniquevalues:
                inverted = Numeric.compress(
                    Numeric.where(Numeric.equal(self._data, value), 1, 0),
                    ordinals)
                inverted_dict[value] = inverted
        else:
            # loop over each element
            for rownum, value in enumerate(self._data):
                if type(value) is tuple:
                    for v in value:
                        row_nums = inverted_dict.setdefault(v, [])
                        row_nums.append(rownum)
                else:
                    row_nums = inverted_dict.setdefault(value, [])
                    row_nums.append(rownum)
            for value, row_nums in inverted_dict.iteritems():
                row_array = Numeric.array(row_nums, typecode=Numeric.Int)
                if self.datatype.name == 'tuple':
                    row_array = soomfunc.unique(Numeric.sort(row_array))
                inverted_dict[value] = row_array
        self._inverted = inverted_dict
        soom.info(
            'Building inverted index for column %s in dataset %s took %.3f seconds'
            % (self.name, self.parent_dataset.name, time.time() - starttime))
Ejemplo n.º 22
0
 def _dsload(self, dsname, path):
     try:
         return self.datasets[dsname.lower()]
     except KeyError:
         pass
     metadata_file = os.path.join(dsname, soom.metadata_filename)
     path = soom.object_path(metadata_file, path)
     if not path:
         raise DatasetNotFound('Unknown dataset %r' % dsname)
     f = open(os.path.join(path, metadata_file), 'rb')
     try:
         ds = cPickle.load(f)
     finally:
         f.close()
     soom.info('Dataset %r loaded.' % dsname)
     if (hasattr(ds, 'soom_version_info') 
         and ds.soom_version_info[:2] != version_info[:2]):
         soom.warning('Dataset created by SOOM %s, this is SOOM %s' %\
                      (ds.soom_version, version))
     ds.load_notify(path)
     self.datasets[dsname.lower()] = ds
     return ds
Ejemplo n.º 23
0
 def loadrows(self, datasource, chunkrows=0, rowlimit=0):
     source_rownum = 0
     starttime = time.time()
     if not rowlimit:
         rowlimit = -1
     for row in datasource:
         source_rownum += 1
         for col, data in self.columns:
             data.append(row.get(col.name, None))
         self.rownum += 1
         if source_rownum == rowlimit:
             break
         if chunkrows and source_rownum and source_rownum % chunkrows == 0:
             self.flush()
         if source_rownum and source_rownum % 1000 == 0:
             soom.info('%s (%d total) rows read from source %s (%.1f per sec)' %\
                       (source_rownum, self.rownum, datasource.name,
                        source_rownum / (time.time() - starttime)))
     self.flush()
     soom.info(
         '%s rows read from DataSource %s, in %.3f seconds (%d rows total)'
         % (source_rownum, datasource.name, time.time() - starttime,
            self.rownum))
     return source_rownum
Ejemplo n.º 24
0
def summ(self, *args, **kwargs):
    '''Summarise a Dataset

    summ(conditioning_columns..., stat_methods..., options...)

For example:

    summary_set = dataset.summ('sex', 'agegrp', 
                                mean('age'), median('age'), 
                                allcalc = True)

Options include:

    name                    name of summary set
    label                   summary set label
    allcalc                 calculate all combinations
    datasetpath             for persistent summary sets,
                            the dataset path.
    filtername              apply the named filter
    levels                  calculate combinations at the
                            specified levels, eg: 2 & 3 is '23'
    permanent               resulting summary dataset should
                            be written to disk.
    proportions

'''
    from SOOMv0.Dataset import SummarisedDataset

    starttime = time.time()
    # Method argument parsing
    label = kwargs.pop('label', None)
#    datasetpath = kwargs.pop('datasetpath', soom.default_object_path)
    name = kwargs.pop('name', None)
#    permanent = kwargs.pop('permanent', False)

    summarise = Summarise(self, *args, **kwargs)
    summaryset = summarise.as_dict()

    # print "summaryset:", # debug
    # print summaryset # debug

    soom.info('Summarise took %.3fs' % (time.time() - starttime))

    if not name:
        by = ['_by_%s' % condcol.name for condcol in summarise.condcols]
        name = 'sumof_%s%s' % (self.name, ''.join(by))
    if not label:
        label = self.label

    by = [' by %s' % condcol.col.label 
            for condcol in summarise.condcols]
    summ_label = ''.join(by)

    starttime = time.time()
    sumset = SummarisedDataset(name, label=label,
                            summ_label=summ_label,
                            filter_label=summarise.filtered_ds.filter_label,
#                            path=datasetpath, backed=permanent,
                            weightcol="_freq_",
                            date_created=summarise.filtered_ds.date_created,
                            date_updated=summarise.filtered_ds.date_updated)
    summaryset.columntodataset(sumset)
    sumset.stat_methods = summarise.stat_methods
    sumset.nonprintcols = ('_level_', '_type_', '_condcols_')
    soom.info('summary dict into dataset took %.3f' % (time.time() - starttime))
    return sumset
Ejemplo n.º 25
0
 def delete(self, name):
     filter = self.filters.pop(name, None)
     if filter:
         filter.delete()
         soom.info('deleted filter %r' % name)
Ejemplo n.º 26
0
 def delete(self, name):
     filter = self.filters.pop(name, None)
     if filter:
         filter.delete()
         soom.info('deleted filter %r' % name)
Ejemplo n.º 27
0
def calc_indirectly_std_ratios(summset,
                               popset,
                               stdsummset,
                               stdpopset,
                               conflev=0.95,
                               baseratio=100,
                               timeinterval='years',
                               popset_popcol='_freq_',
                               stdpopset_popcol='_stdpop_',
                               ci_method='daly',
                               debug=False):
    """
    Calculate Indirectly Standardised Population Event Ratios

    - summset is a summary dataset of counts of events for the
      population-of-interest being compared to the standard population.
    - popset is the stratified population counts for the
      population-of-interest
    - stdsummset is a summary dataset of counts of events for the
      standard population
    - stdpopset is the stratified population counts for the standard
      population
    """
    from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION

    alpha = get_alpha(conflev)

    if ci_method != 'daly':
        raise Error("Only Daly method for confidence intervals "
                    "currently implemented")
    if not popset.has_column(popset_popcol):
        raise Error('Denominator population dataset %r does not have a '
                    '%r column' % (popset.label or popset.name, popset_popcol))
    if not stdpopset.has_column(stdpopset_popcol):
        raise Error('Standard population dataset %r does not have a '
                    '%r column' %
                    (stdpopset.label or stdpopset.name, stdpopset_popcol))

    st = time.time()
    r_mode = get_default_mode()
    try:
        set_default_mode(BASIC_CONVERSION)

        shape = shape_union(stdsummset, summset)

        summtab = CrossTab.from_summset(summset, shaped_like=shape)

        stdsummtab = CrossTab.from_summset(stdsummset, shaped_like=shape)

        stdpoptab = CrossTab.from_summset(stdpopset, shaped_like=shape)
        stdpoptab.collapse_axes_not_in(stdsummtab)

        stdsummtab.replicate_axes(shape)
        stdpoptab.replicate_axes(shape)

        poptab = CrossTab.from_summset(popset, shaped_like=shape)
        poptab.collapse_axes_not_in(shape)
        if poptab.get_shape() != stdsummtab.get_shape():
            raise Error(
                'Observed population does not have all the required columns')
        popfreq = poptab[popset_popcol].data.astype(MA.Float64)

        result = stdsummtab.empty_copy()
        result.add_table('popfreq',
                         data=popfreq,
                         label='Total person-' + timeinterval + ' at risk')

        expected_cols = []
        for table, name, n_add, l_add in just_freq_tables(stdsummtab):
            stdsummfreq = stdsummtab[name].data.astype(MA.Float64)
            stdpopfreq = stdpoptab[stdpopset_popcol].data.astype(MA.Float64)
            std_strata_rates = stdsummfreq / stdpopfreq
            strata_expected_freq = std_strata_rates * popfreq
            #            print stdsummfreq[0,0,0], stdpopfreq[0,0,0], popfreq[0,0,0]
            result.add_table('expected' + n_add,
                             data=strata_expected_freq,
                             label='Expected events' + l_add)
            expected_cols.append('expected' + n_add)

        result.collapse_axes_not_in(summtab)

        axis = 0
        baseratio = float(baseratio)

        for table, name, n_add, l_add in just_freq_tables(summtab):
            observed = table.data.astype(Numeric.Float64)
            result.add_table('observed' + n_add,
                             data=observed,
                             label='Observed events' + l_add)

            expected = result['expected' + n_add].data

            isr = observed / expected
            result.add_table('isr' + n_add,
                             data=isr * baseratio,
                             label='Indirectly Standardised Event Ratio')

            # Confidence Intervals
            if alpha is None or name != '_freq_':
                # Can only calculate confidence intervals on freq cols
                continue

            conflev_l = (1 - conflev) / 2.0
            conflev_u = (1 + conflev) / 2.0

            # get shape of observed
            observed_shape = observed.shape
            # flattened version
            observed_flat = MA.ravel(observed)

            # sanity check on shapes - should be the same!
            assert expected.shape == observed.shape

            # flattened version of expecetd
            expected_flat = MA.ravel(expected)

            # lists to hold results
            isr_ll = Numeric.empty(len(observed_flat),
                                   typecode=Numeric.Float64)
            isr_ul = Numeric.empty(len(observed_flat),
                                   typecode=Numeric.Float64)
            isr_ll_mask = Numeric.zeros(len(observed_flat),
                                        typecode=Numeric.Int8)
            isr_ul_mask = Numeric.zeros(len(observed_flat),
                                        typecode=Numeric.Int8)

            obs_mask = MA.getmaskarray(observed_flat)
            exp_mask = MA.getmaskarray(expected_flat)

            for i, v in enumerate(observed_flat):
                if obs_mask[i] or exp_mask[i]:
                    isr_ll[i] = 0.0
                    isr_ul[i] = 0.0
                    isr_ll_mask[i] = 1
                    isr_ul_mask[i] = 1
                else:
                    if v == 0.:
                        obs_ll = 0.0
                        obs_ul = -math.log(1 - conflev)
                    else:
                        obs_ll = r.qgamma(conflev_l, v, scale=1.)
                        obs_ul = r.qgamma(conflev_u, v + 1., scale=1.)
                    isr_ll[i] = obs_ll / expected_flat[i]
                    isr_ul[i] = obs_ul / expected_flat[i]

            isr_ll = MA.array(isr_ll, typecode=MA.Float64, mask=isr_ll_mask)
            isr_ul = MA.array(isr_ul, typecode=MA.Float64, mask=isr_ul_mask)
            isr_ll.shape = observed_shape
            isr_ul.shape = observed_shape

            isr_base = 'ISR %d%%' % (100.0 * conflev)
            result.add_table('isr_ll' + n_add,
                             data=isr_ll * baseratio,
                             label=isr_base + ' lower confidence limit' +
                             l_add)
            result.add_table('isr_ul' + n_add,
                             data=isr_ul * baseratio,
                             label=isr_base + ' upper confidence limit' +
                             l_add)
    finally:
        set_default_mode(r_mode)
    soom.info('calc_indirectly_std_ratios took %.03f' % (time.time() - st))
    name = 'indir_std_ratios_' + summset.name
    label = 'Indirectly Standardised Ratios for ' + (summset.label
                                                     or summset.name)
    if conflev:
        label += ' (%g%% conf. limits)' % (conflev * 100)

    if debug:
        global vars
        vars = Vars(locals())
    return result.to_summset(name, label=label)
Ejemplo n.º 28
0
def calc_directly_std_rates(summset,
                            popset,
                            stdpopset=None,
                            conflev=0.95,
                            basepop=100000,
                            timeinterval='years',
                            ci_method='dobson',
                            popset_popcol='_freq_',
                            stdpopset_popcol='_stdpop_',
                            axis=0,
                            debug=False):
    """
    Calculate Directly Standardised Population Rates

    summset     is a summary dataset of counts of events for the
                population-of-interest being compared to the standard
                population.  
    popset      is the stratified population counts for the
                population-of-interest
    stdpopset   is the stratified population counts for the standard
                population
    """
    from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION

    alpha = get_alpha(conflev)

    if ci_method not in ('dobson', 'ff'):
        raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) methods '
                    'for confidence intervals currently implemented')
    if not popset.has_column(popset_popcol):
        raise Error('Denominator population dataset %r does not have a '
                    '%r column' % (popset.label or popset.name, popset_popcol))
    if stdpopset is not None and not stdpopset.has_column(stdpopset_popcol):
        raise Error('Standard population dataset %r does not have a '
                    '%r column' %
                    (stdpopset.label or stdpopset.name, stdpopset_popcol))

    st = time.time()
    r_mode = get_default_mode()
    try:
        set_default_mode(BASIC_CONVERSION)

        # We turn the summset into an Ncondcols-dimensional matrix
        summtab = CrossTab.from_summset(summset)

        if stdpopset is not None:
            # Then attempt to do the same to the stdpop data, summing any
            # axes not required and replicate any missing until we have an
            # array the same shape as the summtab array.
            stdtab = CrossTab.from_summset(stdpopset, shaped_like=summtab)
            stdtab.collapse_axes_not_in(summtab)
            stdtab.replicate_axes(summtab)
            stdpop = stdtab[stdpopset_popcol].data.astype(Numeric.Float64)

        # The population dataset must have at least as many dimensions as
        # summary dataset. Any additional axes are eliminated by summing.
        # any missing axes are created by replication.
        poptab = CrossTab.from_summset(popset, shaped_like=summtab)
        poptab.collapse_axes_not_in(summtab)
        poptab.replicate_axes(summtab)
        popfreq = poptab[popset_popcol].data.astype(Numeric.Float64)

        # Manufacture a CrossTab for the result, with one less axis (the first)
        result = summtab.empty_copy()
        del result.axes[axis]

        if stdpopset is not None:
            sum_stdpop = sumaxis(stdpop)
            stdwgts = stdpop / sum_stdpop
            stdpop_sq = stdpop**2
            sum_stdpop_sq = sum_stdpop**2
            ffwi = stdwgts / popfreq
            ffwm = MA.maximum(MA.ravel(ffwi))

        basepop = float(basepop)

        for table, name, n_add, l_add in just_freq_tables(summtab):

            # avoid integer overflows...
            summfreq = table.data.astype(Numeric.Float64)
            strata_rate = summfreq / popfreq

            result.add_table('summfreq' + n_add,
                             data=sumaxis(summfreq, axis),
                             label='Total events' + l_add)
            result.add_table('popfreq' + n_add,
                             data=sumaxis(popfreq, axis),
                             label='Total person-' + timeinterval +
                             ' at risk' + l_add)

            if stdpopset is not None:
                std_strata_summfreq = summfreq * Numeric.where(
                    MA.getmask(stdwgts), 0., 1.)
                wgtrate = strata_rate * stdwgts
                result.add_table('std_strata_summfreq' + n_add,
                                 data=sumaxis(std_strata_summfreq, axis),
                                 label="Total events in standard strata" +
                                 l_add)

            # Crude rate
            cr = sumaxis(summfreq, axis) / sumaxis(popfreq, axis) * basepop
            result.add_table('cr' + n_add,
                             data=cr,
                             label='Crude Rate per ' + '%d' % basepop +
                             ' person-' + timeinterval + l_add)

            if alpha is not None:
                # CIs for crude rate
                count = sumaxis(summfreq, axis)
                count_shape = count.shape
                count_flat = MA.ravel(count)
                totpop = sumaxis(popfreq, axis)
                assert totpop.shape == count.shape
                totpop_flat = MA.ravel(totpop)

                cr_ll = Numeric.empty(len(count_flat),
                                      typecode=Numeric.Float64)
                cr_ul = Numeric.empty(len(count_flat),
                                      typecode=Numeric.Float64)
                cr_ll_mask = Numeric.zeros(len(count_flat),
                                           typecode=Numeric.Int8)
                cr_ul_mask = Numeric.zeros(len(count_flat),
                                           typecode=Numeric.Int8)

                for i, v in enumerate(count_flat):
                    try:
                        if v == 0:
                            cr_ll[i] = 0.0
                        else:
                            cr_ll[i] = (
                                (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) /
                                totpop_flat[i]) * basepop
                        cr_ul[i] = (
                            (r.qchisq(1. - alpha / 2., df=2.0 *
                                      (v + 1)) / 2.0) /
                            totpop_flat[i]) * basepop
                    except:
                        cr_ll[i] = 0.0
                        cr_ul[i] = 0.0
                        cr_ll_mask[i] = 1
                        cr_ul_mask[i] = 1

                cr_ll = MA.array(cr_ll, mask=cr_ll_mask, typecode=MA.Float64)
                cr_ul = MA.array(cr_ul, mask=cr_ul_mask, typecode=MA.Float64)
                cr_ll.shape = count_shape
                cr_ul.shape = count_shape

                cr_base = 'Crude rate %d%%' % (100.0 * conflev)
                result.add_table('cr_ll' + n_add,
                                 data=cr_ll,
                                 label=cr_base + ' lower confidence limit ' +
                                 l_add)
                result.add_table('cr_ul' + n_add,
                                 data=cr_ul,
                                 label=cr_base + ' upper confidence limit ' +
                                 l_add)

            if stdpopset is not None:

                # Directly Standardised Rate
                dsr = sumaxis(wgtrate, axis)
                result.add_table('dsr' + n_add,
                                 data=dsr * basepop,
                                 label='Directly Standardised Rate per ' +
                                 '%d' % basepop + ' person-' + timeinterval +
                                 l_add)

                # Confidence Intervals
                if alpha is None or name != '_freq_':
                    # Can only calculate confidence intervals on freq cols
                    continue

                if ci_method == 'dobson':
                    # Dobson et al method
                    # see: Dobson A, Kuulasmaa K, Eberle E, Schere J. Confidence intervals for weighted sums
                    # of Poisson parameters, Statistics in Medicine, Vol. 10, 1991, pp. 457-62.
                    # se_wgtrate = summfreq*((stdwgts/(popfreq/basepop))**2)
                    se_wgtrate = summfreq * ((stdwgts / (popfreq))**2)
                    stderr = stdpop_sq * strata_rate * (1.0 - strata_rate)
                    se_rate = sumaxis(se_wgtrate, axis)
                    sumsei = sumaxis(stderr, axis)
                    total_freq = sumaxis(std_strata_summfreq, axis)
                    # get shape of total_freq
                    total_freq_shape = total_freq.shape

                    total_freq_flat = MA.ravel(total_freq)

                    # flat arrays to hold results and associated masks
                    l_lam = Numeric.empty(len(total_freq_flat),
                                          typecode=Numeric.Float64)
                    u_lam = Numeric.empty(len(total_freq_flat),
                                          typecode=Numeric.Float64)
                    l_lam_mask = Numeric.zeros(len(total_freq_flat),
                                               typecode=Numeric.Int8)
                    u_lam_mask = Numeric.zeros(len(total_freq_flat),
                                               typecode=Numeric.Int8)

                    conflev_l = (1 - conflev) / 2.0
                    conflev_u = (1 + conflev) / 2.0

                    for i, v in enumerate(total_freq_flat):
                        try:
                            if v == 0.:
                                u_lam[i] = -math.log(1 - conflev)
                                l_lam[i] = 0.0
                            else:
                                l_lam[i] = r.qgamma(conflev_l, v, scale=1.)
                                u_lam[i] = r.qgamma(conflev_u,
                                                    v + 1.,
                                                    scale=1.)
                        except:
                            l_lam[i] = 0.0
                            u_lam[i] = 0.0
                            l_lam_mask[i] = 1
                            u_lam_mask[i] = 1

                    l_lam = MA.array(l_lam,
                                     mask=l_lam_mask,
                                     typecode=MA.Float64)
                    u_lam = MA.array(u_lam,
                                     mask=u_lam_mask,
                                     typecode=MA.Float64)
                    l_lam.shape = total_freq_shape
                    u_lam.shape = total_freq_shape
                    dsr_ll = dsr + (((se_rate / total_freq)**0.5) *
                                    (l_lam - total_freq))
                    dsr_ul = dsr + (((se_rate / total_freq)**0.5) *
                                    (u_lam - total_freq))

                elif ci_method == 'ff':
                    # Fay and Feuer method
                    # see: Fay MP, Feuer EJ. Confidence intervals for directly standardized rates:
                    # a method based on the gamma distribution. Statistics in Medicine 1997 Apr 15;16(7):791-801.

                    ffvari = summfreq * ffwi**2.0
                    ffvar = sumaxis(ffvari, axis)

                    dsr_flat = Numeric.ravel(MA.filled(dsr, 0))
                    dsr_shape = dsr.shape

                    ffvar_flat = Numeric.ravel(MA.filled(ffvar, 0))

                    # flat arrays to hold results and associated masks
                    dsr_ll = Numeric.empty(len(dsr_flat),
                                           typecode=Numeric.Float64)
                    dsr_ul = Numeric.empty(len(dsr_flat),
                                           typecode=Numeric.Float64)
                    dsr_ll_mask = Numeric.zeros(len(dsr_flat),
                                                typecode=Numeric.Int8)
                    dsr_ul_mask = Numeric.zeros(len(dsr_flat),
                                                typecode=Numeric.Int8)

                    for i, y in enumerate(dsr_flat):
                        try:
                            dsr_ll[i] = (ffvar_flat[i] / (2.0 * y)) * r.qchisq(
                                alpha / 2., df=(2.0 * (y**2.) / ffvar_flat[i]))
                            dsr_ul[i] = ((ffvar_flat[i] + (ffwm**2.0)) /
                                         (2.0 * (y + ffwm))) * r.qchisq(
                                             1. - alpha / 2.,
                                             df=((2.0 * ((y + ffwm)**2.0)) /
                                                 (ffvar_flat[i] + ffwm**2.0)))
                        except:
                            dsr_ll[i] = 0.0
                            dsr_ul[i] = 0.0
                            dsr_ll_mask[i] = 1
                            dsr_ul_mask[i] = 1
                    dsr_ll = MA.array(dsr_ll,
                                      mask=dsr_ll_mask,
                                      typecode=MA.Float64)
                    dsr_ul = MA.array(dsr_ul,
                                      mask=dsr_ul_mask,
                                      typecode=MA.Float64)
                    dsr_ll.shape = dsr_shape
                    dsr_ul.shape = dsr_shape

                result.add_table('dsr_ll' + n_add,
                                 data=dsr_ll * basepop,
                                 label='DSR ' + '%d' % (100.0 * conflev) +
                                 '% lower confidence limit' + l_add)
                result.add_table('dsr_ul' + n_add,
                                 data=dsr_ul * basepop,
                                 label='DSR ' + '%d' % (100.0 * conflev) +
                                 '% upper confidence limit' + l_add)

    finally:
        set_default_mode(r_mode)
    soom.info('calc_directly_std_rates took %.03f' % (time.time() - st))
    if stdpopset is not None:
        name = 'dir_std_rates_' + summset.name
        label = 'Directly Standardised Rates for ' + (summset.label
                                                      or summset.name)
    else:
        name = 'crude_rates_' + summset.name
        label = 'Crude Rates for ' + (summset.label or summset.name)
    if conflev:
        label += ' (%g%% conf. limits)' % (conflev * 100)
    if debug:
        global vars
        vars = Vars(locals())
    return result.to_summset(name, label=label)
Ejemplo n.º 29
0
def summ(self, *args, **kwargs):
    '''Summarise a Dataset

    summ(conditioning_columns..., stat_methods..., options...)

For example:

    summary_set = dataset.summ('sex', 'agegrp', 
                                mean('age'), median('age'), 
                                allcalc = True)

Options include:

    name                    name of summary set
    label                   summary set label
    allcalc                 calculate all combinations
    datasetpath             for persistent summary sets,
                            the dataset path.
    filtername              apply the named filter
    levels                  calculate combinations at the
                            specified levels, eg: 2 & 3 is '23'
    permanent               resulting summary dataset should
                            be written to disk.
    proportions

'''
    from SOOMv0.Dataset import SummarisedDataset

    starttime = time.time()
    # Method argument parsing
    label = kwargs.pop('label', None)
    #    datasetpath = kwargs.pop('datasetpath', soom.default_object_path)
    name = kwargs.pop('name', None)
    #    permanent = kwargs.pop('permanent', False)

    summarise = Summarise(self, *args, **kwargs)
    summaryset = summarise.as_dict()

    # print "summaryset:", # debug
    # print summaryset # debug

    soom.info('Summarise took %.3fs' % (time.time() - starttime))

    if not name:
        by = ['_by_%s' % condcol.name for condcol in summarise.condcols]
        name = 'sumof_%s%s' % (self.name, ''.join(by))
    if not label:
        label = self.label

    by = [' by %s' % condcol.col.label for condcol in summarise.condcols]
    summ_label = ''.join(by)

    starttime = time.time()
    sumset = SummarisedDataset(
        name,
        label=label,
        summ_label=summ_label,
        filter_label=summarise.filtered_ds.filter_label,
        #                            path=datasetpath, backed=permanent,
        weightcol="_freq_",
        date_created=summarise.filtered_ds.date_created,
        date_updated=summarise.filtered_ds.date_updated)
    summaryset.columntodataset(sumset)
    sumset.stat_methods = summarise.stat_methods
    sumset.nonprintcols = ('_level_', '_type_', '_condcols_')
    soom.info('summary dict into dataset took %.3f' %
              (time.time() - starttime))
    return sumset
Ejemplo n.º 30
0
def calc_stratified_rates(summset,
                          popset,
                          conflev=0.95,
                          basepop=100000,
                          timeinterval='years',
                          ci_method='dobson',
                          popset_popcol='_freq_',
                          debug=False):
    """
    Calculate stratified population rates

    summset     is a straified summary dataset of counts of events for
                the population-of-interest
    popset      is the stratified population counts for the
                population-of-interest
    """
    from rpy import r, get_default_mode, set_default_mode, BASIC_CONVERSION

    alpha = get_alpha(conflev)

    if ci_method not in ('dobson', 'ff'):
        raise Error('Only Dobson et al. (dobson) and Fay-Feuer (ff) '
                    'methods for confidence intervals currently '
                    'implemented')
    if not popset.has_column(popset_popcol):
        raise Error('Denominator population dataset %r does not have a '
                    '%r column' % (popset.label or popset.name, popset_popcol))

    st = time.time()
    r_mode = get_default_mode()
    try:
        set_default_mode(BASIC_CONVERSION)

        # We turn the summset into an Ncondcols-dimensional matrix
        summtab = CrossTab.from_summset(summset)

        # The population dataset must have at least as many dimensions as
        # summary dataset. Any additional axes are eliminated by summing.
        # any missing axes are created by replication.
        poptab = CrossTab.from_summset(popset, shaped_like=summtab)
        poptab.collapse_axes_not_in(summtab)
        poptab.replicate_axes(summtab)
        popfreq = poptab[popset_popcol].data.astype(Numeric.Float64)

        # Manufacture a CrossTab for the result
        result = summtab.empty_copy()

        basepop = float(basepop)

        for table, name, n_add, l_add in just_freq_tables(summtab):
            # avoid integer overflows...
            summfreq = table.data.astype(Numeric.Float64)

            strata_rate = summfreq / popfreq

            result.add_table('summfreq' + n_add,
                             data=summfreq,
                             label='Events' + l_add)
            result.add_table('popfreq' + n_add,
                             data=popfreq,
                             label='Person-' + timeinterval + ' at risk' +
                             l_add)
            result.add_table('sr' + n_add,
                             data=strata_rate * basepop,
                             label='Strata-specific Rate per ' +
                             '%d' % basepop + ' person-' + timeinterval +
                             l_add)

            if alpha is not None:
                # CIs for stratified rates
                summfreq_shape = summfreq.shape
                summfreq_flat = MA.ravel(summfreq)
                assert popfreq.shape == summfreq.shape
                popfreq_flat = MA.ravel(popfreq)

                sr_ll = Numeric.empty(len(summfreq_flat),
                                      typecode=Numeric.Float64)
                sr_ul = Numeric.empty(len(summfreq_flat),
                                      typecode=Numeric.Float64)
                sr_ll_mask = Numeric.zeros(len(summfreq_flat),
                                           typecode=Numeric.Int8)
                sr_ul_mask = Numeric.zeros(len(summfreq_flat),
                                           typecode=Numeric.Int8)

                for i, v in enumerate(summfreq_flat):
                    try:
                        if v == 0:
                            sr_ll[i] = 0.0
                        else:
                            sr_ll[i] = (
                                (r.qchisq(alpha / 2., df=2.0 * v) / 2.0) /
                                popfreq_flat[i]) * basepop
                        sr_ul[i] = (
                            (r.qchisq(1. - alpha / 2., df=2.0 *
                                      (v + 1)) / 2.0) /
                            popfreq_flat[i]) * basepop
                    except:
                        sr_ll[i] = 0.0
                        sr_ul[i] = 0.0
                        sr_ll_mask[i] = 1
                        sr_ul_mask[i] = 1

                sr_ll = MA.array(sr_ll, mask=sr_ll_mask, typecode=MA.Float64)
                sr_ul = MA.array(sr_ul, mask=sr_ul_mask, typecode=MA.Float64)
                sr_ll.shape = summfreq_shape
                sr_ul.shape = summfreq_shape

                sr_base = 'Stratified rate %s%%' % (100.0 * conflev)
                result.add_table('sr_ll' + n_add,
                                 data=sr_ll,
                                 label=sr_base + ' lower confidence limit ' +
                                 l_add)
                result.add_table('sr_ul' + n_add,
                                 data=sr_ul,
                                 label=sr_base + ' upper confidence limit ' +
                                 l_add)

    finally:
        set_default_mode(r_mode)
    soom.info('calc_stratified_rates took %.03f' % (time.time() - st))
    name = 'stratified_rates_' + summset.name
    label = 'Stratified Rates for ' + (summset.label or summset.name)
    if conflev:
        label += ' (%g%% conf. limits)' % (conflev * 100)
    if debug:
        global vars
        vars = Vars(locals())
    return result.to_summset(name, label=label)