Python clean Examples, gbd.dismod3.utils.clean Python Examples

Example #1

0

Show file

File: models.py Project: jjdu/gbd

    def calculate_covariate(self, covariate_type):
        """ Calculate and cache specified covariate in self.params to avoid
        repeatedly making the database queries required to compute it.
        """

        import numpy as np
        from covariate_data_server.models import Covariate
        from gbd.dismod3.utils import clean

        # TODO: allow a way for one db query to calculate covariates for many data points
        if self.region:
            cy_list = ['%s-%d' % (self.region, y) for y in [gbd.fields.ALL_YEARS] + range(self.year_start,self.year_end+1)]
        else:  # average value from all countries to get regional covariate (FIXME: should this be a population weighted average?)
            from gbd.dismod3.neg_binom_model import countries_for
            cy_list = ['%s-%d' % (c, y) for c in countries_for[clean(self.gbd_region)] for y in [gbd.fields.ALL_YEARS] + range(self.year_start,self.year_end+1)]

        
        sex = self.sex
        if sex == 'all':  # if data is applied to males and females using sex == 'all', take the covariate value for sex == 'total'
            sex = 'total'
            
        covariates = Covariate.objects.filter(
            type__slug=covariate_type,
            sex=sex,
            country_year__in=cy_list)
        if len(covariates) == 0:
            debug(("WARNING: Covariate %s not found for %s %s-%s, "
                   + "(Data_id=%d)" )
                  % (covariate_type, self.sex, self.region, self.year_str(), self.id))

        else:
            self.params[clean(covariate_type)] = np.mean([c.value for c in covariates])
            self.cache_params()
            self.save()
            debug('updated %s %s %s-%s to %f, (Data_id=%d)' % (covariate_type, self.sex, self.region, self.year_str(), np.mean([c.value for c in covariates]), self.id))

Example #2

0

Show file

File: models.py Project: flaxter/gbd

    def study_level_covariates(self):
        data_list = [d.params for d in self.data.all()]
        
        all_keys = set()

        for d in data_list:
            all_keys |= set(d.keys())

        required_keys = ['GBD Cause', 'Parameter', 'GBD Region', 'Country ISO3 Code',
                         'Sex', 'Year Start', 'Year End', 'Age Start', 'Age End',
                         'Parameter Value', 'Standard Error', 'Units', ]

        redundant_keys = ['_row', 'age_weights', 'id', 'value', 'condition', 'data_type', 'region']

        from dismod3.utils import clean
        from numpy import inf
        additional_keys = sorted(all_keys - set([clean(k) for k in required_keys] + redundant_keys))

        cov_dict = {}
        for k in  additional_keys:
            x_vals = set()
            x_min = 10000.
            x_max = -10000.
            for x in [d.get(k) or 0. for d in data_list]:
                try:
                    x = float(x)
                    x_min = min(x_min, x)
                    x_max = max(x_max, x)
                except ValueError:
                    x_vals.add(x)

            if x_min == 10000. and x_max == -10000.:
                x_min = 0.
                x_max = 0.

            # TODO: consider how to get this right, in the case of users adding data in batches
            if len(x_vals) <= 1 and x_min == x_max:
                continue

            # for now, only allow numerical covariates
            if x_min == x_max:
                continue
            cov_dict[k] = dict(rate=dict(value=0, default=0),
                               error=dict(value=0, default=0),
                               value=dict(value='0', default='0'),
                               range=[x_min, x_max],
                               category=sorted(x_vals)
                               )

        if len(cov_dict) == 0:
            cov_dict['none'] = {
                'rate': dict(value=0, default=0),
                'error': dict(value=0, default=0),
                'value': dict(value='', default='0.'),  # value must be a string
                'range': [0, 1],
                'category': ['0', '.5', '1']
                }
            
        return cov_dict

Example #3

0

Show file

File: views.py Project: flaxter/gbd

def covariate_data_value_show(request, type, area, format='png'):
    """ Serve a representation of the covariate for the specified type and country

    Parameters:
    -----------
      type : str
        the covariate_type
      area : str
        either the country code or the gbd region
      format : str, optional
        the format to return the results in, may be one of the following:
        json, csv, png, pdf
    """
    ct = get_object_or_404(CovariateType, slug=type)
    fig_width = 18.
    fig_height = 4.5
    sexes = ['male', 'female', 'total']
    pl.figure(figsize=(fig_width, fig_height), dpi=100)
    if len(area) == 3:
        for i, s in enumerate(sexes):
            pl.subplot(1, 3, i + 1)
            X = pl.array(
                sorted([[c.year, c.value] for c in ct.covariate_set.filter(iso3=area, sex=s)]))
            if len(X) > 0:
                pl.plot(X[:,0], X[:,1], '.-')
                pl.ylabel(c.type)
                pl.xlabel('Time (Years)')
                pl.title('%s for %s in %s' % (c.type, s, c.iso3))
    else:
        region_dict = {}
        for r in gbd_regions:
            region_dict[clean(r)] = r

        for i, s in enumerate(sexes):
            pl.subplot(1, 3, i + 1)
            X = pl.array(
                sorted([[c.year, c.value] for c in ct.covariate_set.filter(region=region_dict[area], sex=s)]))
            if len(X) > 0:
                pl.plot(X[:,0], X[:,1], '.-')
                pl.ylabel(c.type)
                pl.xlabel('Time (Years)')
                pl.title('%s for %s in %s' % (c.type, s, c.region))

    response = view_utils.figure_data(format)
    
    return HttpResponse(response, view_utils.MIMETYPE[format])

Example #4

0

Show file

File: views.py Project: flaxter/gbd

def covariate_data_count_show(request, id):
    """ Show amount of data for each country of the selected covariate type

    Parameters:
    -----------
      id : int
        the id of the covariate type to display
    """
    ct = get_object_or_404(CovariateType, id=id)
    
    if ct.region_only:
        pm = ct.covariate_set.all().distinct().values('region')

        for c in pm:
            c['clean_region'] = clean(c['region'])
            c['count'] = ct.covariate_set.filter(region=c['region']).count()
            if c['count'] < (ct.year_end - ct.year_start + 1) * 3:
                c['color'] = 'class=highlight'
            else:
                c['color'] = ''
        
        if len(pm) != 21:
            error = 'Total number of regions are wrong.  Found ' + str(len(pm)) + '.  Should be 21.'
        else:
            error = ''

        return render_to_response('covariate_data_count_show.html',
                                  {'ct': ct, 'level': 'region', 'error': error,
                                   'paginated_models': view_utils.paginated_models(request, pm)})
    else:
        pm = ct.covariate_set.all().distinct().values('iso3')

        for c in pm:
            c['count'] = ct.covariate_set.filter(iso3=c['iso3']).count()
            if c['count'] < (ct.year_end - ct.year_start + 1) * 3:
                c['color'] = 'class=highlight'
            else:
                c['color'] = ''

        return render_to_response('covariate_data_count_show.html',
                                  {'ct': ct, 'level': 'country',
                                   'paginated_models': view_utils.paginated_models(request, pm)})

Example #5

0

Show file

File: views.py Project: flaxter/gbd

def covariate_type_show(request, id):
    """ Show an index page for the selected covariate type

    Parameters:
    -----------
      id : int
        the id of the covariate type to display
    """
    ct = get_object_or_404(CovariateType, id=id)

    if ct.region_only:
        pm = ct.covariate_set.all().distinct().values('region', 'sex')

        for c in pm:
            c['clean_region'] = clean(c['region'])
            c['count'] = ct.covariate_set.filter(region=c['region'], sex=c['sex']).count()
            if c['count'] < ct.year_end - ct.year_start + 1:
                c['color'] = 'class=highlight'
            else:
                c['color'] = ''

        return render_to_response('covariate_type_show.html',
                                  {'ct': ct, 'level': 'region',
                                   'paginated_models': view_utils.paginated_models(request, pm)})
    else:
        pm = ct.covariate_set.all().distinct().values('iso3', 'sex')

        for c in pm:
            c['count'] = ct.covariate_set.filter(iso3=c['iso3'], sex=c['sex']).count()
            if c['count'] < ct.year_end - ct.year_start + 1:
                c['color'] = 'class=highlight'
            else:
                c['color'] = ''

        return render_to_response('covariate_type_show.html',
                                  {'ct': ct, 'level': 'country',
                                   'paginated_models': view_utils.paginated_models(request, pm)})

Example #6

0

Show file

File: forms.py Project: flaxter/gbd

    def validate(self, lines):
        """
Required data fields:
--------------------------------------------------------------------------------
Name                               Type    Limit
--------------------------------------------------------------------------------
GBD Cause                          str     one of the GBD causes
Region                             str     one of the GBD regions
Parameter                          str     standardize_data_type
Sex                                str     standardize_sex
Country ISO3 Code                  str     an ISO3 code in the region (or blank to apply to all countries in region)
Age Start                          int     [0, 100], <= Age End
Age End                            int     [0, 100], >= Age Start
Year Start                         int     [1950, 2010], <= Year End
Year End                           int     [1950, 2010], >= Year Start
Parameter Value                    float   >= 0
Units                              float   >= 1

Recommended data fields:
--------------------------------------------------------------------------------
Name                               Type    Limit
--------------------------------------------------------------------------------
Study ID                           empty or int     >= 0
Sequela                            empty or str     one of the GBD sequela codes
Case Definition                    empty or str     none
Coverage                           empty or float   [0,1]
Effective Sample Size*             empty or int     > 0, <= Total Study Size N
Lower CI*                          empty or float   >= 0 <= Parameter Value
Upper CI*                          empty or float   > Parameter Value
Standard Error*                    empty or float   > 0
Total Study Size N                 empty or int     > 0
Design Factor                      empty or float   >= 1
Citation                           empty or str     none
Urbanicity                         empty or float   [0, 1]
Ignore                             empty or int     [0, 1]

Optional data fields:
No checks

* Either of Effective Sample Size, Lower CI and Upper CI, or Standard Error must be given.
        """
        col_names = [clean(col) for col in lines.next()]

        # check that required fields appear
        for field in NewDataForm.required_data_fields:
            if not clean(field) in col_names:
                raise forms.ValidationError(_('Column "%s" is missing') % field)

        data_list = []
        for ii, cells in enumerate(lines):
            # skip blank lines
            if sum([cell == '' for cell in cells]) == len(cells):
                continue
            
            # ensure that something appears for each column
            if len(cells) != len(col_names):
                raise forms.ValidationError(
                    _('Error loading row %d:  found %d fields (expected %d))')
                    % (ii+2, len(cells), len(col_names)))

            # make an associative array from the row data
            data = {}
            for key, val in zip(col_names, cells):
                data[clean(key)] = val.strip()
            data['_row'] = ii+2

            data_list.append(data)

        # ensure that certain cells are the right format
        error_str = _('Row %d:  could not understand entry for %s')
        gbd_cause = ''

        for r in data_list:
            # check required data fields
            try:
                r['gbd_cause'] = str(r['gbd_cause'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'GBD Cause'))
            if gbd_cause == '':
                gbd_cause = r['gbd_cause']
            else:
                if gbd_cause != r['gbd_cause']:
                    raise forms.ValidationError(error_str % (r['_row'], 'GBD Cause (all GBD Causes must be the same)'))

            try:
                r['region'] = str(r['region'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Region'))
            if not clean(r['region']) in [clean(region) for region in dismod3.gbd_regions] + ['all']:
                raise forms.ValidationError(error_str % (r['_row'], 'Region'))

            try:
                r['parameter'] = gbd.fields.standardize_data_type[r['parameter']]
            except KeyError:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter'))

            try:
                r['sex'] = gbd.fields.standardize_sex[r['sex']]
            except KeyError:
                raise forms.ValidationError(error_str % (r['_row'], 'Sex'))

            try:
                r['country_iso3_code'] = str(r['country_iso3_code'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Country ISO3 Code'))
            if r['region'] != 'all':
                if not r['country_iso3_code'] in countries_for[clean(r['region'])] + ['']:
                    raise forms.ValidationError(error_str % (r['_row'], 'Country ISO3 Code (%s is not in %s)' % (r['country_iso3_code'], r['region'])))
            elif r['country_iso3_code'] != 'all':
                raise forms.ValidationError(error_str % (r['_row'], 'Country ISO3 Code (%s must be "all" if region is "all")' % r['country_iso3_code']))

            try:
                r['age_start'] = int(r['age_start'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Age Start'))
            if r['age_start'] < 0 or r['age_start'] > 100:
                raise forms.ValidationError(error_str % (r['_row'], 'Age Start (must be in range [0, 100])'))

            try:
                r['age_end'] = int(r['age_end'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Age End'))
            if r['age_end'] < 0 or r['age_end'] > 100:
                raise forms.ValidationError(error_str % (r['_row'], 'Age End (must be in range [0, 100])'))

            if r['age_start'] > r['age_end']:
                raise forms.ValidationError(error_str % (r['_row'], 'Age Start (must be greater than Age End)'))

            try:
                r['year_start'] = int(r['year_start'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Year Start'))
            if r['year_start'] < 1950 or r['year_start'] > 2010:
                raise forms.ValidationError(error_str % (r['_row'], 'Year Start (must be in range [1950, 2010])'))

            try:
                r['year_end'] = int(r['year_end'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Year End'))
            if r['year_end'] < 1950 or r['year_end'] > 2010:
                raise forms.ValidationError(error_str % (r['_row'], 'Year End (must be in range [1950, 2010])'))
   
            if r['year_start'] > r['year_end']:
                raise forms.ValidationError(error_str % (r['_row'], 'Year Start (must be greater than Year End)'))

            units = 0
            try:
                units = float(r['units'].replace(',', '').replace('per ', ''))
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Units'))
            if units < 1:
                raise forms.ValidationError(error_str % (r['_row'], 'Units (must be greater than 1)'))

            try:
                r['parameter_value'] = float(r['parameter_value'])
            except ValueError:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value'))
            if r['parameter_value'] < 0:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value (must be greater than 0)'))
            value = r['parameter_value'] / units
            param = r['parameter']
            if param == 'prevalence data' and value > 1:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of prevalence (must not be greater than 1)'))
            if param == 'duration data' and value > 100:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of duration (must not be greater than 100)'))
            if param == 'relative-risk data' and value < 1:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of relative-risk (must not be smaller than 1)'))
            if param == 'smr data' and value < 1:
                raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of smr (must not be smaller than 1)'))

            # check recommended data fields
            if 'study_id' in col_names and r['study_id'] != '':
                try:
                    r['study_id'] = int(r['study_id'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Study ID'))
                if r['study_id'] < 0:
                    raise forms.ValidationError(error_str % (r['_row'], 'Study ID (must be greater than 0)'))

            #if 'sequela' in col_names and r['sequela'] != '':
            #    try:
            #        r['sequela'] = str(r['sequela'])
            #    except ValueError:
            #        raise forms.ValidationError(error_str % (r['_row'], 'Sequela'))

            #if 'case_definition' in col_names and r['case_definition'] != '':
            #    try:
            #        r['case_definition'] = str(r['case_definition'])
            #    except ValueError:
            #        raise forms.ValidationError(error_str % (r['_row'], 'Case Definition'))

            if 'coverage' in col_names and r['coverage'] != '':
                try:
                    r['coverage'] = float(r['coverage'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Coverage'))
                if r['coverage'] < 0 or r['coverage'] > 1:
                    raise forms.ValidationError(error_str % (r['_row'], 'Coverage (must be in range [0, 1])'))

            effective_sample_size = 'effective_sample_size' in col_names and r['effective_sample_size'] != ''
            lower_ci = 'lower_ci' in col_names and r['lower_ci'] != ''
            upper_ci = 'upper_ci' in col_names and r['upper_ci'] != ''
            standard_error = 'standard_error' in col_names and r['standard_error'] != ''

            if not (effective_sample_size or (lower_ci and upper_ci) or standard_error):
                raise forms.ValidationError(error_str % (r['_row'], 'Either Effective Sample Size or both Lower CI and Upper CI or Standard Error must be given'))

            if effective_sample_size:
                try:
                    r['effective_sample_size'] = int(r['effective_sample_size'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Effective Sample Size'))
                if r['effective_sample_size'] <= 0:
                    raise forms.ValidationError(error_str % (r['_row'], 'Effective Sample Size (must be greater than 0)'))

            if lower_ci:
                try:
                    r['lower_ci'] = float(r['lower_ci'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Lower CI'))
                if r['lower_ci'] < 0 or r['lower_ci'] > r['parameter_value']:
                    raise forms.ValidationError(error_str % (r['_row'], 'Lower CI (must be less than parameter value)'))

            if upper_ci:
                try:
                    r['upper_ci'] = float(r['upper_ci'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Upper CI'))
                if r['upper_ci'] <= r['parameter_value']:
                    raise forms.ValidationError(error_str % (r['_row'], 'Upper CI (must be greater than Parameter Value)'))

            if standard_error:
                try:
                    r['standard_error'] = float(r['standard_error'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Standard Error'))
                if r['standard_error'] <= 0 and r['standard_error'] != -99:
                    raise forms.ValidationError(error_str % (r['_row'], 'Standard Error (must be greater than 0 or -99 for missing)'))

            if 'total_study_size_n' in col_names and r['total_study_size_n'] != '':
                try:
                    r['total_study_size_n'] = int(r['total_study_size_n'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Total Study Size N'))
                if r['total_study_size_n'] <= 0:
                    raise forms.ValidationError(error_str % (r['_row'], 'Total Study Size N (must be greater than 0)'))

            if 'total_study_size_n' in col_names and 'effective_sample_size' in col_names and r['effective_sample_size'] != '' and r['total_study_size_n'] != '':
                if r['effective_sample_size'] > r['total_study_size_n']:
                    raise forms.ValidationError(error_str % (r['_row'], 'Effective Sample Size (must be at most Total Study Size N)'))

            if 'design_factor' in col_names and r['design_factor'] != '':
                try:
                    r['design_factor'] = float(r['design_factor'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Design Factor'))
                if r['design_factor'] < 1:
                    raise forms.ValidationError(error_str % (r['_row'], 'Design Factor (must be greater than 1)'))

            #if 'citation' in col_names and r['citation'] != '':
            #    try:
            #        r['citation'] = str(r['citation'])
            #    except ValueError:
            #        raise forms.ValidationError(error_str % (r['_row'], 'Citation'))

            if 'urbanicity' in col_names and r['urbanicity'] != '':
                try:
                    r['urbanicity'] = float(r['urbanicity'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Urbanicity'))
                if r['urbanicity'] < 0 or r['urbanicity'] > 1:
                    raise forms.ValidationError(error_str % (r['_row'], 'Urbanicity (must be in range [0, 1])'))

            if 'ignore' in col_names and r['ignore'] != '':
                try:
                    r['ignore'] = int(r['ignore'])
                except ValueError:
                    raise forms.ValidationError(error_str % (r['_row'], 'Ignore'))
                if r['ignore'] < 0 or r['ignore'] > 1:
                    raise forms.ValidationError(error_str % (r['_row'], 'Ignore (must be 0 or 1)'))

        return data_list