def calculate_covariate(self, covariate_type): """ Calculate and cache specified covariate in self.params to avoid repeatedly making the database queries required to compute it. """ import numpy as np from covariate_data_server.models import Covariate from gbd.dismod3.utils import clean # TODO: allow a way for one db query to calculate covariates for many data points if self.region: cy_list = ['%s-%d' % (self.region, y) for y in [gbd.fields.ALL_YEARS] + range(self.year_start,self.year_end+1)] else: # average value from all countries to get regional covariate (FIXME: should this be a population weighted average?) from gbd.dismod3.neg_binom_model import countries_for cy_list = ['%s-%d' % (c, y) for c in countries_for[clean(self.gbd_region)] for y in [gbd.fields.ALL_YEARS] + range(self.year_start,self.year_end+1)] sex = self.sex if sex == 'all': # if data is applied to males and females using sex == 'all', take the covariate value for sex == 'total' sex = 'total' covariates = Covariate.objects.filter( type__slug=covariate_type, sex=sex, country_year__in=cy_list) if len(covariates) == 0: debug(("WARNING: Covariate %s not found for %s %s-%s, " + "(Data_id=%d)" ) % (covariate_type, self.sex, self.region, self.year_str(), self.id)) else: self.params[clean(covariate_type)] = np.mean([c.value for c in covariates]) self.cache_params() self.save() debug('updated %s %s %s-%s to %f, (Data_id=%d)' % (covariate_type, self.sex, self.region, self.year_str(), np.mean([c.value for c in covariates]), self.id))
def study_level_covariates(self): data_list = [d.params for d in self.data.all()] all_keys = set() for d in data_list: all_keys |= set(d.keys()) required_keys = ['GBD Cause', 'Parameter', 'GBD Region', 'Country ISO3 Code', 'Sex', 'Year Start', 'Year End', 'Age Start', 'Age End', 'Parameter Value', 'Standard Error', 'Units', ] redundant_keys = ['_row', 'age_weights', 'id', 'value', 'condition', 'data_type', 'region'] from dismod3.utils import clean from numpy import inf additional_keys = sorted(all_keys - set([clean(k) for k in required_keys] + redundant_keys)) cov_dict = {} for k in additional_keys: x_vals = set() x_min = 10000. x_max = -10000. for x in [d.get(k) or 0. for d in data_list]: try: x = float(x) x_min = min(x_min, x) x_max = max(x_max, x) except ValueError: x_vals.add(x) if x_min == 10000. and x_max == -10000.: x_min = 0. x_max = 0. # TODO: consider how to get this right, in the case of users adding data in batches if len(x_vals) <= 1 and x_min == x_max: continue # for now, only allow numerical covariates if x_min == x_max: continue cov_dict[k] = dict(rate=dict(value=0, default=0), error=dict(value=0, default=0), value=dict(value='0', default='0'), range=[x_min, x_max], category=sorted(x_vals) ) if len(cov_dict) == 0: cov_dict['none'] = { 'rate': dict(value=0, default=0), 'error': dict(value=0, default=0), 'value': dict(value='', default='0.'), # value must be a string 'range': [0, 1], 'category': ['0', '.5', '1'] } return cov_dict
def covariate_data_value_show(request, type, area, format='png'): """ Serve a representation of the covariate for the specified type and country Parameters: ----------- type : str the covariate_type area : str either the country code or the gbd region format : str, optional the format to return the results in, may be one of the following: json, csv, png, pdf """ ct = get_object_or_404(CovariateType, slug=type) fig_width = 18. fig_height = 4.5 sexes = ['male', 'female', 'total'] pl.figure(figsize=(fig_width, fig_height), dpi=100) if len(area) == 3: for i, s in enumerate(sexes): pl.subplot(1, 3, i + 1) X = pl.array( sorted([[c.year, c.value] for c in ct.covariate_set.filter(iso3=area, sex=s)])) if len(X) > 0: pl.plot(X[:,0], X[:,1], '.-') pl.ylabel(c.type) pl.xlabel('Time (Years)') pl.title('%s for %s in %s' % (c.type, s, c.iso3)) else: region_dict = {} for r in gbd_regions: region_dict[clean(r)] = r for i, s in enumerate(sexes): pl.subplot(1, 3, i + 1) X = pl.array( sorted([[c.year, c.value] for c in ct.covariate_set.filter(region=region_dict[area], sex=s)])) if len(X) > 0: pl.plot(X[:,0], X[:,1], '.-') pl.ylabel(c.type) pl.xlabel('Time (Years)') pl.title('%s for %s in %s' % (c.type, s, c.region)) response = view_utils.figure_data(format) return HttpResponse(response, view_utils.MIMETYPE[format])
def covariate_data_count_show(request, id): """ Show amount of data for each country of the selected covariate type Parameters: ----------- id : int the id of the covariate type to display """ ct = get_object_or_404(CovariateType, id=id) if ct.region_only: pm = ct.covariate_set.all().distinct().values('region') for c in pm: c['clean_region'] = clean(c['region']) c['count'] = ct.covariate_set.filter(region=c['region']).count() if c['count'] < (ct.year_end - ct.year_start + 1) * 3: c['color'] = 'class=highlight' else: c['color'] = '' if len(pm) != 21: error = 'Total number of regions are wrong. Found ' + str(len(pm)) + '. Should be 21.' else: error = '' return render_to_response('covariate_data_count_show.html', {'ct': ct, 'level': 'region', 'error': error, 'paginated_models': view_utils.paginated_models(request, pm)}) else: pm = ct.covariate_set.all().distinct().values('iso3') for c in pm: c['count'] = ct.covariate_set.filter(iso3=c['iso3']).count() if c['count'] < (ct.year_end - ct.year_start + 1) * 3: c['color'] = 'class=highlight' else: c['color'] = '' return render_to_response('covariate_data_count_show.html', {'ct': ct, 'level': 'country', 'paginated_models': view_utils.paginated_models(request, pm)})
def covariate_type_show(request, id): """ Show an index page for the selected covariate type Parameters: ----------- id : int the id of the covariate type to display """ ct = get_object_or_404(CovariateType, id=id) if ct.region_only: pm = ct.covariate_set.all().distinct().values('region', 'sex') for c in pm: c['clean_region'] = clean(c['region']) c['count'] = ct.covariate_set.filter(region=c['region'], sex=c['sex']).count() if c['count'] < ct.year_end - ct.year_start + 1: c['color'] = 'class=highlight' else: c['color'] = '' return render_to_response('covariate_type_show.html', {'ct': ct, 'level': 'region', 'paginated_models': view_utils.paginated_models(request, pm)}) else: pm = ct.covariate_set.all().distinct().values('iso3', 'sex') for c in pm: c['count'] = ct.covariate_set.filter(iso3=c['iso3'], sex=c['sex']).count() if c['count'] < ct.year_end - ct.year_start + 1: c['color'] = 'class=highlight' else: c['color'] = '' return render_to_response('covariate_type_show.html', {'ct': ct, 'level': 'country', 'paginated_models': view_utils.paginated_models(request, pm)})
def validate(self, lines): """ Required data fields: -------------------------------------------------------------------------------- Name Type Limit -------------------------------------------------------------------------------- GBD Cause str one of the GBD causes Region str one of the GBD regions Parameter str standardize_data_type Sex str standardize_sex Country ISO3 Code str an ISO3 code in the region (or blank to apply to all countries in region) Age Start int [0, 100], <= Age End Age End int [0, 100], >= Age Start Year Start int [1950, 2010], <= Year End Year End int [1950, 2010], >= Year Start Parameter Value float >= 0 Units float >= 1 Recommended data fields: -------------------------------------------------------------------------------- Name Type Limit -------------------------------------------------------------------------------- Study ID empty or int >= 0 Sequela empty or str one of the GBD sequela codes Case Definition empty or str none Coverage empty or float [0,1] Effective Sample Size* empty or int > 0, <= Total Study Size N Lower CI* empty or float >= 0 <= Parameter Value Upper CI* empty or float > Parameter Value Standard Error* empty or float > 0 Total Study Size N empty or int > 0 Design Factor empty or float >= 1 Citation empty or str none Urbanicity empty or float [0, 1] Ignore empty or int [0, 1] Optional data fields: No checks * Either of Effective Sample Size, Lower CI and Upper CI, or Standard Error must be given. """ col_names = [clean(col) for col in lines.next()] # check that required fields appear for field in NewDataForm.required_data_fields: if not clean(field) in col_names: raise forms.ValidationError(_('Column "%s" is missing') % field) data_list = [] for ii, cells in enumerate(lines): # skip blank lines if sum([cell == '' for cell in cells]) == len(cells): continue # ensure that something appears for each column if len(cells) != len(col_names): raise forms.ValidationError( _('Error loading row %d: found %d fields (expected %d))') % (ii+2, len(cells), len(col_names))) # make an associative array from the row data data = {} for key, val in zip(col_names, cells): data[clean(key)] = val.strip() data['_row'] = ii+2 data_list.append(data) # ensure that certain cells are the right format error_str = _('Row %d: could not understand entry for %s') gbd_cause = '' for r in data_list: # check required data fields try: r['gbd_cause'] = str(r['gbd_cause']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'GBD Cause')) if gbd_cause == '': gbd_cause = r['gbd_cause'] else: if gbd_cause != r['gbd_cause']: raise forms.ValidationError(error_str % (r['_row'], 'GBD Cause (all GBD Causes must be the same)')) try: r['region'] = str(r['region']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Region')) if not clean(r['region']) in [clean(region) for region in dismod3.gbd_regions] + ['all']: raise forms.ValidationError(error_str % (r['_row'], 'Region')) try: r['parameter'] = gbd.fields.standardize_data_type[r['parameter']] except KeyError: raise forms.ValidationError(error_str % (r['_row'], 'Parameter')) try: r['sex'] = gbd.fields.standardize_sex[r['sex']] except KeyError: raise forms.ValidationError(error_str % (r['_row'], 'Sex')) try: r['country_iso3_code'] = str(r['country_iso3_code']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Country ISO3 Code')) if r['region'] != 'all': if not r['country_iso3_code'] in countries_for[clean(r['region'])] + ['']: raise forms.ValidationError(error_str % (r['_row'], 'Country ISO3 Code (%s is not in %s)' % (r['country_iso3_code'], r['region']))) elif r['country_iso3_code'] != 'all': raise forms.ValidationError(error_str % (r['_row'], 'Country ISO3 Code (%s must be "all" if region is "all")' % r['country_iso3_code'])) try: r['age_start'] = int(r['age_start']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Age Start')) if r['age_start'] < 0 or r['age_start'] > 100: raise forms.ValidationError(error_str % (r['_row'], 'Age Start (must be in range [0, 100])')) try: r['age_end'] = int(r['age_end']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Age End')) if r['age_end'] < 0 or r['age_end'] > 100: raise forms.ValidationError(error_str % (r['_row'], 'Age End (must be in range [0, 100])')) if r['age_start'] > r['age_end']: raise forms.ValidationError(error_str % (r['_row'], 'Age Start (must be greater than Age End)')) try: r['year_start'] = int(r['year_start']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Year Start')) if r['year_start'] < 1950 or r['year_start'] > 2010: raise forms.ValidationError(error_str % (r['_row'], 'Year Start (must be in range [1950, 2010])')) try: r['year_end'] = int(r['year_end']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Year End')) if r['year_end'] < 1950 or r['year_end'] > 2010: raise forms.ValidationError(error_str % (r['_row'], 'Year End (must be in range [1950, 2010])')) if r['year_start'] > r['year_end']: raise forms.ValidationError(error_str % (r['_row'], 'Year Start (must be greater than Year End)')) units = 0 try: units = float(r['units'].replace(',', '').replace('per ', '')) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Units')) if units < 1: raise forms.ValidationError(error_str % (r['_row'], 'Units (must be greater than 1)')) try: r['parameter_value'] = float(r['parameter_value']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value')) if r['parameter_value'] < 0: raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value (must be greater than 0)')) value = r['parameter_value'] / units param = r['parameter'] if param == 'prevalence data' and value > 1: raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of prevalence (must not be greater than 1)')) if param == 'duration data' and value > 100: raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of duration (must not be greater than 100)')) if param == 'relative-risk data' and value < 1: raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of relative-risk (must not be smaller than 1)')) if param == 'smr data' and value < 1: raise forms.ValidationError(error_str % (r['_row'], 'Parameter Value of smr (must not be smaller than 1)')) # check recommended data fields if 'study_id' in col_names and r['study_id'] != '': try: r['study_id'] = int(r['study_id']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Study ID')) if r['study_id'] < 0: raise forms.ValidationError(error_str % (r['_row'], 'Study ID (must be greater than 0)')) #if 'sequela' in col_names and r['sequela'] != '': # try: # r['sequela'] = str(r['sequela']) # except ValueError: # raise forms.ValidationError(error_str % (r['_row'], 'Sequela')) #if 'case_definition' in col_names and r['case_definition'] != '': # try: # r['case_definition'] = str(r['case_definition']) # except ValueError: # raise forms.ValidationError(error_str % (r['_row'], 'Case Definition')) if 'coverage' in col_names and r['coverage'] != '': try: r['coverage'] = float(r['coverage']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Coverage')) if r['coverage'] < 0 or r['coverage'] > 1: raise forms.ValidationError(error_str % (r['_row'], 'Coverage (must be in range [0, 1])')) effective_sample_size = 'effective_sample_size' in col_names and r['effective_sample_size'] != '' lower_ci = 'lower_ci' in col_names and r['lower_ci'] != '' upper_ci = 'upper_ci' in col_names and r['upper_ci'] != '' standard_error = 'standard_error' in col_names and r['standard_error'] != '' if not (effective_sample_size or (lower_ci and upper_ci) or standard_error): raise forms.ValidationError(error_str % (r['_row'], 'Either Effective Sample Size or both Lower CI and Upper CI or Standard Error must be given')) if effective_sample_size: try: r['effective_sample_size'] = int(r['effective_sample_size']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Effective Sample Size')) if r['effective_sample_size'] <= 0: raise forms.ValidationError(error_str % (r['_row'], 'Effective Sample Size (must be greater than 0)')) if lower_ci: try: r['lower_ci'] = float(r['lower_ci']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Lower CI')) if r['lower_ci'] < 0 or r['lower_ci'] > r['parameter_value']: raise forms.ValidationError(error_str % (r['_row'], 'Lower CI (must be less than parameter value)')) if upper_ci: try: r['upper_ci'] = float(r['upper_ci']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Upper CI')) if r['upper_ci'] <= r['parameter_value']: raise forms.ValidationError(error_str % (r['_row'], 'Upper CI (must be greater than Parameter Value)')) if standard_error: try: r['standard_error'] = float(r['standard_error']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Standard Error')) if r['standard_error'] <= 0 and r['standard_error'] != -99: raise forms.ValidationError(error_str % (r['_row'], 'Standard Error (must be greater than 0 or -99 for missing)')) if 'total_study_size_n' in col_names and r['total_study_size_n'] != '': try: r['total_study_size_n'] = int(r['total_study_size_n']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Total Study Size N')) if r['total_study_size_n'] <= 0: raise forms.ValidationError(error_str % (r['_row'], 'Total Study Size N (must be greater than 0)')) if 'total_study_size_n' in col_names and 'effective_sample_size' in col_names and r['effective_sample_size'] != '' and r['total_study_size_n'] != '': if r['effective_sample_size'] > r['total_study_size_n']: raise forms.ValidationError(error_str % (r['_row'], 'Effective Sample Size (must be at most Total Study Size N)')) if 'design_factor' in col_names and r['design_factor'] != '': try: r['design_factor'] = float(r['design_factor']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Design Factor')) if r['design_factor'] < 1: raise forms.ValidationError(error_str % (r['_row'], 'Design Factor (must be greater than 1)')) #if 'citation' in col_names and r['citation'] != '': # try: # r['citation'] = str(r['citation']) # except ValueError: # raise forms.ValidationError(error_str % (r['_row'], 'Citation')) if 'urbanicity' in col_names and r['urbanicity'] != '': try: r['urbanicity'] = float(r['urbanicity']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Urbanicity')) if r['urbanicity'] < 0 or r['urbanicity'] > 1: raise forms.ValidationError(error_str % (r['_row'], 'Urbanicity (must be in range [0, 1])')) if 'ignore' in col_names and r['ignore'] != '': try: r['ignore'] = int(r['ignore']) except ValueError: raise forms.ValidationError(error_str % (r['_row'], 'Ignore')) if r['ignore'] < 0 or r['ignore'] > 1: raise forms.ValidationError(error_str % (r['_row'], 'Ignore (must be 0 or 1)')) return data_list