def __init__(self, schema, name, obs_table, columns, task): ''' columns: should be an ordereddict if you want to specify columns' order in the table ''' self._id = '.'.join([schema, name]) obs_table.id = self._id obs_table.tablename = '{prefix}{name}'.format( prefix=OBSERVATORY_PREFIX, name=sha1(underscore_slugify( self._id).encode('utf-8')).hexdigest()) self.table = '{schema}.{table}'.format(schema=OBSERVATORY_SCHEMA, table=obs_table.tablename) self.qualified_tablename = '"{schema}".{table}'.format( schema=OBSERVATORY_SCHEMA, table=obs_table.tablename) self.obs_table = obs_table self._tablename = obs_table.tablename self._schema = schema self._name = name self._obs_dict = obs_table.__dict__.copy() self._columns = columns self._task = task if obs_table.tablename in metadata.tables: self._table = metadata.tables[obs_table.tablename] else: self._table = None
def output(self): session = current_session() for table in session.query(OBSTable): split = table.id.split('.') schema, task_id = split[0:-1], split[-1] modname = 'tasks.' + '.'.join(schema) module = __import__(modname, fromlist=['*']) exists = False for name in dir(module): kls = getattr(module, name) if not isinstance(kls, Register): continue # this doesn't work because of underscore_slugify #possible_kls = '_'.join(task_id.split('_')[0:-len(kls.get_params())-1]) if task_id.startswith(underscore_slugify(name)): exists = True if exists is True: LOGGER.info('{table} exists'.format(table=table)) else: # TODO drop table import pdb pdb.set_trace() LOGGER.info(table) yield PostgresTarget(schema='observatory', tablename=table.tablename)
def _parse_columns(self, all_columns_result): all_columns = {} for col in all_columns_result: geom_timespans = {} for gt in col[10]: if gt[0] in geom_timespans: geom_timespans[gt[0]]['timespans'].append(gt[2]) else: geom_timespans[gt[0]] = { 'geom_id': gt[0], 'geom_name': gt[1], 'timespans': [gt[2]], 'geom_tags': json.loads(gt[3]) } all_columns[col[0]] = { 'id': col[0], 'latlng': catalog_latlng(col[0]), 'name': col[1], 'description': col[2], 'type': col[3], 'extra': col[4], 'aggregate': col[5], 'tags': col[6], 'suggested_name': col[7], 'timespan': col[8], 'timespan_sluggified': underscore_slugify('_'.join(col[8])), 'licenses': [ tag_id.split('/')[1] for tag_id, tag_name in col[6].items() if tag_id.startswith('license/') ], 'sources': [ tag_id.split('/')[1] for tag_id, tag_name in col[6].items() if tag_id.startswith('source/') ], 'denoms': col[9], 'geom_timespans': geom_timespans, 'envelope': col[11] } return all_columns
def columns(self): # Here we assemble an OrderedDict using our requirements to specify the # columns that go into this table. # The column name input_ = self.input() cols = OrderedDict([('area_fips', input_['geoid_cols']['county_geoid']) ]) for naics_code, naics_cols in input_['naics'].iteritems(): for key, coltarget in naics_cols.iteritems(): naics_name = NAICS_CODES[naics_code] colname = underscore_slugify(u'{}_{}_{}'.format( key, naics_code, naics_name)) cols[colname] = coltarget return cols
def columns(self): # Here we assemble an OrderedDict using our requirements to specify the # columns that go into this table. # The column name input_ = self.input() cols = OrderedDict([ ('area_fips', input_['geoid_cols']['county_geoid']) ]) for naics_code, naics_cols in input_['naics'].iteritems(): for key, coltarget in naics_cols.iteritems(): naics_name = NAICS_CODES[naics_code] colname = underscore_slugify(u'{}_{}_{}'.format( key, naics_code, naics_name)) cols[colname] = coltarget return cols
def columns(self): # Here we assemble an OrderedDict using our requirements to specify the # columns that go into this table. # The column name input_ = self.input() cols = OrderedDict([ ('area_fipssl', input_['geoid_cols']['county_{}{}'.format( TIGER_YEAR, GEOID_SUMLEVEL_COLUMN)]), ('area_fipssc', input_['geoid_cols']['county_{}{}'.format( TIGER_YEAR, GEOID_SHORELINECLIPPED_COLUMN)]) ]) for naics_code, naics_cols in input_['naics'].items(): for key, coltarget in naics_cols.items(): naics_name = NAICS_CODES[naics_code] colname = underscore_slugify('{}_{}_{}'.format( key, naics_code, naics_name)) cols[colname] = coltarget return cols
def output(self): session = current_session() for table in session.query(OBSTable): split = table.id.split('.') schema, task_id = split[0:-1], split[-1] modname = 'tasks.' + '.'.join(schema) module = __import__(modname, fromlist=['*']) exists = False for name in dir(module): kls = getattr(module, name) if not isinstance(kls, Register): continue # this doesn't work because of underscore_slugify #possible_kls = '_'.join(task_id.split('_')[0:-len(kls.get_params())-1]) if task_id.startswith(underscore_slugify(name)): exists = True if exists is True: print('{table} exists'.format(table=table)) else: # TODO drop table import pdb pdb.set_trace() print table yield PostgresTarget(schema='observatory', tablename=table.tablename)
def test_underscore_slugify(): assert_equals( underscore_slugify( '"path.to.schema"."ClassName(param1=100, param2=foobar)"'), 'path_to_schema_class_name_param1_100_param2_foobar')
def columns(self): cols = OrderedDict() code, name, description = self.naics_code, NAICS_CODES[ self.naics_code], '' # This gives us easier access to the tags we defined as dependencies input_ = self.input() units = input_['units'] sections = input_['sections'] subsections = input_['subsections'] parent = input_.get('parent') cols['qtrly_estabs'] = OBSColumn( id=underscore_slugify('qtrly_estabs_{}'.format(code)), type='Numeric', name='Establishments in {}'.format(name), description= 'Count of establishments in a given quarter in the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), weight=5, aggregate='sum', tags=[ units['businesses'], sections['united_states'], subsections['commerce_economy'] ], targets={parent['qtrly_estabs']: DENOMINATOR} if parent else {}, ) cols['avg_wkly_wage'] = OBSColumn( # Make sure the column ID is unique within this module # If left blank, will be taken from this column's key in the output OrderedDict id=underscore_slugify('avg_wkly_wage_{}'.format(code)), # The PostgreSQL type of this column. Generally Numeric for numbers and Text # for categories. type='Numeric', # Human-readable name. Will be used as header in the catalog name='Average weekly wage for {} establishments'.format(name), # Human-readable description. Will be used as content in the catalog. description= 'Average weekly wage for a given quarter in the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), # Ranking of importance, sometimes used to favor certain measures in auto-selection # Weight of 0 will hide this column from the user. We generally use between 0 and 10 weight=5, # How this measure was derived, for example "sum", "median", "average", etc. # In cases of "sum", this means functions downstream can construct estimates # for arbitrary geographies aggregate='average', # Tags are our way of noting aspects of this measure like its unit, the country # it's relevant to, and which section(s) of the catalog it should appear in. tags=[ units['money'], sections['united_states'], subsections['income'] ], targets={cols['qtrly_estabs']: UNIVERSE}, ) cols['month3_emplvl'] = OBSColumn( id=underscore_slugify('month3_emplvl_{}'.format(code)), type='Numeric', name='Employees in {} establishments'.format(name), description= 'Number of employees in the third month of a given quarter with the {name} ' 'industry (NAICS {code}). {name} is {description}.'.format( name=name, code=code, description=description), weight=5, aggregate='sum', tags=[ units['people'], sections['united_states'], subsections['employment'] ], targets={parent['month3_emplvl']: DENOMINATOR} if parent else {}, ) cols['lq_avg_wkly_wage'] = OBSColumn( id=underscore_slugify('lq_avg_wkly_wage_{}'.format(code)), type='Numeric', name='Average weekly wage location quotient for {} establishments'. format(name), description= 'Location quotient of the average weekly wage for a given quarter relative to ' 'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), weight=3, aggregate=None, tags=[ units['ratio'], sections['united_states'], subsections['income'] ], ) cols['lq_qtrly_estabs'] = OBSColumn( id=underscore_slugify('lq_qtrly_estabs_{}'.format(code)), type='Numeric', name='Location quotient of establishments in {}'.format(name), description= 'Location quotient of the quarterly establishment count relative to ' 'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), weight=3, aggregate=None, tags=[ units['ratio'], sections['united_states'], subsections['commerce_economy'] ], ) cols['lq_month3_emplvl'] = OBSColumn( id=underscore_slugify('lq_month3_emplvl_{}'.format(code)), type='Numeric', name='Employment level location quotient in {} establishments'. format(name), description= 'Location quotient of the employment level for the third month of a given quarter ' 'relative to the U.S. (Rounded to the hundredths place) within the {name} ' 'industry (NAICS {code}). {name} is {description}.'.format( name=name, code=code, description=description), weight=3, aggregate=None, tags=[ units['ratio'], sections['united_states'], subsections['employment'] ], ) source = input_['source']['qcew'] license = input_['license']['no-restrictions'] for colname, col in cols.items(): col.tags.append(source) col.tags.append(license) return cols
def output(self): return LocalTarget(os.path.join('tmp', classpath(self), self.task_id) + '_' + underscore_slugify(self.last_time) + '.csv')
def columns(self): columns = OrderedDict() input_ = self.input() subsectiontags = input_['subsection'] unittags = input_['units'] eu = input_['section']['eu'] licensing = input_['license']['eurostat-license'] source = input_['source']['eurostat-source'] cache = CACHE dicttables_path = input_['DICTTables'].path session = current_session() resp = session.execute(''' SELECT ARRAY_AGG(DISTINCT dimension) FROM {table} WHERE dimension NOT IN ('geo', 'time') AND table_code = '{table_code}'; '''.format(table=input_['metabase'].table, table_code=self.table_name.lower())) dimensions = resp.fetchone()[0] resp = session.execute(''' WITH dimensions AS (SELECT value, dimension FROM {table} WHERE table_code = '{table_code}' AND dimension NOT IN ('time', 'geo')) SELECT ARRAY_AGG(JSON_BUILD_OBJECT({select})) FROM {from_} WHERE {where} '''.format(table=input_['metabase'].table, table_code=self.table_name.lower(), select=', '.join([ "'{}', {}.value".format(dim, dim) for dim in dimensions ]), from_=', '.join( ['dimensions {}'.format(dim) for dim in dimensions]), where=' AND '.join([ "{}.dimension = '{}'".format(dim, dim) for dim in dimensions ]))) cross_prod = resp.fetchone()[0] tables = cache.get(dicttables_path, 'table_dic.dic') table_desc = tables[self.table_name] variable_name = table_desc.split('by')[0].strip() for i in cross_prod: dimdefs = [] if len(cross_prod) > 1: # Multiple variables var_code = underscore_slugify(self.table_name + "_".join(list(i.values()))) if len(i) == 1: # Only one dimension, usually "unit" for unit_dic, unit_value in i.items(): units = cache.get( dicttables_path, '{dimension}.dic'.format(dimension=unit_dic)) dimdefs.append(units[unit_value]) description = "{} ".format( variable_name) + "- " + ", ".join( [str(x) for x in dimdefs]) else: # multiple dimensions, ignore "unit" when building name for dimname, dimvalue in i.items(): if dimname != 'unit': dim_dic = cache.get( dicttables_path, '{dimension}.dic'.format(dimension=dimname)) dimdefs.append(dim_dic[dimvalue]) description = "{} ".format( variable_name) + "- " + ", ".join( [str(x) for x in dimdefs]) else: # Only one variable var_code = underscore_slugify(self.table_name) for unit_dic, unit_value in i.items(): units = cache.get( dicttables_path, '{dimension}.dic'.format(dimension=unit_dic)) dimdefs.append(units[unit_value]) description = "{} ".format(variable_name) + "- " + ", ".join( [str(x) for x in dimdefs]) try: units = cache.get(dicttables_path, 'unit.dic') unitdef = units[i['unit']] if "percentage" in unitdef.lower() or "per" in unitdef.lower( ) or "rate" in unitdef.lower(): final_unit_tag = "ratio" aggregate = None elif 'nama_aux_cra' in var_code: aggregate = None else: final_unit_tag = self.units aggregate = 'sum' except: final_unit_tag = self.units aggregate = 'sum' tags = [ eu, subsectiontags[self.subsection], unittags[final_unit_tag] ] if ('ths' in var_code or 'th_t' in var_code) and '(thousand persons)' not in description: description = description + ' (thousands)' columns[var_code] = OBSColumn( id=var_code, name=simplify_description(description), type='Numeric', description=description, weight=1, aggregate=aggregate, #??? targets={}, #??? tags=tags, extra=i, ) columnsFilter = ColumnsDeclarations( os.path.join(os.path.dirname(__file__), 'eurostat_columns.json')) parameters = '{{"subsection":"{subsection}","units":"{units}","nuts_level":"{nuts_level}","year":"{year}"}}'.format( subsection=self.subsection, units=self.units, nuts_level=self.nuts_level, year=self.year) columns = columnsFilter.filter_columns(columns, parameters) for _, col in columns.items(): col.tags.append(source) col.tags.append(licensing) targets_dict = {} for colname, col in columns.items(): for i, v in col.extra.items(): if v == 'TOTAL' or v == 'T': temp = dict((key, value) for key, value in col.extra.items() if key != i) targets_dict[tuple(temp.items())] = colname for colname, col in columns.items(): denoms = {} for nontotals, code in targets_dict.items(): if all(item in col.extra.items() for item in nontotals) and code != colname: denoms[columns.get(code)] = 'denominator' col.targets = denoms nonsum = ['proportion', 'average', 'percentage', 'rate', r'%', 'share'] for _, col in columns.items(): if any(word in col.name.lower() for word in nonsum): col.aggregate = None return columns
def output(self): return LocalTarget( os.path.join('tmp', classpath(self), self.task_id) + '_' + underscore_slugify(self.last_time) + '.csv')
def test_underscore_slugify(): assert_equals(underscore_slugify('"path.to.schema"."ClassName(param1=100, param2=foobar)"'), 'path_to_schema_class_name_param1_100_param2_foobar' )
def columns(self): cols = OrderedDict() code, name, description = self.naics_code, NAICS_CODES[self.naics_code], '' # This gives us easier access to the tags we defined as dependencies input_ = self.input() units = input_['units'] sections = input_['sections'] subsections = input_['subsections'] parent = input_.get('parent') cols['avg_wkly_wage'] = OBSColumn( # Make sure the column ID is unique within this module # If left blank, will be taken from this column's key in the output OrderedDict id=underscore_slugify(u'avg_wkly_wage_{}'.format(code)), # The PostgreSQL type of this column. Generally Numeric for numbers and Text # for categories. type='Numeric', # Human-readable name. Will be used as header in the catalog name=u'Average weekly wage for {} establishments'.format(name), # Human-readable description. Will be used as content in the catalog. description=u'Average weekly wage for a given quarter in the {name} industry (NAICS {code}).' u'{name} is {description}.'.format(name=name, code=code, description=description), # Ranking of importance, sometimes used to favor certain measures in auto-selection # Weight of 0 will hide this column from the user. We generally use between 0 and 10 weight=5, # How this measure was derived, for example "sum", "median", "average", etc. # In cases of "sum", this means functions downstream can construct estimates # for arbitrary geographies aggregate='average', # Tags are our way of noting aspects of this measure like its unit, the country # it's relevant to, and which section(s) of the catalog it should appear in. tags=[units['money'], sections['united_states'], subsections['income']], ) cols['qtrly_estabs'] = OBSColumn( id=underscore_slugify(u'qtrly_estabs_{}'.format(code)), type='Numeric', name=u'Establishments in {}'.format(name), description=u'Count of establishments in a given quarter in the {name} industry (NAICS {code}).' u'{name} is {description}.'.format(name=name, code=code, description=description), weight=5, aggregate='sum', tags=[units['businesses'], sections['united_states'], subsections['commerce_economy']], targets={parent['qtrly_estabs']: DENOMINATOR} if parent else {}, ) cols['month3_emplvl'] = OBSColumn( id=underscore_slugify(u'month3_emplvl_{}'.format(code)), type='Numeric', name=u'Employees in {} establishments'.format(name), description=u'Number of employees in the third month of a given quarter with the {name} ' u'industry (NAICS {code}). {name} is {description}.'.format( name=name, code=code, description=description), weight=5, aggregate='sum', tags=[units['people'], sections['united_states'], subsections['employment']], targets={parent['month3_emplvl']: DENOMINATOR} if parent else {}, ) cols['lq_avg_wkly_wage'] = OBSColumn( id=underscore_slugify(u'lq_avg_wkly_wage_{}'.format(code)), type='Numeric', name=u'Average weekly wage location quotient for {} establishments'.format(name), description=u'Location quotient of the average weekly wage for a given quarter relative to ' u'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).' u'{name} is {description}.'.format(name=name, code=code, description=description), weight=3, aggregate=None, tags=[units['ratio'], sections['united_states'], subsections['income']], ) cols['lq_qtrly_estabs'] = OBSColumn( id=underscore_slugify(u'lq_qtrly_estabs_{}'.format(code)), type='Numeric', name=u'Location quotient of establishments in {}'.format(name), description=u'Location quotient of the quarterly establishment count relative to ' u'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).' u'{name} is {description}.'.format(name=name, code=code, description=description), weight=3, aggregate=None, tags=[units['ratio'], sections['united_states'], subsections['commerce_economy']], ) cols['lq_month3_emplvl'] = OBSColumn( id=underscore_slugify(u'lq_month3_emplvl_{}'.format(code)), type='Numeric', name=u'Employment level location quotient in {} establishments'.format(name), description=u'Location quotient of the employment level for the third month of a given quarter ' u'relative to the U.S. (Rounded to the hundredths place) within the {name} ' u'industry (NAICS {code}). {name} is {description}.'.format( name=name, code=code, description=description), weight=3, aggregate=None, tags=[units['ratio'], sections['united_states'], subsections['employment']], ) return cols