def required_tables(cls, api_obj): '''Given a list of X, do Y''' vars_needed = api_obj.vars_needed + api_obj.where_vars() if api_obj.order and api_obj.order in cls.possible_variables: vars_needed = vars_needed + [api_obj.order] universe = set(vars_needed) tables_to_use = [] table_cols = [] # Make a set of the variables that will be needed to answer the query while universe: # first find the tables with biggest overlap candidates = cls.list_partial_tables(universe, api_obj) # raise Exception(candidates) top_choices = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True) # take the table with the biggest overlap tbl, overlap = top_choices.pop(0) # ensure the tables are joinable, for now that means # having atleast one column with the same name if tables_to_use: while not set(table_cols).intersection([str(c.key) for c in get_columns(tbl)]): if top_choices: tbl, overlap = top_choices.pop(0) else: raise DataUSAException("can't join tables!") tables_to_use.append(tbl) tmp_cols = [str(c.key) for c in get_columns(tbl)] table_cols += tmp_cols # remove the acquired columns from the universe universe = universe - set(tmp_cols) return tables_to_use
def find_overlap(tbl1, tbl2): '''Given two table objects, determine the set of intersecting columns by column name''' cols1 = [col.key for col in get_columns(tbl1)] cols2 = [col.key for col in get_columns(tbl2)] myset = set(cols1).intersection(cols2) return myset
def query(table, api_obj): vars_and_vals = api_obj.vars_and_vals shows_and_levels = api_obj.shows_and_levels values = api_obj.values filters = process_value_filters(table, vars_and_vals) filters += where_filters(table, api_obj.where) filters += sumlevel_filtering(table, api_obj) if values: pk = [col for col in table.__table__.columns if col.primary_key] cols = pk + values else: cols = get_columns(table) needs_show_filter = any([v != consts.ALL for v in shows_and_levels.values()]) if needs_show_filter and hasattr(table, "gen_show_level_filters"): filters += table.gen_show_level_filters(shows_and_levels) qry = table.query.with_entities(*cols).filter(*filters) if api_obj.order: sort = "desc" if api_obj.sort == "desc" else "asc" qry = qry.order_by("{} {}".format(api_obj.order, sort)) if api_obj.limit: qry = qry.limit(api_obj.limit) data = qry.all() return simple_format(table, cols, data, api_obj.subs)
def query(table, api_obj, stream=False): vars_and_vals = api_obj.vars_and_vals shows_and_levels = api_obj.shows_and_levels values = api_obj.values exclude = api_obj.exclude filters = process_value_filters(table, vars_and_vals, api_obj) filters += where_filters(table, api_obj.where) filters += sumlevel_filtering(table, api_obj) if values: pk = [ col for col in table.__table__.columns if col.primary_key and col.key not in values ] cols = pk + values else: cols = get_columns(table) if exclude: cols = [ col for col in cols if (isinstance(col, basestring) and col not in exclude) or col.key not in exclude ] # qry = table.query.with_entities(*cols) qry = table.query if hasattr(table, "crosswalk_join"): qry = table.crosswalk_join(qry) if stream: qry, cols = use_attr_names(table, qry, cols) qry = qry.with_entities(*cols) if hasattr(table, "JOINED_FILTER"): qry, filters = handle_join(qry, filters, table, api_obj) qry = qry.filter(*filters) if api_obj.order: sort = "desc" if api_obj.sort == "desc" else "asc" if api_obj.order not in TableManager.possible_variables: if api_obj.order == 'abs(pct_change)': pass # allow this else: raise DataUSAException("Bad order parameter", api_obj.order) sort_stmt = text("{} {} NULLS LAST".format(api_obj.order, sort)) qry = qry.order_by(sort_stmt) if api_obj.limit: qry = qry.limit(api_obj.limit) if stream: return stream_format(table, cols, qry, api_obj) return simple_format(table, cols, qry, api_obj)
def query(table, api_obj, stream=False): vars_and_vals = api_obj.vars_and_vals values = api_obj.values exclude = api_obj.exclude filters = process_value_filters(table, vars_and_vals, api_obj) filters += where_filters(table, api_obj.where) filters += sumlevel_filtering(table, api_obj) if values: pk = [col for col in table.__table__.columns if col.primary_key and col.key not in values] cols = pk + [getattr(table, col_name) for col_name in values] else: cols = get_columns(table) if exclude: cols = [col for col in cols if (isinstance(col, basestring) and col not in exclude) or col.key not in exclude] # qry = table.query.with_entities(*cols) qry = table.query if hasattr(table, "crosswalk_join"): qry = table.crosswalk_join(qry) if stream or api_obj.display_names: qry, cols = use_attr_names(table, qry, cols) qry = qry.with_entities(*cols) if hasattr(table, "JOINED_FILTER"): qry, filters = handle_join(qry, filters, table, api_obj) qry = qry.filter(*filters) if api_obj.order: sort = desc if api_obj.sort == "desc" else asc if api_obj.order not in TableManager.possible_variables: if api_obj.order == 'abs(pct_change)': pass # allow this else: raise DataUSAException("Bad order parameter", api_obj.order) # sort_stmt = text("{} {} NULLS LAST".format(api_obj.order, sort)) if api_obj.order == 'abs(pct_change)': target_col = func.abs(table.pct_change) else: target_col = getattr(table, api_obj.order) qry = qry.order_by(sort(target_col).nullslast()) if api_obj.limit: qry = qry.limit(api_obj.limit) if stream: return stream_format(table, cols, qry, api_obj) return simple_format(table, cols, qry, api_obj)
def table_has_some_cols(cls, table, vars_needed): ''' Go through the list of required variables find tables that have atleast 2 variables (if more than one variable is needed). The reason atleast 2 are required is allow a join to occur (one for the value, one to potentially join). ''' table_cols = get_columns(table) cols = set([col.key for col in table_cols]) # min_overlap = 2 if len(vars_needed) > 1 else 1 intersection = set(vars_needed).intersection(cols) if intersection: return len(intersection) return None # TODO review this
class TableManager(object): possible_variables = list(set([col.key for t in registered_models for col in get_columns(t)])) table_years_set = tbl_years_set() table_years = tbl_years() # table_sizes = tbl_sizes() @classmethod def schema_selector(cls, api_obj): # -- If there is a force to an "acs" table (defaults to 5-year) # determine if we can instead use the acs 1 year estimate # schema. has_force = hasattr(api_obj, "force") and api_obj.force if has_force: schema, tblname = api_obj.force.split(".") if schema == 'acs': if api_obj.force_schema: schema = BaseAcs1.schema_name api_obj.force = "{}.{}".format(schema, tblname) api_obj.subs["force"] = schema return api_obj else: schema = BaseAcs5.schema_name api_obj.force = "{}.{}".format(schema, tblname) api_obj.subs["force"] = schema if has_force and api_obj.vars_and_vals and not api_obj.force.startswith(BaseAcs5.schema_name): # Applied fix 5.9.16 if schema and schema in [BaseAcs5.schema_name, BaseAcs3.schema_name]: gvals = api_obj.vars_and_vals["geo"].split(",") nation_state_only = all([v[:3] in ["010", "040"] for v in gvals]) not_ygi_ygo = all(["ygo" not in tblname, "ygi" not in tblname]) if schema != BaseAcs1.schema_name and nation_state_only and not_ygi_ygo: new_fullname = "{}.{}".format(BaseAcs1.schema_name, tblname) if new_fullname in cls.table_years: api_obj.force = new_fullname api_obj.subs["force"] = api_obj.force return api_obj @classmethod def table_can_show(cls, table, api_obj): shows_and_levels = api_obj.shows_and_levels supported_levels = table.get_supported_levels() vars_and_vals = api_obj.vars_and_vals required_geos = [] if "geo" not in vars_and_vals else vars_and_vals["geo"].split(",") if table.get_schema_name().startswith("acs"): if api_obj.force_schema and table.schema_name != api_obj.force_schema: return False if table.__table_args__["schema"] in [BaseAcs5.schema_name, BaseAcs1.schema_name, BaseAcs3.schema_name] and required_geos: need_to_support = set([my_geo[:3] for my_geo in required_geos]) required_levels = [consts.LEVEL_TO_GEO[slvl] for slvl in need_to_support] cond_check = [x in supported_levels["geo"] for x in required_levels] result = all(cond_check) if not result: return False for show_col, show_level in shows_and_levels.items(): if show_col not in supported_levels: # print show_col, supported_levels, "Supported Levels" return False else: if show_level not in supported_levels[show_col]: return False if api_obj.force and table.full_name() != api_obj.force: return False return True @classmethod def required_tables(cls, api_obj): '''Given a list of X, do Y''' vars_needed = api_obj.vars_needed + api_obj.where_vars() if api_obj.order and api_obj.order in cls.possible_variables: vars_needed = vars_needed + [api_obj.order] universe = set(vars_needed) tables_to_use = [] table_cols = [] # Make a set of the variables that will be needed to answer the query while universe: # first find the tables with biggest overlap candidates = cls.list_partial_tables(universe, api_obj) # raise Exception(candidates) top_choices = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True) # take the table with the biggest overlap tbl, overlap = top_choices.pop(0) # ensure the tables are joinable, for now that means # having atleast one column with the same name if tables_to_use: while not set(table_cols).intersection([str(c.key) for c in get_columns(tbl)]): if top_choices: tbl, overlap = top_choices.pop(0) else: raise DataUSAException("can't join tables!") tables_to_use.append(tbl) tmp_cols = [str(c.key) for c in get_columns(tbl)] table_cols += tmp_cols # remove the acquired columns from the universe universe = universe - set(tmp_cols) return tables_to_use @classmethod def list_partial_tables(cls, vars_needed, api_obj): candidates = {} for table in registered_models: overlap_size = TableManager.table_has_some_cols(table, vars_needed) if overlap_size > 0: if TableManager.table_can_show(table, api_obj): # to break ties, we'll use median moe to penalize and subtract # since larger values will be chosen first. penalty = (1 - (1.0 / table.median_moe)) if table.median_moe > 0 else 0 candidates[table] = overlap_size - penalty if not candidates: raise DataUSAException("No tables can match the specified query.") return candidates @classmethod def table_has_some_cols(cls, table, vars_needed): ''' Go through the list of required variables find tables that have atleast 2 variables (if more than one variable is needed). The reason atleast 2 are required is allow a join to occur (one for the value, one to potentially join). ''' table_cols = get_columns(table) cols = set([col.key for col in table_cols]) # min_overlap = 2 if len(vars_needed) > 1 else 1 intersection = set(vars_needed).intersection(cols) if intersection: return len(intersection) return None # TODO review this @classmethod def table_has_cols(cls, table, vars_needed): table_cols = get_columns(table) cols = set([col.key for col in table_cols]) return set(vars_needed).issubset(cols) @classmethod def select_best(cls, table_list, api_obj): special_cases = { Acs1_Yg_Conflict.full_name(): (Acs1_Yg_Conflict, Acs5_Yg_Conflict, "conflict_total") } if table_list[0].full_name() in special_cases: # if we are in a table with missing 1yr data has_specific_geo = api_obj.vars_and_vals and "geo" in api_obj.vars_and_vals if has_specific_geo: tbl1, tbl5, target_col = special_cases[table_list[0].full_name()] geos = api_obj.vars_and_vals["geo"].split(consts.OR) all_have_data = all([datum for datum, in tbl1.query.with_entities(target_col).filter(tbl1.geo.in_(geos))]) return tbl1 if all_have_data else tbl5 # Ordering is sorted in table_list based on moe return table_list[0] @classmethod def all_tables(cls, api_obj): vars_needed = api_obj.vars_needed candidates = [] for table in registered_models: if api_obj.order and api_obj.order in cls.possible_variables: vars_needed = vars_needed + [api_obj.order] if TableManager.table_has_cols(table, vars_needed): if TableManager.table_can_show(table, api_obj): candidates.append(table) candidates = sorted(candidates, key=attrgetter('median_moe')) if not candidates: raise DataUSAException("No tables can match the specified query.") return candidates @classmethod def multi_crosswalk(cls, tables, api_obj): for tbl in tables: api_obj = crosswalker.crosswalk(tbl, api_obj) return api_obj @classmethod def crosswalk(cls, table, api_obj): return crosswalker.crosswalk(table, api_obj) @classmethod def force_1yr_for_big_places(cls, api_obj): # -- if we are trying to look at tracts, that data is only available # -- at 5yr resolution. if api_obj.shows_and_levels and "geo" in api_obj.shows_and_levels and api_obj.shows_and_levels["geo"] == "tract": return api_obj if api_obj.vars_and_vals and "geo" in api_obj.vars_and_vals: cond_a = "med_earnings" in api_obj.values or "med_earnings_moe" in api_obj.values cond_b = "acs_ind" in api_obj.shows_and_levels cond_c = any(["languge" in api_obj.shows_and_levels, "language" in api_obj.values, "num_speakers" in api_obj.values, "num_speakers_moe" in api_obj.values, "num_speakers_rca" in api_obj.values]) if (cond_a and cond_b) or cond_c: return api_obj geos = api_obj.vars_and_vals["geo"].split(consts.OR) if not api_obj.force or api_obj.force.startswith("acs."): can_use_1yr = all([is_big_geo(geo) for geo in geos]) if can_use_1yr: api_obj.force_schema = BaseAcs1.schema_name return api_obj
def table_has_cols(cls, table, vars_needed): table_cols = get_columns(table) cols = set([col.key for col in table_cols]) return set(vars_needed).issubset(cols)
def table_has_cols(cls, table, vars_needed): table_cols = get_columns(table) cols = set([col.key for col in table_cols]) # if table.__tablename__ == 'ygd': # raise Exception(vars_needed, cols, set(vars_needed).issubset(cols)) return set(vars_needed).issubset(cols)
class TableManager(object): possible_variables = [ col.key for t in registered_models for col in get_columns(t) ] table_years = tbl_years() # table_sizes = tbl_sizes() @classmethod def schema_selector(cls, api_obj): # -- If there is a force to an "acs" table (defaults to 5-year) # determine if we can instead use the acs 1 year estimate # schema. has_force = hasattr(api_obj, "force") and api_obj.force if has_force: schema, tblname = api_obj.force.split(".") if schema == 'acs': if api_obj.force_schema: schema = BaseAcs1.schema_name api_obj.force = "{}.{}".format(schema, tblname) api_obj.subs["force"] = schema return api_obj else: schema = BaseAcs5.schema_name api_obj.force = "{}.{}".format(schema, tblname) api_obj.subs["force"] = schema if has_force and api_obj.vars_and_vals and not api_obj.force.startswith( BaseAcs5.schema_name): # Applied fix 5.9.16 if schema and schema in [ BaseAcs5.schema_name, BaseAcs3.schema_name ]: gvals = api_obj.vars_and_vals["geo"].split(",") nation_state_only = all( [v[:3] in ["010", "040"] for v in gvals]) not_ygi_ygo = all(["ygo" not in tblname, "ygi" not in tblname]) if schema != BaseAcs1.schema_name and nation_state_only and not_ygi_ygo: new_fullname = "{}.{}".format(BaseAcs1.schema_name, tblname) if new_fullname in cls.table_years: api_obj.force = new_fullname api_obj.subs["force"] = api_obj.force return api_obj @classmethod def table_can_show(cls, table, api_obj): shows_and_levels = api_obj.shows_and_levels supported_levels = table.get_supported_levels() vars_and_vals = api_obj.vars_and_vals required_geos = [] if "geo" not in vars_and_vals else vars_and_vals[ "geo"].split(",") if table.get_schema_name().startswith("acs"): if api_obj.force_schema and table.schema_name != api_obj.force_schema: return False if table.__table_args__["schema"] in [ BaseAcs5.schema_name, BaseAcs1.schema_name, BaseAcs3.schema_name ] and required_geos: need_to_support = set([my_geo[:3] for my_geo in required_geos]) required_levels = [ consts.LEVEL_TO_GEO[slvl] for slvl in need_to_support ] cond_check = [ x in supported_levels["geo"] for x in required_levels ] result = all(cond_check) if not result: return False for show_col, show_level in shows_and_levels.items(): if show_col not in supported_levels: # print show_col, supported_levels, "Supported Levels" return False else: if show_level not in supported_levels[show_col]: return False if api_obj.force and table.full_name() != api_obj.force: return False return True @classmethod def table_has_cols(cls, table, vars_needed): table_cols = get_columns(table) cols = set([col.key for col in table_cols]) return set(vars_needed).issubset(cols) @classmethod def select_best(cls, table_list, api_obj): # Ordering is sorted in all_tables return table_list[0] @classmethod def all_tables(cls, api_obj): vars_needed = api_obj.vars_needed candidates = [] for table in registered_models: if api_obj.order and api_obj.order in cls.possible_variables: vars_needed = vars_needed + [api_obj.order] if TableManager.table_has_cols(table, vars_needed): if TableManager.table_can_show(table, api_obj): candidates.append(table) candidates = sorted(candidates, key=attrgetter('median_moe')) if not candidates: raise DataUSAException("No tables can match the specified query.") return candidates @classmethod def crosswalk(cls, table, api_obj): return crosswalker.crosswalk(table, api_obj) @classmethod def force_1yr_for_big_places(cls, api_obj): # -- if we are trying to look at tracts, that data is only available # -- at 5yr resolution. if api_obj.shows_and_levels and "geo" in api_obj.shows_and_levels and api_obj.shows_and_levels[ "geo"] == "tract": return api_obj if api_obj.vars_and_vals and "geo" in api_obj.vars_and_vals: cond_a = "med_earnings" in api_obj.values or "med_earnings_moe" in api_obj.values cond_b = "acs_ind" in api_obj.shows_and_levels cond_c = any([ "languge" in api_obj.shows_and_levels, "language" in api_obj.values, "num_speakers" in api_obj.values, "num_speakers_moe" in api_obj.values, "num_speakers_rca" in api_obj.values ]) if (cond_a and cond_b) or cond_c: return api_obj geos = api_obj.vars_and_vals["geo"].split(consts.OR) if not api_obj.force or api_obj.force.startswith("acs."): can_use_1yr = all([is_big_geo(geo) for geo in geos]) if can_use_1yr: api_obj.force_schema = BaseAcs1.schema_name return api_obj