Esempio n. 1
0
 def required_tables(cls, api_obj):
     '''Given a list of X, do Y'''
     vars_needed = api_obj.vars_needed + api_obj.where_vars()
     if api_obj.order and api_obj.order in cls.possible_variables:
         vars_needed = vars_needed + [api_obj.order]
     universe = set(vars_needed)
     tables_to_use = []
     table_cols = []
     # Make a set of the variables that will be needed to answer the query
     while universe:
         # first find the tables with biggest overlap
         candidates = cls.list_partial_tables(universe, api_obj)
         # raise Exception(candidates)
         top_choices = sorted(candidates.items(), key=operator.itemgetter(1),
                              reverse=True)
         # take the table with the biggest overlap
         tbl, overlap = top_choices.pop(0)
         # ensure the tables are joinable, for now that means
         # having atleast one column with the same name
         if tables_to_use:
             while not set(table_cols).intersection([str(c.key) for c in get_columns(tbl)]):
                 if top_choices:
                     tbl, overlap = top_choices.pop(0)
                 else:
                     raise DataUSAException("can't join tables!")
         tables_to_use.append(tbl)
         tmp_cols = [str(c.key) for c in get_columns(tbl)]
         table_cols += tmp_cols
         # remove the acquired columns from the universe
         universe = universe - set(tmp_cols)
     return tables_to_use
Esempio n. 2
0
 def required_tables(cls, api_obj):
     '''Given a list of X, do Y'''
     vars_needed = api_obj.vars_needed + api_obj.where_vars()
     if api_obj.order and api_obj.order in cls.possible_variables:
         vars_needed = vars_needed + [api_obj.order]
     universe = set(vars_needed)
     tables_to_use = []
     table_cols = []
     # Make a set of the variables that will be needed to answer the query
     while universe:
         # first find the tables with biggest overlap
         candidates = cls.list_partial_tables(universe, api_obj)
         # raise Exception(candidates)
         top_choices = sorted(candidates.items(), key=operator.itemgetter(1),
                              reverse=True)
         # take the table with the biggest overlap
         tbl, overlap = top_choices.pop(0)
         # ensure the tables are joinable, for now that means
         # having atleast one column with the same name
         if tables_to_use:
             while not set(table_cols).intersection([str(c.key) for c in get_columns(tbl)]):
                 if top_choices:
                     tbl, overlap = top_choices.pop(0)
                 else:
                     raise DataUSAException("can't join tables!")
         tables_to_use.append(tbl)
         tmp_cols = [str(c.key) for c in get_columns(tbl)]
         table_cols += tmp_cols
         # remove the acquired columns from the universe
         universe = universe - set(tmp_cols)
     return tables_to_use
Esempio n. 3
0
def find_overlap(tbl1, tbl2):
    '''Given two table objects, determine the set of intersecting columns by
    column name'''
    cols1 = [col.key for col in get_columns(tbl1)]
    cols2 = [col.key for col in get_columns(tbl2)]
    myset = set(cols1).intersection(cols2)
    return myset
Esempio n. 4
0
def query(table, api_obj):
    vars_and_vals = api_obj.vars_and_vals
    shows_and_levels = api_obj.shows_and_levels
    values = api_obj.values

    filters = process_value_filters(table, vars_and_vals)
    filters += where_filters(table, api_obj.where)
    filters += sumlevel_filtering(table, api_obj)

    if values:
        pk = [col for col in table.__table__.columns if col.primary_key]
        cols = pk + values
    else:
        cols = get_columns(table)

    needs_show_filter = any([v != consts.ALL for v in shows_and_levels.values()])

    if needs_show_filter and hasattr(table, "gen_show_level_filters"):
        filters += table.gen_show_level_filters(shows_and_levels)

    qry = table.query.with_entities(*cols).filter(*filters)
    if api_obj.order:
        sort = "desc" if api_obj.sort == "desc" else "asc"
        qry = qry.order_by("{} {}".format(api_obj.order, sort))
    if api_obj.limit:
        qry = qry.limit(api_obj.limit)

    data = qry.all()
    return simple_format(table, cols, data, api_obj.subs)
Esempio n. 5
0
def query(table, api_obj, stream=False):
    vars_and_vals = api_obj.vars_and_vals
    shows_and_levels = api_obj.shows_and_levels
    values = api_obj.values
    exclude = api_obj.exclude

    filters = process_value_filters(table, vars_and_vals, api_obj)
    filters += where_filters(table, api_obj.where)
    filters += sumlevel_filtering(table, api_obj)

    if values:
        pk = [
            col for col in table.__table__.columns
            if col.primary_key and col.key not in values
        ]
        cols = pk + values
    else:
        cols = get_columns(table)

    if exclude:
        cols = [
            col for col in cols
            if (isinstance(col, basestring) and col not in exclude)
            or col.key not in exclude
        ]

    # qry = table.query.with_entities(*cols)
    qry = table.query

    if hasattr(table, "crosswalk_join"):
        qry = table.crosswalk_join(qry)

    if stream:
        qry, cols = use_attr_names(table, qry, cols)
    qry = qry.with_entities(*cols)

    if hasattr(table, "JOINED_FILTER"):
        qry, filters = handle_join(qry, filters, table, api_obj)

    qry = qry.filter(*filters)

    if api_obj.order:
        sort = "desc" if api_obj.sort == "desc" else "asc"
        if api_obj.order not in TableManager.possible_variables:
            if api_obj.order == 'abs(pct_change)':
                pass  # allow this
            else:
                raise DataUSAException("Bad order parameter", api_obj.order)
        sort_stmt = text("{} {} NULLS LAST".format(api_obj.order, sort))
        qry = qry.order_by(sort_stmt)
    if api_obj.limit:
        qry = qry.limit(api_obj.limit)

    if stream:
        return stream_format(table, cols, qry, api_obj)

    return simple_format(table, cols, qry, api_obj)
Esempio n. 6
0
def query(table, api_obj, stream=False):
    vars_and_vals = api_obj.vars_and_vals
    values = api_obj.values
    exclude = api_obj.exclude

    filters = process_value_filters(table, vars_and_vals, api_obj)
    filters += where_filters(table, api_obj.where)
    filters += sumlevel_filtering(table, api_obj)

    if values:
        pk = [col for col in table.__table__.columns if col.primary_key and col.key not in values]
        cols = pk + [getattr(table, col_name) for col_name in values]
    else:
        cols = get_columns(table)

    if exclude:
        cols = [col for col in cols
                if (isinstance(col, basestring) and col not in exclude) or col.key not in exclude]

    # qry = table.query.with_entities(*cols)
    qry = table.query

    if hasattr(table, "crosswalk_join"):
        qry = table.crosswalk_join(qry)

    if stream or api_obj.display_names:
        qry, cols = use_attr_names(table, qry, cols)
    qry = qry.with_entities(*cols)

    if hasattr(table, "JOINED_FILTER"):
        qry, filters = handle_join(qry, filters, table, api_obj)

    qry = qry.filter(*filters)

    if api_obj.order:
        sort = desc if api_obj.sort == "desc" else asc
        if api_obj.order not in TableManager.possible_variables:
            if api_obj.order == 'abs(pct_change)':
                pass  # allow this
            else:
                raise DataUSAException("Bad order parameter", api_obj.order)
        # sort_stmt = text("{} {} NULLS LAST".format(api_obj.order, sort))
        if api_obj.order == 'abs(pct_change)':
            target_col = func.abs(table.pct_change)
        else:
            target_col = getattr(table, api_obj.order)

        qry = qry.order_by(sort(target_col).nullslast())
    if api_obj.limit:
        qry = qry.limit(api_obj.limit)

    if stream:
        return stream_format(table, cols, qry, api_obj)

    return simple_format(table, cols, qry, api_obj)
Esempio n. 7
0
    def table_has_some_cols(cls, table, vars_needed):
        '''
        Go through the list of required variables find tables that have
        atleast 2 variables (if more than one variable is needed). The reason atleast
        2 are required is allow a join to occur (one for the value, one to potentially join).
        '''
        table_cols = get_columns(table)
        cols = set([col.key for col in table_cols])
        # min_overlap = 2 if len(vars_needed) > 1 else 1
        intersection = set(vars_needed).intersection(cols)

        if intersection:
            return len(intersection)
        return None # TODO review this
Esempio n. 8
0
    def table_has_some_cols(cls, table, vars_needed):
        '''
        Go through the list of required variables find tables that have
        atleast 2 variables (if more than one variable is needed). The reason atleast
        2 are required is allow a join to occur (one for the value, one to potentially join).
        '''
        table_cols = get_columns(table)
        cols = set([col.key for col in table_cols])
        # min_overlap = 2 if len(vars_needed) > 1 else 1
        intersection = set(vars_needed).intersection(cols)

        if intersection:
            return len(intersection)
        return None # TODO review this
Esempio n. 9
0
class TableManager(object):
    possible_variables = list(set([col.key for t in registered_models
                          for col in get_columns(t)]))
    table_years_set = tbl_years_set()
    table_years = tbl_years()

    # table_sizes = tbl_sizes()
    @classmethod
    def schema_selector(cls, api_obj):
        # -- If there is a force to an "acs" table (defaults to 5-year)
        #    determine if we can instead use the acs 1 year estimate
        #    schema.
        has_force =  hasattr(api_obj, "force") and api_obj.force
        if has_force:
            schema, tblname = api_obj.force.split(".")
            if schema == 'acs':
                if api_obj.force_schema:
                    schema = BaseAcs1.schema_name
                    api_obj.force = "{}.{}".format(schema, tblname)
                    api_obj.subs["force"] = schema
                    return api_obj
                else:
                    schema = BaseAcs5.schema_name
                    api_obj.force = "{}.{}".format(schema, tblname)
                    api_obj.subs["force"] = schema
        if has_force and api_obj.vars_and_vals and not api_obj.force.startswith(BaseAcs5.schema_name): # Applied fix 5.9.16
            if schema and schema in [BaseAcs5.schema_name, BaseAcs3.schema_name]:
                gvals = api_obj.vars_and_vals["geo"].split(",")
                nation_state_only = all([v[:3] in ["010", "040"] for v in gvals])
                not_ygi_ygo = all(["ygo" not in tblname, "ygi" not in tblname])
                if schema != BaseAcs1.schema_name and nation_state_only and not_ygi_ygo:
                    new_fullname = "{}.{}".format(BaseAcs1.schema_name, tblname)
                    if new_fullname in cls.table_years:
                        api_obj.force = new_fullname
                        api_obj.subs["force"] = api_obj.force
        return api_obj

    @classmethod
    def table_can_show(cls, table, api_obj):
        shows_and_levels = api_obj.shows_and_levels
        supported_levels = table.get_supported_levels()
        vars_and_vals = api_obj.vars_and_vals
        required_geos = [] if "geo" not in vars_and_vals else vars_and_vals["geo"].split(",")

        if table.get_schema_name().startswith("acs"):
            if api_obj.force_schema and table.schema_name != api_obj.force_schema:
                return False

        if table.__table_args__["schema"] in [BaseAcs5.schema_name, BaseAcs1.schema_name, BaseAcs3.schema_name] and required_geos:
            need_to_support = set([my_geo[:3] for my_geo in required_geos])
            required_levels = [consts.LEVEL_TO_GEO[slvl] for slvl in need_to_support]
            cond_check = [x in supported_levels["geo"] for x in required_levels]
            result = all(cond_check)
            if not result:
                return False

        for show_col, show_level in shows_and_levels.items():
            if show_col not in supported_levels:
                # print show_col, supported_levels, "Supported Levels"
                return False
            else:
                if show_level not in supported_levels[show_col]:
                    return False

        if api_obj.force and table.full_name() != api_obj.force:
            return False


        return True

    @classmethod
    def required_tables(cls, api_obj):
        '''Given a list of X, do Y'''
        vars_needed = api_obj.vars_needed + api_obj.where_vars()
        if api_obj.order and api_obj.order in cls.possible_variables:
            vars_needed = vars_needed + [api_obj.order]
        universe = set(vars_needed)
        tables_to_use = []
        table_cols = []
        # Make a set of the variables that will be needed to answer the query
        while universe:
            # first find the tables with biggest overlap
            candidates = cls.list_partial_tables(universe, api_obj)
            # raise Exception(candidates)
            top_choices = sorted(candidates.items(), key=operator.itemgetter(1),
                                 reverse=True)
            # take the table with the biggest overlap
            tbl, overlap = top_choices.pop(0)
            # ensure the tables are joinable, for now that means
            # having atleast one column with the same name
            if tables_to_use:
                while not set(table_cols).intersection([str(c.key) for c in get_columns(tbl)]):
                    if top_choices:
                        tbl, overlap = top_choices.pop(0)
                    else:
                        raise DataUSAException("can't join tables!")
            tables_to_use.append(tbl)
            tmp_cols = [str(c.key) for c in get_columns(tbl)]
            table_cols += tmp_cols
            # remove the acquired columns from the universe
            universe = universe - set(tmp_cols)
        return tables_to_use

    @classmethod
    def list_partial_tables(cls, vars_needed, api_obj):
        candidates = {}
        for table in registered_models:
            overlap_size = TableManager.table_has_some_cols(table, vars_needed)
            if overlap_size > 0:
                if TableManager.table_can_show(table, api_obj):
                    # to break ties, we'll use median moe to penalize and subtract
                    # since larger values will be chosen first.
                    penalty = (1 - (1.0 / table.median_moe)) if table.median_moe > 0 else 0
                    candidates[table] = overlap_size - penalty
        if not candidates:
            raise DataUSAException("No tables can match the specified query.")
        return candidates

    @classmethod
    def table_has_some_cols(cls, table, vars_needed):
        '''
        Go through the list of required variables find tables that have
        atleast 2 variables (if more than one variable is needed). The reason atleast
        2 are required is allow a join to occur (one for the value, one to potentially join).
        '''
        table_cols = get_columns(table)
        cols = set([col.key for col in table_cols])
        # min_overlap = 2 if len(vars_needed) > 1 else 1
        intersection = set(vars_needed).intersection(cols)

        if intersection:
            return len(intersection)
        return None # TODO review this

    @classmethod
    def table_has_cols(cls, table, vars_needed):
        table_cols = get_columns(table)
        cols = set([col.key for col in table_cols])
        return set(vars_needed).issubset(cols)

    @classmethod
    def select_best(cls, table_list, api_obj):
        special_cases = {
            Acs1_Yg_Conflict.full_name(): (Acs1_Yg_Conflict, Acs5_Yg_Conflict, "conflict_total")
        }
        if table_list[0].full_name() in special_cases:
            # if we are in a table with missing 1yr data
            has_specific_geo = api_obj.vars_and_vals and "geo" in api_obj.vars_and_vals
            if has_specific_geo:
                tbl1, tbl5, target_col = special_cases[table_list[0].full_name()]
                geos = api_obj.vars_and_vals["geo"].split(consts.OR)
                all_have_data = all([datum for datum, in tbl1.query.with_entities(target_col).filter(tbl1.geo.in_(geos))])
                return tbl1 if all_have_data else tbl5
        # Ordering is sorted in table_list based on moe
        return table_list[0]

    @classmethod
    def all_tables(cls, api_obj):
        vars_needed = api_obj.vars_needed
        candidates = []
        for table in registered_models:
            if api_obj.order and api_obj.order in cls.possible_variables:
                vars_needed = vars_needed + [api_obj.order]
            if TableManager.table_has_cols(table, vars_needed):
                if TableManager.table_can_show(table, api_obj):
                    candidates.append(table)
        candidates = sorted(candidates, key=attrgetter('median_moe'))
        if not candidates:
            raise DataUSAException("No tables can match the specified query.")
        return candidates

    @classmethod
    def multi_crosswalk(cls, tables, api_obj):
        for tbl in tables:
            api_obj = crosswalker.crosswalk(tbl, api_obj)
        return api_obj

    @classmethod
    def crosswalk(cls, table, api_obj):
        return crosswalker.crosswalk(table, api_obj)

    @classmethod
    def force_1yr_for_big_places(cls, api_obj):
        # -- if we are trying to look at tracts, that data is only available
        # -- at 5yr resolution.
        if api_obj.shows_and_levels and "geo" in api_obj.shows_and_levels and api_obj.shows_and_levels["geo"] == "tract":
            return api_obj
        if api_obj.vars_and_vals and "geo" in api_obj.vars_and_vals:
            cond_a = "med_earnings" in api_obj.values or "med_earnings_moe" in api_obj.values
            cond_b = "acs_ind" in api_obj.shows_and_levels
            cond_c = any(["languge" in api_obj.shows_and_levels, "language" in api_obj.values, "num_speakers" in api_obj.values, "num_speakers_moe" in api_obj.values, "num_speakers_rca" in api_obj.values])
            if (cond_a and cond_b) or cond_c:
                return api_obj
            geos = api_obj.vars_and_vals["geo"].split(consts.OR)
            if not api_obj.force or api_obj.force.startswith("acs."):
                can_use_1yr = all([is_big_geo(geo) for geo in geos])
                if can_use_1yr:
                    api_obj.force_schema = BaseAcs1.schema_name
        return api_obj
Esempio n. 10
0
 def table_has_cols(cls, table, vars_needed):
     table_cols = get_columns(table)
     cols = set([col.key for col in table_cols])
     return set(vars_needed).issubset(cols)
Esempio n. 11
0
 def table_has_cols(cls, table, vars_needed):
     table_cols = get_columns(table)
     cols = set([col.key for col in table_cols])
     return set(vars_needed).issubset(cols)
Esempio n. 12
0
 def table_has_cols(cls, table, vars_needed):
     table_cols = get_columns(table)
     cols = set([col.key for col in table_cols])
     # if table.__tablename__ == 'ygd':
         # raise Exception(vars_needed, cols, set(vars_needed).issubset(cols))
     return set(vars_needed).issubset(cols)
Esempio n. 13
0
class TableManager(object):
    possible_variables = [
        col.key for t in registered_models for col in get_columns(t)
    ]
    table_years = tbl_years()
    # table_sizes = tbl_sizes()
    @classmethod
    def schema_selector(cls, api_obj):
        # -- If there is a force to an "acs" table (defaults to 5-year)
        #    determine if we can instead use the acs 1 year estimate
        #    schema.
        has_force = hasattr(api_obj, "force") and api_obj.force
        if has_force:
            schema, tblname = api_obj.force.split(".")
            if schema == 'acs':
                if api_obj.force_schema:
                    schema = BaseAcs1.schema_name
                    api_obj.force = "{}.{}".format(schema, tblname)
                    api_obj.subs["force"] = schema
                    return api_obj
                else:
                    schema = BaseAcs5.schema_name
                    api_obj.force = "{}.{}".format(schema, tblname)
                    api_obj.subs["force"] = schema
        if has_force and api_obj.vars_and_vals and not api_obj.force.startswith(
                BaseAcs5.schema_name):  # Applied fix 5.9.16
            if schema and schema in [
                    BaseAcs5.schema_name, BaseAcs3.schema_name
            ]:
                gvals = api_obj.vars_and_vals["geo"].split(",")
                nation_state_only = all(
                    [v[:3] in ["010", "040"] for v in gvals])
                not_ygi_ygo = all(["ygo" not in tblname, "ygi" not in tblname])
                if schema != BaseAcs1.schema_name and nation_state_only and not_ygi_ygo:
                    new_fullname = "{}.{}".format(BaseAcs1.schema_name,
                                                  tblname)
                    if new_fullname in cls.table_years:
                        api_obj.force = new_fullname
                        api_obj.subs["force"] = api_obj.force
        return api_obj

    @classmethod
    def table_can_show(cls, table, api_obj):
        shows_and_levels = api_obj.shows_and_levels
        supported_levels = table.get_supported_levels()
        vars_and_vals = api_obj.vars_and_vals
        required_geos = [] if "geo" not in vars_and_vals else vars_and_vals[
            "geo"].split(",")

        if table.get_schema_name().startswith("acs"):
            if api_obj.force_schema and table.schema_name != api_obj.force_schema:
                return False

        if table.__table_args__["schema"] in [
                BaseAcs5.schema_name, BaseAcs1.schema_name,
                BaseAcs3.schema_name
        ] and required_geos:
            need_to_support = set([my_geo[:3] for my_geo in required_geos])
            required_levels = [
                consts.LEVEL_TO_GEO[slvl] for slvl in need_to_support
            ]
            cond_check = [
                x in supported_levels["geo"] for x in required_levels
            ]
            result = all(cond_check)
            if not result:
                return False

        for show_col, show_level in shows_and_levels.items():
            if show_col not in supported_levels:
                # print show_col, supported_levels, "Supported Levels"
                return False
            else:
                if show_level not in supported_levels[show_col]:
                    return False

        if api_obj.force and table.full_name() != api_obj.force:
            return False

        return True

    @classmethod
    def table_has_cols(cls, table, vars_needed):
        table_cols = get_columns(table)
        cols = set([col.key for col in table_cols])
        return set(vars_needed).issubset(cols)

    @classmethod
    def select_best(cls, table_list, api_obj):
        # Ordering is sorted in all_tables
        return table_list[0]

    @classmethod
    def all_tables(cls, api_obj):
        vars_needed = api_obj.vars_needed
        candidates = []
        for table in registered_models:
            if api_obj.order and api_obj.order in cls.possible_variables:
                vars_needed = vars_needed + [api_obj.order]
            if TableManager.table_has_cols(table, vars_needed):
                if TableManager.table_can_show(table, api_obj):
                    candidates.append(table)
        candidates = sorted(candidates, key=attrgetter('median_moe'))
        if not candidates:
            raise DataUSAException("No tables can match the specified query.")
        return candidates

    @classmethod
    def crosswalk(cls, table, api_obj):
        return crosswalker.crosswalk(table, api_obj)

    @classmethod
    def force_1yr_for_big_places(cls, api_obj):
        # -- if we are trying to look at tracts, that data is only available
        # -- at 5yr resolution.
        if api_obj.shows_and_levels and "geo" in api_obj.shows_and_levels and api_obj.shows_and_levels[
                "geo"] == "tract":
            return api_obj
        if api_obj.vars_and_vals and "geo" in api_obj.vars_and_vals:
            cond_a = "med_earnings" in api_obj.values or "med_earnings_moe" in api_obj.values
            cond_b = "acs_ind" in api_obj.shows_and_levels
            cond_c = any([
                "languge" in api_obj.shows_and_levels, "language"
                in api_obj.values, "num_speakers" in api_obj.values,
                "num_speakers_moe" in api_obj.values, "num_speakers_rca"
                in api_obj.values
            ])
            if (cond_a and cond_b) or cond_c:
                return api_obj
            geos = api_obj.vars_and_vals["geo"].split(consts.OR)
            if not api_obj.force or api_obj.force.startswith("acs."):
                can_use_1yr = all([is_big_geo(geo) for geo in geos])
                if can_use_1yr:
                    api_obj.force_schema = BaseAcs1.schema_name
        return api_obj