def create_pandas_bins(pandas_df, columns, bins, btype="quantile"):
     quantile_dict = {}
     columns = du.aslist(columns)
     quantile_bins = du.aslist(bins)
     for column in columns:
         if btype == "quantile":
             labels = [f"'({bins[i]}, {bins[i+1]}]'" for i,b in enumerate(bins[1:])]
             pandas_df[f'{column}_bins'], retbins = pandas.qcut(pandas_df[column], bins, duplicates='drop', retbins=True)
             print(f"Bins: {retbins}")
         else:
             pandas_df[f'{column}_bins'] = pandas.cut(x=pandas_df[column], bins=quantiles)
     return pandas_df
Example #2
0
def addMetadataColumns(df,
                       dasrun,
                       metadata_columns=[AC.PLB, AC.RUN_ID, AC.BUDGET_GROUP]):
    """
    adds additional experiment/run metadata columns to the Spark DataFrame

    Parameters
    ==========
    df : Spark DataFrame

    dasrun : DASRun object
        A wrapper for the run path and other important metadata for an experiment run

    metadata_columns : list of strings; default is [AC.PLB, AC.RUN_ID, AC.BUDGET_GROUP]
        Add the provided columns to the data frame if the data exists in the das run

    Returns
    =======
    Spark DataFrame
    """
    for column in du.aslist(metadata_columns):
        lit = getattr(dasrun, column)
        if lit is not None:
            df = df.withColumn(column, sf.lit(lit)).persist()
    return df
Example #3
0
def getGRFC(spark, columns=None):
    """
    returns the GRFC columns as a Spark DataFrame

    Parameters
    ==========
    spark : SparkSession

    columns : str or list of str
        Default: None - return all columns


    Returns
    =======
    a Spark DataFrame containing information from the GRFC file
    """
    grfc_loc = f"{DAS_S3ROOT}/2010/cefv2/pp10_grf_tab_ikeda_100219.csv"

    grfc = spark.read.option("header", "true").csv(grfc_loc)

    grfc = grfc.withColumn('BLOCK', sf.concat(sf.col("TABBLKST"), sf.col("TABBLKCOU"), sf.col("TABTRACTCE"), sf.col("TABBLK")[0:1], sf.col("TABBLK")))
    grfc = grfc.withColumn('geocode', sf.col('BLOCK')).persist()

    if columns is None:
        columns = grfc.columns
    else:
        # want geocode, at least, as the join column
        columns = np.unique(du.aslist(columns) + ['geocode']).tolist()

    grfc = grfc.select(columns)

    return grfc
Example #4
0
def getToyGeounitData_GeounitNode(schema,
                                  geocodes=[
                                      '000', '001', '002', '003', '010', '011',
                                      '012', '020', '022'
                                  ],
                                  geocode_dict={
                                      3: 'block',
                                      2: 'county'
                                  },
                                  raw_params={
                                      'low': 0,
                                      'high': 2
                                  },
                                  syn_params={
                                      'low': 0,
                                      'high': 5
                                  }):
    geounits = []
    for geocode in du.aslist(geocodes):
        if raw_params is not None:
            raw = np.random.randint(low=raw_params['low'],
                                    high=raw_params['high'],
                                    size=schema.size).reshape(schema.shape)
        if syn_params is not None:
            syn = np.random.randint(low=syn_params['low'],
                                    high=syn_params['high'],
                                    size=schema.size).reshape(schema.shape)
        geounits.append(
            GeounitNode(geocode=geocode,
                        geocode_dict=geocode_dict,
                        raw=multiSparse(raw),
                        syn=multiSparse(syn)))
    return geounits
Example #5
0
def getMicrodataDF_mapper(node, schema, privatized=True, mangled_names=True, recoders=None):
    # TODO: provide support for node = GeounitNode, in addition to node = dict
    GEOCODE = "geocode"
    ORIG = "raw"
    PRIV = "syn"
    DATATYPE = "data_type"
    import numpy as np
    if privatized:
        datakey = PRIV
    else:
        datakey = ORIG
    for item in [datakey, GEOCODE]:
        assert item in node, f"Cannot create microdata; '{item}' not found in the node."
    data = node[datakey].sparse_array
    all_nonzero_indices = data.indices.tolist()
    rows = []
    for ind in all_nonzero_indices:
        rowdict = {}
        rowdict[GEOCODE] = node[GEOCODE]
        rowdict[DATATYPE] = str(datakey)
        num_records = int(data[0,ind])
        cell = np.unravel_index(ind, schema.shape)
        for dim, level in enumerate(cell):
            if mangled_names:
                dimname = f"{schema.mangled_dimnames[dim]}"
            else:
                dimname = f"{schema.dimnames[dim]}"
            rowdict[dimname] = str(level)
        if recoders is not None:
            for recode in du.aslist(recoders):
                rowdict = recode(rowdict)
        row = rowdict
        rows += [row]*num_records
    return rows
 def make_pandas_qcut_bin_labels(bins):
     bins = du.aslist(bins)
     binstr = [f"'(-Inf, {bins[0]}]'"]
     if len(bins) >= 2:
         for i,b in enumerate(bins[1:]):
             binstr += [f"'({bins[i]}, {bins[i+1]})'"]
     return binstr
Example #7
0
def expandKeywords(querynames):
    """
    looks through the querynames provided and expands any keywords found that refer to workloads

    Inputs:
        querynames: list of strings

    Outputs:
        list of strings (querynames)

    Notes:
        This allows for the use of keywords as shorthand for commonly used
        sets of queries (e.g. PL94, P12, etc.)

        Note that it does not check to see if the queries themselves are
        valid for a particular schema; this function only expands valid
        workload keywords into querynames and appends those querynames to the
        list of querynames that will be returned.
    """
    expandednames = []
    querynames = das_utils.aslist(querynames)
    for name in querynames:
        try:
            keynames = getWorkload(name)
        except (AssertionError, KeyError) as e:
            keynames = [name]

        expandednames += keynames

    return np.unique(expandednames).tolist()
Example #8
0
def getToySparseHistDF(geounit_data, schema):
    records = []
    for geounit in du.aslist(geounit_data):
        records += mappers.getSparseDF_mapper(geounit, schema)
    df = pandas.DataFrame(records)

    column_order = ['geocode'] + schema.dimnames
    df = df[column_order + [x for x in df.columns if x not in column_order]]
    return df
def getTable(data, schema, querynames):
    querynames = das_utils.aslist(querynames)
    answerdict = {}
    leveldict = {}
    for name in querynames:
        #print(name)
        answerdict[name] = schema.getQuery(name).answerWithShape(data)
        leveldict[name] = schema.getQueryLevel(name, flatten=False)

    return Table(answerdict, leveldict)
Example #10
0
    def getCustomQuerynames(self, querynames):
        queries = []
        querynames = das_utils.aslist(querynames)
        for name in querynames:
            if name in self.tabledict:
                queries += self.tabledict[name]
            else:
                if self.schema.isValidQuery(name):
                    queries += [name]

        queries = np.unique(queries).tolist()
        return queries
    def getQueries(self, querynames):
        """
        returns a dictionary of querybase objects
        
        Inputs:
            querynames: a single string or list of strings referring to the queries to build
        """
        queries = {}
        querynames = das_utils.aslist(querynames)
        for name in querynames:
            queries[name] = self.getQuery(name)

        return queries
    def getWorkloadByTable(self, tablenames=None):
        if tablenames is None:
            tablenames = self.tablenames
        else:
            tablenames = das_utils.aslist(tablenames)

        querynames = []
        for name in tablenames:
            if name in self.tabledict:
                querynames += self.tabledict[name]

        querynames = np.unique(querynames).tolist()

        return querynames
 def getCustomTableTuples(self, querynames):
     """
     returns a list of tuples (query, level) in the custom table shell
     
     used for comparing rows that exist to the table shell to find all missing rows
     
     a list of tuples has a much smaller memory footprint than using a pandas dataframe or even a numpy array
     """
     tuples = []
     querynames = self.getCustomQuerynames(das_utils.aslist(querynames))
     for query in querynames:
         tuples += [(query, level)
                    for level in self.schema.getQueryLevel(query)]
     return tuples
Example #14
0
 def getCustomQuerynames(self, querynames):
     querynames = self.standardize_querynames(querynames)
     queries = []
     querynames = du.aslist(querynames)
     for name in querynames:
         if name in self.tabledict:
             queries += self.tabledict[name]
         else:
             if self.isValidQuery(name):
                 queries += [name]
             else:
                 print(f"Removing '{name}' from the list of queries.")
     
     queries = np.unique(queries).tolist()
     return queries
    def getCustomQuerynames(self, querynames):
        queries = []
        querynames = das_utils.aslist(querynames)
        for name in querynames:
            if name in self.tabledict:
                queries += self.tabledict[name]
            else:
                if self.schema.isValidQuery(name):
                    queries += [name]
                else:
                    print(
                        f"'{name}' is not a valid query for this schema\nRemoving it from the list of queries."
                    )

        queries = np.unique(queries).tolist()
        return queries
    def getGroupingLevels(self, customlevels, groupings, keepdims):
        """
        returns a dictionary of levels for groupings
        this function is primarily used in the buildRecodeQuerySeed function
        
        Inputs:
            customlevels (dict): levels as defined by the user (and specified
                                 in the recode's levels attribute)
            groupings (dict): the groupings defined for the recoded variable
            keepdims (list): the dimensions in dimnames to keep (marginalize the others)
        
        Outputs:
            a dictionary of levels
        """
        levels = {}
        baselevels = self._getBaseLevels()
        for dim in keepdims:
            if dim in groupings:
                if dim in customlevels:
                    levels[dim] = customlevels[dim]
                else:
                    items = []
                    dimgroups = groupings[dim]
                    # in order to make automatic level-generation work for groupings, each group needs to be
                    # part of a list within the dim group.
                    # For example, it's simpler to write { 'dim0': [1,2,3,4,5] } than
                    #                                    { 'dim0': [[1],[2],[3],[4],[5]] }
                    # for a dimension dim0 that has 6 levels, but where we
                    # want to ignore the first on.
                    # As such, when the levels are automatically generated (i.e. no custom levels have been specified)
                    # then the first dictionary above is automatically translated into the second here
                    # This also works fine for other dimension groupings, such as { 'dim1': [[1],[2,3,4]] }
                    # since it is translated into the exact same thing
                    dimgroups = [das_utils.aslist(x) for x in dimgroups]

                    for j in range(len(dimgroups)):
                        items.append(".".join([
                            x for i, x in enumerate(baselevels[dim])
                            if i in dimgroups[j]
                        ]))
                    levels[dim] = items
            else:
                levels[dim] = baselevels[dim]

        return levels
    def getQueryLevels(self, querynames, order=None, flatten=True):
        """
        returns a dictionary of arrays corresponding to the levels found in the querynames
        
        Inputs:
            querynames: a string or list of strings
            order: the order of the dimensions for creating crosses
                   the default is None, which sets the order to be the ordering of the dimnames attribute
            flatten: boolean. If True, return the levels as a flattened array
                              If False, return the levels as a multdimensional numpy array
        """
        qlevels = {}
        order = self.dimnames if order is None else order
        querynames = das_utils.aslist(querynames)
        for name in querynames:
            seed = self.getQuerySeed(name)
            qlevels[name] = seed.getQueryLevels(order=order, flatten=flatten)

        return qlevels
Example #18
0
def getDASRunsNested(paths,
                     search_threads=20,
                     build_threads=20,
                     schema_name=None):
    """
    returns a list of DASRun objects that contain information about DAS Experiment data

    Parameters
    ==========
    paths: str or list of str
        List of s3 paths to DAS Experiment data

    search_threads: int (kwarg with default = 20)
        Number of `multiprocessing` threads to use during s3 search

    build_threads: int (kwarg with default = 20)
        Number of `multiprocessing` threads to use to construct DASRun objects

    schema_name: str (kwarg with default = None)
        The name of the Schema associated with all of the DAS Experiments in the path_list
        Notes:
            - If None, the DASRun class will search for the schema_name within the config
              file and will throw an error if it can't be found.
            - If not None, the DASRun class will just use the schema_name provided.

    Returns
    =======
    A list of DASRun objects
    """
    t0 = time.time()
    config_paths = []
    paths = du.aslist(paths)
    for path in paths:
        config_paths += findDASRunConfigs(path, threads=search_threads)
    dasrun_ingredients = [(config_path, schema_name)
                          for config_path in config_paths]
    with mp.Pool(build_threads) as pool:
        runs = pool.map(makeDASRunNested, dasrun_ingredients)
    t1 = time.time()
    print(
        f"It took {t1-t0} seconds to build all DASRuns from the found config.ini files"
    )
    return runs
Example #19
0
 def getQueryLevels(self, querynames: Union[Iterable[str], str], order=None, flatten=True, cross_marker=" BY "):
     """
     returns a dictionary of arrays corresponding to the levels found in the querynames
     
     Inputs:
         querynames: a string or list of strings
         order: the order of the dimensions for creating crosses
                the default is None, which sets the order to be the ordering of the dimnames attribute
         flatten: boolean. If True, return the levels as a flattened array
                           If False, return the levels as a multdimensional numpy array
     """
     querynames = self.standardize_querynames(querynames)
     qlevels = {}
     order = self.dimnames if order is None else order
     querynames = du.aslist(querynames)
     querynames = self.getCustomQuerynames(querynames)
     for name in querynames:
         if self.isValidQuery(name):
             seed = self.getQuerySeed(name)
             qlevels[name] = seed.getQueryLevels(order=order, flatten=flatten, cross_marker=cross_marker)
     
     return qlevels
Example #20
0
def getWorkload(workload_keywords):
    """
    returns a list of unique queries based on one or more workloads

    Inputs:
        workload_keywords: a list of strings associated with the workloads desired

    Outputs:
        a list of strings/query names

    Notes:
        Since some workloads share queries (e.g. "total"), this function allows us to
        concatenate lists of queries (from multiple workloads) and remove any duplicates,
        as duplicates cause HDMM to not work as we might want/expect.
    """
    keys = das_utils.aslist(workload_keywords)
    querynames = []
    for key in keys:
        querynames += getWorkloadByKey(key)

    unique_querynames = np.unique(querynames).tolist()

    return unique_querynames
    def getQueryNames(self, nway=None, ignore=None, include=None):
        """
        returns a list of valid query names
        
        Inputs:
            nway: an int or list of ints that refer to the marginal querynames desired
            ignore: a str or list of strs that refer to the "crossable" dimensions to remove
                    from the queryname list
            include: a str or list of strs that refer to queries that must be part of the list (unless
                    the queries are invalid)
        
        Outputs:
            a list of strings referring to the queries asked for
        
        Notes:
            Not often used in practice; used primarily for testing purposes
            
            'detailed' always refers to the query expressed by the crosses between all of the dimname variables
            Example:
                if dimnames = ['a', 'b', 'c'] and there are two recoded variables ['a1', 'c6'], then, even though
                'a1_b_c', 'a_b_c6', and 'a_b_c' are all valid queries, only 'a_b_c' matches the crosses of the
                original (dimnames) variables, so it is the only one that will be renamed as 'detailed'
        """
        if nway is None:
            valid_names = self._getAllQueryNames()

        else:
            valid_names = []
            for n in das_utils.aslist(nway):
                if isinstance(n, int):
                    combos = list(
                        itertools.combinations(self._getCrossableQueries(), n))
                    if n == 0:
                        combos = ['total']
                    elif n == 1:
                        combos = [list(x)[0] for x in combos]
                    else:
                        combos = [
                            C.SCHEMA_CROSS_JOIN_DELIM.join(list(x))
                            for x in combos
                        ]
                    valid_names += combos

        nonignored_names = set(valid_names)
        if ignore is not None:
            for name in das_utils.aslist(ignore):
                nonignored_names = nonignored_names.intersection(
                    set([
                        x for x in valid_names
                        if name not in re.split(C.SCHEMA_CROSS_SPLIT_DELIM, x)
                    ]))
            valid_names = list(nonignored_names)

        if include is not None:
            for x in das_utils.aslist(include):
                if not np.any([
                        True if isSameQuery(x, y) else False
                        for y in valid_names
                ]):
                    valid_names.append(x)

        valid_names = list(set(valid_names))

        for i, name in enumerate(valid_names):
            detailed_name = C.SCHEMA_CROSS_JOIN_DELIM.join(self.dimnames)
            if isSameQuery(name, detailed_name):
                valid_names[i] = "detailed"

        valid_names = [x for x in valid_names if self._validQuerySeed(x)]
        valid_names.sort(
            key=lambda s: len(re.split(C.SCHEMA_CROSS_SPLIT_DELIM, s)))

        return valid_names
Example #22
0
def getCrosswalkDF(spark=None, columns=None, strong_mcd_states=STRONG_MCD_STATES, aian_areas=AIAN_AREAS, aian_ranges_path=AIAN_RANGES_PATH, fed_airs=FED_AIRS):
    """
    Loads the 2010 crosswalk files that Simson generated from the 2010 GRFC into a Spark DF

    Parameters
    ==========
    spark : SparkSession

    columns : str or list of str (default is None, which will return all columns in the file)
        - This determines which columns survive from the original crosswalk data file, as the function will
          only return a Spark DF with the columns listed here

    Returns
    =======
    a Spark DF containing crosswalk columns

    Notes
    =====
    - This function also generates a number of additional columns to expand the ease-of-use when aggregating
      blocks to form geographic units in different geographic levels.
        - e.g. Rather than COUNTY being the 3-digit FIPS code, the COUNTY column will concatenate both the
               2-digit STATE FIPS code and the 3-digit COUNTY FIPS code to create a 5-digit COUNTY code that
               is unique from all other 5-digit COUNTY codes.
    """
    crosswalk = f"{DAS_S3ROOT}/2010/geounit_crosswalks/24vars/"

    crossdf = spark.read.option("header", "true").csv(crosswalk)
    # add "geocode" column based on GEOID (which is the 16 digit block id)
    crossdf = crossdf.withColumn("geocode", crossdf['GEOID'])

    # generate unique counties
    crossdf = crossdf.withColumn("COUNTY", sf.concat(sf.col("STATE"), sf.col("COUNTY")))

    # generate unique tract groups
    crossdf = crossdf.withColumn("TRACT_GROUP", sf.concat(sf.col("County"), crossdf.TRACT[0:4]))

    # generate unique tracts
    crossdf = crossdf.withColumn("TRACT", sf.concat(sf.col("COUNTY"), sf.col("TRACT")))

    # generate block group column
    crossdf = crossdf.withColumn("BLOCK_GROUP", crossdf.BLOCK[0:1])

    # generate unique block groups
    crossdf = crossdf.withColumn("BLOCK_GROUP", sf.concat(sf.col("TRACT"), sf.col("BLOCK_GROUP")))

    # generate unique blocks
    crossdf = crossdf.withColumn("BLOCK", sf.concat(sf.col("BLOCK_GROUP"), sf.col("BLOCK")))

    # generate unique SLDLs (only unique if state fips has been prepended to the SLDL identifier)
    crossdf = crossdf.withColumn("SLDL", sf.concat(sf.col("STATE"), sf.col("SLDL")))

    # generate unique SLDUs (only unique if state fips has been prepended to the SLDU identifier)
    crossdf = crossdf.withColumn("SLDU", sf.concat(sf.col("STATE"), sf.col("SLDU")))

    # generate unique Congressional Districts (111th Congress) - only unique if state fips has been prepended to the CD identifier
    crossdf = crossdf.withColumn("CD", sf.concat(sf.col("STATE"), sf.col("CD")))

    # generate unique school districts (only unique if state fips has been prepended to the identifiers)
    crossdf = crossdf.withColumn("SDELM", sf.concat(sf.col("STATE"), sf.col("SDELM")))
    crossdf = crossdf.withColumn("SDSEC", sf.concat(sf.col("STATE"), sf.col("SDSEC")))
    crossdf = crossdf.withColumn("SDUNI", sf.concat(sf.col("STATE"), sf.col("SDUNI")))

    # generate unique urban areas and urban growth areas (only unique if state prepended)
    crossdf = crossdf.withColumn("UA", sf.concat(sf.col("STATE"), sf.col("UA")))
    crossdf = crossdf.withColumn("UGA", sf.concat(sf.col("STATE"), sf.col("UGA")))

    # generate unique puma and place ids (only unique if state prepended)
    crossdf = crossdf.withColumn("PUMA", sf.concat(sf.col("STATE"), sf.col("PUMA")))
    crossdf = crossdf.withColumn("PLACE", sf.concat(sf.col("STATE"), sf.col("PLACE")))

    # generate unique county subdivisions (only unique if state and county prepended)
    crossdf = crossdf.withColumn("COUSUB", sf.concat(sf.col("COUNTY"), sf.col("COUSUB")))

    # generate unique subminor civil divisions (only unique if state, county, and county subdivisions prepended)
    crossdf = crossdf.withColumn("SUBMCD", sf.concat(sf.col("COUSUB"), sf.col("SUBMCD")))

    # voting districts appear to have a floating space (" ") character in every VTD code, so we'll remove them as they
    # don't appear in the BlockAssign files for VTD
    ### Update - 2019-06-25 - The floating space is a valid character in the 6-character VTD codes; the first character
    #                         isn't always a " ", so " " is just another part of the code.
    #crossdf = crossdf.withColumn("VTD1st", crossdf.VTD[0:1])

    # generate unique voting districts (only unique if state and county prepended)
    crossdf = crossdf.withColumn("VTD", sf.concat(sf.col("COUNTY"), sf.col("VTD")))

    # create a column for the nation
    crossdf = crossdf.withColumn("US", sf.lit("Nation"))

    # Note: When using any of the columns from the next block, filter out IDs composed only of "9"'s
    aian_ranges_dict = make_aian_ranges_dict(aian_ranges_path, aian_areas)

    is_fed_air_udf = udf(lambda aiannhce: in_aian_class(aiannhce, fed_airs, aian_ranges_dict), BooleanType())
    is_aian_udf = udf(lambda aiannhce: in_aian_class(aiannhce, aian_areas, aian_ranges_dict), BooleanType())
    crossdf = add_aiannhce_col(spark, crossdf)
    # aian_areas:
    crossdf = crossdf.withColumn("AIAN_AREAS", sf.when(is_aian_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA))
    crossdf = crossdf.withColumn("FED_AIRS", sf.when(is_fed_air_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA))
    # portions of Blocks/Tracts/States within aian_areas:
    crossdf = crossdf.withColumn("AIANBlock", sf.when(sf.col("AIAN_AREAS") != CC.NOT_AN_AIAN_AREA, sf.col("BLOCK")).otherwise(CC.NOT_AN_AIAN_BLOCK))
    crossdf = crossdf.withColumn("AIANTract", sf.col("AIANBlock")[0:11])
    crossdf = crossdf.withColumn("AIANState", sf.col("AIANTract")[0:2])
    # Define an off-spine entity (OSE) as Place in AIAN areas/ non-strong-MCD states and MCD otherwise:
    crossdf = crossdf.withColumn("OSE", sf.when((sf.col("AIAN_AREAS") == CC.NOT_AN_AIAN_AREA) & (sf.col("STATE").isin(strong_mcd_states)), sf.col("COUSUB")).otherwise(sf.col("PLACE")))
    crossdf = crossdf.withColumn("COUNTY_NSMCD", sf.when(sf.col("STATE").isin(strong_mcd_states), CC.STRONG_MCD_COUNTY).otherwise(sf.col("COUNTY")))
    crossdf = crossdf.withColumn("MCD", sf.when(sf.col("STATE").isin(strong_mcd_states), sf.col("COUSUB")).otherwise(sf.lit(CC.NOT_A_MCD)))

    if columns is None:
        columns = crossdf.columns
    else:
        # always want 'geocode' (aka Block ID, GEOID) in the crosswalk dataframe
        columns = np.unique(du.aslist(columns) + ['geocode']).tolist()

    crossdf = crossdf.select(columns)
    return crossdf
 def getCustomTable(self, querynames, data=None):
     querynames = das_utils.aslist(querynames)
     if data is None:
         data = np.zeros(self.schema.shape)
     querynames = self.getCustomQuerynames(querynames)
     return getTable(data, self.schema, querynames).toDF()
Example #24
0
def getDASRunsFlat(data_paths, schema_name, budget_group=None, run_id=None):
    data_paths = du.aslist(data_paths)
    dasruns = [
        DASRunFlat(x, schema_name, budget_group, run_id) for x in data_paths
    ]
    return dasruns