def test_processPersonTable(reader_instance, spark: SparkSession, dd_das_stub): data = [ ("1", "2", 1, 0), ("1", "2", 1, 0), ("1", "2", 1, 1), ("2", "1", 1, 1) ] num_partitions = 10 gv = ('ga', 'gb', ) hv = ('a', 'b',) variables = gv + hv r = reader_instance r.num_reader_partitions = 8 r.config.set(READER, "PersonData.variables", "ga gb a b") r.config.set(READER, f"PersonData.geography", "ga gb") r.config.set(READER, "ga.type", "str") r.config.set(READER, "ga.legal", "0-9") r.config.set(READER, "gb.type", "str") r.config.set(READER, "gb.legal", "0-9") r.config.set(READER, f"PersonData.histogram", "a b") r.config.set(READER, "a.type", "int") r.config.set(READER, "b.type", "int") r.config.set(READER, "a.legal", "0,1") r.config.set(READER, "b.legal", "0,1") t_class = das_utils.class_from_config(r.config, f"PersonData.{TABLE_CLASS}", READER) t = t_class(name="PersonData", das=dd_das_stub, config=r.config, reader_instance=r) t.data_shape = (2, 2) df = spark.createDataFrame(data, variables).repartition(num_partitions) nodes = dict(t.process(df).collect()) assert np.array_equal(das_utils.npArray(nodes[("1", "2")], (2, 2)), np.array([[0, 0], [2, 1]])) assert np.array_equal(das_utils.npArray(nodes[("2", "1")], (2, 2)), np.array([[0, 0], [0, 1]]))
def make_variables(self) -> typing.List[FWFTableVariable]: """ Dynamically create variables based on the generated specification file """ generated_module = f"{self.name}.generated_module" generated_table_name = f"{self.name}.generated_table" table_name = self.getconfig(generated_table_name, section=C.READER) generated_module = das_utils.class_from_config(self.config, generated_module, C.READER) self.generated_class = getattr(generated_module, table_name) generated_spec = generated_module.SPEC_DICT["tables"][table_name] variables = [ FWFTableVariable(spec_var["name"].lower(), column=spec_var["column"], width=spec_var["width"], vtype=spec_var["vtype"], legal=spec_var["ranges"]) for spec_var in generated_spec["variables"] ] for var in variables: var.set_vtype(var.vtype) var.set_legal_values_from_ranges(var.legal_values) return variables
def __init__(self, *, reader_instance: 'reader', **kwargs): super().__init__(**kwargs) self.reader = reader_instance #self.location = [os.path.expandvars(x) for x in re.split(CC.REGEX_CONFIG_DELIM, self.getconfigwsec(CC.PATH)) if len(x)>0] self.location = list(self.gettuplewsec(CC.PATH)) self.variables = self.make_variables() self.recode_variables = [ TableVariable(var_name).make_from_config(self.config) for var_name in self.gettuplewsec(CC.RECODE_VARS, default=()) ] self.csv_file_format = self.reader.csv_file_format.copy() # If we want these distinct for each table, then the option should include table name try: self.csv_file_format["sep"] = self.getconfig( f"{self.name}.{CC.DELIMITER}", section=CC.READER) except NoOptionError: pass self.csv_file_format['schema'] = self.set_schema() self.geography_variables = self.gettuplewsec(CC.GEOGRAPHY) self.histogram_variables = self.gettuplewsec(CC.HISTOGRAM) # Finally, set up the recoder, if there is any. This must be done in __init__() so the recoder is included in the BOM. recoder_name = f"{self.name}.{CC.PRE_RECODER}" if not self.config.has_option(CC.READER, recoder_name): self.recoder = None else: args = [ self.gettuple(var.name, section=CC.READER, sep=" ") for var in self.recode_variables ] if self.getboolean(f"{self.name}.{CC.NEWRECODER}", section=CC.READER, default=False): args = args + [self.recode_variables] try: self.recoder = das_utils.class_from_config( self.config, recoder_name, CC.READER)(*args) except TypeError as err: raise TypeError( f"Table {self.name} failed to create recoder, arguments: {args}, Error: {err.args}" )
def test_class_from_config(): with pytest.raises(KeyError) as err: class_from_config(config, "aaa", "A") assert 'Key "aaa" in config section [A] not found' in str(err.value) with pytest.raises(ImportError) as err: class_from_config(config, "aa", "A") assert 'Module aaa.bbb import failed.\nCurrent directory' in str(err.value) with pytest.raises(AttributeError) as err: class_from_config(config, "bb", "B") assert "[B]/bb option" in str(err.value) from das_framework.driver import AbstractDASModule as adm assert type(class_from_config(config, "adm", "B")) == type(adm)
def __init__(self, **kwargs): super().__init__(**kwargs) assert self.config self.csv_file_format = { "header": self.getboolean(C.HEADER), "sep": self.getconfig(C.DELIMITER), "comment": self.getconfig(C.CSV_COMMENT, default='#') } self.num_reader_partitions = self.getint(C.NUM_READER_PARTITIONS, default=100) self.range_partition = self.getboolean(C.RANGE_PARTITION, default=False) self.reader_partition_len = self.getint(C.READER_PARTITION_LEN, default=11) self.measure_rdd_times = self.getboolean(C.MEASURE_RDD_TIMES, default=False) table_names = self.gettuple(C.TABLES, sep=" ") logging.info("building table infrastructure") logging.debug("table names %s", table_names) logging.debug("Reading table module and class names from config") # Create the reader classes for each table. # They are subclasses of the AbstractDASModule, so they need to be properly setup self.tables = {name: das_utils.class_from_config(self.config, f"{name}.{C.TABLE_CLASS}", C.READER)( name=name, config=self.config, reader_instance=self, das=self.das) for name in table_names} self.shape_dict = {} # Find out recode variables and their dimensions to know the set of histogram variables and its shape for table in self.tables.values(): logging.info(f"recode meta for table {table.name}") table.recode_meta_update() table.set_shape() self.shape_dict[table.name] = table.data_shape # Bottom geographical level bottom: str = self.setup.levels[0] # Invariants and Constraints for the bottom level ic_bottom: Dict[str, List[str]] = self.setup.inv_con_by_level[bottom] # Get invariants names from setup self.invar_names = ic_bottom["invar_names"] # Get constraints names from setup self.cons_names = ic_bottom["cons_names"] # Get the names of tables self.privacy_table_name = self.getconfig(C.PTABLE).strip() # Person or Household table self.constraint_tables = self.gettuple(C.CTABLES) # it allows for extra-generality, but we only need one additional table for invariants/constaints, i.e. self.constraint_table_name = self.constraint_tables[0] # Unit table self.data_names = [self.privacy_table_name] + list(self.constraint_tables) # Shape of the person histogram (save it in setup object for further use) if self.setup.hist_shape != self.tables[self.privacy_table_name].data_shape: msg = (f"The histogram shape set in config file {self.tables[self.privacy_table_name].data_shape} that the data read is different from " f"the shape of schema {self.setup.schema} {self.setup.hist_shape}") warnings.warn(msg) self.log_warning_and_print(msg) self.setup.hist_shape = self.tables[self.privacy_table_name].data_shape # Save person tables histogram variables in setup if self.setup.hist_vars != self.tables[self.privacy_table_name].histogram_variables: # msg = f"The histogram variables set in config file {self.tables[self.privacy_table_name].histogram_variables} that the data read are " \ # f"different from the variables of schema {self.setup.schema} {self.setup.hist_vars}" # warnings.warn(msg) # self.log_warning_and_print(msg) self.setup.hist_vars = self.tables[self.privacy_table_name].histogram_variables # Create geocode dict geolevel_leng = self.gettuple(C.GEODICT_LENGTHS, section=C.GEODICT) assert len(geolevel_leng) == len(self.setup.levels), "Geolevel names and geolevel lengths differ in size" self.geocode_dict = {int(gl_length): gl_name for gl_name, gl_length in zip(self.setup.levels, geolevel_leng)}
def __init__(self, **kwargs): super().__init__(**kwargs) assert self.config try: comment_start = self.getconfig(CC.CSV_COMMENT) except NoOptionError: comment_start = None self.csv_file_format = { "header": self.getboolean(CC.HEADER), "sep": self.getconfig(CC.DELIMITER), "comment": comment_start } self.num_reader_partitions = self.getint(CC.NUM_READER_PARTITIONS, default=100) self.range_partition = self.getboolean(CC.RANGE_PARTITION, default=False) self.reader_partition_len = self.getint(CC.READER_PARTITION_LEN, default=11) self.measure_rdd_times = self.getboolean(CC.MEASURE_RDD_TIMES, default=False) # The [reader] section of the config file specifies all of the tables to read in. # For each table a path and a class are provided. # [reader] # PersonData.path: $DAS_S3ROOT/title13_input_data/table8/ri44.txt # PersonData.class: programs.reader.sql_spar_table.SQLSparseHistogramTable # # This gets all of the table names: table_names = self.gettuple(CC.TABLES, sep=" ") self.annotate("building table infrastructure") self.annotate(f'table names {table_names}') self.annotate("Reading table module and class names from config") # Create the reader classes for each table. # They are subclasses of the AbstractDASModule, so they need to be properly setup # They are stored in self.tables. # The class definitions of the table classes are in sql_spar_table.py and spar_table.py self.tables = { name: das_utils.class_from_config(self.config, f"{name}.{CC.TABLE_CLASS}", CC.READER)(name=name, config=self.config, reader_instance=self, das=self.das) for name in table_names } self.shape_dict = {} # Find out recode variables and their dimensions to know the set of histogram variables and its shape for table in self.tables.values(): logging.info(f"recode meta for table {table.name}") table.recode_meta_update() table.set_shape() self.shape_dict[table.name] = table.data_shape # Bottom geographical level bottom: str = self.setup.levels[0] # Invariants and Constraints for the bottom level ic_bottom: Dict[str, List[str]] = self.setup.inv_con_by_level[bottom] # Get invariants names from setup self.invar_names = ic_bottom["invar_names"] # Get constraints names from setup self.cons_names = ic_bottom["cons_names"] # Get the names of tables self.privacy_table_name = self.getconfig( CC.PTABLE).strip() # Person or Household table self.constraint_tables = self.gettuple( CC.CTABLES ) # Allows for extra-generality, but we only need one additional table for invariants/constaints, i.e. self.constraint_table_name = self.constraint_tables[0] # Unit table self.data_names = [self.privacy_table_name] + list( self.constraint_tables) # Shape of the person histogram (save it in setup object for further use) if self.setup.hist_shape != self.tables[ self.privacy_table_name].data_shape: msg = ( f"The histogram shape set in config file {self.tables[self.privacy_table_name].data_shape} that the data read is different from " f"the shape of schema {self.setup.schema} {self.setup.hist_shape}" ) warnings.warn(msg) self.log_warning_and_print(msg) self.setup.hist_shape = self.tables[ self.privacy_table_name].data_shape # Save person tables histogram variables in setup if tuple(self.setup.hist_vars) != tuple( self.tables[self.privacy_table_name].histogram_variables): msg = f"The histogram variables set in config file {self.tables[self.privacy_table_name].histogram_variables} that the data read are " \ f"different from the variables of schema {self.setup.schema} {self.setup.hist_vars}" warnings.warn(msg) self.log_warning_and_print(msg) self.setup.hist_vars = self.tables[ self.privacy_table_name].histogram_variables # Get geocode dict from setup object self.modified_geocode_dict = self.setup.geocode_dict.copy()