def test_processPersonTable(reader_instance, spark: SparkSession, dd_das_stub):
    data = [
        ("1", "2", 1, 0),
        ("1", "2", 1, 0),
        ("1", "2", 1, 1),
        ("2", "1", 1, 1)
    ]
    num_partitions = 10
    gv = ('ga', 'gb', )
    hv = ('a', 'b',)
    variables = gv + hv

    r = reader_instance
    r.num_reader_partitions = 8
    r.config.set(READER, "PersonData.variables", "ga gb a b")
    r.config.set(READER, f"PersonData.geography", "ga gb")
    r.config.set(READER, "ga.type", "str")
    r.config.set(READER, "ga.legal", "0-9")
    r.config.set(READER, "gb.type", "str")
    r.config.set(READER, "gb.legal", "0-9")
    r.config.set(READER, f"PersonData.histogram", "a b")
    r.config.set(READER, "a.type", "int")
    r.config.set(READER, "b.type", "int")
    r.config.set(READER, "a.legal", "0,1")
    r.config.set(READER, "b.legal", "0,1")
    t_class = das_utils.class_from_config(r.config, f"PersonData.{TABLE_CLASS}", READER)
    t = t_class(name="PersonData", das=dd_das_stub, config=r.config, reader_instance=r)
    t.data_shape = (2, 2)
    df = spark.createDataFrame(data, variables).repartition(num_partitions)
    nodes = dict(t.process(df).collect())
    assert np.array_equal(das_utils.npArray(nodes[("1", "2")], (2, 2)), np.array([[0, 0], [2, 1]]))
    assert np.array_equal(das_utils.npArray(nodes[("2", "1")], (2, 2)), np.array([[0, 0], [0, 1]]))
    def make_variables(self) -> typing.List[FWFTableVariable]:
        """
        Dynamically create variables based on the generated specification file
        """
        generated_module = f"{self.name}.generated_module"
        generated_table_name = f"{self.name}.generated_table"
        table_name = self.getconfig(generated_table_name, section=C.READER)
        generated_module = das_utils.class_from_config(self.config,
                                                       generated_module,
                                                       C.READER)
        self.generated_class = getattr(generated_module, table_name)
        generated_spec = generated_module.SPEC_DICT["tables"][table_name]

        variables = [
            FWFTableVariable(spec_var["name"].lower(),
                             column=spec_var["column"],
                             width=spec_var["width"],
                             vtype=spec_var["vtype"],
                             legal=spec_var["ranges"])
            for spec_var in generated_spec["variables"]
        ]

        for var in variables:
            var.set_vtype(var.vtype)
            var.set_legal_values_from_ranges(var.legal_values)
        return variables
Ejemplo n.º 3
0
    def __init__(self, *, reader_instance: 'reader', **kwargs):
        super().__init__(**kwargs)

        self.reader = reader_instance

        #self.location = [os.path.expandvars(x) for x in re.split(CC.REGEX_CONFIG_DELIM, self.getconfigwsec(CC.PATH)) if len(x)>0]
        self.location = list(self.gettuplewsec(CC.PATH))

        self.variables = self.make_variables()

        self.recode_variables = [
            TableVariable(var_name).make_from_config(self.config)
            for var_name in self.gettuplewsec(CC.RECODE_VARS, default=())
        ]

        self.csv_file_format = self.reader.csv_file_format.copy()

        # If we want these distinct for each table, then the option should include table name
        try:
            self.csv_file_format["sep"] = self.getconfig(
                f"{self.name}.{CC.DELIMITER}", section=CC.READER)
        except NoOptionError:
            pass

        self.csv_file_format['schema'] = self.set_schema()

        self.geography_variables = self.gettuplewsec(CC.GEOGRAPHY)
        self.histogram_variables = self.gettuplewsec(CC.HISTOGRAM)

        # Finally, set up the recoder, if there is any. This must be done in __init__() so the recoder is included in the BOM.

        recoder_name = f"{self.name}.{CC.PRE_RECODER}"
        if not self.config.has_option(CC.READER, recoder_name):
            self.recoder = None
        else:
            args = [
                self.gettuple(var.name, section=CC.READER, sep=" ")
                for var in self.recode_variables
            ]
            if self.getboolean(f"{self.name}.{CC.NEWRECODER}",
                               section=CC.READER,
                               default=False):
                args = args + [self.recode_variables]
            try:
                self.recoder = das_utils.class_from_config(
                    self.config, recoder_name, CC.READER)(*args)
            except TypeError as err:
                raise TypeError(
                    f"Table {self.name} failed to create recoder, arguments: {args}, Error: {err.args}"
                )
Ejemplo n.º 4
0
def test_class_from_config():
    with pytest.raises(KeyError) as err:
        class_from_config(config, "aaa", "A")
    assert 'Key "aaa" in config section [A] not found' in str(err.value)

    with pytest.raises(ImportError) as err:
        class_from_config(config, "aa", "A")
    assert 'Module aaa.bbb import failed.\nCurrent directory' in str(err.value)

    with pytest.raises(AttributeError) as err:
        class_from_config(config, "bb", "B")
    assert "[B]/bb option" in str(err.value)

    from das_framework.driver import AbstractDASModule as adm
    assert type(class_from_config(config, "adm", "B")) == type(adm)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        assert self.config

        self.csv_file_format = {
            "header": self.getboolean(C.HEADER),
            "sep": self.getconfig(C.DELIMITER),
            "comment": self.getconfig(C.CSV_COMMENT, default='#')
        }

        self.num_reader_partitions = self.getint(C.NUM_READER_PARTITIONS, default=100)
        self.range_partition = self.getboolean(C.RANGE_PARTITION, default=False)
        self.reader_partition_len = self.getint(C.READER_PARTITION_LEN, default=11)
        self.measure_rdd_times = self.getboolean(C.MEASURE_RDD_TIMES, default=False)

        table_names = self.gettuple(C.TABLES, sep=" ")

        logging.info("building table infrastructure")
        logging.debug("table names %s", table_names)
        logging.debug("Reading table module and class names from config")

        # Create the reader classes for each table.
        # They are subclasses of the AbstractDASModule, so they need to be properly setup
        self.tables = {name: das_utils.class_from_config(self.config, f"{name}.{C.TABLE_CLASS}", C.READER)(
                           name=name, config=self.config, reader_instance=self, das=self.das) 
                       for name in table_names}

        self.shape_dict = {}

        # Find out recode variables and their dimensions to know the set of histogram variables and its shape
        for table in self.tables.values():
            logging.info(f"recode meta for table {table.name}")
            table.recode_meta_update()
            table.set_shape()
            self.shape_dict[table.name] = table.data_shape

        # Bottom geographical level
        bottom: str = self.setup.levels[0]

        # Invariants and Constraints for the bottom level
        ic_bottom: Dict[str, List[str]] = self.setup.inv_con_by_level[bottom]

        # Get invariants names from setup
        self.invar_names = ic_bottom["invar_names"]

        # Get constraints names from setup
        self.cons_names = ic_bottom["cons_names"]

        # Get the names of tables
        self.privacy_table_name = self.getconfig(C.PTABLE).strip()  # Person or Household table
        self.constraint_tables = self.gettuple(C.CTABLES)           # it allows for extra-generality, but we only need one additional table for invariants/constaints, i.e.
        self.constraint_table_name = self.constraint_tables[0]    # Unit table
        self.data_names = [self.privacy_table_name] + list(self.constraint_tables)

        # Shape of the person histogram (save it in setup object for further use)
        if self.setup.hist_shape != self.tables[self.privacy_table_name].data_shape:
            msg = (f"The histogram shape set in config file {self.tables[self.privacy_table_name].data_shape} that the data read is different from " 
                   f"the shape of schema {self.setup.schema} {self.setup.hist_shape}")
            warnings.warn(msg)
            self.log_warning_and_print(msg)
            self.setup.hist_shape = self.tables[self.privacy_table_name].data_shape

        # Save person tables histogram variables in setup
        if self.setup.hist_vars != self.tables[self.privacy_table_name].histogram_variables:
            # msg = f"The histogram variables set in config file {self.tables[self.privacy_table_name].histogram_variables} that the data read are " \
            #     f"different from the variables of schema {self.setup.schema} {self.setup.hist_vars}"
            # warnings.warn(msg)
            # self.log_warning_and_print(msg)
            self.setup.hist_vars = self.tables[self.privacy_table_name].histogram_variables

        # Create geocode dict
        geolevel_leng = self.gettuple(C.GEODICT_LENGTHS, section=C.GEODICT)
        assert len(geolevel_leng) == len(self.setup.levels), "Geolevel names and geolevel lengths differ in size"
        self.geocode_dict = {int(gl_length): gl_name for gl_name, gl_length in zip(self.setup.levels, geolevel_leng)}
Ejemplo n.º 6
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        assert self.config

        try:
            comment_start = self.getconfig(CC.CSV_COMMENT)
        except NoOptionError:
            comment_start = None

        self.csv_file_format = {
            "header": self.getboolean(CC.HEADER),
            "sep": self.getconfig(CC.DELIMITER),
            "comment": comment_start
        }

        self.num_reader_partitions = self.getint(CC.NUM_READER_PARTITIONS,
                                                 default=100)
        self.range_partition = self.getboolean(CC.RANGE_PARTITION,
                                               default=False)
        self.reader_partition_len = self.getint(CC.READER_PARTITION_LEN,
                                                default=11)
        self.measure_rdd_times = self.getboolean(CC.MEASURE_RDD_TIMES,
                                                 default=False)

        # The [reader] section of the config file specifies all of the tables to read in.
        # For each table a path and a class are provided.
        # [reader]
        # PersonData.path: $DAS_S3ROOT/title13_input_data/table8/ri44.txt
        # PersonData.class: programs.reader.sql_spar_table.SQLSparseHistogramTable
        #

        # This gets all of the table names:
        table_names = self.gettuple(CC.TABLES, sep=" ")

        self.annotate("building table infrastructure")
        self.annotate(f'table names {table_names}')
        self.annotate("Reading table module and class names from config")

        # Create the reader classes for each table.
        # They are subclasses of the AbstractDASModule, so they need to be properly setup
        # They are stored in self.tables.
        # The class definitions of the table classes are in sql_spar_table.py and spar_table.py
        self.tables = {
            name: das_utils.class_from_config(self.config,
                                              f"{name}.{CC.TABLE_CLASS}",
                                              CC.READER)(name=name,
                                                         config=self.config,
                                                         reader_instance=self,
                                                         das=self.das)
            for name in table_names
        }

        self.shape_dict = {}

        # Find out recode variables and their dimensions to know the set of histogram variables and its shape
        for table in self.tables.values():
            logging.info(f"recode meta for table {table.name}")
            table.recode_meta_update()
            table.set_shape()
            self.shape_dict[table.name] = table.data_shape

        # Bottom geographical level
        bottom: str = self.setup.levels[0]

        # Invariants and Constraints for the bottom level
        ic_bottom: Dict[str, List[str]] = self.setup.inv_con_by_level[bottom]

        # Get invariants names from setup
        self.invar_names = ic_bottom["invar_names"]

        # Get constraints names from setup
        self.cons_names = ic_bottom["cons_names"]

        # Get the names of tables
        self.privacy_table_name = self.getconfig(
            CC.PTABLE).strip()  # Person or Household table
        self.constraint_tables = self.gettuple(
            CC.CTABLES
        )  # Allows for extra-generality, but we only need one additional table for invariants/constaints, i.e.
        self.constraint_table_name = self.constraint_tables[0]  # Unit table
        self.data_names = [self.privacy_table_name] + list(
            self.constraint_tables)

        # Shape of the person histogram (save it in setup object for further use)
        if self.setup.hist_shape != self.tables[
                self.privacy_table_name].data_shape:
            msg = (
                f"The histogram shape set in config file {self.tables[self.privacy_table_name].data_shape} that the data read is different from "
                f"the shape of schema {self.setup.schema} {self.setup.hist_shape}"
            )
            warnings.warn(msg)
            self.log_warning_and_print(msg)
            self.setup.hist_shape = self.tables[
                self.privacy_table_name].data_shape

        # Save person tables histogram variables in setup
        if tuple(self.setup.hist_vars) != tuple(
                self.tables[self.privacy_table_name].histogram_variables):
            msg = f"The histogram variables set in config file {self.tables[self.privacy_table_name].histogram_variables} that the data read are " \
                f"different from the variables of schema {self.setup.schema} {self.setup.hist_vars}"
            warnings.warn(msg)
            self.log_warning_and_print(msg)
            self.setup.hist_vars = self.tables[
                self.privacy_table_name].histogram_variables

        # Get geocode dict from setup object
        self.modified_geocode_dict = self.setup.geocode_dict.copy()