Ejemplo n.º 1
0
    def test_makeHistRowsFromMultiSparse(self, persons, as_dict, recode):
        node = self.makeNode(persons)
        if as_dict:
            node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR))
        if recode:
            rows = makeHistRowsFromMultiSparse(
                node, self.schema, row_recoder=DHCPHHGQToMDFPersons2020Recoder)
            assert len(rows) == len(persons)
        else:
            rows = makeHistRowsFromMultiSparse(node,
                                               self.schema,
                                               add_schema_name=False)

        input_rows = ["|".join(map(str, row[:-1])) for row in persons]

        if not recode:
            match_cnt = 0
            for row in rows:
                row_str = "|".join([row[var] for var in self.schema.dimnames])
                for inp_row in input_rows:
                    if row_str == inp_row:
                        match_cnt += 1
                        input_rows.remove(inp_row)
                        break
            assert match_cnt == len(rows) == len(persons)

        else:
            assert len(rows) == len(persons)
Ejemplo n.º 2
0
    def test_makeHistRowsFromMultiSparse(self, hholds, units, as_dict, recode):
        node = self.makeNode(hholds, units)
        if as_dict:
            node = node.toDict(keep_attrs=(SYN, GEOCODE, INVAR))
        if recode:
            rows = makeHistRowsFromMultiSparse(
                node,
                self.schema,
                row_recoder=Household2010ToMDFUnit2020Recoder)
            assert len(rows) == len(hholds)
            rows = addEmptyAndGQ(node,
                                 self.schema,
                                 rows,
                                 row_recoder=Household2010ToMDFUnit2020Recoder)
            assert len(rows) == len(units)
        else:
            rows = makeHistRowsFromMultiSparse(node,
                                               self.schema,
                                               add_schema_name=False)

        input_rows = ["|".join(map(str, row[:-1])) for row in hholds]

        if not recode:
            match_cnt = 0
            for row in rows:
                row_str = "|".join([row[var] for var in self.schema.dimnames])
                for inp_row in input_rows:
                    if row_str == inp_row:
                        match_cnt += 1
                        input_rows.remove(inp_row)
                        break
            assert match_cnt == len(rows) == len(hholds)

        else:
            assert len(rows) == len(units)
Ejemplo n.º 3
0
 def node2SparkRows(node: dict):
     # nodedict = node.toDict((SYN, INVAR, GEOCODE))
     nodedict = {SYN: node[SYN], GEOCODE: node[GEOCODE]}
     persons = makeHistRowsFromMultiSparse(nodedict,
                                           schema,
                                           row_recoder=self.row_recoder)
     return persons
Ejemplo n.º 4
0
 def node2SparkRows(node: GeounitNode):
     nodedict = node.toDict((SYN, INVAR, GEOCODE))
     persons = makeHistRowsFromMultiSparse(
         nodedict,
         schema,
         row_recoder=self.row_recoder,
         geocode_dict=inverted_geodict)
     return persons
Ejemplo n.º 5
0
 def test_makeHistRowsFromMultiSparseRecode(self, persons):
     node = self.makeNode(persons)
     rows = makeHistRowsFromMultiSparse(
         node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)),
         self.schema,
         row_recoder=DHCPHHGQToMDFPersons2020Recoder)
     ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the
     #   writer test below
     assert len(rows) == len(persons)
Ejemplo n.º 6
0
 def node2SparkRows(node: GeounitNode):
     nodedict = node.toDict((SYN, INVAR, GEOCODE))
     households = makeHistRowsFromMultiSparse(
         nodedict, schema, row_recoder=self.row_recoder)
     units = addEmptyAndGQ(
         nodedict,
         schema,
         households,
         row_recoder=self.row_recoder,
         gqtype_recoder=HHGQUnitDemoProductAttr.das2mdf,
         geocode_dict=inverted_geodict)
     return units
Ejemplo n.º 7
0
        def node2SparkRows(node: dict):
            # nodedict = node.toDict((SYN, INVAR, GEOCODE))

            # node already comes as a dict, but let's still clear everything except for SYN, INVAR and GEOCODE.
            nodedict = {SYN: node[SYN], GEOCODE: node[GEOCODE]}
            nodedict[INVAR] = node[INVAR] if INVAR in node else node['_invar']

            households = makeHistRowsFromMultiSparse(
                nodedict, schema, row_recoder=self.row_recoder)
            units = addEmptyAndGQ(nodedict,
                                  schema,
                                  households,
                                  row_recoder=self.row_recoder,
                                  gqtype_recoder=gqtype_recoder)
            return units
Ejemplo n.º 8
0
 def test_makeHistRowsFromMultiSparseRecode(self, hholds, units):
     node = self.makeNode(hholds, units)
     rows = makeHistRowsFromMultiSparse(
         node.toDict(keep_attrs=(SYN, GEOCODE, INVAR)),
         self.schema,
         row_recoder=Household2010ToMDFUnit2020Recoder)
     ## TODO: Some testing of the MDF spec output should probably be done here. Maybe on just one case. Maybe not, and just test it within the
     #   writer test below
     assert len(rows) == len(hholds)
     rows = addEmptyAndGQ(node,
                          self.schema,
                          rows,
                          row_recoder=Household2010ToMDFUnit2020Recoder)
     assert len(rows) == len(units)
     pass
Ejemplo n.º 9
0
 def node2SparkRows(node: GeounitNode):
     nodedict = node.toDict((SYN, INVAR, GEOCODE))
     households = makeHistRowsFromMultiSparse(
         nodedict,
         schema,
         row_recoder=self.row_recoder,
         geocode_dict=inverted_geodict,
         microdata_field=None)
     units = addGroupQuarters(nodedict,
                              schema,
                              households,
                              row_recoder=self.row_recoder,
                              geocode_dict=inverted_geodict,
                              to_microdata=False)
     # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
     ordered_cols = self.var_list + ['priv']
     return [
         Row(*ordered_cols)(*[unit[col] for col in ordered_cols])
         for unit in units
     ]