def test_configReaderValidate(s_config, remove_option, msg, errtype, dd_das_stub):
    import programs.reader.table_reader as tr_spark
    import programs.das_setup as ds
    config = ConfigParser()
    config.read_string(s_config)
    config.remove_option(READER, remove_option)
    setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub)
    with pytest.raises(errtype) as err:
        tr_spark.DASDecennialReader(config=setup_instance.config, setup=setup_instance, name='reader', das=dd_das_stub)
    if errtype == KeyError:
        assert msg.lower() in err.value.args[0].lower()
    else:
        assert msg.lower() in err.value.message.lower()
Exemple #2
0
    def test_transformRDDForSaving(self, spark, dd_das_stub):
        dd_das_stub.reader = get_reader_stub()

        config = ConfigParser()
        config.read_string(self.config)
        import programs.das_setup as ds
        setup_instance = ds.DASDecennialSetup(config=config,
                                              name='setup',
                                              das=dd_das_stub)
        w = MDF2020HouseholdWriter(config=setup_instance.config,
                                   setup=setup_instance,
                                   name='writer',
                                   das=dd_das_stub)

        hholds = hhdata['households']
        units = hhdata['units']
        node1 = self.makeNode(hholds[:4], units[:4], geocode='0')
        node2 = self.makeNode(hholds[4:], units[4:], geocode='1')
        spark = SparkSession.builder.getOrCreate()
        node_rdd = spark.sparkContext.parallelize([node1, node2])
        df = w.transformRDDForSaving(node_rdd)
        df.show()

        assert df.count() == len(units)

        for val in df.select('P18').collect():
            assert val['P18'] == 9

        for val in df.select('PAC').collect():
            assert val['PAC'] == '9'

        def len_cond(cond):
            return len(np.where(cond)[0])

        num_gq = len_cond(np.array(units)[:, 0] > 1)

        rtype = np.array(df.select('RTYPE').collect())

        assert len_cond(rtype[:, 0] == '4') == num_gq
        assert len_cond(rtype[:, 0] == '2') == len(units) - num_gq
Exemple #3
0
    def test_transformRDDForSaving(self, spark, dd_das_stub):
        dd_das_stub.reader = get_reader_stub()

        config = ConfigParser()
        config.read_string(self.config)
        import programs.das_setup as ds
        setup_instance = ds.DASDecennialSetup(config=config,
                                              name='setup',
                                              das=dd_das_stub)
        w = MDF2020PersonWriter(config=setup_instance.config,
                                setup=setup_instance,
                                name='writer',
                                das=dd_das_stub)

        persons = pdata['persons']
        node1 = self.makeNode(persons[:2], geocode='0123456789abcdef')
        node2 = self.makeNode(persons[2:], geocode='0123456789abcdeg')
        spark = SparkSession.builder.getOrCreate()
        node_rdd = spark.sparkContext.parallelize([node1, node2])
        df = w.transformRDDForSaving(node_rdd)
        df.show()

        assert df.count() == len(persons)

        for val in df.select('EPNUM').collect():
            assert val['EPNUM'] == 999999999

        for val in df.select('RELSHIP').collect():
            assert val['RELSHIP'] == '99'

        def len_cond(cond):
            return len(np.where(cond)[0])

        num_gq = len_cond(np.array(persons)[:, 0] > 0)

        rtype = np.array(df.select('RTYPE').collect())

        assert len_cond(rtype[:, 0] == '5') == num_gq
        assert len_cond(rtype[:, 0] == '3') == len(persons) - num_gq
                print(
                    f'section:schema: {str(list(config.items(section=CC.SCHEMA)))}'
                )

                print(
                    f'Converting experiment at (full path) {full_path} to {output_path}'
                )

                # print(f'str(nodes_dict_rdd.take(1)): {str(nodes_dict_rdd.take(1))}')

                das_stub = DASStub()
                das_stub.t0 = time.time()
                das_stub.output_paths = []

                setup_instance = ds.DASDecennialSetup(config=config,
                                                      name='setup',
                                                      das=das_stub)
                if not files_shipped:
                    setup_instance = setup_instance.setup_func(
                    )  # This ships files to spark
                    files_shipped = True

                print(f"Reading pickled data: {full_path}")

                nodes_dict_rdd = spark.sparkContext.pickleFile(full_path)

                a_node_dict = nodes_dict_rdd.take(1)[0]
                #if not (experiment.type is PERSON):
                #if INVAR not in a_node_dict and '_invar' not in a_node_dict:
                #        if not invar_loaded:
                #            invar_rdd = spark\
def reader_instance(spark, config, dd_das_stub):
    import programs.reader.table_reader as tr_spark
    import programs.das_setup as ds
    setup_instance = ds.DASDecennialSetup(config=config, name='setup', das=dd_das_stub)
    return tr_spark.DASDecennialReader(config=setup_instance.config, setup=setup_instance, name='reader', das=dd_das_stub)
def old_main(s3path, config_path):
    print('Beginning of pickle picker')
    logging.info("Beginning of pickle picker")

    spark = SparkSession.builder.getOrCreate()
    files_shipped = False

    logging.basicConfig(
        filename="convert.log",
        format="%(asctime)s %(filename)s:%(lineno)d (%(funcName)s) %(message)s"
    )
    invar_loaded = False
    print(f'Source data: {s3path}')
    print(f'Config file located at: {config_path}')

    config = ConfigParser()
    config.read_string(s3open(config_path).read())
    """
    print(f'existing writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}')
    output_datafile_name = config.get(CC.WRITER_SECTION, CC.OUTPUT_DATAFILE_NAME)
    print(f'section:writer, output_datafile_name: {output_datafile_name}')
    output_path = f'{experiment.folder}_unpickled/{sub_folder}/run_000{str(run_number)}'
    config.set(CC.WRITER_SECTION, CC.OUTPUT_PATH, output_path)
    config.set(CC.WRITER_SECTION, CC.S3CAT, '1')
    config.set(CC.WRITER_SECTION, CC.S3CAT_SUFFIX, '.csv')
    config.set(CC.WRITER_SECTION, CC.OVERWRITE_FLAG, '0')
    config.set(CC.WRITER_SECTION, CC.WRITE_METADATA, '1')
    config.set(CC.WRITER_SECTION, CC.CLASSIFICATION_LEVEL, 'C_U_I//CENS')
    config.set(CC.WRITER_SECTION, CC.NUM_PARTS, '5000')
    print(f'modified writer section: {str(list(config.items(section=CC.WRITER_SECTION)))}')
    print(f'section:schema: {str(list(config.items(section=CC.SCHEMA)))}')

    # print(f'str(nodes_dict_rdd.take(1)): {str(nodes_dict_rdd.take(1))}')



    print(f"Reading pickled data: {s3path}")
    """

    # Ship the files to spark and get the setup object
    das_stub = DASStub()
    das_stub.t0 = time.time()
    das_stub.output_paths = []
    setup = ds.DASDecennialSetup(config=config, name='setup', das=das_stub)
    setup_data = setup.setup_func()
    nodes_dict_rdd = spark.sparkContext.pickleFile(s3path)
    """
    a_node_dict = nodes_dict_rdd.take(1)[0]
    if not (experiment.type is PERSON):
        if INVAR not in a_node_dict and '_invar' not in a_node_dict:
            if not invar_loaded:
                invar_rdd = spark\
                    .sparkContext\
                    .pickleFile('s3://uscb-decennial-ite-das/users/sexto015/experiments/full_household/Sept12_TestMUD_VA_PLB_Experiment/td001/run_0000/data') \
                    .map(lambda nd: (nd[GEOCODE], nd['_invar']))
                invar_loaded = True
            nodes_dict_rdd = nodes_dict_rdd\
                .map(lambda nd: (nd[GEOCODE], nd[SYN]))\
                .join(invar_rdd)\
                .map(lambda g_sk: {GEOCODE: g_sk[0], SYN: g_sk[1][0], INVAR: g_sk[1][1]})

    # print(nodes_dict_rdd.count())
    # from rdd_like_list import RDDLikeList
    # nodes_dict_rdd = RDDLikeList(nodes_dict_rdd.take(10))

    if experiment.type is PERSON:
        print('Using Person Writer')
        w = NonConvertingMDF2020PersonWriter(config=config, setup=setup_data, name='writer', das=das_stub)
    else:
        print('Using Household Writer')
        w = NonConvertingMDF2020HouseholdWriter(config=config, setup=setup_data, name='writer', das=das_stub)

    print('Writing')
    """

    # calls programs.writer.write() which takes an engine_tuple
    # engine_tuple is (blocknoderdd, feas_dict)
    # w.write((nodes_dict_rdd, None))
    # For testing, just take the first record and print it
    record = nodes_dict_rdd.take(1)
    print("record:", record)