Ejemplo n.º 1
0
    def testAcsMetadata(self):
        """Tests parsing ACS metadata and retrieving group variables from it"""
        metadata = census.parse_acs_metadata(
            self._fake_metadata, ["B02001", "B01001"])
        self.assertEqual(
            "Estimate!!Total:!!Male:!!25 to 29 years",
            metadata["B01001_011E"]["label"])
        self.assertEqual(
            "Estimate!!Total:!!Two or more races:",
            metadata["B02001_008E"]["label"])
        # Wasn't specified in the groups to include.
        self.assertIsNone(metadata.get("B01001B_029E"))

        group_vars = census.get_vars_for_group("SEX BY AGE", metadata, 2)
        self.assertDictEqual({
            "B01001_011E": ["Male", "25 to 29 years"],
            "B01001_012E": ["Male", "30 to 34 years"],
            "B01001_041E": ["Female", "55 to 59 years"],
            "B01001_042E": ["Female", "60 and 61 years"]
        }, group_vars)

        group_vars = census.get_vars_for_group("RACE", metadata, 1)
        self.assertDictEqual({
            "B02001_005E": ["Asian alone"],
            "B02001_007E": ["Some other race alone"],
            "B02001_008E": ["Two or more races"]
        }, group_vars)
    def get_sex_by_age_and_race(self, var_map, sex_by_age_frames):
        """Returns a DataFrame of population by sex and age and race.

           var_map: ACS metadata variable map, as returned by
                    `parse_acs_metadata`
           sex_by_age_frames: Map of concept to non-standardized DataFrame for
                              that concept."""
        frames = []
        for concept, race in SEX_BY_AGE_CONCEPTS_TO_RACE.items():
            frame = sex_by_age_frames[concept]
            group_vars = get_vars_for_group(concept, var_map, 2)
            sex_by_age = standardize_frame(frame, group_vars,
                                           [SEX_COL, AGE_COL],
                                           self.county_level, POPULATION_COL)

            # TODO reorder columns so population is last
            sex_by_age[RACE_CATEGORY_ID_COL] = race
            frames.append(sex_by_age)
        result = pandas.concat(frames)
        result[AGE_COL] = result[AGE_COL].apply(rename_age_bracket)

        result = add_sum_of_rows(result, AGE_COL, POPULATION_COL, TOTAL_VALUE)
        result = add_sum_of_rows(result, SEX_COL, POPULATION_COL, TOTAL_VALUE)

        add_race_columns_from_category_id(result)
        return self.sort_sex_age_race_frame(result)
    def write_local_files_debug(self):
        """Downloads and writes the tables to the local file system as csv and
           json files. This is only for debugging/convenience, and should not
           be used in production."""
        metadata = fetch_acs_metadata(self.base_acs_url)
        var_map = parse_acs_metadata(metadata, list(GROUPS.keys()))

        by_hisp_and_race_json = fetch_acs_group(self.base_acs_url,
                                                HISPANIC_BY_RACE_CONCEPT,
                                                var_map, 2, self.county_level)
        sex_by_age_frames = {}
        for concept in SEX_BY_AGE_CONCEPTS_TO_RACE:
            json_string = fetch_acs_group(self.base_acs_url, concept, var_map,
                                          2, self.county_level)
            frame = gcs_to_bq_util.values_json_to_dataframe(json_string)
            sex_by_age_frames[concept] = update_col_types(frame)

        race_and_hispanic_frame = gcs_to_bq_util.values_json_to_dataframe(
            by_hisp_and_race_json)
        race_and_hispanic_frame = update_col_types(race_and_hispanic_frame)
        race_and_hispanic_frame = standardize_frame(
            race_and_hispanic_frame,
            get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2),
            [HISPANIC_COL, RACE_COL], self.county_level, POPULATION_COL)

        frames = {
            self.get_table_name_by_race():
            self.get_all_races_frame(race_and_hispanic_frame),
            self.get_table_name_by_sex_age_race():
            self.get_sex_by_age_and_race(var_map, sex_by_age_frames)
        }
        for key, df in frames.items():
            df.to_csv("table_" + key + ".csv", index=False)
            df.to_json("table_" + key + ".json", orient="records")
Ejemplo n.º 4
0
    def upload_to_gcs(self, gcs_bucket):
        """Uploads population data from census to GCS bucket."""
        metadata = fetch_acs_metadata(self.base_acs_url)
        var_map = parse_acs_metadata(metadata, list(GROUPS.keys()))

        concepts = list(SEX_BY_AGE_CONCEPTS_TO_RACE.keys())
        concepts.append(HISPANIC_BY_RACE_CONCEPT)

        file_diff = False
        for concept in concepts:
            group_vars = get_vars_for_group(concept, var_map, 2)
            cols = list(group_vars.keys())
            url_params = get_census_params(cols, self.county_level)
            concept_file_diff = url_file_to_gcs.url_file_to_gcs(
                self.base_acs_url, url_params, gcs_bucket,
                self.get_filename(concept))
            file_diff = file_diff or concept_file_diff

        url_params = get_census_params([TOTAL_POP_VARIABLE_ID],
                                       self.county_level)
        next_file_diff = url_file_to_gcs.url_file_to_gcs(
            self.base_acs_url, url_params, gcs_bucket,
            self.add_filename_suffix(TOTAL_POP_VARIABLE_ID))
        file_diff = file_diff or next_file_diff
        return file_diff
Ejemplo n.º 5
0
    def testStandarizeFrameTwoDims(self):
        """Tests standardizing an ACS DataFrame"""
        metadata = census.parse_acs_metadata(
            self._fake_metadata, ["B02001", "B01001"])
        group_vars = census.get_vars_for_group("SEX BY AGE", metadata, 2)

        df = gcs_to_bq_util.values_json_to_dataframe(
            json.dumps(self._fake_sex_by_age_data))
        df = census.standardize_frame(
            df, group_vars, ["sex", "age"], False, "population")
        expected_df = gcs_to_bq_util.values_json_to_dataframe(
            json.dumps(self._expected_sex_by_age_data)).reset_index(drop=True)
        assert_frame_equal(expected_df, df)
Ejemplo n.º 6
0
    def write_to_bq(self, dataset, gcs_bucket):
        """Writes population data to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from"""
        # TODO change this to have it read metadata from GCS bucket
        metadata = fetch_acs_metadata(self.base_acs_url)
        var_map = parse_acs_metadata(metadata, list(GROUPS.keys()))

        race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT))
        race_and_hispanic_frame = update_col_types(race_and_hispanic_frame)

        race_and_hispanic_frame = standardize_frame(
            race_and_hispanic_frame,
            get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2),
            [HISPANIC_COL, RACE_COL],
            self.county_level,
            POPULATION_COL)

        total_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID))
        total_frame = update_col_types(total_frame)
        total_frame = standardize_frame(
            total_frame,
            {TOTAL_POP_VARIABLE_ID: ['Total']},
            [RACE_OR_HISPANIC_COL],
            self.county_level,
            POPULATION_COL)

        sex_by_age_frames = {}
        for concept in SEX_BY_AGE_CONCEPTS_TO_RACE:
            sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, self.get_filename(concept))
            sex_by_age_frame = update_col_types(sex_by_age_frame)
            sex_by_age_frames[concept] = sex_by_age_frame

        frames = {
            self.get_table_name_by_race(): self.get_all_races_frame(
                race_and_hispanic_frame, total_frame),
            self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race(
                var_map, sex_by_age_frames)
        }

        for table_name, df in frames.items():
            # All breakdown columns are strings
            column_types = {c: 'STRING' for c in df.columns}
            column_types[POPULATION_COL] = 'INT64'
            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types)