def clean_Twitter_csv(data_path): twitter_data = pd.read_csv(data_path) twitter_data = twitter_data.replace('St. Tammany Parish', 'St Tammany Parish') twitter_data = twitter_data.replace('St. Joseph County', 'St Joseph County') results_data_list = [] for no_accounts in [1, 10, 50, 100]: for no_tweets in [1, 10, 50, 100, 200, 500]: suffix = f'{no_accounts}_accounts_{no_tweets}_tweets ' thresholded_accounts = threshold_accounts(twitter_data, no_accounts, no_tweets) if len(thresholded_accounts) < 1: continue thresholded_accounts = thresholded_accounts.rename(columns={ 'county': 'County', 'state': 'State' }) fips_map = Geo().get_county_state_to_fips_map(unique_fips=False) twitter_data_with_fips = pd.merge(thresholded_accounts, fips_map, on=['County', 'State'], how='left') missing = twitter_data_with_fips[ twitter_data_with_fips.fips_code.isna()][['County', 'State']] if len(missing) > 0: print("For ", suffix) print("Missing the following counties' fips codes") print(missing) # These are the data columns from the Twitter data file we are going to # add suffix at the front of each of these for the output. # The filename has the thresholds for the numbers of tweets and accounts columns = 'Mean % low-credibility,Stderr % low-credibility,Min % low-credibility,Max % low-credibility,No. accounts,No. tweets'.split( ',') output_columns = {c: suffix + c for c in columns} twitter_data_to_output = twitter_data_with_fips.rename( columns=output_columns) data = twitter_data_to_output.rename(columns={'fips_code': 'FIPS'}) data = data.melt(id_vars=["FIPS", "State", "County"], value_vars=list(output_columns.values())) data = data[~data.FIPS.isna()] results_data_list += [ data, ] return pd.concat(results_data_list, sort=False)
def clean_Twitter_csv_state(data_path, just_state_identified_accounts=False): twitter_data = pd.read_csv(data_path) if just_state_identified_accounts: state_data = twitter_data[(df.county.isna()) | ( df.county == 'None')].groupby('state').apply( lambda x: get_summary_stats(x)) else: state_data = twitter_data.groupby('state').apply( lambda x: get_summary_stats(x)) state_data = state_data.reset_index() state_data = state_data.rename(columns={ 'county': 'County', 'state': 'State' }) fips_map = Geo().get_state_to_fips_map() data = pd.merge(state_data, fips_map, on='State') data['County'] = '' data = data.melt(id_vars=["FIPS", "State", "County"], value_vars=[ 'No. accounts', 'No. tweets', 'Mean % low-credibility', 'Stderr % low-credibility', 'Min % low-credibility', 'Max % low-credibility', ]) return data
def clean_OWID_vaccine_uptake_csv(config, early=False): """Clean vaccine uptake data""" data_path = os.path.join(config["PATHS"]["STATE_DATA_DIR"], config["FILES"]["OWID_DATA_FILE"]) cols = [ "daily_vaccinations_per_million", "people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred", "share_doses_used" ] data = pd.read_csv(data_path, usecols=["date", "location"] + cols, dtype={ "location": str, "date": str, }.update({c: float for c in cols})) data['DateTime'] = pd.to_datetime(data['date']) start_date = "UPTAKE_START" end_date = "UPTAKE_END" if early: start_date = "UPTAKE_EARLY_START" end_date = "UPTAKE_EARLY_END" start = pd.to_datetime(config["DATES"][start_date]) end = pd.to_datetime(config["DATES"][end_date]) time_period = data[(data.DateTime >= start) & (data.DateTime <= end)].copy() grouped_data = time_period.groupby( time_period.location).mean().reset_index() # Fix NY bug in OWID data grouped_data.loc[grouped_data.location == 'New York State', 'location'] = 'New York' merged_data = pd.merge(grouped_data, Geo().get_state_to_fips_map(), left_on='location', right_on='State') merged_data['County'] = '' if early: old_cols = cols merged_data = merged_data.rename( columns={col: 'early_' + col for col in old_cols}).copy() cols = ['early_' + col for col in old_cols] data = merged_data.melt(id_vars=["FIPS", "State", "County"], value_vars=cols) return data
def clean_FB_survey_csv(data_path, state_level=False): """Clean facebook survey county-level data.""" data = pd.read_csv( data_path, usecols=["geo_value", "time_value", "value", "stderr", "sample_size"], dtype={ "geo_value": str, "time_value": str, "value": float, "stderr": float, "sample_size": float # This needs to be set as a float or the data doesn't load }) data['DateTime'] = pd.to_datetime(data['time_value']) start_date = "REFUSAL_START" end_date = "REFUSAL_END" start = pd.to_datetime(config["DATES"][start_date]) end = pd.to_datetime(config["DATES"][end_date]) data = data[(data.DateTime >= start) & (data.DateTime <= end)].copy() data['num_accept'] = data.sample_size * (data.value / 100.0) aggregate = data.groupby('geo_value', as_index=False).sum() # Calculate the new variables for the aggregates aggregate[ 'mean_smoothed_covid_vaccinated_or_accept'] = aggregate.num_accept / aggregate.sample_size aggregate['stderr_smoothed_covid_vaccinated_or_accept'] = aggregate.apply( lambda x: get_stderr(x['num_accept'], x['sample_size']), axis=1) if not state_level: fips_map = Geo().get_county_state_to_fips_map(unique_fips=True) data = pd.merge(aggregate, fips_map, left_on='geo_value', right_on='fips_code') else: state_abbr = Geo().get_state_to_fips_map() data = pd.merge(aggregate, state_abbr, left_on='geo_value', right_on='abbr_lower') data['County'] = '' data = data.rename( columns={ 'fips_code': 'FIPS', 'num_accept': 'num_smoothed_covid_vaccinated_or_accept', 'sample_size': 'sample_size_for_covid_vaccinated_or_accept_question' }) # This will help to create a standard set of columns, which will be: # ["FIPS","State", "County", "variable", "value"] data = data.melt(id_vars=["FIPS", "State", "County"], value_vars=[ "mean_smoothed_covid_vaccinated_or_accept", "stderr_smoothed_covid_vaccinated_or_accept", "num_smoothed_covid_vaccinated_or_accept", "sample_size_for_covid_vaccinated_or_accept_question" ]) return data
raise Exception( "CHANGE CURRENT WORKING DIRECTORY TO THE `src` PATH BEFORE RUNNING!!" ) # Load config_file_path from commandline input args = parse_cl_args() config_file_path = args.config_file state_level = args.state_level keywords_filter = args.keywords_filter # Get config file object config = parse_config_file(config_file_path) # Intialize the Geo class and load lookup tables/dicts g = Geo() fip_lookup = g.load_fip_code_lookup() state_lookup = g.load_state_abbrv_lookup(as_dict=True) # Get base dir for county-level data and set data file paths county_data_dir = config["PATHS"]["COUNTY_DATA_DIR"] state_data_dir = config["PATHS"]["STATE_DATA_DIR"] covid_data_dir = config["PATHS"]["COVID_DATA_DIR"] intermediate_data_dir = config["PATHS"]["INTERMEDIATE_DATA_DIR"] people_file_path = os.path.join(county_data_dir, config["FILES"]["COUNTY_PEOPLE"]) income_file_path = os.path.join(county_data_dir, config["FILES"]["COUNTY_INCOME"]) gini_file_path = os.path.join(county_data_dir, config["FILES"]["COUNTY_GINI"])
def __init__(self): self._logger = logging.getLogger('POI') self._session = requests.session() self._geo = Geo()