def clean_Twitter_csv_state(data_path, just_state_identified_accounts=False): twitter_data = pd.read_csv(data_path) if just_state_identified_accounts: state_data = twitter_data[(df.county.isna()) | ( df.county == 'None')].groupby('state').apply( lambda x: get_summary_stats(x)) else: state_data = twitter_data.groupby('state').apply( lambda x: get_summary_stats(x)) state_data = state_data.reset_index() state_data = state_data.rename(columns={ 'county': 'County', 'state': 'State' }) fips_map = Geo().get_state_to_fips_map() data = pd.merge(state_data, fips_map, on='State') data['County'] = '' data = data.melt(id_vars=["FIPS", "State", "County"], value_vars=[ 'No. accounts', 'No. tweets', 'Mean % low-credibility', 'Stderr % low-credibility', 'Min % low-credibility', 'Max % low-credibility', ]) return data
def clean_Twitter_csv(data_path): twitter_data = pd.read_csv(data_path) twitter_data = twitter_data.replace('St. Tammany Parish', 'St Tammany Parish') twitter_data = twitter_data.replace('St. Joseph County', 'St Joseph County') results_data_list = [] for no_accounts in [1, 10, 50, 100]: for no_tweets in [1, 10, 50, 100, 200, 500]: suffix = f'{no_accounts}_accounts_{no_tweets}_tweets ' thresholded_accounts = threshold_accounts(twitter_data, no_accounts, no_tweets) if len(thresholded_accounts) < 1: continue thresholded_accounts = thresholded_accounts.rename(columns={ 'county': 'County', 'state': 'State' }) fips_map = Geo().get_county_state_to_fips_map(unique_fips=False) twitter_data_with_fips = pd.merge(thresholded_accounts, fips_map, on=['County', 'State'], how='left') missing = twitter_data_with_fips[ twitter_data_with_fips.fips_code.isna()][['County', 'State']] if len(missing) > 0: print("For ", suffix) print("Missing the following counties' fips codes") print(missing) # These are the data columns from the Twitter data file we are going to # add suffix at the front of each of these for the output. # The filename has the thresholds for the numbers of tweets and accounts columns = 'Mean % low-credibility,Stderr % low-credibility,Min % low-credibility,Max % low-credibility,No. accounts,No. tweets'.split( ',') output_columns = {c: suffix + c for c in columns} twitter_data_to_output = twitter_data_with_fips.rename( columns=output_columns) data = twitter_data_to_output.rename(columns={'fips_code': 'FIPS'}) data = data.melt(id_vars=["FIPS", "State", "County"], value_vars=list(output_columns.values())) data = data[~data.FIPS.isna()] results_data_list += [ data, ] return pd.concat(results_data_list, sort=False)
def clean_OWID_vaccine_uptake_csv(config, early=False): """Clean vaccine uptake data""" data_path = os.path.join(config["PATHS"]["STATE_DATA_DIR"], config["FILES"]["OWID_DATA_FILE"]) cols = [ "daily_vaccinations_per_million", "people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred", "share_doses_used" ] data = pd.read_csv(data_path, usecols=["date", "location"] + cols, dtype={ "location": str, "date": str, }.update({c: float for c in cols})) data['DateTime'] = pd.to_datetime(data['date']) start_date = "UPTAKE_START" end_date = "UPTAKE_END" if early: start_date = "UPTAKE_EARLY_START" end_date = "UPTAKE_EARLY_END" start = pd.to_datetime(config["DATES"][start_date]) end = pd.to_datetime(config["DATES"][end_date]) time_period = data[(data.DateTime >= start) & (data.DateTime <= end)].copy() grouped_data = time_period.groupby( time_period.location).mean().reset_index() # Fix NY bug in OWID data grouped_data.loc[grouped_data.location == 'New York State', 'location'] = 'New York' merged_data = pd.merge(grouped_data, Geo().get_state_to_fips_map(), left_on='location', right_on='State') merged_data['County'] = '' if early: old_cols = cols merged_data = merged_data.rename( columns={col: 'early_' + col for col in old_cols}).copy() cols = ['early_' + col for col in old_cols] data = merged_data.melt(id_vars=["FIPS", "State", "County"], value_vars=cols) return data
def clean_FB_survey_csv(data_path, state_level=False): """Clean facebook survey county-level data.""" data = pd.read_csv( data_path, usecols=["geo_value", "time_value", "value", "stderr", "sample_size"], dtype={ "geo_value": str, "time_value": str, "value": float, "stderr": float, "sample_size": float # This needs to be set as a float or the data doesn't load }) data['DateTime'] = pd.to_datetime(data['time_value']) start_date = "REFUSAL_START" end_date = "REFUSAL_END" start = pd.to_datetime(config["DATES"][start_date]) end = pd.to_datetime(config["DATES"][end_date]) data = data[(data.DateTime >= start) & (data.DateTime <= end)].copy() data['num_accept'] = data.sample_size * (data.value / 100.0) aggregate = data.groupby('geo_value', as_index=False).sum() # Calculate the new variables for the aggregates aggregate[ 'mean_smoothed_covid_vaccinated_or_accept'] = aggregate.num_accept / aggregate.sample_size aggregate['stderr_smoothed_covid_vaccinated_or_accept'] = aggregate.apply( lambda x: get_stderr(x['num_accept'], x['sample_size']), axis=1) if not state_level: fips_map = Geo().get_county_state_to_fips_map(unique_fips=True) data = pd.merge(aggregate, fips_map, left_on='geo_value', right_on='fips_code') else: state_abbr = Geo().get_state_to_fips_map() data = pd.merge(aggregate, state_abbr, left_on='geo_value', right_on='abbr_lower') data['County'] = '' data = data.rename( columns={ 'fips_code': 'FIPS', 'num_accept': 'num_smoothed_covid_vaccinated_or_accept', 'sample_size': 'sample_size_for_covid_vaccinated_or_accept_question' }) # This will help to create a standard set of columns, which will be: # ["FIPS","State", "County", "variable", "value"] data = data.melt(id_vars=["FIPS", "State", "County"], value_vars=[ "mean_smoothed_covid_vaccinated_or_accept", "stderr_smoothed_covid_vaccinated_or_accept", "num_smoothed_covid_vaccinated_or_accept", "sample_size_for_covid_vaccinated_or_accept_question" ]) return data
raise Exception( "CHANGE CURRENT WORKING DIRECTORY TO THE `src` PATH BEFORE RUNNING!!" ) # Load config_file_path from commandline input args = parse_cl_args() config_file_path = args.config_file state_level = args.state_level keywords_filter = args.keywords_filter # Get config file object config = parse_config_file(config_file_path) # Intialize the Geo class and load lookup tables/dicts g = Geo() fip_lookup = g.load_fip_code_lookup() state_lookup = g.load_state_abbrv_lookup(as_dict=True) # Get base dir for county-level data and set data file paths county_data_dir = config["PATHS"]["COUNTY_DATA_DIR"] state_data_dir = config["PATHS"]["STATE_DATA_DIR"] covid_data_dir = config["PATHS"]["COVID_DATA_DIR"] intermediate_data_dir = config["PATHS"]["INTERMEDIATE_DATA_DIR"] people_file_path = os.path.join(county_data_dir, config["FILES"]["COUNTY_PEOPLE"]) income_file_path = os.path.join(county_data_dir, config["FILES"]["COUNTY_INCOME"]) gini_file_path = os.path.join(county_data_dir, config["FILES"]["COUNTY_GINI"])
def __init__(self): self._logger = logging.getLogger('POI') self._session = requests.session() self._geo = Geo()
class POI(object): def __init__(self): self._logger = logging.getLogger('POI') self._session = requests.session() self._geo = Geo() def _fetch_and_parse(self, url): resp = self._session.get(url) if resp.status_code != 200: raise Exception("HTTP request <%s> returned status code %d", url, resp.status_code) return html.fromstring(resp.text.encode("utf8")) def get_points(self, category_name, url): self._logger.info('Category: {}'.format(category_name)) tree = self._fetch_and_parse(url) if tree.xpath('//div[@class="Paragraph"]//li/p//a'): res = self._get_points_from_old_tree(tree) elif tree.xpath('//article[@class="object"]'): res = self._get_points_from_new_tree(tree) else: raise Exception('Unknown POI page format: <{}>'.format(url)) points = [] self._logger.info('Points: {}'.format(len(res))) for name, address in res: name = name.text.strip() address = address.text.strip() street = re.split('[,\(-]', address)[0].strip() # brak adresu, miejsce poza Poznaniem if address == '' or ('Pozna' not in address and "\n" in address): self._logger.info("Skipping! - %s: %s", name, address) continue self._logger.debug('%s - %s', name, street) pos = self._geo.query(street + u', Poznań') points.append({ "name": name, "address": street, "lat": pos['lat'] if pos is not None else False, "lon": pos['lon'] if pos is not None else False, }) return points @staticmethod def _get_points_from_old_tree(tree): """ @see http://www.poznan.pl/mim/inwestycje/biurowce,poi,4661/ [stary format] """ names = tree.xpath('//div[@class="Paragraph"]//li/p//a') addresses = tree.xpath('//div[@class="Paragraph"]//li/p[2]') return zip(names, addresses) @staticmethod def _get_points_from_new_tree(tree): """ @see http://www.poznan.pl/mim/osiedla/muzea-w-poznaniu,poi,202,12/ [nowy format] """ names = tree.xpath('//article[contains(@class, "object")]//h2') addresses = tree.xpath('//article[contains(@class, "object")]//p[1]') return zip(names, addresses)