def clean_Twitter_csv_state(data_path, just_state_identified_accounts=False):

    twitter_data = pd.read_csv(data_path)

    if just_state_identified_accounts:
        state_data = twitter_data[(df.county.isna()) | (
            df.county == 'None')].groupby('state').apply(
                lambda x: get_summary_stats(x))
    else:
        state_data = twitter_data.groupby('state').apply(
            lambda x: get_summary_stats(x))

    state_data = state_data.reset_index()
    state_data = state_data.rename(columns={
        'county': 'County',
        'state': 'State'
    })

    fips_map = Geo().get_state_to_fips_map()
    data = pd.merge(state_data, fips_map, on='State')
    data['County'] = ''

    data = data.melt(id_vars=["FIPS", "State", "County"],
                     value_vars=[
                         'No. accounts',
                         'No. tweets',
                         'Mean % low-credibility',
                         'Stderr % low-credibility',
                         'Min % low-credibility',
                         'Max % low-credibility',
                     ])
    return data
def clean_Twitter_csv(data_path):

    twitter_data = pd.read_csv(data_path)

    twitter_data = twitter_data.replace('St. Tammany Parish',
                                        'St Tammany Parish')
    twitter_data = twitter_data.replace('St. Joseph County',
                                        'St Joseph County')

    results_data_list = []

    for no_accounts in [1, 10, 50, 100]:
        for no_tweets in [1, 10, 50, 100, 200, 500]:
            suffix = f'{no_accounts}_accounts_{no_tweets}_tweets '

            thresholded_accounts = threshold_accounts(twitter_data,
                                                      no_accounts, no_tweets)
            if len(thresholded_accounts) < 1:
                continue

            thresholded_accounts = thresholded_accounts.rename(columns={
                'county': 'County',
                'state': 'State'
            })

            fips_map = Geo().get_county_state_to_fips_map(unique_fips=False)

            twitter_data_with_fips = pd.merge(thresholded_accounts,
                                              fips_map,
                                              on=['County', 'State'],
                                              how='left')

            missing = twitter_data_with_fips[
                twitter_data_with_fips.fips_code.isna()][['County', 'State']]
            if len(missing) > 0:
                print("For ", suffix)
                print("Missing the following counties' fips codes")
                print(missing)

            # These are the data columns from the Twitter data file we are going to
            # add suffix at the front of each of these for the output.
            # The filename has the thresholds for the numbers of tweets and accounts
            columns = 'Mean % low-credibility,Stderr % low-credibility,Min % low-credibility,Max % low-credibility,No. accounts,No. tweets'.split(
                ',')

            output_columns = {c: suffix + c for c in columns}

            twitter_data_to_output = twitter_data_with_fips.rename(
                columns=output_columns)

            data = twitter_data_to_output.rename(columns={'fips_code': 'FIPS'})

            data = data.melt(id_vars=["FIPS", "State", "County"],
                             value_vars=list(output_columns.values()))
            data = data[~data.FIPS.isna()]
            results_data_list += [
                data,
            ]

    return pd.concat(results_data_list, sort=False)
def clean_OWID_vaccine_uptake_csv(config, early=False):
    """Clean vaccine uptake data"""

    data_path = os.path.join(config["PATHS"]["STATE_DATA_DIR"],
                             config["FILES"]["OWID_DATA_FILE"])

    cols = [
        "daily_vaccinations_per_million", "people_vaccinated_per_hundred",
        "people_fully_vaccinated_per_hundred", "share_doses_used"
    ]

    data = pd.read_csv(data_path,
                       usecols=["date", "location"] + cols,
                       dtype={
                           "location": str,
                           "date": str,
                       }.update({c: float
                                 for c in cols}))

    data['DateTime'] = pd.to_datetime(data['date'])

    start_date = "UPTAKE_START"
    end_date = "UPTAKE_END"

    if early:
        start_date = "UPTAKE_EARLY_START"
        end_date = "UPTAKE_EARLY_END"

    start = pd.to_datetime(config["DATES"][start_date])
    end = pd.to_datetime(config["DATES"][end_date])

    time_period = data[(data.DateTime >= start)
                       & (data.DateTime <= end)].copy()

    grouped_data = time_period.groupby(
        time_period.location).mean().reset_index()
    # Fix NY bug in OWID data
    grouped_data.loc[grouped_data.location == 'New York State',
                     'location'] = 'New York'

    merged_data = pd.merge(grouped_data,
                           Geo().get_state_to_fips_map(),
                           left_on='location',
                           right_on='State')
    merged_data['County'] = ''

    if early:
        old_cols = cols
        merged_data = merged_data.rename(
            columns={col: 'early_' + col
                     for col in old_cols}).copy()
        cols = ['early_' + col for col in old_cols]

    data = merged_data.melt(id_vars=["FIPS", "State", "County"],
                            value_vars=cols)

    return data
def clean_FB_survey_csv(data_path, state_level=False):
    """Clean facebook survey county-level data."""

    data = pd.read_csv(
        data_path,
        usecols=["geo_value", "time_value", "value", "stderr", "sample_size"],
        dtype={
            "geo_value": str,
            "time_value": str,
            "value": float,
            "stderr": float,
            "sample_size":
            float  # This needs to be set as a float or the data doesn't load
        })
    data['DateTime'] = pd.to_datetime(data['time_value'])

    start_date = "REFUSAL_START"
    end_date = "REFUSAL_END"

    start = pd.to_datetime(config["DATES"][start_date])
    end = pd.to_datetime(config["DATES"][end_date])

    data = data[(data.DateTime >= start) & (data.DateTime <= end)].copy()

    data['num_accept'] = data.sample_size * (data.value / 100.0)
    aggregate = data.groupby('geo_value', as_index=False).sum()

    # Calculate the new variables for the aggregates
    aggregate[
        'mean_smoothed_covid_vaccinated_or_accept'] = aggregate.num_accept / aggregate.sample_size
    aggregate['stderr_smoothed_covid_vaccinated_or_accept'] = aggregate.apply(
        lambda x: get_stderr(x['num_accept'], x['sample_size']), axis=1)

    if not state_level:
        fips_map = Geo().get_county_state_to_fips_map(unique_fips=True)
        data = pd.merge(aggregate,
                        fips_map,
                        left_on='geo_value',
                        right_on='fips_code')
    else:
        state_abbr = Geo().get_state_to_fips_map()
        data = pd.merge(aggregate,
                        state_abbr,
                        left_on='geo_value',
                        right_on='abbr_lower')
        data['County'] = ''

    data = data.rename(
        columns={
            'fips_code': 'FIPS',
            'num_accept': 'num_smoothed_covid_vaccinated_or_accept',
            'sample_size':
            'sample_size_for_covid_vaccinated_or_accept_question'
        })

    # This will help to create a standard set of columns, which will be:
    #     ["FIPS","State", "County", "variable", "value"]
    data = data.melt(id_vars=["FIPS", "State", "County"],
                     value_vars=[
                         "mean_smoothed_covid_vaccinated_or_accept",
                         "stderr_smoothed_covid_vaccinated_or_accept",
                         "num_smoothed_covid_vaccinated_or_accept",
                         "sample_size_for_covid_vaccinated_or_accept_question"
                     ])

    return data
        raise Exception(
            "CHANGE CURRENT WORKING DIRECTORY TO THE `src` PATH BEFORE RUNNING!!"
        )

    # Load config_file_path from commandline input
    args = parse_cl_args()
    config_file_path = args.config_file

    state_level = args.state_level
    keywords_filter = args.keywords_filter

    # Get config file object
    config = parse_config_file(config_file_path)

    # Intialize the Geo class and load lookup tables/dicts
    g = Geo()
    fip_lookup = g.load_fip_code_lookup()
    state_lookup = g.load_state_abbrv_lookup(as_dict=True)

    # Get base dir for county-level data and set data file paths
    county_data_dir = config["PATHS"]["COUNTY_DATA_DIR"]
    state_data_dir = config["PATHS"]["STATE_DATA_DIR"]
    covid_data_dir = config["PATHS"]["COVID_DATA_DIR"]
    intermediate_data_dir = config["PATHS"]["INTERMEDIATE_DATA_DIR"]

    people_file_path = os.path.join(county_data_dir,
                                    config["FILES"]["COUNTY_PEOPLE"])
    income_file_path = os.path.join(county_data_dir,
                                    config["FILES"]["COUNTY_INCOME"])
    gini_file_path = os.path.join(county_data_dir,
                                  config["FILES"]["COUNTY_GINI"])
Beispiel #6
0
 def __init__(self):
     self._logger = logging.getLogger('POI')
     self._session = requests.session()
     self._geo = Geo()
Beispiel #7
0
class POI(object):
    def __init__(self):
        self._logger = logging.getLogger('POI')
        self._session = requests.session()
        self._geo = Geo()

    def _fetch_and_parse(self, url):
        resp = self._session.get(url)

        if resp.status_code != 200:
            raise Exception("HTTP request <%s> returned status code %d", url, resp.status_code)

        return html.fromstring(resp.text.encode("utf8"))

    def get_points(self, category_name, url):
        self._logger.info('Category: {}'.format(category_name))

        tree = self._fetch_and_parse(url)

        if tree.xpath('//div[@class="Paragraph"]//li/p//a'):
            res = self._get_points_from_old_tree(tree)
        elif tree.xpath('//article[@class="object"]'):
            res = self._get_points_from_new_tree(tree)
        else:
            raise Exception('Unknown POI page format: <{}>'.format(url))

        points = []

        self._logger.info('Points: {}'.format(len(res)))

        for name, address in res:
            name = name.text.strip()
            address = address.text.strip()

            street = re.split('[,\(-]', address)[0].strip()

            # brak adresu, miejsce poza Poznaniem
            if address == '' or ('Pozna' not in address and "\n" in address):
                self._logger.info("Skipping! - %s: %s", name, address)
                continue

            self._logger.debug('%s - %s', name, street)

            pos = self._geo.query(street + u', Poznań')

            points.append({
                "name": name,
                "address": street,
                "lat": pos['lat'] if pos is not None else False,
                "lon": pos['lon'] if pos is not None else False,
            })

        return points

    @staticmethod
    def _get_points_from_old_tree(tree):
        """
        @see http://www.poznan.pl/mim/inwestycje/biurowce,poi,4661/ [stary format]
        """
        names = tree.xpath('//div[@class="Paragraph"]//li/p//a')
        addresses = tree.xpath('//div[@class="Paragraph"]//li/p[2]')

        return zip(names, addresses)

    @staticmethod
    def _get_points_from_new_tree(tree):
        """
        @see http://www.poznan.pl/mim/osiedla/muzea-w-poznaniu,poi,202,12/ [nowy format]
        """
        names = tree.xpath('//article[contains(@class, "object")]//h2')
        addresses = tree.xpath('//article[contains(@class, "object")]//p[1]')

        return zip(names, addresses)
Beispiel #8
0
 def __init__(self):
     self._logger = logging.getLogger('POI')
     self._session = requests.session()
     self._geo = Geo()
Beispiel #9
0
class POI(object):
    def __init__(self):
        self._logger = logging.getLogger('POI')
        self._session = requests.session()
        self._geo = Geo()

    def _fetch_and_parse(self, url):
        resp = self._session.get(url)

        if resp.status_code != 200:
            raise Exception("HTTP request <%s> returned status code %d", url,
                            resp.status_code)

        return html.fromstring(resp.text.encode("utf8"))

    def get_points(self, category_name, url):
        self._logger.info('Category: {}'.format(category_name))

        tree = self._fetch_and_parse(url)

        if tree.xpath('//div[@class="Paragraph"]//li/p//a'):
            res = self._get_points_from_old_tree(tree)
        elif tree.xpath('//article[@class="object"]'):
            res = self._get_points_from_new_tree(tree)
        else:
            raise Exception('Unknown POI page format: <{}>'.format(url))

        points = []

        self._logger.info('Points: {}'.format(len(res)))

        for name, address in res:
            name = name.text.strip()
            address = address.text.strip()

            street = re.split('[,\(-]', address)[0].strip()

            # brak adresu, miejsce poza Poznaniem
            if address == '' or ('Pozna' not in address and "\n" in address):
                self._logger.info("Skipping! - %s: %s", name, address)
                continue

            self._logger.debug('%s - %s', name, street)

            pos = self._geo.query(street + u', Poznań')

            points.append({
                "name": name,
                "address": street,
                "lat": pos['lat'] if pos is not None else False,
                "lon": pos['lon'] if pos is not None else False,
            })

        return points

    @staticmethod
    def _get_points_from_old_tree(tree):
        """
        @see http://www.poznan.pl/mim/inwestycje/biurowce,poi,4661/ [stary format]
        """
        names = tree.xpath('//div[@class="Paragraph"]//li/p//a')
        addresses = tree.xpath('//div[@class="Paragraph"]//li/p[2]')

        return zip(names, addresses)

    @staticmethod
    def _get_points_from_new_tree(tree):
        """
        @see http://www.poznan.pl/mim/osiedla/muzea-w-poznaniu,poi,202,12/ [nowy format]
        """
        names = tree.xpath('//article[contains(@class, "object")]//h2')
        addresses = tree.xpath('//article[contains(@class, "object")]//p[1]')

        return zip(names, addresses)