Beispiel #1
0
def main():
    # read arguments
    args = parse_args()
    users_shp = args.users_shp
    counties_shp = args.counties_shp

    # read user data
    users = gpd.read_file(users_shp)

    # get state/county shapefile
    counties = gpd.read_file(counties_shp)
    counties = counties.dropna(subset=['HASC_2'])

    # get only users in US

    # load ebird client
    client = Client('6qmfvb8pg9dk', 'en_US')

    # find eBird display name for every user
    display_names = []
    for checklist in users.sample_che:
        try:
            display_name = client.get_checklist(
                checklist.split('/')[-1])['userDisplayName']
        except ValueError:
            print(f'No valid username for checklist {checklist}')
            display_name = 'None'
        display_names.append(display_name)

    # find state/county for every user
    states = []
    for idx, row in users.iterrows():
        point = row.geometry
        closest = np.argmin(
            [point.distance(polygon) for polygon in counties.geometry])
        row = counties.iloc[closest]
        state_code = row['HASC_2'].split('.')[1]
        display_names[idx] += f" / {counties.iloc[closest]['NAME_2']}"
        states.append(state_code)

    # add user display name and state
    users['user_name'] = display_names
    users['state'] = states

    # write update shapefile
    users.to_file(users_shp)
    def __init__(self,
                 ebird_credential_path: Path,
                 xcache_path: Path = cache_path,
                 country: str = 'US'):
        """

        :param ebird_credential_path: Path to YAML files for eBird API Key credentials
        :param xcache_path: Where files like subnational2 codes and taxonomy are cached
        :param country: This is only important when retrieving and caching subnational2 codes
        """
        self.ebird_credential_path = ebird_credential_path
        self.cache_path = xcache_path
        self.country = country
        self.ebird_client = None
        self.__ebird_api_key = get_credential(self.ebird_credential_path)
        if not self.__ebird_api_key:
            print(f'No API key found for eBird')

        self._cache_path = cache_path
        self._cached_visits_path = self._cache_path / 'visits'
        self._cached_historic_path = self._cache_path / 'historic'
        self._cached_details_path = self._cache_path / 'details'

        if self.__ebird_api_key:
            self.ebird_client = Client(self.__ebird_api_key,
                                       EBIRD_DEFAULT_LOCALE)

        # We do this as a side effect as a user convenience. The list of subnational2 codes
        # are saved in the cache for reference. subnational2 codes are the region codes
        # needed in the parameters file. Not fatal if we can't get it. Files saved e.g.:
        #      cache_path / 'regions-US-subnational2.csv'
        # Note that this also calls and caches subnational1 codes
        try:
            # print(f'Calling get_subnational2_cached with {(ebird_client, country, cache_path)}')
            _ = self.get_subnational2_cached()
        except Exception as ee:
            print(f'Failed to get subnational2 codes: {ee}')
Beispiel #3
0
'''

(1) Check if how many data I've scraped before are still there
(2) If I can find some way to match those missing ones


'''
import eb_passwords
from ebird.api import Client, get_visits
import datetime

api_key = eb_passwords.ebird_api_key
locale = 'zh'
client = Client(api_key, locale)

import sys
import os
import django
from django.conf import settings
sys.path.append(os.path.abspath('ebirdtaiwan'))
from ebirdtaiwan.settings.base import DATABASES, INSTALLED_APPS
settings.configure(DATABASES=DATABASES, INSTALLED_APPS=INSTALLED_APPS)
django.setup()

from fall.models import AutumnChanllengeData

#################################
####  TEST 1 ####################
###############################
'''
today = datetime.date.today()
#feature test for ebird api
# Can I get access to what I want...

from ebird.api import Client
import time
import datetime

api_key = 'o1rng64r9e2b'
locale = 'zh'
client = Client(api_key, locale)

records = client.get_visits('TW', date=datetime.date.today())

region_codes = [
    'TW-TPE',  #台北
    'TW-TPQ',  #新北
    'TW-TAO',  #桃園
    'TW-HSQ',  #新竹
    'TW-MIA',  #苗栗
    'TW-TXG',  #台中
    'TW-CHA',  #彰化
    'TW-NAN',  #南投
    'TW-YUN',  #雲林
    'TW-CYQ',  #嘉義
    'TW-TNN',  #台南
    'TW-KHH',  #高雄
    'TW-PIF',  #屏東
    'TW-TTT',  #台東
    'TW-HUA',  #雲林
    'TW-ILA',  #宜蘭
    'TW-PEN',  #澎湖
    'TW-HUA' : '花蓮',
    'TW-ILA' : '宜蘭',
    'TW-PEN' : '澎湖',
    'TW-KIN' : '金門',
    'TW-LIE' : '連江',
}

'''
There is no need to copy their data to my data base,
 just query their data is well enough
'''

df_checklist = ''


client = Client(eb_passwords.ebird_api_key, 'zh')

def AllCheckListDashDT(date):
    
    global df_checklist

    checklists = client.get_visits('TW', date=date)
    if len(checklists) == 0:
        return html.H3(f'No data in this date: {date}')
    
    CLID     = []
    userName = []
    obsDT    = []
    county   = []
    locName  = []
    lat      = []
class EBirdExtra(object):
    def __init__(self,
                 ebird_credential_path: Path,
                 xcache_path: Path = cache_path,
                 country: str = 'US'):
        """

        :param ebird_credential_path: Path to YAML files for eBird API Key credentials
        :param xcache_path: Where files like subnational2 codes and taxonomy are cached
        :param country: This is only important when retrieving and caching subnational2 codes
        """
        self.ebird_credential_path = ebird_credential_path
        self.cache_path = xcache_path
        self.country = country
        self.ebird_client = None
        self.__ebird_api_key = get_credential(self.ebird_credential_path)
        if not self.__ebird_api_key:
            print(f'No API key found for eBird')

        self._cache_path = cache_path
        self._cached_visits_path = self._cache_path / 'visits'
        self._cached_historic_path = self._cache_path / 'historic'
        self._cached_details_path = self._cache_path / 'details'

        if self.__ebird_api_key:
            self.ebird_client = Client(self.__ebird_api_key,
                                       EBIRD_DEFAULT_LOCALE)

        # We do this as a side effect as a user convenience. The list of subnational2 codes
        # are saved in the cache for reference. subnational2 codes are the region codes
        # needed in the parameters file. Not fatal if we can't get it. Files saved e.g.:
        #      cache_path / 'regions-US-subnational2.csv'
        # Note that this also calls and caches subnational1 codes
        try:
            # print(f'Calling get_subnational2_cached with {(ebird_client, country, cache_path)}')
            _ = self.get_subnational2_cached()
        except Exception as ee:
            print(f'Failed to get subnational2 codes: {ee}')

    def get_taxonomy_from_ebird(self) -> Optional[pd.DataFrame]:
        taxonomy_from_ebird = None
        if self.ebird_client:
            taxonomy_from_ebird = pd.DataFrame(
                self.ebird_client.get_taxonomy()).fillna('')

        return taxonomy_from_ebird

    # These are very high level stats, so not particularly useful unless looking at the whole state
    # For example, https://api.ebird.org/v2/product/stats/US-CA-085/2019/12/15 returns
    # {'numChecklists': 0, 'numContributors': 0, 'numSpecies': 0}
    def get_regional_statistics_on_a_date(self, region_code: str, year: int,
                                          month: int, day: int):
        # Regional statistics on a date
        # Note that it works for 'US-CA' but not 'US-CA-085'
        #   returns e.g. {'numChecklists': 1129, 'numContributors': 996, 'numSpecies': 299}
        # https://api.ebird.org/v2/product/stats/{{regionCode}}/{{y}}/{{m}}/{{d}}
        stats = pd.DataFrame()
        try:
            api_url_base = 'https://api.ebird.org/v2/product/stats/'
            url = f'{api_url_base}{region_code}/{year}/{month}/{day}'
            api_auth_header = {'X-eBirdApiToken': self.__ebird_api_key}

            print(url)
            rr = requests.get(url,
                              params=None,
                              headers=api_auth_header,
                              stream=True)
            if rr.status_code == requests.codes.ok:
                stats = rr.json()  # pd.DataFrame()
            rr.raise_for_status()

        except Exception as ee:
            print(ee)
            traceback.print_exc(file=sys.stdout)

        return stats

    # For a myriad of reasons we cannot ever expand the CBC period beyond the current date range
    # of December 14th through January 5th and CBC counts cannot take place outside of these dates.

    def get_historic_observations_on_a_date(self, region_code: str, year: int,
                                            month: int, day: int):
        # Regional statistics on a date
        # https://api.ebird.org/v2/data/obs/{{regionCode}}/historic/{{y}}/{{m}}/{{d}}
        stats = pd.DataFrame()
        try:
            api_url_base = 'https://api.ebird.org/v2/data/obs/'
            url = f'{api_url_base}{region_code}/historic/{year}/{month}/{day}'
            api_auth_header = {'X-eBirdApiToken': self.__ebird_api_key}

            rr = requests.get(url,
                              params=None,
                              headers=api_auth_header,
                              stream=True)
            if rr.status_code == requests.codes.ok:
                stats = pd.DataFrame(rr.json())
            rr.raise_for_status()

        except Exception as ee:
            print(ee)
            traceback.print_exc(file=sys.stdout)

        return stats

    def get_checklist_feed_for_region_on_date(self, region_code: str,
                                              xdate: str) -> pd.DataFrame:
        # xdate is e.g. '2020-12-26'
        # https://api.ebird.org/v2/product/lists/{{regionCode}}/{{y}}/{{m}}/{{d}}
        oxdate = datetime.strptime(xdate, '%Y-%m-%d')
        results = pd.DataFrame()
        try:
            api_url_base = 'https://api.ebird.org/v2/product/lists'
            api_auth_header = {'X-eBirdApiToken': self.__ebird_api_key}
            url = f'{api_url_base}/{region_code}/{oxdate.year}/{oxdate.month}/{oxdate.day}'
            xparams = {'maxResults': 200}

            rr = requests.get(
                url, params=xparams, headers=api_auth_header,
                stream=True)  # params=params, headers=api_auth_header,
            if rr.status_code == requests.codes.ok:
                results = pd.DataFrame(rr.json())
            rr.raise_for_status()

        except Exception as ee:
            print(f'get_recent_observations_for_region: {ee}')

        return results

    def get_species_list_at_a_location(self, loc_id):
        # GET Species List at a Location [IN DEVELOPMENT]
        # https://api.ebird.org/v2/product/spplist/{{locId}}
        # e.g. "locId": "L99381"
        # Seems to be returned in taxonomic order

        results = pd.DataFrame()
        try:
            api_url_base = 'https://api.ebird.org/v2/product/spplist'
            api_auth_header = {'X-eBirdApiToken': self.ebird_api_key}
            url = f'{api_url_base}/{loc_id}'

            rr = requests.get(url,
                              params=None,
                              headers=api_auth_header,
                              stream=True)
            #         print(rr.request.headers)
            if rr.status_code == requests.codes.ok:
                results = pd.DataFrame(rr.json())
            rr.raise_for_status()

        except Exception as ee:
            print(ee)
            traceback.print_exc(file=sys.stdout)

        return results

    def get_subnational1_cached(self) -> pd.DataFrame:
        # subnational_1. In the US, these are states
        subnational1_df = pd.DataFrame()
        subnational1_path = self._cache_path / f'regions-{self.country}-subnational1.csv'

        try:
            if not subnational1_path.exists():
                print(f'Creating eBird subnational1 region cache...')
                subnational1_df = pd.DataFrame(
                    self.ebird_client.get_regions('subnational1',
                                                  self.country)).fillna('')
                subnational1_df.to_csv(subnational1_path, index=False)
            else:
                subnational1_df = pd.read_csv(subnational1_path,
                                              index_col=False)

        except Exception as ee:
            print(ee)
            pass
            traceback.print_exc(file=sys.stdout)

        return subnational1_df

    def get_subnational2_cached(self) -> pd.DataFrame:
        # client is global ebird_client
        # Country is the two character ISO code, e.g. 'US'
        # https://www.nationsonline.org/oneworld/country_code_list.htm

        subnational2_path = self._cache_path / f'regions-{self.country}-subnational2.csv'
        subnational2_df = pd.DataFrame()

        try:
            if not subnational2_path.exists():
                print(f'Creating eBird subnational2 region cache...')

                # Get "states"
                subnational1_df = self.get_subnational1_cached()

                # Get "counties" for each "state"
                subnational2_list = []
                for row in subnational1_df.itertuples():
                    state, code = row.name, row.code
                    subnational2s = pd.DataFrame(
                        self.ebird_client.get_regions('subnational2', code))
                    subnational2s['state'] = state
                    subnational2_list.append(subnational2s)

                subnational2_df = pd.concat(subnational2_list)
                subnational2_df.to_csv(subnational2_path, index=False)

            else:
                subnational2_df = pd.read_csv(subnational2_path,
                                              index_col=False)

        except Exception as ee:
            print(ee)
            traceback.print_exc(file=sys.stdout)

        return subnational2_df

    def get_visits_expanded(self, region_code: str,
                            date_of_count: str) -> pd.DataFrame:
        """
        A wrapper on top of eBird.api's get_visits that expands the
        loc field (which is a dictionary) into additional columns in the dataframe

        This also allows us to cache this as a CSV file

        :param region_code: eBird region code, e.g. 'US-CA-085'
        :param date_of_count: YYYY-MM-DD format e.g. '2019-12-28',
        :return: dataframe with every checklist filed in eBird on date for region
        """

        # EBIRD_DEFAULT_LOCALE = 'en'
        # credentials = xutilities.load_credentials(eBirdCredential_path)['credentials']
        # ebird_api_key = credentials['api_key']
        # ebird_client = Client(ebird_api_key, EBIRD_DEFAULT_LOCALE)

        visits_expanded = pd.DataFrame()
        cached_visits_path = self._cached_visits_path / f'visits-{region_code}-{date_of_count}.csv'
        try:
            if cached_visits_path.is_file():
                visits_expanded = pd.read_csv(cached_visits_path,
                                              index_col=False)
            else:
                visits = pd.DataFrame(
                    self.ebird_client.get_visits(region_code, date_of_count))

                # Make a dataframe out of the 'loc' column
                locs = []
                for idx, row in visits.iterrows():
                    locs.append(row['loc'])

                locs_df = pd.DataFrame(locs)
                locs_cols = ['loc_' + col for col in locs_df.columns]
                locs_df.columns = locs_cols

                visits_expanded = pd.concat([visits, locs_df], axis=1).drop(
                    ['loc'], axis=1).reset_index(drop=True)

                visits_expanded['RegionCode'] = region_code

                visits_expanded.to_csv(cached_visits_path, index=False)
        except Exception as ee:
            print(ee)
            traceback.print_exc(file=sys.stdout)

        return visits_expanded

    def get_checklist(self, sub_id: str):
        return self.ebird_client.get_checklist(sub_id)

    # --------------------------- HOTSPOTS ---------------------------

    def get_hotspots(self, region_codes: List[str]):
        # was: hotspot_data_for_regions
        first_region, *remaining_regions = region_codes
        combined = self._get_hotspots_for_region_cached(first_region)

        for rc in remaining_regions:
            region_hotspots = self._get_hotspots_for_region_cached(rc)
            combined = pd.concat([combined, region_hotspots],
                                 ignore_index=True)

        combined_hotspot_geometry = [
            Point(x, y) for x, y in zip(combined.lng, combined.lat)
        ]
        combined_hotspot_gdf = gpd.GeoDataFrame(
            combined, geometry=combined_hotspot_geometry, crs='epsg:4269')
        center_pt = combined_hotspot_gdf.unary_union.convex_hull.centroid.coords[
            0][::-1]

        return combined_hotspot_gdf, center_pt

    def get_hotspots_for_region(self, region_code: str) -> pd.DataFrame:
        # https://ebird.org/ws2.0/ref/hotspot/geo?lat=37.4407&lng=-122.0936&fmt=json&dist=6
        # https://api.ebird.org/v2/ref/hotspot/{{regionCode}}
        # Sample returned row (includes geometry column too)
        # L6530472	US	US-CA	US-CA-085	37.435150	-122.102342	Adobe Creek--north of US-101
        # 2020-06-06 10:03	113	POINT (-122.10234 37.43515)
        headers = [
            'locid', 'r1', 'r2', 'r3', 'lat', 'lng', 'name', 'date', 'num'
        ]
        results = pd.DataFrame()
        try:
            api_url_base = 'https://api.ebird.org/v2/ref/hotspot'
            # API token not currently required, but may be in future
            # api_auth_header = {'X-eBirdApiToken': self.__ebird_api_key}
            url = f'{api_url_base}/{region_code}'
            # params = None  # { 'maxResults' : 200}
            rr = requests.get(
                url, stream=True)  # params=params, headers=api_auth_header,
            if rr.status_code == requests.codes.ok:
                results = pd.read_csv(StringIO(rr.text),
                                      names=headers,
                                      index_col=False)
            rr.raise_for_status()

        except Exception as ee:
            print(ee)

        return results

    def _get_hotspots_for_region_cached(self,
                                        region_code: str) -> pd.DataFrame:
        fpath = self._cache_path / f'hotspots-{region_code}.csv'
        if not fpath.is_file():
            hs_df = self.get_hotspots_for_region(region_code)
            hs_df.to_csv(fpath, index=False)
            return hs_df
        else:
            return pd.read_csv(fpath, index_col=False)

    # --------------------------- VISITS ---------------------------

    def get_visits(self, region_codes: List[str], date_of_count: str):
        # was: hotspot_data_for_regions
        first_region, *remaining_regions = region_codes
        combined = self.get_visits_expanded(first_region, date_of_count)

        for rc in remaining_regions:
            region_visits = self.get_visits_expanded(rc, date_of_count)
            combined = pd.concat([combined, region_visits], ignore_index=True)

        return combined

    def get_visits_for_dates(self, region_codes: List[str], dates: List[str]):
        first_date, *remaining_dates = dates
        combined = self.get_visits(region_codes, first_date)

        for xdate in remaining_dates:
            date_visits = self.get_visits(region_codes, xdate)
            combined = pd.concat([combined, date_visits], ignore_index=True)

        return combined

    def get_recent_observations_for_region(
            self,
            region_code: str,
            back: int = 14,
            cat: str = '(all)',
            hotspot: bool = False,
            include_provisional: bool = False,
            max_results: Union[int, str] = '(all)') -> pd.DataFrame:
        # https://api.ebird.org/v2/data/obs/{{regionCode}}/recent
        # Sample returned row (includes geometry column too)
        # L6530472	US	US-CA	US-CA-085	37.435150	-122.102342	Adobe Creek--north of US-101
        # 2020-06-06 10:03	113	POINT (-122.10234 37.43515)
        results = pd.DataFrame()
        try:
            api_url_base = 'https://api.ebird.org/v2/data/obs'
            api_auth_header = {'X-eBirdApiToken': self.__ebird_api_key}
            url = f'{api_url_base}/{region_code}/recent'
            xparams = None  # { 'maxResults' : 200}
            rr = requests.get(
                url, params=xparams, headers=api_auth_header,
                stream=True)  # params=params, headers=api_auth_header,
            if rr.status_code == requests.codes.ok:
                results = pd.DataFrame(rr.json())
            rr.raise_for_status()

        except Exception as ee:
            print(f'get_recent_observations_for_region: {ee}')

        return results

    # --------------------------- DETAILS ---------------------------

    @staticmethod
    def convert_date_range_to_date_str(drange) -> List[str]:
        return [ds.strftime('%Y-%m-%d') for ds in drange]

    def get_details_for_dates(self, subids_by_date: Dict[str, List[str]],
                              dates: List[str]):
        # Note that by construction, visits only contains data for dates we care about
        # so we don't need to filter for that
        first_date, *remaining_dates = dates
        subids = subids_by_date.get(first_date, None)
        combined = self.get_details(subids, first_date)

        for xdate in remaining_dates:
            subids = subids_by_date.get(xdate, None)
            details = self.get_details(subids, xdate)
            combined = pd.concat([combined, details], ignore_index=True)

        return combined

    def get_details(self, subids: List[str], date_of_count: str):
        """
        Return a dataframe with the "obs" fields flattened. Will cache on disk
        Leave enhancement and other expansions for elsewhere
        Pass in all subids for date
        :param subids: list of checklist IDs
        :param date_of_count: This is used for caching; assumes all subids are for same date
        :return: dataframe with the "obs" fields flattened

        visits-US-CA-085-2019-12-15.csv
        ebird_details_path
        """
        subids_hash = compute_hash(''.join(subids), 12)
        sdate = datetime.strptime(date_of_count, '%Y-%m-%d').strftime('%Y%m%d')

        details = pd.DataFrame()

        # Look in cache first
        # Name is S<date>-<hash>.csv
        details_path = self._cached_details_path / f'S{sdate}-{subids_hash}.csv'

        try:
            if not details_path.exists():
                detailed_checklists = []
                for subid in subids:
                    cdict = self.get_checklist(subid)
                    # if cdict is None:
                    #     continue
                    # print(subid, cdict)
                    # Birdathon iOS version 1.4.1 adds the subAux field, which breaks
                    # turning this into a dataframe directly
                    if 'subAux' in cdict.keys():
                        del cdict['subAux']
                    checklist = pd.DataFrame(cdict)
                    # Not every checklist has groupId, so add if not there
                    # We need it later for detecting duplicate checklists (e.g. shared)
                    if 'groupId' not in checklist.columns:
                        checklist['groupId'] = None
                    if not checklist.empty:
                        detailed_checklists.append(checklist)

                if len(detailed_checklists) == 0:
                    details = pd.DataFrame()
                else:
                    if len(detailed_checklists) > 1:
                        details = pd.concat(detailed_checklists,
                                            axis=0,
                                            ignore_index=True)
                    elif len(detailed_checklists) == 1:
                        details = detailed_checklists[0]
                    details = self.flatten_detail_observations(details)
                details.to_csv(details_path, index=False)
            else:
                details = pd.read_csv(details_path, index_col=False)

        except Exception as ee:
            print(ee)
            traceback.print_exc(file=sys.stdout)

        return details

    def get_api_key(self):
        return self.__ebird_api_key

    """
    Sample observation entry
        {'speciesCode': 'rocpig1',
        'hideFlags': [],
        'obsDt': '2019-12-15 11:16',
        'subnational1Code': 'US-CA',
        'howManyAtleast': 92,
        'howManyAtmost': 92,
        'subId': 'S62345617',
        'projId': 'EBIRD',
        'obsId': 'OBS838617722',
        'howManyStr': '92',
        'present': False}
    """

    @staticmethod
    def flatten_detail_observations(details: pd.DataFrame) -> pd.DataFrame:

        details_list = []  # make a list of dictionaries, then dataframe
        for ix, row in details.iterrows():
            rowdict = row.to_dict().copy()
            rowdict.update(row.obs)
            del rowdict['obs']
            details_list.append(rowdict)

        flattened_details = pd.DataFrame(details_list)

        return flattened_details
Beispiel #7
0
from ebird.api import Client
import time
import datetime

api_key = 'o1rng64r9e2b'
locale = 'zh'
client = Client(api_key, locale)

start_date = datetime.date(2020, 9, 5)

for i in range(15):
    print(start_date + datetime.timedelta(days=i))
    records = client.get_visits('TW',
                                date=start_date + datetime.timedelta(days=i))
    print(f'have {len(records)} new checklist!')
    time.sleep(2)