def to_dwca(self, file_path): if self.df.empty: return def concatenateIssue(row): if row._error: try: is_nan = math.isnan(row.issue) except TypeError: is_nan = False if row.issue and not is_nan: row.issue += f';{row._error}' else: row.issue = row._error return row.issue df = self.df.copy() df['dynamicProperties'] = np.nan df['dynamicProperties'] = df.apply( lambda row: { 'vessel': self.vessel, 'distance': row['_distance'], 'voyagerInferrences': row['_inferred_on'] }, axis=1) df['issue'] = df.apply(concatenateIssue, axis=1) df.drop(self.additional_columns, axis=1, inplace=True) # We merged these in from dynamicProperties df.drop(['vessel', 'expedition', 'datetime'], axis=1, inplace=True) df.drop(df.filter(regex="Unname"), axis=1, inplace=True) logger.info('Saved DWC-A %s', file_path) df.to_csv(file_path, index=False)
def _parse_imma(self, vessel_name, years): cache_key = 'icoads_{0}_{1}'.format( vessel_name, '-'.join(years) ).lower() cache_path = CACHE_DIR / f'{cache_key}.csv' try: logger.info(f'Loading {cache_key} from cache') df = pd.read_csv( cache_path ) except FileNotFoundError: logger.info(f'Cached {cache_key} not found - parsing data') else: return df re_search = re.compile(vessel_name, re.IGNORECASE) data = [] for year in years: for record in self._read_imma(year): supd = record.get('SUPD') if not supd: continue try: rid = record['ID'].strip() except AttributeError: # No record ID?? continue m = re_search.search(supd) if m: data.append({ 'ship_id': rid, 'year': record['YR'], 'month': record['MO'], 'day': record['DY'], 'lat': record['LAT'], 'lon': record['LON'] }) df = pd.DataFrame(data) df['datetime'] = pd.to_datetime(df[['day', 'year', 'month']]) df = df.sort_values(by=['datetime']) df.to_csv(cache_path) return df
def _get_occurences_by_geotemporal_proximity(self): # Have to have date so lets filter on date range df = self.gbif[ (self.gbif.datetime >= self.route.date_from) & (self.gbif.datetime <= self.route.date_to) ] df = self._add_distance_to_route(df) df = df[df['_distance'] < self.MAX_KM_TO_ROUTE] df['_inferred_on'] = 'route_proximity' logger.info('%s occurrences found within %skm of route', df.shape[0], self.MAX_KM_TO_ROUTE) self._frames.append(df)
def _get_occurences(self): self._get_occurences_by_vessel() if self.expedition: self._get_occurences_by_expedition() if self.collectors: self._get_occurences_by_collector() if self.inferred_collectors: self._get_occurences_by_inferred_collector() self._get_occurences_by_geotemporal_proximity() if self._frames: df = pd.concat(self._frames) df = df.drop_duplicates(subset='gbifID', keep="last") logger.info('TOTAL: %s Occurrences with complete data', df.shape[0]) return df else: logger.error('No occurrences found for %s - %s', self.vessel, self.route.year_from) return pd.DataFrame()
def get_cached(self, func, **kwargs): cache_key = 'gbif' if kwargs: kwargs_key = '-'.join(map(str, kwargs.values())) cache_key = f'{cache_key}-{kwargs_key}' cache_path = CACHE_DIR / f'{cache_key}.csv' try: logger.info(f'Loading {cache_key} from cache') df = pd.read_csv(cache_path) except FileNotFoundError: logger.info(f'Cached {cache_key} not found') # If we haven't yet loaded the main GBIF file, load it now # But only if kwargs are specified to prevent infinite loop if self.dwca is None and kwargs: logger.info(f'Loading DWCA') self.dwca = self.get_cached(self.parse_dwca, ) df = func(**kwargs) df.to_csv(cache_path) finally: return df
def get_location_by_date(self, date): location = self.interpolated.loc[self.interpolated.index == date, ['lat', 'lon']] try: location = location.iloc[0] except IndexError: if date > self.date_to: logger.debug( f'Occurrence dated {date} is after voyage end date {self.date_to}' ) elif date < self.date_from: logger.debug( f'Occurrence dated {date} is before voyage start date {self.date_to}' ) else: # Downgraded to debug from error, as we no longer have # A date from journey start to journey end logger.info(f'Could not find location for date {date}') else: return location.tolist()
def _get_occurences_by_vessel(self): # Match data by expedition and vessel data for field in ['vessel', 'expedition']: if self.gbif[field].any(): if '+' in self.vessel: vessels = self.vessel.split('+') df = self.gbif[self.gbif[field].str.contains( '|'.join(vessels), case=False, na=False, regex=True)] else: df = self.gbif[self.gbif[field].str.contains( self.vessel, case=False, na=False)] df['_inferred_on'] = field logger.info('Found %s occurences for %s.', df.shape[0], field) self._process_occurrences_with_inferences(df) # As we have matched on vessel, expedition we can be usre these collectors # Are correct, so update the property self._update_inferred_collectors(df)
def interpolate(self): df = self.df.copy() df['date_diff'] = (df['datetime'] - df['datetime'].shift(1)).dt.days df.reset_index(drop=True, inplace=True) interpolation_break_points = [] for index, row in df.iterrows(): if row['date_diff'] > self.INTERPOLATION_MAX_DAYS: try: previous_point = (previous_row.lat, previous_row.lon) except NameError: # Not defined on first loop pass else: point = (row.lat, row.lon) previous_point = (previous_row.lat, previous_row.lon) distance = geodesic(previous_point, point).kilometers if distance > self.INTERPOLATION_MAX_DISTANCE: interpolation_break_points.append(index) previous_row = row if interpolation_break_points: logger.info('Splitting voyage into %s stages.', len(interpolation_break_points)) frames = self._split_df_into_frames(df, interpolation_break_points) frames = [self._interpolate(f) for f in frames] df = pd.concat(frames) else: # Perform interpolation on the whole data frame df = self._interpolate(df) return df
def _add_distance_to_route(self, df): # It is possible that matching on date / records that have lat/lon # includes records that are too distant from the route - remove these # And must have lat/lon df = df[ df['decimalLatitude'].notnull() & df['decimalLongitude'].notnull() ] mask = df['_distance'].isnull() logger.info('Calculating distance for %s occurences.', mask.sum()) df['_distance'] = df.apply( lambda row: row['_distance'] if row['_distance'] >= 0 else self._calc_geodesic_distance( row['datetime'], row['decimalLatitude'], row['decimalLongitude'] ), axis=1 ) return df
def _process_occurrences_with_inferences(self, df): if not df.empty: has_date_mask = df.datetime.notnull() has_location_mask = df['decimalLatitude'].notnull( ) & df['decimalLongitude'].notnull() # If they have both lat/lon and date, no further processing required with_date_location = df[has_date_mask & has_location_mask] logger.info('%s occurences with both date and location.', with_date_location.shape[0]) self._frames.append(with_date_location) # If they have a date & no lat/lng date_not_location = df[has_date_mask & np.logical_not(has_location_mask)].copy() if not date_not_location.empty: date_not_location['_error'] = 'COORDINATES_INFERRED' # We're calculating the lat/lon from the date, so distance will always be 0 date_not_location['_distance'] = 0 date_not_location[['decimalLatitude', 'decimalLongitude']] = date_not_location['datetime'].apply( self._get_location_by_date) # Make sure we don;t have any NaNs date_not_location = date_not_location[df['decimalLatitude'].notna( ) & df['decimalLongitude'].notna( )] logger.info('%s occurences with date and inferred location.', date_not_location.shape[0]) self._frames.append(date_not_location) location_not_date = df[has_location_mask & np.logical_not(has_date_mask)].copy() if not location_not_date.empty: location_not_date['_error'] = 'RECORDED_DATE_INFERRED' location_not_date['datetime'] = location_not_date.apply( lambda row: self._get_date_by_location( row['decimalLatitude'], row['decimalLongitude']), axis=1) logger.info('%s occurences with location and inferred date.', location_not_date.shape[0]) self._frames.append(location_not_date)
def app(limit, vessel_name): imma_files = _cli_imma_files(vessel_name) counter = 0 js_voyages_file = APP_DATA_DIR / 'voyages.js' js_metadata_file = APP_DATA_DIR / 'metadata.js' js_occurrences_file = APP_DATA_DIR / 'occurrences.js' map_metadata = {'timestamp': {'min': 0, 'max': 0}} new_line = False logger.info(f'Exporting to {js_voyages_file}') def _update_map_metadata(coordinates): min_timestamp = coordinates.timestamp.min() max_timestamp = coordinates.timestamp.max() if min_timestamp < map_metadata['timestamp']['min']: map_metadata['timestamp']['min'] = min_timestamp if max_timestamp > map_metadata['timestamp'][ 'max'] or not map_metadata['timestamp']['max']: map_metadata['timestamp']['max'] = max_timestamp occurrence_count = 0 with js_voyages_file.open('w') as f: f.write('export default [\n') occurence_records = {} for vessel, route, dwca in _cli_get_routes(vessel_name): logger.info( f'Exporting {vessel} {route.year_from} - {route.year_to}') if new_line: f.write(',\n') coordinates = route.get_coordinates() df = dwca.copy() df['datetime'] = pd.to_datetime(df['eventDate'], infer_datetime_format=True) df['timestamp'] = df.datetime.astype('int64') // 10**9 df.rename(columns={ 'gbifID': 'id', 'scientificName': 'name', 'decimalLatitude': 'lat', 'decimalLongitude': 'lon', }, inplace=True) # Ensure the occurrences occur only within the published time frame df = df[(df['datetime'] > route.date_from) & (df['datetime'] <= route.date_to)] df = df[['timestamp', 'id', 'name', 'lat', 'lon']] df = df.sort_values(by=['timestamp']) df['timestamp'] = df['timestamp'].astype(str) occurrence_count += df.shape[0] occurence_records[vessel] = df.values.tolist() _update_map_metadata(coordinates) coordinates['timestamp'] = coordinates['timestamp'].astype(str) voyage = { 'coordinates': coordinates.values.tolist(), 'metadata': { 'vessel': vessel, 'year_from': route.year_from, 'year_to': route.year_to, 'count': df.shape[0] } } f.write(json.dumps(voyage)) new_line = True counter += 1 if limit and counter >= limit: break f.write('\n]') # Give the map tiles time to load before rendering any lines dt = datetime.fromtimestamp(map_metadata['timestamp']['min']) timestamp_min = datetime.timestamp(dt + relativedelta(months=-24)) # Output the metadata file with js_metadata_file.open('w') as f: f.write('export default {\n') f.write(f"\tminTimestamp: {timestamp_min},\n") f.write(f"\tmaxTimestamp: {map_metadata['timestamp']['max']}") f.write('\n}') with js_occurrences_file.open('w') as f: f.write('export default \n') f.write(json.dumps(occurence_records)) f.write('\n') click.secho(f'Occurrence count: {occurrence_count}', fg='green')
# Are correct, so update the property self._update_inferred_collectors(df) def _update_inferred_collectors(self, df): for name, count in df['recordedBy'].value_counts().to_dict().items(): self.inferred_collectors.setdefault(name, 0) self.inferred_collectors[name] += count def _get_occurences_by_collector(self): df = self.gbif[self.gbif['recordedBy'].str.contains( '|'.join(self.collectors), case=False, na=False)] df['_inferred_on'] = 'collector' logger.info('Found %s occurences by collector name.', df.shape[0]) self._process_occurrences_with_inferences(df) def _get_occurences_by_inferred_collector(self): name_blacklist = ['Anonymous', 'Unknown', 'Unnamed', 'Unidentified', 'Anon'] # Number of times we should have seen a collector name for it to be included occurrence_threshold = 50 collectors = [name for name, count in self.inferred_collectors.items( ) if count > occurrence_threshold] collectors = [extract_surname(c) for c in collectors]