def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files, monitor: Monitor) -> int: sync_files_number = 0 checked_files_number = 0 files_to_download = OrderedDict() file_set_size = 0 for expected_dir_path, expected_filename_dict in expected_remote_files.items(): if monitor.is_cancelled(): raise Cancellation() ftp_dir = ftp_base_dir + '/' + expected_dir_path try: ftp.cwd(ftp_dir) except ftplib.Error: # Note: If we can't CWD to ftp_dir, this usually means, # expected_dir_path may refer to a time range that is not covered remotely. monitor.progress(work=1) continue try: remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify']) except ftplib.Error: # Note: If we can't MLSD the CWD ftp_dir, we have a problem. monitor.progress(work=1) continue for existing_filename, facts in remote_dir_content: if monitor.is_cancelled(): raise Cancellation() if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict: # update expected_filename_dict with facts of existing_filename expected_filename_dict[existing_filename] = facts file_size = int(facts.get('size', '-1')) if file_size > 0: file_set_size += file_size # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files existing_file_info = dict(size=file_size, path=expected_dir_path) files_to_download[existing_filename] = existing_file_info last_cwd = None if files_to_download: dl_stat = _DownloadStatistics(file_set_size) for existing_filename, existing_file_info in files_to_download.items(): checked_files_number += 1 child_monitor = monitor.child(work=1.) if monitor.is_cancelled(): raise Cancellation() if last_cwd is not existing_file_info['path']: ftp.cwd(ftp_base_dir + '/' + existing_file_info['path']) last_cwd = existing_file_info['path'] downloader = FtpDownloader(ftp, existing_filename, existing_file_info, self._file_set_data_store.root_dir, (checked_files_number, num_of_expected_remote_files), child_monitor, dl_stat) result = downloader.start() if DownloadStatus.SUCCESS is result: sync_files_number += 1 return sync_files_number
def _do_json_rpc(web_socket, rpc_request: dict, monitor: Monitor) -> dict: web_socket.write_message(json.dumps(rpc_request)) work_reported = None started = False while True and (monitor is None or not monitor.is_cancelled()): response_str = yield web_socket.read_message() rpc_response = json.loads(response_str) if 'progress' in rpc_response: if monitor: progress = rpc_response['progress'] total = progress.get('total') label = progress.get('label') worked = progress.get('worked') msg = progress.get('message') if not started: monitor.start(label or "start", total_work=total) started = True if started: if worked: if work_reported is None: work_reported = 0.0 work = worked - work_reported work_reported = worked else: work = None monitor.progress(work=work, msg=msg) else: if monitor and started: monitor.done() return rpc_response return {}
def _fetch_solr_json(base_url, query_args, offset=0, limit=3500, timeout=10, monitor: Monitor = Monitor.NONE): """ Return JSON value read from paginated Solr web-service. """ combined_json_dict = None num_found = -1 # we don't know ahead of time how much request are necessary with monitor.starting("Loading", 10): while True: monitor.progress(work=1) if monitor.is_cancelled(): raise InterruptedError paging_query_args = dict(query_args or {}) paging_query_args.update(offset=offset, limit=limit, format='application/solr+json') url = base_url + '?' + urllib.parse.urlencode(paging_query_args) with urllib.request.urlopen(url, timeout=timeout) as response: json_text = response.read() json_dict = json.loads(json_text.decode('utf-8')) if num_found is -1: num_found = json_dict.get('response', {}).get('numFound', 0) if not combined_json_dict: combined_json_dict = json_dict if num_found < limit: break else: docs = json_dict.get('response', {}).get('docs', []) combined_json_dict.get('response', {}).get('docs', []).extend(docs) if num_found < offset + limit: break offset += limit return combined_json_dict
def data_frame_aggregate(df: DataFrameLike.TYPE, var_names: VarNamesLike.TYPE = None, aggregate_geometry: bool = False, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Aggregate columns into count, mean, median, sum, std, min, and max. Return a new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of the GeoDataFrame are to be aggregated. All geometries are merged union-like. The return data type will always be the same as the input data type. :param df: The (Geo)DataFrame to be analysed :param var_names: Variables to be aggregated ('None' uses all aggregatable columns) :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame :param monitor: Monitor for progress bar :return: returns either DataFrame or GeoDataFrame. Keeps input data type """ vns = VarNamesLike.convert(var_names) df_is_geo = isinstance(df, gpd.GeoDataFrame) aggregations = ["count", "mean", "median", "sum", "std", "min", "max"] # Check var names integrity (aggregatable, exists in data frame) types_accepted_for_agg = ['float64', 'int64', 'bool'] agg_columns = list(df.select_dtypes(include=types_accepted_for_agg).columns) if df_is_geo: agg_columns.append('geometry') columns = list(df.columns) if vns is None: vns = agg_columns diff = list(set(vns) - set(columns)) if len(diff) > 0: raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!') diff = list(set(vns) - set(agg_columns)) if len(diff) > 0: raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!') try: df['geometry'] except KeyError as e: raise ValidationError('Variable geometry not in GEO data frame!') from e # Aggregate columns if vns is None: df_buff = df.select_dtypes(include=types_accepted_for_agg).agg(aggregations) else: df_buff = df[vns].select_dtypes(include=types_accepted_for_agg).agg(aggregations) res = {} for n in df_buff.columns: for a in aggregations: val = df_buff[n][a] h = n + '_' + a res[h] = [val] df_agg = pd.DataFrame(res) # Aggregate (union) geometry if GeoDataFrame if df_is_geo and aggregate_geometry: total_work = 100 num_work_rows = 1 + len(df) // total_work with monitor.starting('Aggregating geometry: ', total_work): multi_polygon = shapely.geometry.MultiPolygon() i = 0 for rec in df.geometry: if monitor.is_cancelled(): break # noinspection PyBroadException try: multi_polygon = multi_polygon.union(other=rec) except Exception: pass if i % num_work_rows == 0: monitor.progress(work=1) i += 1 df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs) return df_agg
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): # local_name = local_ds.name local_id = local_ds.name time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert( var_names) if var_names else None # type: Sequence compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if protocol == _ODP_PROTOCOL_OPENDAP: files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.name, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] remote_netcdf = None local_netcdf = None try: remote_netcdf = NetCDF4DataStore(dataset_uri) local_netcdf = NetCDF4DataStore(local_filepath, mode='w', persist=True) local_netcdf.set_attributes(remote_netcdf.get_attrs()) remote_dataset = xr.Dataset.load_store(remote_netcdf) process_region = False if region: geo_lat_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_min') geo_lat_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_max') geo_lon_min = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_min') geo_lon_max = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_max') geo_lat_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lon_resolution') geo_lon_res = self._get_harmonized_coordinate_value( remote_dataset.attrs, 'geospatial_lat_resolution') if not (isnan(geo_lat_min) or isnan(geo_lat_max) or isnan(geo_lon_min) or isnan(geo_lon_max) or isnan(geo_lat_res) or isnan(geo_lon_res)): process_region = True [lat_min, lon_min, lat_max, lon_max] = region.bounds lat_min = floor( (lat_min - geo_lat_min) / geo_lat_res) lat_max = ceil( (lat_max - geo_lat_min) / geo_lat_res) lon_min = floor( (lon_min - geo_lon_min) / geo_lon_res) lon_max = ceil( (lon_max - geo_lon_min) / geo_lon_res) # TODO (kbernat): check why dataset.sel fails! remote_dataset = remote_dataset.isel( drop=False, lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max)) geo_lat_max = lat_max * geo_lat_res + geo_lat_min geo_lat_min += lat_min * geo_lat_res geo_lon_max = lon_max * geo_lon_res + geo_lon_min geo_lon_min += lon_min * geo_lon_res if not var_names: var_names = [ var_name for var_name in remote_netcdf.variables.keys() ] var_names.extend([ coord_name for coord_name in remote_dataset.coords.keys() if coord_name not in var_names ]) child_monitor.start(label=file_name, total_work=len(var_names)) for sel_var_name in var_names: var_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.variables.keys() if var_name != sel_var_name ]) if compression_enabled: var_dataset.variables.get( sel_var_name).encoding.update(encoding_update) local_netcdf.store_dataset(var_dataset) child_monitor.progress(work=1, msg=sel_var_name) if process_region: local_netcdf.set_attribute('geospatial_lat_min', geo_lat_min) local_netcdf.set_attribute('geospatial_lat_max', geo_lat_max) local_netcdf.set_attribute('geospatial_lon_min', geo_lon_min) local_netcdf.set_attribute('geospatial_lon_max', geo_lon_max) finally: if remote_netcdf: remote_netcdf.close() if local_netcdf: local_netcdf.close() local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or ( file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.name, len(outdated_file_list)): bytes_to_download = sum( [file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: if monitor.is_cancelled(): raise InterruptedError dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) if monitor.is_cancelled(): raise InterruptedError sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % ( file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve(url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset(os.path.join(local_id, filename), (coverage_from, coverage_to)) local_ds.save() monitor.done()
def data_frame_aggregate(df: DataFrameLike.TYPE, var_names: VarNamesLike.TYPE = None, aggregate_geometry: bool = False, monitor: Monitor = Monitor.NONE) -> pd.DataFrame: """ Aggregate columns into count, mean, median, sum, std, min, and max. Return a new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of the GeoDataFrame are to be aggregated. All geometries are merged union-like. The return data type will always be the same as the input data type. :param df: The (Geo)DataFrame to be analysed :param var_names: Variables to be aggregated ('None' uses all aggregatable columns) :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame :param monitor: Monitor for progress bar :return: returns either DataFrame or GeoDataFrame. Keeps input data type """ vns = VarNamesLike.convert(var_names) df_is_geo = isinstance(df, gpd.GeoDataFrame) aggregations = ["count", "mean", "median", "sum", "std", "min", "max"] # Check var names integrity (aggregatable, exists in data frame) types_accepted_for_agg = ['float64', 'int64', 'bool'] agg_columns = list( df.select_dtypes(include=types_accepted_for_agg).columns) if df_is_geo: agg_columns.append('geometry') columns = list(df.columns) if vns is None: vns = agg_columns diff = list(set(vns) - set(columns)) if len(diff) > 0: raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!') diff = list(set(vns) - set(agg_columns)) if len(diff) > 0: raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!') try: df['geometry'] except KeyError as e: raise ValidationError( 'Variable geometry not in GEO data frame!') from e # Aggregate columns if vns is None: df_buff = df.select_dtypes( include=types_accepted_for_agg).agg(aggregations) else: df_buff = df[vns].select_dtypes( include=types_accepted_for_agg).agg(aggregations) res = {} for n in df_buff.columns: for a in aggregations: val = df_buff[n][a] h = n + '_' + a res[h] = [val] df_agg = pd.DataFrame(res) # Aggregate (union) geometry if GeoDataFrame if df_is_geo and aggregate_geometry: total_work = 100 num_work_rows = 1 + len(df) // total_work with monitor.starting('Aggregating geometry: ', total_work): multi_polygon = shapely.geometry.MultiPolygon() i = 0 for rec in df.geometry: if monitor.is_cancelled(): break # noinspection PyBroadException try: multi_polygon = multi_polygon.union(other=rec) except Exception: pass if i % num_work_rows == 0: monitor.progress(work=1) i += 1 df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs) return df_agg