Example #1
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files,
                    monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items():
            if monitor.is_cancelled():
                raise Cancellation()
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    raise Cancellation()
                if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size, path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items():
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    raise Cancellation()
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(ftp,
                                           existing_filename, existing_file_info, self._file_set_data_store.root_dir,
                                           (checked_files_number, num_of_expected_remote_files), child_monitor,
                                           dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Example #2
0
    def _sync_files(self, ftp, ftp_base_dir, expected_remote_files, num_of_expected_remote_files,
                    monitor: Monitor) -> int:
        sync_files_number = 0
        checked_files_number = 0

        files_to_download = OrderedDict()
        file_set_size = 0
        for expected_dir_path, expected_filename_dict in expected_remote_files.items():
            if monitor.is_cancelled():
                raise Cancellation()
            ftp_dir = ftp_base_dir + '/' + expected_dir_path
            try:
                ftp.cwd(ftp_dir)
            except ftplib.Error:
                # Note: If we can't CWD to ftp_dir, this usually means,
                # expected_dir_path may refer to a time range that is not covered remotely.
                monitor.progress(work=1)
                continue

            try:
                remote_dir_content = ftp.mlsd(facts=['type', 'size', 'modify'])
            except ftplib.Error:
                # Note: If we can't MLSD the CWD ftp_dir, we have a problem.
                monitor.progress(work=1)
                continue

            for existing_filename, facts in remote_dir_content:
                if monitor.is_cancelled():
                    raise Cancellation()
                if facts.get('type', None) == 'file' and existing_filename in expected_filename_dict:
                    # update expected_filename_dict with facts of existing_filename
                    expected_filename_dict[existing_filename] = facts
                    file_size = int(facts.get('size', '-1'))
                    if file_size > 0:
                        file_set_size += file_size
                    # TODO (forman, 20160619): put also 'modify' in file_info, to update outdated local files
                    existing_file_info = dict(size=file_size, path=expected_dir_path)
                    files_to_download[existing_filename] = existing_file_info

        last_cwd = None
        if files_to_download:
            dl_stat = _DownloadStatistics(file_set_size)
            for existing_filename, existing_file_info in files_to_download.items():
                checked_files_number += 1
                child_monitor = monitor.child(work=1.)
                if monitor.is_cancelled():
                    raise Cancellation()
                if last_cwd is not existing_file_info['path']:
                    ftp.cwd(ftp_base_dir + '/' + existing_file_info['path'])
                    last_cwd = existing_file_info['path']
                downloader = FtpDownloader(ftp,
                                           existing_filename, existing_file_info, self._file_set_data_store.root_dir,
                                           (checked_files_number, num_of_expected_remote_files), child_monitor,
                                           dl_stat)
                result = downloader.start()
                if DownloadStatus.SUCCESS is result:
                    sync_files_number += 1
        return sync_files_number
Example #3
0
def _do_json_rpc(web_socket, rpc_request: dict, monitor: Monitor) -> dict:
    web_socket.write_message(json.dumps(rpc_request))
    work_reported = None
    started = False
    while True and (monitor is None or not monitor.is_cancelled()):
        response_str = yield web_socket.read_message()
        rpc_response = json.loads(response_str)
        if 'progress' in rpc_response:
            if monitor:
                progress = rpc_response['progress']
                total = progress.get('total')
                label = progress.get('label')
                worked = progress.get('worked')
                msg = progress.get('message')

                if not started:
                    monitor.start(label or "start", total_work=total)
                    started = True

                if started:
                    if worked:
                        if work_reported is None:
                            work_reported = 0.0
                        work = worked - work_reported
                        work_reported = worked
                    else:
                        work = None
                    monitor.progress(work=work, msg=msg)
        else:
            if monitor and started:
                monitor.done()
            return rpc_response

    return {}
Example #4
0
def _do_json_rpc(web_socket, rpc_request: dict, monitor: Monitor) -> dict:
    web_socket.write_message(json.dumps(rpc_request))
    work_reported = None
    started = False
    while True and (monitor is None or not monitor.is_cancelled()):
        response_str = yield web_socket.read_message()
        rpc_response = json.loads(response_str)
        if 'progress' in rpc_response:
            if monitor:
                progress = rpc_response['progress']
                total = progress.get('total')
                label = progress.get('label')
                worked = progress.get('worked')
                msg = progress.get('message')

                if not started:
                    monitor.start(label or "start", total_work=total)
                    started = True

                if started:
                    if worked:
                        if work_reported is None:
                            work_reported = 0.0
                        work = worked - work_reported
                        work_reported = worked
                    else:
                        work = None
                    monitor.progress(work=work, msg=msg)
        else:
            if monitor and started:
                monitor.done()
            return rpc_response

    return {}
Example #5
0
def _fetch_solr_json(base_url,
                     query_args,
                     offset=0,
                     limit=3500,
                     timeout=10,
                     monitor: Monitor = Monitor.NONE):
    """
    Return JSON value read from paginated Solr web-service.
    """
    combined_json_dict = None
    num_found = -1
    # we don't know ahead of time how much request are necessary
    with monitor.starting("Loading", 10):
        while True:
            monitor.progress(work=1)
            if monitor.is_cancelled():
                raise InterruptedError
            paging_query_args = dict(query_args or {})
            paging_query_args.update(offset=offset,
                                     limit=limit,
                                     format='application/solr+json')
            url = base_url + '?' + urllib.parse.urlencode(paging_query_args)
            with urllib.request.urlopen(url, timeout=timeout) as response:
                json_text = response.read()
                json_dict = json.loads(json_text.decode('utf-8'))
                if num_found is -1:
                    num_found = json_dict.get('response',
                                              {}).get('numFound', 0)
                if not combined_json_dict:
                    combined_json_dict = json_dict
                    if num_found < limit:
                        break
                else:
                    docs = json_dict.get('response', {}).get('docs', [])
                    combined_json_dict.get('response', {}).get('docs',
                                                               []).extend(docs)
                    if num_found < offset + limit:
                        break
            offset += limit
    return combined_json_dict
Example #6
0
def data_frame_aggregate(df: DataFrameLike.TYPE,
                         var_names: VarNamesLike.TYPE = None,
                         aggregate_geometry: bool = False,
                         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Aggregate columns into count, mean, median, sum, std, min, and max. Return a
    new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of
    the GeoDataFrame are to be aggregated. All geometries are merged union-like.

    The return data type will always be the same as the input data type.

    :param df: The (Geo)DataFrame to be analysed
    :param var_names: Variables to be aggregated ('None' uses all aggregatable columns)
    :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame
    :param monitor: Monitor for progress bar
    :return: returns either DataFrame or GeoDataFrame. Keeps input data type
    """
    vns = VarNamesLike.convert(var_names)

    df_is_geo = isinstance(df, gpd.GeoDataFrame)
    aggregations = ["count", "mean", "median", "sum", "std", "min", "max"]

    # Check var names integrity (aggregatable, exists in data frame)
    types_accepted_for_agg = ['float64', 'int64', 'bool']
    agg_columns = list(df.select_dtypes(include=types_accepted_for_agg).columns)

    if df_is_geo:
        agg_columns.append('geometry')

    columns = list(df.columns)

    if vns is None:
        vns = agg_columns

    diff = list(set(vns) - set(columns))
    if len(diff) > 0:
        raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!')

    diff = list(set(vns) - set(agg_columns))
    if len(diff) > 0:
        raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!')

    try:
        df['geometry']
    except KeyError as e:
        raise ValidationError('Variable geometry not in GEO data frame!') from e

    # Aggregate columns
    if vns is None:
        df_buff = df.select_dtypes(include=types_accepted_for_agg).agg(aggregations)
    else:
        df_buff = df[vns].select_dtypes(include=types_accepted_for_agg).agg(aggregations)

    res = {}
    for n in df_buff.columns:
        for a in aggregations:
            val = df_buff[n][a]
            h = n + '_' + a
            res[h] = [val]

    df_agg = pd.DataFrame(res)

    # Aggregate (union) geometry if GeoDataFrame
    if df_is_geo and aggregate_geometry:
        total_work = 100
        num_work_rows = 1 + len(df) // total_work
        with monitor.starting('Aggregating geometry: ', total_work):
            multi_polygon = shapely.geometry.MultiPolygon()
            i = 0
            for rec in df.geometry:
                if monitor.is_cancelled():
                    break
                # noinspection PyBroadException
                try:
                    multi_polygon = multi_polygon.union(other=rec)
                except Exception:
                    pass

                if i % num_work_rows == 0:
                    monitor.progress(work=1)
                i += 1

        df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs)

    return df_agg
Example #7
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        # local_name = local_ds.name
        local_id = local_ds.name

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)

        if protocol == _ODP_PROTOCOL_OPENDAP:

            files = self._get_urls_list(selected_file_list, protocol)
            monitor.start('Sync ' + self.name, total_work=len(files))
            for idx, dataset_uri in enumerate(files):
                child_monitor = monitor.child(work=1)

                file_name = os.path.basename(dataset_uri)
                local_filepath = os.path.join(local_path, file_name)

                time_coverage_start = selected_file_list[idx][1]
                time_coverage_end = selected_file_list[idx][2]

                remote_netcdf = None
                local_netcdf = None
                try:
                    remote_netcdf = NetCDF4DataStore(dataset_uri)

                    local_netcdf = NetCDF4DataStore(local_filepath,
                                                    mode='w',
                                                    persist=True)
                    local_netcdf.set_attributes(remote_netcdf.get_attrs())

                    remote_dataset = xr.Dataset.load_store(remote_netcdf)

                    process_region = False
                    if region:
                        geo_lat_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_min')
                        geo_lat_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_max')
                        geo_lon_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_min')
                        geo_lon_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_max')

                        geo_lat_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_resolution')
                        geo_lon_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_resolution')
                        if not (isnan(geo_lat_min) or isnan(geo_lat_max)
                                or isnan(geo_lon_min) or isnan(geo_lon_max)
                                or isnan(geo_lat_res) or isnan(geo_lon_res)):
                            process_region = True

                            [lat_min, lon_min, lat_max,
                             lon_max] = region.bounds

                            lat_min = floor(
                                (lat_min - geo_lat_min) / geo_lat_res)
                            lat_max = ceil(
                                (lat_max - geo_lat_min) / geo_lat_res)
                            lon_min = floor(
                                (lon_min - geo_lon_min) / geo_lon_res)
                            lon_max = ceil(
                                (lon_max - geo_lon_min) / geo_lon_res)

                            # TODO (kbernat): check why dataset.sel fails!
                            remote_dataset = remote_dataset.isel(
                                drop=False,
                                lat=slice(lat_min, lat_max),
                                lon=slice(lon_min, lon_max))

                            geo_lat_max = lat_max * geo_lat_res + geo_lat_min
                            geo_lat_min += lat_min * geo_lat_res
                            geo_lon_max = lon_max * geo_lon_res + geo_lon_min
                            geo_lon_min += lon_min * geo_lon_res

                    if not var_names:
                        var_names = [
                            var_name
                            for var_name in remote_netcdf.variables.keys()
                        ]
                    var_names.extend([
                        coord_name
                        for coord_name in remote_dataset.coords.keys()
                        if coord_name not in var_names
                    ])
                    child_monitor.start(label=file_name,
                                        total_work=len(var_names))
                    for sel_var_name in var_names:
                        var_dataset = remote_dataset.drop([
                            var_name
                            for var_name in remote_dataset.variables.keys()
                            if var_name != sel_var_name
                        ])
                        if compression_enabled:
                            var_dataset.variables.get(
                                sel_var_name).encoding.update(encoding_update)
                        local_netcdf.store_dataset(var_dataset)
                        child_monitor.progress(work=1, msg=sel_var_name)
                    if process_region:
                        local_netcdf.set_attribute('geospatial_lat_min',
                                                   geo_lat_min)
                        local_netcdf.set_attribute('geospatial_lat_max',
                                                   geo_lat_max)
                        local_netcdf.set_attribute('geospatial_lon_min',
                                                   geo_lon_min)
                        local_netcdf.set_attribute('geospatial_lon_max',
                                                   geo_lon_max)

                finally:
                    if remote_netcdf:
                        remote_netcdf.close()
                    if local_netcdf:
                        local_netcdf.close()
                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                child_monitor.done()
        else:
            outdated_file_list = []
            for file_rec in selected_file_list:
                filename, _, _, file_size, url = file_rec
                dataset_file = os.path.join(local_path, filename)
                # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                # ... outdated or incomplete or corrupted.
                # JSON also includes "checksum" and "checksum_type" fields.
                if not os.path.isfile(dataset_file) or (
                        file_size
                        and os.path.getsize(dataset_file) != file_size):
                    outdated_file_list.append(file_rec)

            if outdated_file_list:
                with monitor.starting('Sync ' + self.name,
                                      len(outdated_file_list)):
                    bytes_to_download = sum(
                        [file_rec[3] for file_rec in outdated_file_list])
                    dl_stat = _DownloadStatistics(bytes_to_download)

                    file_number = 1

                    for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                        if monitor.is_cancelled():
                            raise InterruptedError
                        dataset_file = os.path.join(local_path, filename)
                        sub_monitor = monitor.child(work=1.0)

                        # noinspection PyUnusedLocal
                        def reporthook(block_number, read_size,
                                       total_file_size):
                            dl_stat.handle_chunk(read_size)
                            if monitor.is_cancelled():
                                raise InterruptedError
                            sub_monitor.progress(work=read_size,
                                                 msg=str(dl_stat))

                        sub_monitor_msg = "file %d of %d" % (
                            file_number, len(outdated_file_list))
                        with sub_monitor.starting(sub_monitor_msg, file_size):
                            urllib.request.urlretrieve(url[protocol],
                                                       filename=dataset_file,
                                                       reporthook=reporthook)
                        file_number += 1
                        local_ds.add_dataset(os.path.join(local_id, filename),
                                             (coverage_from, coverage_to))
        local_ds.save()
        monitor.done()
Example #8
0
def data_frame_aggregate(df: DataFrameLike.TYPE,
                         var_names: VarNamesLike.TYPE = None,
                         aggregate_geometry: bool = False,
                         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Aggregate columns into count, mean, median, sum, std, min, and max. Return a
    new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of
    the GeoDataFrame are to be aggregated. All geometries are merged union-like.

    The return data type will always be the same as the input data type.

    :param df: The (Geo)DataFrame to be analysed
    :param var_names: Variables to be aggregated ('None' uses all aggregatable columns)
    :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame
    :param monitor: Monitor for progress bar
    :return: returns either DataFrame or GeoDataFrame. Keeps input data type
    """
    vns = VarNamesLike.convert(var_names)

    df_is_geo = isinstance(df, gpd.GeoDataFrame)
    aggregations = ["count", "mean", "median", "sum", "std", "min", "max"]

    # Check var names integrity (aggregatable, exists in data frame)
    types_accepted_for_agg = ['float64', 'int64', 'bool']
    agg_columns = list(
        df.select_dtypes(include=types_accepted_for_agg).columns)

    if df_is_geo:
        agg_columns.append('geometry')

    columns = list(df.columns)

    if vns is None:
        vns = agg_columns

    diff = list(set(vns) - set(columns))
    if len(diff) > 0:
        raise ValidationError('Variable ' + ','.join(diff) +
                              ' not in data frame!')

    diff = list(set(vns) - set(agg_columns))
    if len(diff) > 0:
        raise ValidationError('Variable(s) ' + ','.join(diff) +
                              ' not aggregatable!')

    try:
        df['geometry']
    except KeyError as e:
        raise ValidationError(
            'Variable geometry not in GEO data frame!') from e

    # Aggregate columns
    if vns is None:
        df_buff = df.select_dtypes(
            include=types_accepted_for_agg).agg(aggregations)
    else:
        df_buff = df[vns].select_dtypes(
            include=types_accepted_for_agg).agg(aggregations)

    res = {}
    for n in df_buff.columns:
        for a in aggregations:
            val = df_buff[n][a]
            h = n + '_' + a
            res[h] = [val]

    df_agg = pd.DataFrame(res)

    # Aggregate (union) geometry if GeoDataFrame
    if df_is_geo and aggregate_geometry:
        total_work = 100
        num_work_rows = 1 + len(df) // total_work
        with monitor.starting('Aggregating geometry: ', total_work):
            multi_polygon = shapely.geometry.MultiPolygon()
            i = 0
            for rec in df.geometry:
                if monitor.is_cancelled():
                    break
                # noinspection PyBroadException
                try:
                    multi_polygon = multi_polygon.union(other=rec)
                except Exception:
                    pass

                if i % num_work_rows == 0:
                    monitor.progress(work=1)
                i += 1

        df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs)

    return df_agg