コード例 #1
0
def run_timeseries2time_func(inps):

    # basic info
    atr = readfile.read_attribute(inps.timeseries_file)
    length, width = int(atr['LENGTH']), int(atr['WIDTH'])
    num_date = inps.numDate
    dates = np.array(inps.dateList)
    seconds = atr.get('CENTER_LINE_UTC', 0)

    # use the 1st date as reference if not found, e.g. timeseriesResidual.h5 file
    if "REF_DATE" not in atr.keys() and not inps.ref_date:
        inps.ref_date = inps.dateList[0]
        print(
            'WARNING: No REF_DATE found in time-series file or input in command line.'
        )
        print('  Set "--ref-date {}" and continue.'.format(inps.dateList[0]))

    # get deformation model from parsers
    model, num_param = read_inps2model(inps)

    ## output preparation

    # time_func_param: attributes
    atrV = dict(atr)
    atrV['FILE_TYPE'] = 'velocity'
    atrV['UNIT'] = 'm/year'
    atrV['START_DATE'] = inps.dateList[0]
    atrV['END_DATE'] = inps.dateList[-1]
    atrV['DATE12'] = '{}_{}'.format(inps.dateList[0], inps.dateList[-1])
    if inps.ref_yx:
        atrV['REF_Y'] = inps.ref_yx[0]
        atrV['REF_X'] = inps.ref_yx[1]
    if inps.ref_date:
        atrV['REF_DATE'] = inps.ref_date

    # time_func_param: config parameter
    print('add/update the following configuration metadata:\n{}'.format(
        configKeys))
    for key in configKeys:
        atrV[key_prefix + key] = str(vars(inps)[key])

    # time_func_param: instantiate output file
    ds_name_dict, ds_unit_dict = model2hdf5_dataset(model,
                                                    ds_shape=(length,
                                                              width))[1:]
    writefile.layout_hdf5(inps.outfile,
                          metadata=atrV,
                          ds_name_dict=ds_name_dict,
                          ds_unit_dict=ds_unit_dict)

    # timeseries_res: attributes + instantiate output file
    if inps.save_res:
        atrR = dict(atr)
        for key in ['REF_DATE']:
            if key in atrR.keys():
                atrR.pop(key)
        writefile.layout_hdf5(inps.res_file,
                              metadata=atrR,
                              ref_file=inps.timeseries_file)

    ## estimation

    # calc number of box based on memory limit
    memoryAll = (num_date + num_param * 2 + 2) * length * width * 4
    if inps.bootstrap:
        memoryAll += inps.bootstrapCount * num_param * length * width * 4
    num_box = int(np.ceil(memoryAll * 3 / (inps.maxMemory * 1024**3)))
    box_list = cluster.split_box2sub_boxes(box=(0, 0, width, length),
                                           num_split=num_box,
                                           dimension='y',
                                           print_msg=True)

    # loop for block-by-block IO
    for i, box in enumerate(box_list):
        box_wid = box[2] - box[0]
        box_len = box[3] - box[1]
        num_pixel = box_len * box_wid
        if num_box > 1:
            print('\n------- processing patch {} out of {} --------------'.
                  format(i + 1, num_box))
            print('box width:  {}'.format(box_wid))
            print('box length: {}'.format(box_len))

        # initiate output
        m = np.zeros((num_param, num_pixel), dtype=dataType)
        m_std = np.zeros((num_param, num_pixel), dtype=dataType)

        # read input
        print('reading data from file {} ...'.format(inps.timeseries_file))
        ts_data = readfile.read(inps.timeseries_file, box=box)[0]

        # referencing in time and space
        # for file w/o reference info. e.g. ERA5.h5
        if inps.ref_date:
            print('referecing to date: {}'.format(inps.ref_date))
            ref_ind = inps.dateList.index(inps.ref_date)
            ts_data -= np.tile(ts_data[ref_ind, :, :],
                               (ts_data.shape[0], 1, 1))

        if inps.ref_yx:
            print('referencing to point (y, x): ({}, {})'.format(
                inps.ref_yx[0], inps.ref_yx[1]))
            ref_box = (inps.ref_yx[1], inps.ref_yx[0], inps.ref_yx[1] + 1,
                       inps.ref_yx[0] + 1)
            ref_val = readfile.read(inps.timeseries_file, box=ref_box)[0]
            ts_data -= np.tile(ref_val.reshape(ts_data.shape[0], 1, 1),
                               (1, ts_data.shape[1], ts_data.shape[2]))

        ts_data = ts_data[inps.dropDate, :, :].reshape(inps.numDate, -1)
        if atrV['UNIT'] == 'mm':
            ts_data *= 1. / 1000.

        ts_std = None
        if inps.ts_std_file:
            ts_std = readfile.read(inps.ts_std_file, box=box)[0]
            ts_std = ts_std[inps.dropDate, :, :].reshape(inps.numDate, -1)
            # set zero value to a fixed small value to avoid divide by zero

            epsilon = 1e-5
            ts_std[ts_std < epsilon] = epsilon

        # mask invalid pixels
        print('skip pixels with zero/nan value in all acquisitions')
        ts_stack = np.nanmean(ts_data, axis=0)
        mask = np.multiply(~np.isnan(ts_stack), ts_stack != 0.)
        del ts_stack

        if ts_std is not None:
            print('skip pxiels with nan STD value in any acquisition')
            num_std_nan = np.sum(np.isnan(ts_std), axis=0)
            mask *= num_std_nan == 0
            del num_std_nan

        ts_data = ts_data[:, mask]
        num_pixel2inv = int(np.sum(mask))
        idx_pixel2inv = np.where(mask)[0]
        print('number of pixels to invert: {} out of {} ({:.1f}%)'.format(
            num_pixel2inv, num_pixel, num_pixel2inv / num_pixel * 100))

        # go to next if no valid pixel found
        if num_pixel2inv == 0:
            continue

        ### estimation / solve Gm = d
        print('estimating time functions via linalg.lstsq ...')

        if inps.bootstrap:
            ## option 1 - least squares with bootstrapping
            # Bootstrapping is a resampling method which can be used to estimate properties
            # of an estimator. The method relies on independently sampling the data set with
            # replacement.
            print(
                'estimating time function STD with bootstrap resampling ({} times) ...'
                .format(inps.bootstrapCount))

            # calc model of all bootstrap sampling
            rng = np.random.default_rng()
            m_boot = np.zeros((inps.bootstrapCount, num_param, num_pixel2inv),
                              dtype=dataType)
            prog_bar = ptime.progressBar(maxValue=inps.bootstrapCount)
            for i in range(inps.bootstrapCount):
                # bootstrap resampling
                boot_ind = rng.choice(inps.numDate,
                                      size=inps.numDate,
                                      replace=True)
                boot_ind.sort()

                # estimation
                m_boot[i] = time_func.estimate_time_func(
                    model=model,
                    date_list=dates[boot_ind].tolist(),
                    dis_ts=ts_data[boot_ind],
                    seconds=seconds)[1]

                prog_bar.update(i + 1,
                                suffix='iteration {} / {}'.format(
                                    i + 1, inps.bootstrapCount))
            prog_bar.close()
            #del ts_data

            # get mean/std among all bootstrap sampling
            m[:, mask] = m_boot.mean(axis=0).reshape(num_param, -1)
            m_std[:, mask] = m_boot.std(axis=0).reshape(num_param, -1)
            del m_boot

        else:
            ## option 2 - least squares with uncertainty propagation
            G, m[:, mask], e2 = time_func.estimate_time_func(
                model=model,
                date_list=inps.dateList,
                dis_ts=ts_data,
                seconds=seconds)
            #del ts_data

            ## Compute the covariance matrix for model parameters: Gm = d
            # C_m_hat = (G.T * C_d^-1, * G)^-1  # linear propagation from the TS covariance matrix. (option 2.1)
            #         = sigma^2 * (G.T * G)^-1  # assuming obs errors are normally dist. in time.   (option 2.2a)
            # Based on the law of integrated expectation, we estimate the obs sigma^2 using
            # the OLS estimation residual e_hat_i = d_i - d_hat_i
            # sigma^2 = sigma_hat^2 * N / (N - P)                                                   (option 2.2b)
            #         = (e_hat.T * e_hat) / (N - P)  # sigma_hat^2 = (e_hat.T * e_hat) / N

            if ts_std is not None:
                # option 2.1 - linear propagation from time-series covariance matrix
                print(
                    'estimating time function STD from time-series STD pixel-by-pixel ...'
                )
                prog_bar = ptime.progressBar(maxValue=num_pixel2inv)
                for i in range(num_pixel2inv):
                    idx = idx_pixel2inv[i]

                    try:
                        C_ts_inv = np.diag(1. /
                                           np.square(ts_std[:, idx].flatten()))
                        m_var = np.diag(linalg.inv(
                            G.T.dot(C_ts_inv).dot(G))).astype(np.float32)
                        m_std[:, idx] = np.sqrt(m_var)
                    except linalg.LinAlgError:
                        m_std[:, idx] = np.nan

                    prog_bar.update(i + 1,
                                    every=200,
                                    suffix='{}/{} pixels'.format(
                                        i + 1, num_pixel2inv))
                prog_bar.close()

            else:
                # option 2.2a - assume obs errors following normal dist. in time
                print(
                    'estimating time function STD from time-series fitting residual ...'
                )
                G_inv = linalg.inv(np.dot(G.T, G))
                m_var = e2.reshape(1, -1) / (num_date - num_param)
                m_std[:, mask] = np.sqrt(
                    np.dot(np.diag(G_inv).reshape(-1, 1), m_var))

                # option 2.2b - simplified form for linear velocity (without matrix linear algebra)
                # The STD can also be calculated using Eq. (10) from Fattahi and Amelung (2015, JGR)
                # ts_diff = ts_data - np.dot(G, m)
                # t_diff = G[:, 1] - np.mean(G[:, 1])
                # vel_std = np.sqrt(np.sum(ts_diff ** 2, axis=0) / np.sum(t_diff ** 2)  / (num_date - 2))

        # write - time func params
        block = [box[1], box[3], box[0], box[2]]
        ds_dict = model2hdf5_dataset(model, m, m_std, mask=mask)[0]
        for ds_name, data in ds_dict.items():
            writefile.write_hdf5_block(inps.outfile,
                                       data=data.reshape(box_len, box_wid),
                                       datasetName=ds_name,
                                       block=block)

        # write - residual file
        if inps.save_res:
            block = [0, num_date, box[1], box[3], box[0], box[2]]
            ts_res = np.ones(
                (num_date, box_len * box_wid), dtype=np.float32) * np.nan
            ts_res[:, mask] = ts_data - np.dot(G, m)[:, mask]
            writefile.write_hdf5_block(inps.res_file,
                                       data=ts_res.reshape(
                                           num_date, box_len, box_wid),
                                       datasetName='timeseries',
                                       block=block)

    return inps.outfile
コード例 #2
0
    def prepare_geometry_definition_radar(self):
        """Get src_def and dest_def for lookup table in radar-coord (from ISCE, DORIS)"""
        def mark_lat_lon_anomoly(lat, lon):
            """mask pixels with abnormal values (0, etc.)
            This is found on sentinelStack multiple swath lookup table file.
            """
            # ignore pixels with zero value
            zero_mask = np.multiply(lat != 0., lon != 0.)

            # ignore anomaly non-zero values
            # by get the most common data range (d_min, d_max) based on histogram
            mask = np.array(zero_mask, np.bool_)
            for data in [lat, lon]:
                bin_value, bin_edge = np.histogram(data[mask], bins=10)
                # if there is anomaly, histogram won't be evenly distributed
                while np.max(bin_value) > np.sum(zero_mask) * 0.3:
                    # find the continous bins where the largest bin is --> normal data range
                    bin_value_thres = ut.median_abs_deviation_threshold(
                        bin_value, cutoff=3)
                    bin_label = ndimage.label(bin_value > bin_value_thres)[0]
                    idx = np.where(
                        bin_label == bin_label[np.argmax(bin_value)])[0]
                    # convert to min/max data value
                    bin_step = bin_edge[1] - bin_edge[0]
                    d_min = bin_edge[idx[0]] - bin_step / 2.
                    d_max = bin_edge[idx[-1] + 1] + bin_step / 2.
                    mask *= np.multiply(data >= d_min, data <= d_max)
                    bin_value, bin_edge = np.histogram(data[mask], bins=10)

            # set invalid pixels to fixed values
            lat[mask == 0] = 90.
            lon[mask == 0] = 0.
            return lat, lon, mask

        # read lookup table: lat/lon at pixel center
        # src  for radar2geo
        # dest for geo2radar
        print('read latitude / longitude from lookup table file: {}'.format(
            self.lut_file))
        lat_file = self.lat_file if self.lat_file else self.lut_file
        lon_file = self.lon_file if self.lon_file else self.lut_file
        lut_lat = readfile.read(lat_file,
                                datasetName='latitude')[0].astype(np.float32)
        lut_lon = readfile.read(lon_file,
                                datasetName='longitude')[0].astype(np.float32)
        lut_lat, lut_lon, mask = mark_lat_lon_anomoly(lut_lat, lut_lon)

        # radar2geo (with block-by-block support)
        if 'Y_FIRST' not in self.src_meta.keys():

            # src_lat/lon0/1
            src_lat0 = np.nanmax(lut_lat[mask])
            src_lat1 = np.nanmin(lut_lat[mask])
            src_lon0 = np.nanmin(lut_lon[mask])
            src_lon1 = np.nanmax(lut_lon[mask])

            # parameter 1 - lalo_step (output grid)
            if self.lalo_step is None:
                try:
                    # ensure the same pixel area before / after geocoding
                    merged_meta = {**self.lut_meta, **self.src_meta}
                    lat_c = (src_lat0 + src_lat1) / 2.
                    lat_step, lon_step = ut.auto_lat_lon_step_size(
                        merged_meta, lat_c)

                except KeyError:
                    # ensure the same matrix shape before / after geocoding
                    # if not enough metadata found for the above
                    lat_step = (src_lat1 - src_lat0) / (lut_lat.shape[0] - 1)
                    lon_step = (src_lon1 - src_lon0) / (lut_lat.shape[1] - 1)
                self.lalo_step = (abs(lat_step) * -1., abs(lon_step))

            else:
                # ensure lat/lon step sign
                self.lalo_step = (abs(self.lalo_step[0]) * -1.,
                                  abs(self.lalo_step[1]) * 1.)
            print('output pixel size in (lat, lon) in degree: {}'.format(
                self.lalo_step))

            # parameter 2 / 3 - SNWE (at pixel outer boundary; output grid) / length & width
            if self.SNWE is None:
                self.SNWE = (src_lat1 + self.lalo_step[0] / 2.0,
                             src_lat0 - self.lalo_step[0] / 2.0,
                             src_lon0 - self.lalo_step[1] / 2.0,
                             src_lon1 + self.lalo_step[1] / 2.0)
            self.length = int(
                np.rint((self.SNWE[0] - self.SNWE[1]) / self.lalo_step[0]))
            self.width = int(
                np.rint((self.SNWE[3] - self.SNWE[2]) / self.lalo_step[1]))
            # adjust SNWE ending coordinate (S, E) for precise alignment
            self.SNWE = (self.SNWE[1] + self.lalo_step[0] * self.length,
                         self.SNWE[1], self.SNWE[2],
                         self.SNWE[2] + self.lalo_step[1] * self.width)
            print('output area extent in (S, N, W, E) in degree: {}'.format(
                self.SNWE))
            print('output file row / column number: ({}, {})'.format(
                self.length, self.width))

            # parameter 4 - list of boxes & geometry definitions
            self.src_box_list = []
            self.src_def_list = []
            self.dest_box_list = []
            self.dest_def_list = []

            # split dest_box (in grid)
            self.dest_box_list = split_box2sub_boxes(box=(0, 0, self.width,
                                                          self.length),
                                                     num_split=self.num_box,
                                                     dimension='y',
                                                     print_msg=True)

            # dest_box --> src_box / src_def / dest_def
            for i, dest_box in enumerate(self.dest_box_list):
                if self.num_box > 1:
                    print('preparing geometry for dest_box {}/{}: {}'.format(
                        i + 1, self.num_box, dest_box))

                # dest_lat/lon at pixel center
                lat_num = dest_box[3] - dest_box[1]
                lon_num = dest_box[2] - dest_box[0]
                lat0 = self.SNWE[1] + self.lalo_step[0] * (dest_box[1] + 0.5)
                lat1 = self.SNWE[1] + self.lalo_step[0] * (dest_box[3] - 0.5)
                lon0 = self.SNWE[2] + self.lalo_step[1] * (dest_box[0] + 0.5)
                lon1 = self.SNWE[2] + self.lalo_step[1] * (dest_box[2] - 0.5)
                dest_lat, dest_lon = np.mgrid[lat0:lat1:lat_num * 1j,
                                              lon0:lon1:lon_num * 1j]

                # src_box
                src_area = (src_lat1 - src_lat0) * (src_lon1 - src_lon0)
                dest_area = (lat1 - lat0) * (lon1 - lon0)
                if dest_area < src_area * 0.5:
                    # reduction of swath data
                    # https://pyresample.readthedocs.io/en/latest/data_reduce.html
                    # get src_box (in swath) from lat/lon (from dest_box in grid)
                    print('searching relevant box covering the current SNWE')
                    flag = pr.data_reduce.get_valid_index_from_lonlat_grid(
                        dest_lon,
                        dest_lat,
                        lut_lon,
                        lut_lat,
                        radius_of_influence=3000)
                    idx_row, idx_col = np.where(flag)
                    src_box = (np.min(idx_col), np.min(idx_row),
                               np.max(idx_col), np.max(idx_row))
                else:
                    src_box = (0, 0, lut_lat.shape[1], lut_lat.shape[0])

                # geometry definition
                src_def = pr.geometry.SwathDefinition(
                    lons=lut_lon[src_box[1]:src_box[3], src_box[0]:src_box[2]],
                    lats=lut_lat[src_box[1]:src_box[3], src_box[0]:src_box[2]])
                dest_def = pr.geometry.GridDefinition(lons=dest_lon,
                                                      lats=dest_lat)

                self.src_box_list.append(src_box)
                self.src_def_list.append(src_def)
                self.dest_def_list.append(dest_def)

        # geo2radar (WITHOUT block-by-block support)
        else:
            # parameter 1 - lalo_step (input grid)
            self.lalo_step = [
                float(self.src_meta['Y_STEP']),
                float(self.src_meta['X_STEP'])
            ]
            print('input pixel size in (lat, lon) in degree: {}'.format(
                self.lalo_step))

            # parameter 2 - SNWE (input grid)
            lat0 = float(self.src_meta['Y_FIRST'])
            lon0 = float(self.src_meta['X_FIRST'])
            if not self.SNWE:
                # default SNWE --> src_box
                src_box = (0, 0, int(self.src_meta['WIDTH']),
                           int(self.src_meta['LENGTH']))
            else:
                # custom input SNWE --> src_box
                # to align SNWE to precisely to source file in geo-coord
                src_box = (
                    int(np.rint(
                        (self.SNWE[2] - lon0) / self.lalo_step[1])),  # x0 - W
                    int(np.rint(
                        (self.SNWE[1] - lat0) / self.lalo_step[0])),  # y0 - N
                    int(np.rint(
                        (self.SNWE[3] - lon0) / self.lalo_step[1])),  # x1 - E
                    int(np.rint(
                        (self.SNWE[0] - lat0) / self.lalo_step[0])))  # y1 - S
            # src_box --> SNWE
            self.SNWE = (
                lat0 + self.lalo_step[0] * src_box[3],  # S - y1
                lat0 + self.lalo_step[0] * src_box[1],  # N - y0
                lon0 + self.lalo_step[1] * src_box[0],  # W - x0
                lon0 + self.lalo_step[1] * src_box[2])  # E - x1
            print('input area extent in (S, N, W, E) in degree: {}'.format(
                self.SNWE))

            # parameter 3 - length / width (output grid)
            self.length, self.width = lut_lat.shape

            # src_lat/lon (at pixel center)
            src_len = src_box[3] - src_box[1]
            src_wid = src_box[2] - src_box[0]
            src_lat0 = self.SNWE[1] + self.lalo_step[0] * (src_box[1] + 0.5)
            src_lat1 = self.SNWE[1] + self.lalo_step[0] * (src_box[3] - 0.5)
            src_lon0 = self.SNWE[2] + self.lalo_step[1] * (src_box[0] + 0.5)
            src_lon1 = self.SNWE[2] + self.lalo_step[1] * (src_box[2] - 0.5)
            src_lat, src_lon = np.mgrid[src_lat0:src_lat1:src_len * 1j,
                                        src_lon0:src_lon1:src_wid * 1j]

            # parameter 4 - list of boxes & geometry definitions
            self.src_box_list = [src_box]
            self.src_def_list = [
                pr.geometry.GridDefinition(lons=src_lon, lats=src_lat)
            ]
            self.dest_box_list = [(0, 0, self.width, self.length)]
            self.dest_def_list = [
                pr.geometry.SwathDefinition(lons=lut_lon, lats=lut_lat)
            ]
            self.num_box = 1

        return
コード例 #3
0
def run_timeseries2time_func(inps):

    # basic info
    atr = readfile.read_attribute(inps.timeseries_file)
    length, width = int(atr['LENGTH']), int(atr['WIDTH'])
    num_date = inps.numDate
    dates = np.array(inps.dateList)

    # get deformation model from parsers
    model, num_param = read_inps2model(inps)


    ## output preparation

    # attributes
    atr['FILE_TYPE'] = 'velocity'
    atr['UNIT'] = 'm/year'
    atr['START_DATE'] = inps.dateList[0]
    atr['END_DATE'] = inps.dateList[-1]
    atr['DATE12'] = '{}_{}'.format(inps.dateList[0], inps.dateList[-1])
    if inps.ref_yx:
        atr['REF_Y'] = inps.ref_yx[0]
        atr['REF_X'] = inps.ref_yx[1]
    if inps.ref_date:
        atr['REF_DATE'] = inps.ref_date

    # config parameter
    print('add/update the following configuration metadata:\n{}'.format(configKeys))
    for key in configKeys:
        atr[key_prefix+key] = str(vars(inps)[key])

    # instantiate output file
    layout_hdf5(inps.outfile, atr, model)


    ## estimation

    # calc number of box based on memory limit
    memoryAll = (num_date + num_param * 2 + 2) * length * width * 4 
    if inps.bootstrap:
        memoryAll += inps.bootstrapCount * num_param * length * width * 4
    num_box = int(np.ceil(memoryAll * 3 / (inps.maxMemory * 1024**3)))
    box_list = cluster.split_box2sub_boxes(box=(0, 0, width, length),
                                           num_split=num_box,
                                           dimension='y',
                                           print_msg=True)

    # loop for block-by-block IO
    for i, box in enumerate(box_list):
        box_width  = box[2] - box[0]
        box_length = box[3] - box[1]
        num_pixel = box_length * box_width
        if num_box > 1:
            print('\n------- processing patch {} out of {} --------------'.format(i+1, num_box))
            print('box width:  {}'.format(box_width))
            print('box length: {}'.format(box_length))

        # initiate output
        m = np.zeros((num_param, num_pixel), dtype=dataType)
        m_std = np.zeros((num_param, num_pixel), dtype=dataType)

        # read input
        print('reading data from file {} ...'.format(inps.timeseries_file))
        ts_data = readfile.read(inps.timeseries_file, box=box)[0]
        # referencing in time and space
        # for file w/o reference info. e.g. ERA5.h5
        if inps.ref_date:
            print('referecing to date: {}'.format(inps.ref_date))
            ref_ind = inps.dateList.index(inps.ref_date)
            ts_data -= np.tile(ts_data[ref_ind, :, :], (ts_data.shape[0], 1, 1))
        if inps.ref_yx:
            print('referencing to point (y, x): ({}, {})'.format(inps.ref_yx[0], inps.ref_yx[1]))
            ref_box = (inps.ref_yx[1], inps.ref_yx[0], inps.ref_yx[1]+1, inps.ref_yx[0]+1)
            ref_val = readfile.read(inps.timeseries_file, box=ref_box)[0]
            ts_data -= np.tile(ref_val.reshape(ts_data.shape[0], 1, 1), (1, ts_data.shape[1], ts_data.shape[2]))

        ts_data = ts_data[inps.dropDate, :, :].reshape(inps.numDate, -1)
        if atr['UNIT'] == 'mm':
            ts_data *= 1./1000.

        # mask invalid pixels
        print('skip pixels with zero/nan value in all acquisitions')
        ts_stack = np.nanmean(ts_data, axis=0)
        mask = np.multiply(~np.isnan(ts_stack), ts_stack!=0.)
        del ts_stack

        ts_data = ts_data[:, mask]
        num_pixel2inv = int(np.sum(mask))
        print('number of pixels to invert: {} out of {} ({:.1f}%)'.format(
            num_pixel2inv, num_pixel, num_pixel2inv/num_pixel*100))

        # go to next if no valid pixel found
        if num_pixel2inv == 0:
            block = [box[1], box[3], box[0], box[2]]
            write_hdf5_block(inps.outfile, model, m, m_std,
                             mask=mask,
                             block=block)
            continue


        ### estimation / solve Gm = d

        if inps.bootstrap:
            ## option 1 - least squares with bootstrapping
            # Bootstrapping is a resampling method which can be used to estimate properties
            # of an estimator. The method relies on independently sampling the data set with
            # replacement.

            try:
                from sklearn.utils import resample
            except ImportError:
                raise ImportError('can not import scikit-learn!')
            print('using bootstrap resampling {} times ...'.format(inps.bootstrapCount)) 

            # calc model of all bootstrap sampling
            m_boot = np.zeros((inps.bootstrapCount, num_param, num_pixel2inv), dtype=dataType)
            prog_bar = ptime.progressBar(maxValue=inps.bootstrapCount)
            for i in range(inps.bootstrapCount):
                # bootstrap resampling
                boot_ind = resample(np.arange(inps.numDate),
                                    replace=True,
                                    n_samples=inps.numDate)
                boot_ind.sort()

                # estimation
                m_boot[i] = estimate_time_func(dates[boot_ind].tolist(),
                                               ts_data[boot_ind],
                                               model)[1]

                prog_bar.update(i+1, suffix='iteration {} / {}'.format(i+1, inps.bootstrapCount))
            prog_bar.close()
            del ts_data

            # get mean/std among all bootstrap sampling
            print('calculate mean and standard deviation of bootstrap estimations')
            m[:, mask] = m_boot.mean(axis=0).reshape(num_param, -1)
            m_std[:, mask] = m_boot.std(axis=0).reshape(num_param, -1)
            del m_boot


        else:
            ## option 2 - least squares with uncertainty propagation

            print('estimate time functions via linalg.lstsq ...')
            G, m[:, mask], e2 = estimate_time_func(inps.dateList,
                                                   ts_data,
                                                   model)
            del ts_data

            ## Compute the covariance matrix for model parameters: Gm = d
            # C_m_hat = (G.T * C_d^-1, * G)^-1  # the most generic form
            #         = sigma^2 * (G.T * G)^-1  # assuming the obs error is normally distributed in time.
            # Based on the law of integrated expectation, we estimate the obs sigma^2 using
            # the OLS estimation residual e_hat_i = d_i - d_hat_i
            # sigma^2 = sigma_hat^2 * N / (N - P)
            #         = (e_hat.T * e_hat) / (N - P)  # sigma_hat^2 = (e_hat.T * e_hat) / N

            G_inv = linalg.inv(np.dot(G.T, G))
            m_var = e2.reshape(1, -1) / (num_date - num_param)
            m_std[:, mask] = np.sqrt(np.dot(np.diag(G_inv).reshape(-1, 1), m_var))

            ## for linear velocity, the STD can also be calculated 
            # using Eq. (10) from Fattahi and Amelung (2015, JGR)
            # ts_diff = ts_data - np.dot(G, m)
            # t_diff = G[:, 1] - np.mean(G[:, 1])
            # vel_std = np.sqrt(np.sum(ts_diff ** 2, axis=0) / np.sum(t_diff ** 2)  / (num_date - 2))

        # write
        block = [box[1], box[3], box[0], box[2]]
        write_hdf5_block(inps.outfile, model, m, m_std,
                         mask=mask,
                         block=block)

    return inps.outfile
コード例 #4
0
ファイル: reference_date.py プロジェクト: scottstanie/MintPy
def change_timeseries_ref_date(ts_file, ref_date, outfile=None, max_memory=4.0, force=False):
    """Change input file reference date to a different one.
    Parameters: ts_file : str, timeseries file to be changed
                ref_date : str, date in YYYYMMDD format
                outfile  : if str, save to a different file
                           if None, modify the data value in the existing input file
    """
    ts_file = os.path.abspath(ts_file)
    if not outfile:
        outfile = ts_file
    outfile = os.path.abspath(outfile)

    print('-'*50)
    print('change reference date for file: {}'.format(ts_file))
    atr = readfile.read_attribute(ts_file)
    dsName = atr['FILE_TYPE']

    # if the input reference date is the same as the existing one.
    if ref_date == atr.get('REF_DATE', None) and not force:
        print('input refDate is the same as the existing REF_DATE.')
        if outfile == ts_file:
            print('Nothing to be done.')
            return ts_file
        else:
            print('Copy {} to {}'.format(ts_file, outfile))
            shutil.copy2(ts_file, outfile)
            return outfile

    # basic info
    obj = timeseries(ts_file)
    obj.open(print_msg=False)
    num_date = obj.numDate
    length = obj.length
    width = obj.width
    ref_idx = obj.dateList.index(ref_date)

    # get list of boxes for block-by-block IO
    num_box = int(np.ceil((num_date * length * width * 4 * 2) / (max_memory * 1024**3)))
    box_list = split_box2sub_boxes(box=(0, 0, width, length),
                                   num_split=num_box,
                                   dimension='y',
                                   print_msg=True)

    # updating existing file or write new file
    if outfile == ts_file:
        mode = 'r+'
    else:
        mode = 'a'
        # instantiate output file
        writefile.layout_hdf5(outfile, ref_file=ts_file)

    # loop for block-by-block IO
    for i, box in enumerate(box_list):
        box_width  = box[2] - box[0]
        box_length = box[3] - box[1]
        if num_box > 1:
            print('\n------- processing patch {} out of {} --------------'.format(i+1, num_box))
            print('box width:  {}'.format(box_width))
            print('box length: {}'.format(box_length))

        # reading
        print('reading data ...')
        ts_data = readfile.read(ts_file, box=box)[0]

        print('referencing in time ...')
        dshape = ts_data.shape
        ts_data -= np.tile(ts_data[ref_idx, :, :].reshape(1, dshape[1], dshape[2]), (dshape[0], 1, 1))

        # writing
        block = (0, num_date, box[1], box[3], box[0], box[2])
        writefile.write_hdf5_block(outfile,
                                   data=ts_data,
                                   datasetName=dsName,
                                   block=block,
                                   mode=mode)

    # update metadata
    print('update "REF_DATE" attribute value to {}'.format(ref_date))
    with h5py.File(outfile, 'r+') as f:
        f.attrs['REF_DATE'] = ref_date
        f.attrs['FILE_PATH'] = outfile

    return outfile
コード例 #5
0
def correct_dem_error(inps):
    """Correct DEM error of input timeseries file"""

    start_time = time.time()

    # limit the number of threads to 1
    # for slight speedup and big CPU usage save
    num_threads_dict = cluster.set_num_threads("1")

    ## 1. input info

    # 1.1 read date info
    ts_obj = timeseries(inps.timeseries_file)
    ts_obj.open()
    num_date = ts_obj.numDate
    length, width = ts_obj.length, ts_obj.width

    num_step = len(inps.stepFuncDate)

    # exclude dates
    date_flag = read_exclude_date(inps.excludeDate, ts_obj.dateList)[0]
    if inps.polyOrder > np.sum(date_flag):
        raise ValueError(
            "input poly order {} > number of acquisition {}! Reduce it!".
            format(inps.polyOrder, np.sum(date_flag)))

    # 1.2 design matrix part 1 - time func for surface deformation
    G_defo = get_design_matrix4defo(inps)

    ## 2. prepare output

    # 2.1 metadata
    meta = dict(ts_obj.metadata)
    print(
        'add/update the following configuration metadata to file:\n{}'.format(
            configKeys))
    for key in configKeys:
        meta[key_prefix + key] = str(vars(inps)[key])

    # 2.2 instantiate est. DEM error
    dem_err_file = 'demErr.h5'
    meta['FILE_TYPE'] = 'dem'
    meta['UNIT'] = 'm'
    ds_name_dict = {'dem': [np.float32, (length, width), None]}
    writefile.layout_hdf5(dem_err_file, ds_name_dict, metadata=meta)

    # 2.3 instantiate corrected time-series
    ts_cor_file = inps.outfile
    meta['FILE_TYPE'] = 'timeseries'
    writefile.layout_hdf5(ts_cor_file,
                          metadata=meta,
                          ref_file=inps.timeseries_file)

    # 2.4 instantiate residual phase time-series
    ts_res_file = os.path.join(os.path.dirname(inps.outfile),
                               'timeseriesResidual.h5')
    writefile.layout_hdf5(ts_res_file,
                          metadata=meta,
                          ref_file=inps.timeseries_file)

    ## 3. run the estimation and write to disk

    # 3.1 split ts_file into blocks to save memory
    # 1st dimension size: ts (obs / cor / res / step) + dem_err/inc_angle/rg_dist (+pbase)
    num_epoch = num_date * 3 + num_step + 3
    if inps.geom_file:
        geom_obj = geometry(inps.geom_file)
        geom_obj.open(print_msg=False)
        if 'bperp' in geom_obj.datasetNames:
            num_epoch += num_date

    # split in row/line direction based on the input memory limit
    num_box = int(
        np.ceil((num_epoch * length * width * 4) * 2.5 /
                (inps.maxMemory * 1024**3)))
    box_list = cluster.split_box2sub_boxes(box=(0, 0, width, length),
                                           num_split=num_box,
                                           dimension='y')

    # 3.2 prepare the input arguments for *_patch()
    data_kwargs = {
        'G_defo': G_defo,
        'ts_file': inps.timeseries_file,
        'geom_file': inps.geom_file,
        'date_flag': date_flag,
        'phase_velocity': inps.phaseVelocity,
    }

    # 3.3 invert / write block-by-block
    for i, box in enumerate(box_list):
        box_wid = box[2] - box[0]
        box_len = box[3] - box[1]
        if num_box > 1:
            print('\n------- processing patch {} out of {} --------------'.
                  format(i + 1, num_box))
            print('box width:  {}'.format(box_wid))
            print('box length: {}'.format(box_len))

        # update box argument in the input data
        data_kwargs['box'] = box

        # invert
        if not inps.cluster:
            # non-parallel
            delta_z, ts_cor, ts_res = correct_dem_error_patch(
                **data_kwargs)[:-1]

        else:
            # parallel
            print('\n\n------- start parallel processing using Dask -------')

            # initiate the output data
            delta_z = np.zeros((box_len, box_wid), dtype=np.float32)
            ts_cor = np.zeros((num_date, box_len, box_wid), dtype=np.float32)
            ts_res = np.zeros((num_date, box_len, box_wid), dtype=np.float32)

            # initiate dask cluster and client
            cluster_obj = cluster.DaskCluster(inps.cluster,
                                              inps.numWorker,
                                              config_name=inps.config)
            cluster_obj.open()

            # run dask
            delta_z, ts_cor, ts_res = cluster_obj.run(
                func=correct_dem_error_patch,
                func_data=data_kwargs,
                results=[delta_z, ts_cor, ts_res])

            # close dask cluster and client
            cluster_obj.close()

            print('------- finished parallel processing -------\n\n')

        # write the block to disk
        # with 3D block in [z0, z1, y0, y1, x0, x1]
        # and  2D block in         [y0, y1, x0, x1]

        # DEM error - 2D
        block = [box[1], box[3], box[0], box[2]]
        writefile.write_hdf5_block(dem_err_file,
                                   data=delta_z,
                                   datasetName='dem',
                                   block=block)

        # corrected time-series - 3D
        block = [0, num_date, box[1], box[3], box[0], box[2]]
        writefile.write_hdf5_block(ts_cor_file,
                                   data=ts_cor,
                                   datasetName='timeseries',
                                   block=block)

        # residual time-series - 3D
        block = [0, num_date, box[1], box[3], box[0], box[2]]
        writefile.write_hdf5_block(ts_res_file,
                                   data=ts_res,
                                   datasetName='timeseries',
                                   block=block)

    # roll back to the origial number of threads
    cluster.roll_back_num_threads(num_threads_dict)

    # time info
    m, s = divmod(time.time() - start_time, 60)
    print('time used: {:02.0f} mins {:02.1f} secs.'.format(m, s))

    return dem_err_file, ts_cor_file, ts_res_file
コード例 #6
0
def run_timeseries2time_func(inps):

    # basic info
    atr = readfile.read_attribute(inps.timeseries_file)
    length, width = int(atr['LENGTH']), int(atr['WIDTH'])
    num_date = inps.numDate
    dates = np.array(inps.dateList)
    seconds = atr.get('CENTER_LINE_UTC', 0)

    # use the 1st date as reference if not found, e.g. timeseriesResidual.h5 file
    if "REF_DATE" not in atr.keys() and not inps.ref_date:
        inps.ref_date = inps.dateList[0]
        print(
            'WARNING: No REF_DATE found in time-series file or input in command line.'
        )
        print('  Set "--ref-date {}" and continue.'.format(inps.dateList[0]))

    # get deformation model from parsers
    model, num_param = read_inps2model(inps)

    ## output preparation

    # time_func_param: attributes
    atrV = dict(atr)
    atrV['FILE_TYPE'] = 'velocity'
    atrV['UNIT'] = 'm/year'
    atrV['START_DATE'] = inps.dateList[0]
    atrV['END_DATE'] = inps.dateList[-1]
    atrV['DATE12'] = '{}_{}'.format(inps.dateList[0], inps.dateList[-1])
    if inps.ref_yx:
        atrV['REF_Y'] = inps.ref_yx[0]
        atrV['REF_X'] = inps.ref_yx[1]
    if inps.ref_date:
        atrV['REF_DATE'] = inps.ref_date

    # time_func_param: config parameter
    print('add/update the following configuration metadata:\n{}'.format(
        configKeys))
    for key in configKeys:
        atrV[key_prefix + key] = str(vars(inps)[key])

    # time_func_param: instantiate output file
    ds_name_dict, ds_unit_dict = model2hdf5_dataset(model,
                                                    ds_shape=(length,
                                                              width))[1:]
    writefile.layout_hdf5(inps.outfile,
                          metadata=atrV,
                          ds_name_dict=ds_name_dict,
                          ds_unit_dict=ds_unit_dict)

    # timeseries_res: attributes + instantiate output file
    if inps.save_res:
        atrR = dict(atr)
        # remove REF_DATE attribute
        for key in ['REF_DATE']:
            if key in atrR.keys():
                atrR.pop(key)
        # prepare ds_name_dict manually, instead of using ref_file, to support --ex option
        date_len = len(inps.dateList[0])
        ds_name_dict = {
            "date": [
                np.dtype(f'S{date_len}'), (num_date, ),
                np.array(inps.dateList, dtype=np.string_)
            ],
            "timeseries": [np.float32, (num_date, length, width), None]
        }
        writefile.layout_hdf5(inps.res_file,
                              ds_name_dict=ds_name_dict,
                              metadata=atrR)

    ## estimation

    # calc number of box based on memory limit
    memoryAll = (num_date + num_param * 2 + 2) * length * width * 4
    if inps.bootstrap:
        memoryAll += inps.bootstrapCount * num_param * length * width * 4
    num_box = int(np.ceil(memoryAll * 3 / (inps.maxMemory * 1024**3)))
    box_list = cluster.split_box2sub_boxes(box=(0, 0, width, length),
                                           num_split=num_box,
                                           dimension='y',
                                           print_msg=True)

    # loop for block-by-block IO
    for i, box in enumerate(box_list):
        box_wid = box[2] - box[0]
        box_len = box[3] - box[1]
        num_pixel = box_len * box_wid
        if num_box > 1:
            print('\n------- processing patch {} out of {} --------------'.
                  format(i + 1, num_box))
            print('box width:  {}'.format(box_wid))
            print('box length: {}'.format(box_len))

        # initiate output
        m = np.zeros((num_param, num_pixel), dtype=dataType)
        m_std = np.zeros((num_param, num_pixel), dtype=dataType)

        # read input
        print('reading data from file {} ...'.format(inps.timeseries_file))
        ts_data = readfile.read(inps.timeseries_file, box=box)[0]

        # referencing in time and space
        # for file w/o reference info. e.g. ERA5.h5
        if inps.ref_date:
            print('referecing to date: {}'.format(inps.ref_date))
            ref_ind = inps.dateList.index(inps.ref_date)
            ts_data -= np.tile(ts_data[ref_ind, :, :],
                               (ts_data.shape[0], 1, 1))

        if inps.ref_yx:
            print('referencing to point (y, x): ({}, {})'.format(
                inps.ref_yx[0], inps.ref_yx[1]))
            ref_box = (inps.ref_yx[1], inps.ref_yx[0], inps.ref_yx[1] + 1,
                       inps.ref_yx[0] + 1)
            ref_val = readfile.read(inps.timeseries_file, box=ref_box)[0]
            ts_data -= np.tile(ref_val.reshape(ts_data.shape[0], 1, 1),
                               (1, ts_data.shape[1], ts_data.shape[2]))

        ts_data = ts_data[inps.dropDate, :, :].reshape(inps.numDate, -1)
        if atrV['UNIT'] == 'mm':
            ts_data *= 1. / 1000.

        ts_cov = None
        if inps.ts_cov_file:
            print(
                f'reading time-series covariance matrix from file {inps.ts_cov_file} ...'
            )
            ts_cov = readfile.read(inps.ts_cov_file, box=box)[0]
            if len(ts_cov.shape) == 4:
                # full covariance matrix in 4D --> 3D
                if inps.numDate < ts_cov.shape[0]:
                    ts_cov = ts_cov[inps.dropDate, :, :, :]
                    ts_cov = ts_cov[:, inps.dropDate, :, :]
                ts_cov = ts_cov.reshape(inps.numDate, inps.numDate, -1)

            elif len(ts_cov.shape) == 3:
                # diaginal variance matrix in 3D --> 2D
                if inps.numDate < ts_cov.shape[0]:
                    ts_cov = ts_cov[inps.dropDate, :, :]
                ts_cov = ts_cov.reshape(inps.numDate, -1)

            ## set zero value to a fixed small value to avoid divide by zero
            #epsilon = 1e-5
            #ts_cov[ts_cov<epsilon] = epsilon

        # mask invalid pixels
        print('skip pixels with zero/nan value in all acquisitions')
        ts_stack = np.nanmean(ts_data, axis=0)
        mask = np.multiply(~np.isnan(ts_stack), ts_stack != 0.)
        del ts_stack

        #if ts_cov is not None:
        #    print('skip pxiels with nan STD value in any acquisition')
        #    num_std_nan = np.sum(np.isnan(ts_cov), axis=0)
        #    mask *= num_std_nan == 0
        #    del num_std_nan

        ts_data = ts_data[:, mask]
        num_pixel2inv = int(np.sum(mask))
        idx_pixel2inv = np.where(mask)[0]
        print('number of pixels to invert: {} out of {} ({:.1f}%)'.format(
            num_pixel2inv, num_pixel, num_pixel2inv / num_pixel * 100))

        # go to next if no valid pixel found
        if num_pixel2inv == 0:
            continue

        ### estimation / solve Gm = d
        print('estimating time functions via linalg.lstsq ...')

        if inps.bootstrap:
            ## option 1 - least squares with bootstrapping
            # Bootstrapping is a resampling method which can be used to estimate properties
            # of an estimator. The method relies on independently sampling the data set with
            # replacement.
            print(
                'estimating time function STD with bootstrap resampling ({} times) ...'
                .format(inps.bootstrapCount))

            # calc model of all bootstrap sampling
            rng = np.random.default_rng()
            m_boot = np.zeros((inps.bootstrapCount, num_param, num_pixel2inv),
                              dtype=dataType)
            prog_bar = ptime.progressBar(maxValue=inps.bootstrapCount)
            for i in range(inps.bootstrapCount):
                # bootstrap resampling
                boot_ind = rng.choice(inps.numDate,
                                      size=inps.numDate,
                                      replace=True)
                boot_ind.sort()

                # estimation
                m_boot[i] = time_func.estimate_time_func(
                    model=model,
                    date_list=dates[boot_ind].tolist(),
                    dis_ts=ts_data[boot_ind],
                    seconds=seconds)[1]

                prog_bar.update(i + 1,
                                suffix='iteration {} / {}'.format(
                                    i + 1, inps.bootstrapCount))
            prog_bar.close()
            #del ts_data

            # get mean/std among all bootstrap sampling
            m[:, mask] = m_boot.mean(axis=0).reshape(num_param, -1)
            m_std[:, mask] = m_boot.std(axis=0).reshape(num_param, -1)
            del m_boot

            # get design matrix to calculate the residual time series
            G = time_func.get_design_matrix4time_func(inps.dateList,
                                                      model=model,
                                                      ref_date=inps.ref_date,
                                                      seconds=seconds)

        else:
            ## option 2 - least squares with uncertainty propagation
            G, m[:, mask], e2 = time_func.estimate_time_func(
                model=model,
                date_list=inps.dateList,
                dis_ts=ts_data,
                seconds=seconds)
            #del ts_data

            ## Compute the covariance matrix for model parameters:
            #       G * m = d
            #     C_m_hat = G+ * C_d * G+.T
            #
            # For ordinary least squares estimation:
            #     G+ = (G.T * G)^-1 * G.T                       (option 2.1)
            #
            # For weighted least squares estimation:
            #          G+ = (G.T * C_d^-1 * G)^-1 * G.T * C_d^-1
            # =>  C_m_hat = (G.T * C_d^-1 * G)^-1               (option 2.2)
            #
            # Assuming normality of the observation errors (in the time domain) with a variance of sigma^2
            # we have C_d = sigma^2 * I, then the above equation is simplfied into:
            #     C_m_hat = sigma^2 * (G.T * G)^-1              (option 2.3)
            #
            # Based on the law of integrated expectation, we estimate the obs sigma^2 using
            # the OLS estimation residual as:
            #           e_hat = d - d_hat
            # =>  sigma_hat^2 = (e_hat.T * e_hat) / N
            # =>      sigma^2 = sigma_hat^2 * N / (N - P)       (option 2.4)
            #                 = (e_hat.T * e_hat) / (N - P)
            # which is the equation (10) from Fattahi and Amelung (2015, JGR)

            if ts_cov is not None:
                # option 2.1 - linear propagation from time-series (co)variance matrix
                # TO DO: save the full covariance matrix of the time function parameters
                # only the STD is saved right now
                covar_flag = True if len(ts_cov.shape) == 3 else False
                msg = 'estimating time function STD from time-serries '
                msg += 'covariance pixel-by-pixel ...' if covar_flag else 'variance pixel-by-pixel ...'
                print(msg)

                # calc the common pseudo-inverse matrix
                Gplus = linalg.pinv(G)

                # loop over each pixel
                # or use multidimension matrix multiplication
                # m_cov = Gplus @ ts_cov @ Gplus.T
                prog_bar = ptime.progressBar(maxValue=num_pixel2inv)
                for i in range(num_pixel2inv):
                    idx = idx_pixel2inv[i]

                    # cov: time-series -> time func
                    ts_covi = ts_cov[:, :, idx] if covar_flag else np.diag(
                        ts_cov[:, idx])
                    m_cov = np.linalg.multi_dot([Gplus, ts_covi, Gplus.T])
                    m_std[:, idx] = np.sqrt(np.diag(m_cov))

                    prog_bar.update(i + 1,
                                    every=200,
                                    suffix='{}/{} pixels'.format(
                                        i + 1, num_pixel2inv))
                prog_bar.close()

            else:
                # option 2.3 - assume obs errors following normal dist. in time
                print(
                    'estimating time function STD from time-series fitting residual ...'
                )
                G_inv = linalg.inv(np.dot(G.T, G))
                m_var = e2.reshape(1, -1) / (num_date - num_param)
                m_std[:, mask] = np.sqrt(
                    np.dot(np.diag(G_inv).reshape(-1, 1), m_var))

                # option 2.4 - simplified form for linear velocity (without matrix linear algebra)
                # The STD can also be calculated using Eq. (10) from Fattahi and Amelung (2015, JGR)
                # ts_diff = ts_data - np.dot(G, m)
                # t_diff = G[:, 1] - np.mean(G[:, 1])
                # vel_std = np.sqrt(np.sum(ts_diff ** 2, axis=0) / np.sum(t_diff ** 2)  / (num_date - 2))

        # write - time func params
        block = [box[1], box[3], box[0], box[2]]
        ds_dict = model2hdf5_dataset(model, m, m_std, mask=mask)[0]
        for ds_name, data in ds_dict.items():
            writefile.write_hdf5_block(inps.outfile,
                                       data=data.reshape(box_len, box_wid),
                                       datasetName=ds_name,
                                       block=block)

        # write - residual file
        if inps.save_res:
            block = [0, num_date, box[1], box[3], box[0], box[2]]
            ts_res = np.ones(
                (num_date, box_len * box_wid), dtype=np.float32) * np.nan
            ts_res[:, mask] = ts_data - np.dot(G, m)[:, mask]
            writefile.write_hdf5_block(inps.res_file,
                                       data=ts_res.reshape(
                                           num_date, box_len, box_wid),
                                       datasetName='timeseries',
                                       block=block)

    return inps.outfile
コード例 #7
0
ファイル: diff.py プロジェクト: scottstanie/MintPy
def diff_file(file1, file2, out_file=None, force=False, max_num_pixel=2e8):
    """calculate/write file1 - file2

    Parameters: file1   - str, path of file1
                file2   - list of str, path of file2(s)
                out_file - str, path of output file
                force   - bool, overwrite existing output file
                max_num_pixel - float, maximum number of pixels for each block
    """
    start_time = time.time()

    if not out_file:
        fbase, fext = os.path.splitext(file1)
        if len(file2) > 1:
            raise ValueError(
                'Output file name is needed for more than 2 files input.')
        out_file = '{}_diff_{}{}'.format(
            fbase,
            os.path.splitext(os.path.basename(file2[0]))[0], fext)
    print('{} - {} --> {}'.format(file1, file2, out_file))

    # Read basic info
    atr1 = readfile.read_attribute(file1)
    k1 = atr1['FILE_TYPE']
    atr2 = readfile.read_attribute(file2[0])
    k2 = atr2['FILE_TYPE']
    print('input files are: {} and {}'.format(k1, k2))

    if k1 == 'timeseries':
        if k2 not in ['timeseries', 'giantTimeseries']:
            raise Exception(
                'Input multiple dataset files are not the same file type!')
        if len(file2) > 1:
            raise Exception(
                ('Only 2 files substraction is supported for time-series file,'
                 ' {} input.'.format(len(file2) + 1)))

        atr1 = readfile.read_attribute(file1)
        atr2 = readfile.read_attribute(file2[0])
        dateList1 = timeseries(file1).get_date_list()
        if k2 == 'timeseries':
            dateList2 = timeseries(file2[0]).get_date_list()
            unit_fac = 1.
        elif k2 == 'giantTimeseries':
            dateList2 = giantTimeseries(file2[0]).get_date_list()
            unit_fac = 0.001

        # check reference point
        ref_date, ref_y, ref_x = check_reference(atr1, atr2)

        # check dates shared by two timeseries files
        dateListShared = [i for i in dateList1 if i in dateList2]
        dateShared = np.ones((len(dateList1)), dtype=np.bool_)
        if dateListShared != dateList1:
            print('WARNING: {} does not contain all dates in {}'.format(
                file2, file1))
            if force:
                dateListEx = list(set(dateList1) - set(dateListShared))
                print(
                    'Continue and enforce the differencing for their shared dates only.'
                )
                print(
                    '\twith following dates are ignored for differencing:\n{}'.
                    format(dateListEx))
                dateShared[np.array([dateList1.index(i)
                                     for i in dateListEx])] = 0
            else:
                raise Exception(
                    'To enforce the differencing anyway, use --force option.')

        # instantiate the output file
        writefile.layout_hdf5(out_file, ref_file=file1)

        # block-by-block IO
        length, width = int(atr1['LENGTH']), int(atr1['WIDTH'])
        num_box = int(np.ceil(len(dateList1) * length * width / max_num_pixel))
        box_list = cluster.split_box2sub_boxes(box=(0, 0, width, length),
                                               num_split=num_box,
                                               dimension='y',
                                               print_msg=True)

        if ref_y and ref_x:
            ref_box = (ref_x, ref_y, ref_x + 1, ref_y + 1)
            ref_val = readfile.read(file2[0],
                                    datasetName=dateListShared,
                                    box=ref_box)[0] * unit_fac

        for i, box in enumerate(box_list):
            if num_box > 1:
                print('\n------- processing patch {} out of {} --------------'.
                      format(i + 1, num_box))
                print('box: {}'.format(box))

            # read data2 (consider different reference_date/pixel)
            print('read from file: {}'.format(file2[0]))
            data2 = readfile.read(
                file2[0], datasetName=dateListShared, box=box)[0] * unit_fac

            if ref_y and ref_x:
                print('* referencing data from {} to y/x: {}/{}'.format(
                    os.path.basename(file2[0]), ref_y, ref_x))
                data2 -= np.tile(ref_val.reshape(-1, 1, 1),
                                 (1, data2.shape[1], data2.shape[2]))

            if ref_date:
                print('* referencing data from {} to date: {}'.format(
                    os.path.basename(file2[0]), ref_date))
                ref_ind = dateListShared.index(ref_date)
                data2 -= np.tile(data2[ref_ind, :, :], (data2.shape[0], 1, 1))

            # read data1
            print('read from file: {}'.format(file1))
            data = readfile.read(file1, box=box)[0]

            # apply differencing
            mask = data == 0.
            data[dateShared] -= data2
            data[mask] = 0.  # Do not change zero phase value
            del data2

            # write the block
            block = [0, data.shape[0], box[1], box[3], box[0], box[2]]
            writefile.write_hdf5_block(out_file,
                                       data=data,
                                       datasetName=k1,
                                       block=block)

    elif all(i == 'ifgramStack' for i in [k1, k2]):
        obj1 = ifgramStack(file1)
        obj1.open()
        obj2 = ifgramStack(file2[0])
        obj2.open()
        dsNames = list(set(obj1.datasetNames) & set(obj2.datasetNames))
        if len(dsNames) == 0:
            raise ValueError('no common dataset between two files!')
        dsName = [i for i in ifgramDatasetNames if i in dsNames][0]

        # read data
        print('reading {} from file {} ...'.format(dsName, file1))
        data1 = readfile.read(file1, datasetName=dsName)[0]
        print('reading {} from file {} ...'.format(dsName, file2[0]))
        data2 = readfile.read(file2[0], datasetName=dsName)[0]

        # consider reference pixel
        if 'unwrapphase' in dsName.lower():
            print('referencing to pixel ({},{}) ...'.format(
                obj1.refY, obj1.refX))
            ref1 = data1[:, obj1.refY, obj1.refX]
            ref2 = data2[:, obj2.refY, obj2.refX]
            for i in range(data1.shape[0]):
                data1[i, :][data1[i, :] != 0.] -= ref1[i]
                data2[i, :][data2[i, :] != 0.] -= ref2[i]

        # operation and ignore zero values
        data1[data1 == 0] = np.nan
        data2[data2 == 0] = np.nan
        data = data1 - data2
        del data1, data2
        data[np.isnan(data)] = 0.

        # write to file
        dsDict = {}
        dsDict[dsName] = data
        writefile.write(dsDict, out_file=out_file, ref_file=file1)

    # Sing dataset file
    else:
        data1 = readfile.read(file1)[0]
        data = np.array(data1, data1.dtype)
        for fname in file2:
            data2 = readfile.read(fname)[0]
            data = np.array(data, dtype=np.float32) - np.array(
                data2, dtype=np.float32)
            data = np.array(data, data1.dtype)
        print('writing >>> ' + out_file)
        writefile.write(data, out_file=out_file, metadata=atr1)

    m, s = divmod(time.time() - start_time, 60)
    print('time used: {:02.0f} mins {:02.1f} secs'.format(m, s))

    return out_file