def __init__(self, shp_in, out_srs=None): print "\nOpening and reading shapefile '{0}'.".format(shp_in) try: self.__shp_rd = shp.Reader(shp_in) except shp.ShapefileException: exit("\nERROR -> File '{0}' not found".format(shp_in)) # Import projection prj_file = splitext(shp_in)[0] + '.prj' try: shp_prj = open(prj_file) except IOError: exit("\nERROR -> Could not find projection file '{0}'.".format(prj_file)) else: with shp_prj: prj_txt = shp_prj.read() self.__shp_srs = osr.SpatialReference() if self.__shp_srs.ImportFromESRI([prj_txt]) != 0: exit("\nERROR -> Error importing the projection information from '{0}'.".format(shp_in)) # Store requested spatial reference self.__out_srs = out_srs # If destination coordinates are specified if self.__out_srs: # Define the coordinates transformation self.__trans = prjpnt(self.__shp_srs, self.__out_srs) else: self.__trans = None # Initialize data loaded indicator self.__dataLoaded = False # Define coordinate labels self.__coo_lbl = ['SHP_X', 'SHP_Y'] # An empty dictionary to store the data for internal representation self.__data = {}
def amp2xls(xls_in, # args.xls_in amp_in=None, # args.amp_in shp_in=None, # args.shp_in ts_in=None, # args.ts_in xls_epsg=4326, # args.xls_epsg xls_sheet_in=['IS', 'PR', 'SC'], # args.xls_sheet_in keep_bad=False, # args.keep_bad ndval=-9999.0, # args.ndval period="winter", # args.period corr=['NIRI Average', 'CCI', 'CCI Class'], # args.corr pkfile=None, # args.pkfile csvfile=None, # args.txtfile differential=False, # args.differential prepend="", # args.prepend verbose=False): # args.verbose # Some default values that can be turned into arguments later on slng = 'Start GPS Longitude' # XLS column containing the staring GPS longitude slat = 'Start GPS Latitude' # XLS column containing the staring GPS latitued dtest = 'Date Tested' # XLS column contining the date when the section of road was tested year = 'Year' # XLS column contining the official year for the dataset ts_keys = ['VEL_STDEV', 'VEL'] # Keywords identitying temporary raster files # Load the input excel file names xls_stack = glob(xls_in) if not xls_stack: exit("\nERROR -> No excel files were selected using '{0}'".format(xls_in)) else: print "\nAnalyzing following excel files:" for x in xls_stack: print "- {0}".format(x) # Check if at least another source is selected if amp_in is None and shp_in is None and ts_in is None: exit("\nERROR -> At least and additional source (amplitude, SqueeSAR or Temporarary Scatterer) should be chosen for data extraction.") # Define the destination (excel) spatial reference xls_srs = osr.SpatialReference() if xls_srs.ImportFromEPSG(int(xls_epsg)) != 0: exit("\nERROR -> Error setting the destination data spatial reference to EPSG:{0}".format(xls_epsg)) # Define prepend string if prepend != "": prepend += "_" # Load the SAR amplitude stack file names af = "" if amp_in is not None: sar_stack = [f for f in glob(amp_in) if f[-4:] == '.tif'] if not sar_stack: exit("\nERROR -> No SAR amplitude files were selected using '{0}'".format(amp_in)) sar_stack.sort() # Sort the files # Initialize SAR spatial reference sar_srs = osr.SpatialReference() af = "_AMP" # If selected, open the shapefile containing displacement data sf = "" if shp_in is not None: # Load shapefile data shp = shp2df(shp_in, out_srs=xls_srs) shp_dat = shp.getDF() # Initialize spatial search tree [shp_x_lbl, shp_y_lbl] = shp.getCooLabels() kdt = KDTree(shp_dat[[shp_x_lbl, shp_y_lbl]].values) sf = "_SHP" # If selected, load the temporary scatterer files tsf = "" if ts_in is not None: ts_stack = [f for f in glob(ts_in) if f[-4:] == '.tif'] if not ts_stack: exit("\nERROR -> No temporary scatterer files were selected using '{0}'".format(ts_in)) # Initialize the TS spatial reference ts_srs = osr.SpatialReference() tsf = "_TS" # Differential processing dif = "" if differential is True: dif = "_DIF" # Load the list of correlating values selected by the user clist = list(corr) ccl = None # Check if the user selected 'CCI Class' if clist.count('CCI Class') == 1: ccl = clist.index('CCI Class') clist[ccl] = 'CCI' # Define the last element of the output files name if len(clist) > 1: cf = "_many" else: cf = "_" + list(corr)[0] # Process all the files in the xls stack for xls_in in xls_stack: print "\nOpening excel file '{0}' as input".format(xls_in) try: xlfil = pd.ExcelFile(xls_in) except IOError: exit("\nERROR -> File '{0}' not found!".format(xls_in)) except Exception as e: exit("\nERROR -> Error: '{0}'".format(e)) # Load the list of sheets selected by the user xls_data = xlfil.parse(list(xls_sheet_in)) # Open output excel file # If using XLSX, pandas requires for openpyxl to be version 1.6.1 or # higher, but lower than 2.0.0. # When using XLS, the limit is 256 columns and 65535 rows. # TODO: move the decision at the end of the file where teh number of # columns and rows is known. Then select driver and extension # based on the amount of rows and columns. name, ext = splitext(xls_in) ext = '.xlsx' xls_out = prepend + name + "_" + period + af + sf + tsf + dif + ext print "- Creating excel file '{0}' as output".format(xls_out) try: writer = pd.ExcelWriter(xls_out) except IOError: exit("\nERROR -> File '{0}' not found!".format(xls_out)) except Exception as e: exit("\nERROR -> Error: '{0}'".format(e)) # Iterate over the selected sheets for sheet_in, xldata in xls_data.items(): print " - Processing input sheet '{0}'".format(sheet_in) # Clear NaN data xldata.fillna(0, inplace=True) lxldata = len(xldata) # Length of the xldata frame npout_fields = list(clist) # Copy of the corr list # Extract GPS start coordinates xls_coo = xldata[[slng, slat]].values # Evaluate the date range based on user selection meas_year = dt.strptime(str(xldata[year].irow(0)), "%Y") if period == "year": dates = [dt.strptime(str(d), "%Y%m%d") for d in xldata[dtest].values] max_date = max(dates) min_date = max(dates) - relativedelta(years=1) elif period == "winter": min_date = meas_year - relativedelta(months=3) max_date = meas_year + relativedelta(months=3) elif period == "all": min_date = dt.min max_date = dt.max else: exit("\nERROR -> The defined period ({0}) is not allowed".format(period)) if period == "all": print " - Using all available dates." else: print " - Range of dates ({0}): {1} -> {2}".format(period, dt.strftime(min_date, "%Y-%m-%d"), dt.strftime(max_date, "%Y-%m-%d")) # For each SAR amplitude file, extract the amplitude values at the GPS if amp_in is not None: # Find dates to process and define processing method dates = process_date_range(sar_stack, min_date, max_date, differential, prefix='A') if dates is None: exit("\nERROR -> No dates found inside {}".format(sar_stack)) # Process amplitude files diff_fields = [] diff_dict = {} for f in sar_stack: # Skip if date is out of range processing = dates[f]['processing'] if processing == 'skip': continue # Open SAR amplitude raster files if verbose: print " - Extracting amplitude values from '{0}'".format(f) tf = gdal.Open(f) if sar_srs.ImportFromWkt(tf.GetProjectionRef()) != 0: exit("\nERROR -> Error importing the projection information from '{0}'.".format(f)) xls2sar = prjpnt(xls_srs, sar_srs) geo = tf.GetGeoTransform() tfb = tf.GetRasterBand(1) sar_ndval = tfb.GetNoDataValue() lng_max = tfb.XSize - 1 lat_max = tfb.YSize - 1 # Loop through the coordinates in the xls amp = np.zeros(lxldata) for i in range(lxldata): # Convert xls coordinates to raster coordinates [lng, lat] = xls2sar.prj_coo(xls_coo[i]) # Calculate pixel location in raster coordinates plng = int((lng - geo[0]) / geo[1]) plat = int((lat - geo[3]) / geo[5]) # Check if it's inside the raster file. # It it is, gather amplitude. if plng < 0 or plng > lng_max or plat < 0 or plat > lat_max: amp[i] = np.nan else: amp[i] = tfb.ReadAsArray(plng, plat, 1, 1) # Check if amplitude is valid. if amp[i] == 0 or amp[i] == ndval or amp[i] == sar_ndval: amp[i] = np.nan # Mark row as bad if the corr value is negative or zero if np.any(xldata[clist].irow(i) < 0): amp[i] = np.nan # Close gdal handles tfb = None tf = None # If differential processing and differential frame, just # store the data. if differential: # If differential frame if processing == 'differential': prev_amp = amp.copy() continue # If regular frame, store data date = dates[f]['date'] xldata[date] = amp npout_fields.append(date) # If differential processing if differential: months = dates[f]['months'] if months != 0: diff_name = 'D' + date + "({})".format(months) diff_dict[diff_name] = (amp - prev_amp) / months diff_fields.append(diff_name) prev_amp = amp.copy() # If differential processing, add data if differential: # Insert differential data in xdata after amplitude xldata = pd.concat((xldata, pd.DataFrame(diff_dict)), axis=1) # Append field names to npout npout_fields.extend(diff_fields) # Remove rows marked as to be removed if not keep_bad: xldata.dropna(inplace=True) xldata.reset_index(drop=True, inplace=True) xls_coo = xldata[[slng, slat]].values lxldata = len(xldata) # If shapefile need to be analyzed if shp_in is not None: # Clear the xls data outside the shapefile bounding box shp_extent = shp.getExtent() xldata = xldata[xldata[slng] >= shp_extent[0]] xldata = xldata[xldata[slng] <= shp_extent[2]] xldata = xldata[xldata[slat] >= shp_extent[1]] xldata = xldata[xldata[slat] <= shp_extent[3]] xldata.reset_index(drop=True, inplace=True) xls_coo = xldata[[slng, slat]].values lxldata = len(xldata) # Look into the shapefile for the nearest point print " - Looking for neighbors in shapefile" neigh, neigh_idx = kdt.query(xls_coo) print " - Extracting SqueeSAR values from neighbors" n_shp = shp_dat.iloc[neigh_idx] # Extract processing information dates = process_date_range(n_shp.columns.values, min_date, max_date, differential, prefix='D') if dates is None: exit("\nERROR -> No dates found inside {}".format(n_shp.columns.values)) # Process the shapefile fields diff_fields = [] diff_dict = {} for lbl in n_shp.columns.values: # for date in n_shp.filter(regex='D[0-9]{8}').columns.values: # Process the date fields if re.search("D[0-9]{8}", lbl): # Skip if date is out of range processing = dates[lbl]['processing'] if processing == 'skip': continue # Get displacement data and store it disp = n_shp[lbl].values # If differential processing and differential frame, just # store the data. if differential: if processing == "differential": prev_disp = disp.copy() continue # If regular frame, store data xldata[lbl] = disp npout_fields.append(lbl) # If differential processing if differential: months = dates[lbl]['months'] if months != 0: diff_name = 'D' + lbl + "({})".format(months) diff_dict[diff_name] = (disp - prev_disp) / months diff_fields.append(diff_name) prev_disp = disp.copy() else: xldata[lbl] = n_shp[lbl].values npout_fields.append(lbl) # If differential processing, add data if differential: # Insert differential data in xdata after amplitude xldata = pd.concat((xldata, pd.DataFrame(diff_dict)), axis=1) # Append field names to npout npout_fields.extend(diff_fields) # Calculate the approximate distance between the points lt1 = np.radians(np.asarray(xls_coo[:, 1])) lt2 = np.radians(xldata[shp_y_lbl].values) ln1 = np.radians(np.asarray(xls_coo[:, 0])) ln2 = np.radians(xldata[shp_x_lbl].values) x = (ln2 - ln1) * np.cos(0.5 * (lt2 + lt1)) y = lt2 - lt1 dist = 6371000. * np.sqrt(x*x + y*y) xldata['Aprx. Distance (m)'] = dist npout_fields.append('Aprx. Distance (m)') # Add the TS information to output dataframe if ts_in is not None: # Look into the shapefile for the nearest point print " - Extracting temporary scatter values" # Initialize temporary dictionary to hold data ts_dict = dict.fromkeys(ts_stack, []) # For each file store the data corresponding to the xls coordinates for f in ts_stack: if verbose is True: print " - Extracting TS values from '{0}'".format(f) tf = gdal.Open(f) if ts_srs.ImportFromWkt(tf.GetProjectionRef()) != 0: exit("\nERROR -> Error importing the projection information from '{0}'.".format(f)) xls2ts = prjpnt(xls_srs, ts_srs) geo = tf.GetGeoTransform() tfb = tf.GetRasterBand(1) ts_ndval = tfb.GetNoDataValue() lng_max = tfb.XSize - 1 lat_max = tfb.YSize - 1 # Loop through the coordinates ts_val = np.zeros(lxldata) for i in range(lxldata): # Convert xls coordinates to TS raster coordinates [lng, lat] = xls2ts.prj_coo(xls_coo[i]) # Calculate pixel location in TS raster coordinates plng = int((lng - geo[0]) / geo[1]) plat = int((lat - geo[3]) / geo[5]) # Check if it's inside the raster file. # If it is, gather TS values if plng < 0 or plng > lng_max or plat < 0 or plat > lat_max: ts_val[i] = np.nan else: ts_val[i] = float(tfb.ReadAsArray(plng, plat, 1, 1)) # Check if the value is valid if ts_val[i] == ts_ndval or ts_val[i] == ndval: ts_val[i] = np.nan # Store values in temporary dictionary ts_dict[f] = ts_val # Close gdal handles tfb = None tf = None # TODO: For keys that can be included within other keys (for # example 'VEL' and 'VEL_STDEV') it is assumed that they are # provided in 'contains' order: 'VEL_STDEV' should come before # 'VEL'. See if there is a way to automatically accomplish this # without expecting the user to specify them in order. # Associate files to user provided keys ts_k_dict = {} ts_set = set(ts_stack) for k in ts_keys: ts_k_dict[k] = [f for f in ts_set if f.count(k)] ts_set = ts_set - set(ts_k_dict[k]) # Merge data and add to output dataframe for k, v in ts_k_dict.iteritems(): xldata["TS_" + k] = np.nan * np.ones(lxldata) npout_fields.append("TS_" + k) for f in v: xldata["TS_" + k] = np.fmax(xldata["TS_" + k], ts_dict[f]) # Remove bad data if not keep_bad: xldata.dropna(inplace=True) xldata.reset_index(drop=True, inplace=True) xls_coo = xldata[[slng, slat]].values # Extract subarray to pickle and convert to CSV npout = xldata[npout_fields].copy() # If the user selected 'CCI Class' if ccl is not None: # Function to map CCI values to classes def cci2class(cci): ccicl = np.zeros_like(cci) ccicl[cci <= 100] = 4 ccicl[cci < 90] = 3 ccicl[cci < 70] = 2 ccicl[cci < 60] = 1 ccicl[cci < 50] = 0 return ccicl npout.columns.values[ccl] = 'CCI Class' cci = npout.loc[:, 'CCI Class'].values npout.loc[:, 'CCI Class'] = cci2class(cci) # Add xls coordinates to output dataframe npout.insert(0, 'XLS Longitude', xls_coo[:, 0]) npout.insert(1, 'XLS Latitude', xls_coo[:, 1]) # Define basic name for output files name = prepend + dt.strftime(meas_year, "%Y") + "_" + sheet_in + "_" + period + af + sf + tsf + dif + cf # Store table in pickle file if pkfile is None: npkl = name + ".pkl" else: npkl = pkfile pth = join(dirname(xls_in), npkl) print " - Saving pickled dataframe to '{0}'".format(pth) npout.to_pickle(pth) # Store table in CSV file if csvfile is None: ncsv = name + ".csv" else: ncsv = csvfile pth = join(dirname(xls_in), ncsv) print " - Saving CSV dataframe to '{0}'".format(pth) npout.to_csv(pth, index=False) # Write to the corresponding sheet in new file print " - Writing to output sheet: '{0}'".format(sheet_in) xldata.to_excel(writer, sheet_name=sheet_in, index=False) # Save and close xls file writer.save() writer.close()