def test_event_offset(): from numpy import asarray, rec data = [1, 2, 3] assert_array_equal(util.event_offset(data, 1), [2, 3, 4]) assert_array_equal(util.event_offset(asarray(data), 2), [3, 4, 5]) marked = rec.fromarrays((data, ['a', 'b', 'c']), names=('start', 'names')) assert_array_equal(util.event_offset(marked, 1)['start'], [2, 3, 4])
def recarray_from_file(source, ifo=None, columns=None, loudest=False): """Read a `GWRecArray` from a PyCBC live HDF5 file """ # read HDF5 file if isinstance(source, CacheEntry): source = source.path if isinstance(source, str): h5f = source = h5py.File(source, 'r') opened = True else: opened = False # find group if isinstance(source, h5py.File): if ifo is None: try: ifo, = list(source) except ValueError as e: e.args = ("PyCBC live HDF5 file contains multiple IFO groups, " "please select ifo manually", ) raise try: source = source[ifo] except KeyError as e: e.args = ("No group for ifo %r in PyCBC live HDF5 file" % ifo, ) raise # at this stage, 'source' should be an HDF5 group in the pycbc live format if columns is None: columns = [c for c in source if c not in INVALID_COLUMNS] names, data = zip(*[(k, source[k][:]) for k in source if k in columns]) names = list(map(str, names)) if loudest: # recover only the 'loudest' events loudest = source['loudest'][:] data = [d[loudest] for d in data] else: data = list(data) # calculate new_snr on-the-fly if 'new_snr' in columns and 'new_snr' not in source: # get columns needed for newsnr snr = data[names.index('snr')] rchisq = data[names.index('chisq')] # chisq is already reduced # calculate and append to column list data.append(get_new_snr(snr, rchisq)) names.append('new_snr') # calculate mchirp if 'mchirp' in columns and 'mchirp' not in source: mass1 = data[names.index('mass1')] mass2 = data[names.index('mass2')] data.append(get_mchirp(mass1, mass2)) names.append('mchirp') # read columns into numpy recarray out = rec.fromarrays(data, names=map(str, names)).view(GWRecArray) if 'end_time' in columns: out.sort(order='end_time') if opened: h5f.close() return out
def recarray_from_file(source, ifo=None, columns=None, loudest=False): """Read a `GWRecArray` from a PyCBC live HDF5 file """ # read HDF5 file if isinstance(source, CacheEntry): source = source.path if isinstance(source, str): h5f = source = h5py.File(source, 'r') opened = True else: opened = False # find group if isinstance(source, h5py.File): if ifo is None: try: ifo, = list(source) except ValueError as e: e.args = ("PyCBC live HDF5 file contains multiple IFO groups, " "please select ifo manually",) raise try: source = source[ifo] except KeyError as e: e.args = ("No group for ifo %r in PyCBC live HDF5 file" % ifo,) raise # at this stage, 'source' should be an HDF5 group in the pycbc live format if columns is None: columns = [c for c in source if c not in INVALID_COLUMNS] names, data = zip(*[(k, source[k][:]) for k in source if k in columns]) names = list(map(str, names)) if loudest: # recover only the 'loudest' events loudest = source['loudest'][:] data = [d[loudest] for d in data] else: data = list(data) # calculate new_snr on-the-fly if 'new_snr' in columns and 'new_snr' not in source: # get columns needed for newsnr snr = data[names.index('snr')] rchisq = data[names.index('chisq')] # chisq is already reduced # calculate and append to column list data.append(get_new_snr(snr, rchisq)) names.append('new_snr') # calculate mchirp if 'mchirp' in columns and 'mchirp' not in source: mass1 = data[names.index('mass1')] mass2 = data[names.index('mass2')] data.append(get_mchirp(mass1, mass2)) names.append('mchirp') # read columns into numpy recarray out = rec.fromarrays(data, names=map(str, names)).view(GWRecArray) if 'end_time' in columns: out.sort(order='end_time') if opened: h5f.close() return out
def is_sorted(self): idx_cols = list(self.schema.idx) if len(idx_cols) == 1: arr = self[idx_cols[0]] return all(arr[1:] >= arr[:-1]) # Multi-column index we fallback on argsort arr = rec.fromarrays([self[n] for n in idx_cols], names=idx_cols) sort_mask = self.argsort() a_range = arange(len(sort_mask)) return all(sort_mask == a_range)
def test_select(self): f0 = arange(10, dtype=int32) f1 = arange(10, dtype=float64) irregular = rec.fromarrays([f0, f1]) f0 = irregular['f0'] f1 = irregular['f1'] i0 = evaluate('f0 < 5') i1 = evaluate('f1 < 5') assert_array_equal(f0[i0], arange(5, dtype=int32)) assert_array_equal(f1[i1], arange(5, dtype=float64))
def dict_list_to_frame(dict_list): df = pd.DataFrame(dict_list) d0 = dict( df.iloc[0] ) goodkeys = [ k for k in d0.keys() if (type(d0[k])!=fits.card.Undefined)] df = df[goodkeys] # comb = pdplus.df_to_rec_strings(df) dfs = df.select_dtypes(include=['object']) dfns = df.select_dtypes(exclude=['object']) dfs = rec.fromarrays(np.array(dfs).astype('S100').T,names=list(dfs.columns)) names = list(dfns.columns) arrs = [dfns[n] for n in names] comb = mlab.rec_append_fields(dfs,names,arrs) return comb
def extrema(x, max=True, min=True, withend=True): """ Return indexes, values, and sign of curvature of local extrema of 1-d array. The boolean arguments max, min, withend determine whether to include maxima and minima, and include the endpoints. Basic usage. >>> x = [2, 1, 0, 1, 2] >>> extrema(x) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS rec.array([(0, 2, -1), (2, 0, 1), (4, 2, -1)], dtype=[('index', '<i...'), ('value', '<i...'), ('curv', '<i...')]) Options to include only certain types of extrema. >>> extrema(x, withend=False) rec.array([(2, 0, 1)],... >>> extrema(x, max=False) rec.array([(2, 0, 1)],... >>> extrema(x, min=False) rec.array([(0, 2, -1), (4, 2, -1)],... >>> extrema(x, max=False, min=False) rec.array([],... The beginning and end of flat segments both count as extrema, except the first and last data point. >>> extrema([0, 0, 1, 1, 2, 2]) rec.array([(1, 0, 1), (2, 1, -1), (3, 1, 1), (4, 2, -1)],... >>> extrema([0, 0, 0]) rec.array([],...) >>> extrema([0, 0, 1, 1], withend=False) rec.array([(1, 0, 1), (2, 1, -1)],... @todo: Add options on how to handle flat segments. """ x = squeeze(x) # ensure 1-d numpy array xpad = r_[x[1], x, x[-2]] # pad x so endpoints become minima or maxima curv = sign(diff(sign(diff(xpad)))) # +1 at minima, -1 at maxima i = curv.nonzero()[0] # nonzero() wraps the indices in a 1-tuple ext = rec.fromarrays([i, x[i], curv[i]], names=["index", "value", "curv"]) if not withend: ext = ext[(i > 0) & (i < len(x) - 1)] if not max: ext = ext[ext.curv >= 0] if not min: ext = ext[ext.curv <= 0] return ext
def torec(self): """ Returns a recarray, averaging across chains as needed. """ from numpy import sqrt, rec fields = ['time', 'p.sd', 'p.mmse'] values = [self.tgrid, sqrt(_reduce(self.pvar)), _reduce(self.pmmse)] if self.nchains > 1: fields.append("p.mmse.sd") values.append(self.pmmse.std(1)) for k, v in self.estimates.items(): fields.append(k) values.append(_reduce(v)) if v.ndim > 1: fields.append(k + ".sd") values.append(v.std(1)) return rec.fromarrays(values, names=fields)
def load_recarray(group): """Read recarray from the given HDF5 group """ columns = map(str, list(group)) # read segments try: epoch = LIGOTimeGPS(group['segments'].attrs['epoch']) except TypeError: epoch = LIGOTimeGPS(float(group['segments'].attrs['epoch'])) segments = SegmentList(Segment(epoch + x[0], epoch + x[1]) for x in group['segments'][:]) columns.pop(columns.index('segments')) # read columns data = [group[c] for c in columns] # format and add table = rec.fromarrays(data, names=columns).view(GWRecArray) add_triggers(table, group.name.split('/')[-1], segments=segments) return table
def argsort(self): idx_cols = list(self.schema.idx) arr = rec.fromarrays([self[n] for n in idx_cols], names=idx_cols) # Mergesort is faster on pre-sorted arrays return argsort(arr, kind="mergesort")
def reduce(self, *col_list, **col_dict): """ Return a new frame containing the choosen columns. A column can be one of the existing column or an s-expression that we be automatically evaluated. """ # Merge all args in one dict columns = dict(zip(col_list, col_list)) columns.update(col_dict) # Detect aggregations all_ast = {} for alias, expr in columns.items(): if expr.startswith("("): all_ast[alias] = AST.parse(expr) agg_ast = {} other_ast = {} for alias, ast in all_ast.items(): if ast.is_aggregate(): agg_ast[alias] = ast else: other_ast[alias] = ast # Eval non-aggregated columns env = self.eval_env() non_agg = {} for alias, expr in columns.items(): if alias in agg_ast: continue ast = other_ast.get(alias) if ast: arr = ast.eval(env=env) else: arr = self.columns[expr] if isinstance(arr, Alias): # un-pack alias arr, alias = arr.value, arr.name non_agg[alias] = arr # Early exit if we don't need to compute aggregates if not agg_ast: schema = Schema.from_frame(non_agg, idx_columns=list(non_agg)) return Frame(schema, non_agg) res = {} if non_agg: # Compute binning records = rec.fromarrays(non_agg.values(), names=list(non_agg)) keys, bins = unique(records, return_inverse=True) # Build resulting columns for alias in non_agg: arr = keys[alias] if isinstance(arr, Alias): # un-pack alias arr, alias = arr.value, arr.name res[alias] = arr env.update({"_keys": keys, "_bins": bins}) # Compute aggregates for alias, expr in agg_ast.items(): arr = expr.eval(env) if isinstance(arr, Alias): # un-pack alias arr, alias = arr.value, arr.name # Without bins, eval will return a scalar value res[alias] = arr if non_agg else asarray([arr]) schema = Schema.from_frame(res, idx_columns=list(non_agg)) return Frame(schema, res)
from numpy import array, rec from numpy.random import normal as nprandom from rpy2.robjects import numpy2ri, r foo = array(range(10)) bar = foo + nprandom(0,1,10) d = rec.fromarrays([foo, bar], names=('foo','bar')) print d fit = r.lm('bar ~ foo', data=d) print fit.rx2('coefficients')
def read(self, var, x=None, y=None, radius=0., tlim=None, ylim=None, xlim=None, missions=None, sort=True, profile=True): """Reads dataset. PARAMETERS var (string) : Variable to be read from dataset. It also accepts special naming conventions in order to rename the original dataset variable and to load alternative variables in case of invalid data according to the syntax '[new_var_name]:var[|other_var]'. x, y (array like, optional) : List of zonal and meridional point coordinate of interest. radius (float, optional) : Search radius in degrees. tlim, ylim, xlim (array like, optional) : The temporal, meridional and zonal limits (minimum, maximum) for which data will be read. missions (array like, optional) : List of missions to read data from. If omitted, defaults available missions on dataset class intialization. sort (boolean optional) : If true, sorts the data record in order of ascendant time, latitude and longitude. profile (boolean, optional) : Sets whether the status is send to screen. RETURNS dat (record array) : Record time-series of 'time', 'latitude', 'longitude', selected variable and 'mission'. """ t0 = time() # Checks input parameters. T = self.variables['time'].data if var.find(':') >= 0: # Checks spetial variable syntax var_name, var = var.split(':') else: var_name = var if tlim == None: tlim = (T.min(), T.max()) if (x != None) | (y != None): x, y = asarray(x), asarray(y) if x.size != y.size: raise ValueError('Zonal and meridional coordinate dimensions ' 'do not match.') npoints = x.size radius2 = radius ** 2 else: npoints = 0 x = y = [] # if ylim == None: ylim = (-90., 90.) if xlim == None: xlim = (0., 360.) else: # Make sure longitude limits are between 0 and 360. xlim = list(lon360(asarray(xlim))) if missions == None: missions = self.params['missions'] # First we have to select which files will be loaded, which will # depend on the temporal limits given in $t$. sel_time = flatnonzero((T >= floor(min(tlim))) & (T <= ceil(max(tlim)))) N = len(sel_time) # Second we will walk through each of the selected time in the dataset # and load the correspondant file for the available missions. t1 = time() if profile: s = '\rLoading data...' stdout.write(s) stdout.flush() # Reset important variables TIME, LAT, LON, VAR, MISSION = [array([])] * 5 # for i, tm in enumerate(T[sel_time]): t2 = time() for (mission, dset, fname, cycle, orbit) in self.attributes['time_dataset'][tm]: # Skips mission not in missions list. if mission not in missions: continue # Uncompresses gzipped file and opens NetCDF instance. data = self.read_file('%s/%s/%s' % (self.params['path'], mission, fname)) # Reads variable from NetCDF file. raw_time = self.read_variable(data, 'time') raw_lat = self.read_variable(data, 'lat') raw_lon = self.read_variable(data, 'lon') raw_dat = self.read_variable(data, var) # Select relevant data range according to limit parameters sel_from_time = ( (raw_time >= min(tlim)) & (raw_time <= max(tlim)) ) if (ylim != None) | (xlim !=None): sel_from_limits = ones(data.dimensions['time'], dtype=bool) else: sel_from_limits = zeros(data.dimensions['time'], dtype=bool) if ylim != None: sel_from_limits = (sel_from_limits & ((raw_lat >= min(ylim)) & (raw_lat <= max(ylim)))) if xlim != None: sel_from_limits = (sel_from_limits & ((raw_lon >= min(xlim)) & (raw_lon <= max(xlim)))) # Select relevant data according to points and search radius. sel_from_radius = zeros(data.dimensions['time'], dtype=bool) for xx, yy in zip(x, y): distance2 = ((raw_lat - yy) ** 2 + (raw_lon - lon360(xx)) ** 2) sel_from_radius = sel_from_radius | (distance2 <= radius2) # sel_data = flatnonzero(sel_from_time & (sel_from_limits | sel_from_radius) & (~isnan(raw_dat))) _time = raw_time[sel_data] _lat = raw_lat[sel_data] _lon = raw_lon[sel_data] _dat = raw_dat[sel_data] # TIME = append(TIME, _time) LAT = append(LAT, _lat) LON = append(LON, _lon) VAR = append(VAR, _dat) MISSION = append(MISSION, [mission] * len(sel_data)) # self.close_file(data) # # Profiling if profile: s = '\rLoading data... %s ' % (profiler(N, i+1, t0, t1, t2),) stdout.write(s) stdout.flush() # if profile: stdout.write('\n') stdout.flush() # Converts the data a structured array DAT = rec.fromarrays((TIME, LAT, LON, VAR, MISSION), dtype=[('time', float64), ('latitude', float64), ('longitude', float64), (var_name, float64), ('mission', '|S3')]) # Some data sorting? if sort: DAT.sort(order=('time', 'latitude', 'longitude'), axis=0) return DAT
def read(self, var, x=None, y=None, radius=0., tlim=None, ylim=None, xlim=None, missions=None, sort=True, profile=True): """Reads dataset. PARAMETERS var (string) : Variable to be read from dataset. It also accepts special naming conventions in order to rename the original dataset variable and to load alternative variables in case of invalid data according to the syntax '[new_var_name]:var[|other_var]'. x, y (array like, optional) : List of zonal and meridional point coordinate of interest. radius (float, optional) : Search radius in degrees. tlim, ylim, xlim (array like, optional) : The temporal, meridional and zonal limits (minimum, maximum) for which data will be read. missions (array like, optional) : List of missions to read data from. If omitted, defaults available missions on dataset class intialization. sort (boolean optional) : If true, sorts the data record in order of ascendant time, latitude and longitude. profile (boolean, optional) : Sets whether the status is send to screen. RETURNS dat (record array) : Record time-series of 'time', 'latitude', 'longitude', selected variable and 'mission'. """ t0 = time() # Checks input parameters. T = self.variables['time'].data if var.find(':') >= 0: # Checks spetial variable syntax var_name, var = var.split(':') else: var_name = var if tlim == None: tlim = (T.min(), T.max()) if (x != None) | (y != None): x, y = asarray(x), asarray(y) if x.size != y.size: raise ValueError('Zonal and meridional coordinate dimensions ' 'do not match.') npoints = x.size radius2 = radius**2 else: npoints = 0 x = y = [] # if ylim == None: ylim = (-90., 90.) if xlim == None: xlim = (0., 360.) else: # Make sure longitude limits are between 0 and 360. xlim = list(lon360(asarray(xlim))) if missions == None: missions = self.params['missions'] # First we have to select which files will be loaded, which will # depend on the temporal limits given in $t$. sel_time = flatnonzero((T >= floor(min(tlim))) & (T <= ceil(max(tlim)))) N = len(sel_time) # Second we will walk through each of the selected time in the dataset # and load the correspondant file for the available missions. t1 = time() if profile: s = '\rLoading data...' stdout.write(s) stdout.flush() # Reset important variables TIME, LAT, LON, VAR, MISSION = [array([])] * 5 # for i, tm in enumerate(T[sel_time]): t2 = time() for (mission, dset, fname, cycle, orbit) in self.attributes['time_dataset'][tm]: # Skips mission not in missions list. if mission not in missions: continue # Uncompresses gzipped file and opens NetCDF instance. data = self.read_file('%s/%s/%s' % (self.params['path'], mission, fname)) # Reads variable from NetCDF file. raw_time = self.read_variable(data, 'time') raw_lat = self.read_variable(data, 'lat') raw_lon = self.read_variable(data, 'lon') raw_dat = self.read_variable(data, var) # Select relevant data range according to limit parameters sel_from_time = ((raw_time >= min(tlim)) & (raw_time <= max(tlim))) if (ylim != None) | (xlim != None): sel_from_limits = ones(data.dimensions['time'], dtype=bool) else: sel_from_limits = zeros(data.dimensions['time'], dtype=bool) if ylim != None: sel_from_limits = (sel_from_limits & ((raw_lat >= min(ylim)) & (raw_lat <= max(ylim)))) if xlim != None: sel_from_limits = (sel_from_limits & ((raw_lon >= min(xlim)) & (raw_lon <= max(xlim)))) # Select relevant data according to points and search radius. sel_from_radius = zeros(data.dimensions['time'], dtype=bool) for xx, yy in zip(x, y): distance2 = ((raw_lat - yy)**2 + (raw_lon - lon360(xx))**2) sel_from_radius = sel_from_radius | (distance2 <= radius2) # sel_data = flatnonzero(sel_from_time & (sel_from_limits | sel_from_radius) & (~isnan(raw_dat))) _time = raw_time[sel_data] _lat = raw_lat[sel_data] _lon = raw_lon[sel_data] _dat = raw_dat[sel_data] # TIME = append(TIME, _time) LAT = append(LAT, _lat) LON = append(LON, _lon) VAR = append(VAR, _dat) MISSION = append(MISSION, [mission] * len(sel_data)) # self.close_file(data) # # Profiling if profile: s = '\rLoading data... %s ' % (profiler(N, i + 1, t0, t1, t2), ) stdout.write(s) stdout.flush() # if profile: stdout.write('\n') stdout.flush() # Converts the data a structured array DAT = rec.fromarrays((TIME, LAT, LON, VAR, MISSION), dtype=[('time', float64), ('latitude', float64), ('longitude', float64), (var_name, float64), ('mission', '|S3')]) # Some data sorting? if sort: DAT.sort(order=('time', 'latitude', 'longitude'), axis=0) return DAT
def read(self, x=None, y=None, radius=0., tlim=None, ylim=None, xlim=None, missions=None, sort=True, profile=True): """Reads dataset. PARAMETERS x, y (array like, optional) : List of zonal and meridional point coordinate of interest. radius (float, optional) : Search radius in degrees. tlim, ylim, xlim (array like, optional) : The temporal, meridional and zonal limits (minimum, maximum) for which data will be read. missions (array like, optional) : List of missions to read data from. If omitted, defaults available missions on dataset class intialization. sort (boolean optional) : If true, sorts the data record in order of ascendant time, latitude and longitude. profile (boolean, optional) : Sets whether the status is send to screen. RETURNS dat (record array) : Record time-series of 'time', 'latitude', 'longitude', selected variable and 'mission'. """ t0 = time() # Checks input parameters. T = self.variables['time'].data if tlim == None: tlim = (T.min(), T.max()) if (x != None) | (y != None): x, y = asarray(x), asarray(y) if x.size != y.size: raise ValueError('Zonal and meridional coordinate dimensions ' 'do not match.') npoints = x.size radius2 = radius ** 2 else: npoints = 0 x = y = [] # if ylim == None: ylim = (-90., 90.) if xlim == None: xlim = (0., 360.) else: # Make sure longitude limits are between 0 and 360. xlim = list(lon360(asarray(xlim))) if missions == None: missions = self.params['missions'] # Aviso uses time in days since 1950-01-01 00:00:00 UTC, therefore # we have to calculate the initial time in matplotlib's format. We # also have to determine the proper variable using product name. T0 = dates.datestr2num('1950-01-01 00:00:00 UTC') var = self.params['product'].upper() # First we have to select which files will be loaded, which will # depend on the temporal limits given in $t$. sel_time = flatnonzero((T >= floor(min(tlim))) & (T <= ceil(max(tlim)))) N = len(sel_time) # Second we will walk through each of the selected time in the dataset # and load the correspondant file for the available missions. t1 = time() if profile: s = '\rLoading data...' stdout.write(s) stdout.flush() # Reset important variables TIME, LAT, LON, VAR, MISSION = [array([])] * 5 # for i, tm in enumerate(T[sel_time]): t2 = time() for (mission, fname) in self.attributes['time_dataset'][tm]: # Skips mission not in missions list. if mission not in missions: continue # Uncompresses gzipped file and opens NetCDF instance. data = self.read_file('%s/%s/%s' % (self.params['path'], mission, fname)) # Retrieve the scale factor for each variable scale_lat = data.variables['latitude'].scale_factor scale_lon = data.variables['latitude'].scale_factor scale_dat = data.variables[var].scale_factor # Get the raw time, latitude and longitude raw_time = data.variables['time'].data + T0 raw_lat = data.variables['latitude'].data * scale_lat raw_lon = data.variables['longitude'].data * scale_lon # Select relevant data range according to limit parameters sel_from_time = ( (raw_time >= min(tlim)) & (raw_time <= max(tlim)) ) sel_from_limits = zeros(data.dimensions['time'], dtype=bool) if ylim != None: sel_from_limits = (sel_from_limits | ((raw_lat >= min(ylim)) & (raw_lat <= max(ylim)))) if xlim != None: sel_from_limits = (sel_from_limits | ((raw_lon >= min(xlim)) & (raw_lon <= max(xlim)))) # Select relevant data according to points and search radius. sel_from_radius = zeros(data.dimensions['time'], dtype=bool) for xx, yy in zip(x, y): distance2 = ((raw_lat - yy) ** 2 + (raw_lon - lon360(xx)) ** 2) sel_from_radius = sel_from_radius | (distance2 <= radius2) # sel_data = flatnonzero(sel_from_time & (sel_from_limits | sel_from_radius)) _time = raw_time[sel_data] _lat = raw_lat[sel_data] _lon = raw_lon[sel_data] _dat = data.variables[var].data[sel_data] * scale_dat # TIME = append(TIME, _time) LAT = append(LAT, _lat) LON = append(LON, _lon) VAR = append(VAR, _dat) MISSION = append(MISSION, [mission] * len(sel_data)) # self.close_file(data) # # Profiling if profile: s = '\rLoading data... %s ' % (profiler(N, i+1, t0, t1, t2),) stdout.write(s) stdout.flush() # if profile: stdout.write('\n') stdout.flush() # Converts the data a structured array DAT = rec.fromarrays((TIME, LAT, LON, VAR, MISSION), dtype=[('time', float64), ('latitude', float64), ('longitude', float64), (self.params['product'], float64), ('mission', '|S3')]) #DAT = hstack((TIME[:, None], LAT[:, None], LON[:, None], # VAR[:, None], MISSION[:, None])).view(dtype=[('time', float64), # ('latitude', float64), ('longitude', float64), # (self.params['product'], float64), ('mission', '|S3')]) # Some data sorting? if sort: DAT.sort(order=('time', 'latitude', 'longitude'), axis=0) return DAT