def parse(self, stations, years, fields): ''' Pass in some stations and years. For convenience (and to match CRN's data output) we'll only deal with data in complete years. ''' # First make sure we've got the data locally: self._download(stations, years) doc = DataObjectCollection() for station in stations: do = DataObject() for field in self.fields: # Skip some fields we know we don't care about useless_fields = ['WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN', 'SUR_TEMP_TYPE'] if (fields and field not in fields) or (not fields and field in useless_fields): continue do[field] = TimeSeries([]) for year in years: f = open(self.storage_dir + self._filename(station, year)) for line in f: values = line.split() do.append(values, self.fields) for ts in do.values(): ts.replace_data(interpolate_forward_backward(ts, missing_values)) doc.append(do) return doc
def parse(self, stations, years, fields): ''' Pass in some stations and years. For convenience (and to match CRN's data output) we'll only deal with data in complete years. ''' # First make sure we've got the data locally: self._download(stations, years) doc = DataObjectCollection() for station in stations: do = DataObject() for field in self.fields: # Skip some fields we know we don't care about useless_fields = [ 'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN', 'SUR_TEMP_TYPE' ] if (fields and field not in fields) or (not fields and field in useless_fields): continue do[field] = TimeSeries([]) for year in years: f = open(self.storage_dir + self._filename(station, year)) for line in f: values = line.split() do.append(values, self.fields) for ts in do.values(): ts.replace_data( interpolate_forward_backward(ts, missing_values)) doc.append(do) return doc
def parse(self, listofdicts): doc = DataObjectCollection() for curdict in listofdicts: do = DataObject() for key, val in curdict.items(): do[key] = val doc.append(do) return doc
def parse(self, sineslist): doc = DataObjectCollection() for sines in sineslist: do = DataObject() for key, sine in sines.items(): ts = TimeSeries(sine) ts.sample_rate = 1 do[key] = ts doc.append(do) return doc
def parse(self, sines): doc = DataObjectCollection() do = DataObject() for key, sine in sines.items(): ts = TimeSeries(sine) ts.sample_rate = 1 # ts.rangex = (-1,1) do[key] = ts doc.append(do) return doc
def parse(self): d = eval(self.file.readline().strip()) doc = DataObjectCollection(sample_rate=1 / 3.0) for i, octant in enumerate(d): do = DataObject() for varname, values in octant.items(): ts = TimeSeries(values) do[varname] = ts doc.append(do) return doc
def test_doc_imposes_sample_rate(): # create a DO do1 = DataObject() # create a DOC and put the DO in it doc = DataObjectCollection(sample_rate=60) doc.append(do1) retrieved_do = doc[0] assert(retrieved_do.sample_rate == 60)
def test_datamapper_1(): # create a TimeSeries and stick something in it ts1 = TimeSeries(['datapoint'], sample_rate=60) # create a DO and put the TS in it do1 = DataObject() do1['somedata'] = ts1 assert do1.keys() == ['somedata'] # create a DOC and put the DO in it doc = DataObjectCollection() doc.append(do1) # dig down through the levels and get the datapoint we originally inserted timeseries = doc[0]['somedata'] datapoint = timeseries[0] assert(datapoint == 'datapoint')
def parse(self, input_filename, num_buoys=4, criterion_function=record_length, interpolation_function=interpolate_forward_backward, start=None, end=None, maxlines=None, print_heap=False): ''' Parse a file from the Global Drifter buoy program. Keeps the num_buoys buoys that most closely match the criterion function (eg longest record, closest to some latitude, closest to some lat/long pair). Each buoy becomes a DataObject. ''' ''' Metadata for global drifter program: VE and VN are eastward and northward velocity. SPD is speed. Last 3 are variance. Do I care about any of them? ID MM DD YY LAT LON TEMP VE VN SPD VAR. LAT VAR. LON VAR. TEMP Deg C CM/S CM/S CM/S Note: file is very large (2+ GB) Files can be obtained from ftp://ftp.aoml.noaa.gov/phod/pub/buoydata/ and must be gunzipped despite the odd .dat-gz suffix. ''' column_names = 'ID MM DD YY LAT LON TEMP VE VN SPD VAR_LAT VAR_LON VAR_TEMP'.split() def _getDataObject(): ''' Convenience method to return a DataObject initialized to fit the buoy data. ''' do = DataObject(metadata={'buoy_id': id}) for key in ['LAT', 'LON', 'TEMP']: do[key] = TimeSeries([]) return do def _push_to_heap(data, curdata): # make sure curdata isn't empty: ts = curdata.values()[0] if not ts: return heapindex = criterion_function(curdata) if len(data) >= num_buoys: popped = heappushpop(data, (heapindex, curdata)) if print_heap and heapindex != popped[0]: print 'pushing', heapindex print 'popping', popped[0] print print 'now:' for v in data: print ' ', v[0] print else: # Still building our heap to the size we want heappush(data, (heapindex, curdata)) with open(input_filename) as input_file: data = [] # treat as heapq buoy_id = None curdata = _getDataObject() for i, line in enumerate(input_file): if maxlines and i > maxlines: break splitline = line.split() if not splitline: continue # blank line new_id = splitline[0] # buoy_id for this line if new_id != buoy_id: # Have we moved on to a new buoy? if curdata: curdata.metadata['buoy_id'] = buoy_id _push_to_heap(data, curdata) buoy_id = new_id curdata = _getDataObject() # Start by stuffing all the data for this observation into a dict: temp_data_dict = {} for i, val in enumerate(splitline): column_name = column_names[i] temp_data_dict[column_name] = val # But we don't want to save all of it (there's a bunch of stuff we don't care # about). So we pick through it for the stuff we want, parsing and transforming # as necessary. Right now they're all strings. # Date/time first # Day of month plus time of day is represented like: 3.75 (3rd day, 3/4 of the way through) day_time = float(temp_data_dict['DD']) day = int(day_time) percent_of_day = day_time - day hour = int(24 * percent_of_day) # leaves us with 0, 6, 12, or 18 year = int(temp_data_dict['YY']) month = int(temp_data_dict['MM']) date_time = datetime(year, month, day, hour) if start and date_time < start: continue if end and date_time > end: continue # preserve first and last datetimes if 'start' not in curdata.metadata: curdata.metadata['start'] = date_time curdata.metadata['end'] = date_time curdata['LAT'].append(float(temp_data_dict['LAT'])) curdata['LON'].append(float(temp_data_dict['LON'])) curdata['TEMP'].append(float(temp_data_dict['TEMP'])) # We hit EOF; push the current data curdata.metadata['buoy_id'] = buoy_id _push_to_heap(data, curdata) doc = DataObjectCollection(sample_rate=1.0 / 360) # 1 sample per six hours for _, do in data: # _ is the heap index doc.append(do) try: v = doc[0].values()[0] if not v: return None except IndexError: return None # Saner to return None than an empty DOC # interpolate for do in doc: for ts in do.values(): ts.replace_data(interpolation_function(ts, missing_values)) return doc