def station_annual_anomalies(data): """*data* should be a v2.mean file. Return a dictionary that maps from station record identifier (12-digit) to an annual anomaly series. All the data series start in 1880.""" from code import series from gio import GHCNV2Reader return ((record.uid, series.monthly_annual(record.series)[1]) for record in GHCNV2Reader(data, year_min=1880))
def station_annual_anomalies(data): """*data* should be a v2.mean file. Return a dictionary that maps from station record identifier (12-digit) to an annual anomaly series. All the data series start in 1880.""" from code import series from gio import GHCNV2Reader return ((record.uid, series.monthly_annual(record.series)[1]) for record in GHCNV2Reader(data, year_min=1880))
def do_combine(stream, log, select_func, combine_func): """Drive record combination. This is a filter driver function used by ``comb_records`` and ``comb_pieces``. :Param stream: The stream of records to filter. :Param log: Open log file file. :Param select_func: A function to call to select the 'best' record from a collection of records (belonging to the same station). :Param combine_func: A function to call to perform the data combining. """ for id11, record_set in itertools.groupby(stream, lambda r: r.station_uid): log.write('%s\n' % id11) records = set() for record in record_set: records.add(record) ann_mean, ann_anoms = series.monthly_annual(record.series) record.set_ann_anoms(ann_anoms) record.ann_mean = ann_mean begin, end = records_begin_end(records) years = end - begin + 1 # reduce the collection of records (by combining) until there # are none (or one) left. while records: if len(records) == 1: # Just one left, yield it. yield records.pop() break record = select_func(records) records.remove(record) sums, wgts = fresh_arrays(record, years) log.write("\t%s %s %s -- %s\n" % (record.uid, record.first_valid_year(), record.last_valid_year(), record.source)) combine_func(sums, wgts, begin, records, log, record.uid) final_data = average(sums, wgts) record.set_series(begin * 12 + 1, final_data) yield record
def get_longest_overlap(target, begin, records): """Find the record in the *records* set that has the longest overlap with the *target* by considering annual anomalies. *target* is a sequence of monthly values starting in the year *begin*. A triple (record, diff, overlap) is returned; *diff* is the average difference in annual anomalies between *record* and *target* (positive when *record* is higher); *overlap* is the number of years in the overlap. Even when there is no overlap _some_ record is returned and in that case *diff* is None and *overlap* is 0. Like other functions, assumes (and asserts) that *begin* is the first year for all the records. """ # Annual mean, and annual anomaly sequence. mean, anoms = series.monthly_annual(target) overlap = 0 diff = None # :todo: the records are consulted in an essentially arbitrary # order (which depends on the implementation), but the order # may affect the result. Tie breaks go to the last record consulted. # For exact compatiblity with previous versions, we create a # temporary dict. t = dict((record.uid, record) for record in records) for record in t.values(): common = [(rec_anom,anom) for rec_anom, anom in zip(record.ann_anoms, anoms) if valid(rec_anom) and valid(anom)] if len(common) < overlap: continue overlap = len(common) best_record = record S = sum((record.ann_mean+rec_anom) - (mean+anom) for rec_anom, anom in common) if common: diff = S / len(common) return best_record, diff, overlap
def asdict(arg, inp, mode, axes, offset=None, scale=None): """`arg` should be a list of 11-digit station identifiers or 12-digit record identifiers. The records from `inp` are extracted and returned as a dictionary (that maps identifiers to (data,begin) pair). If `mode` is 'anom' then data are converted to monthly anomalies; if `mode` is 'annual' then data are converted to annual anomalies (using the GISTEMP algorithm that copes with missing months). *offset* can be used to offset each station. The first station in the *arg* list will have no offset, each subsequent station will have its data biased by adding *offset* (the offset increasing arithmetically for each station). All of the duplicates for a given station will be offset by the same amount. The visual effect is to displace stations upward (if the offset is positive). """ # Clear Climate Code, tool directory import ghcnm_index # Clear Climate Code from code import series v2 = ghcnm_index.File(inp) table = {} if not offset: offset = [0.0] * len(arg) for id,axis,off in zip(arg, axes, offset): for id12,rows in v2.get(id): data,begin = from_lines(rows, scale) if mode == 'anom': series.anomalize(data, None) if mode == 'annual': _, data = series.monthly_annual(data) data = apply_data_offset(data, off) table[id12] = (data,begin,axis) return table
def asdict(arg, inp, mode, axes, offset=None, scale=0.1): """`arg` should be a list of 11-digit station identifiers or 12-digit record identifiers. The records from `inp` are extracted and returned as a dictionary (that maps identifiers to (data,begin) pair). If `mode` is 'anom' then data are converted to monthly anomalies; if `mode` is 'annual' then data are converted to annual anomalies (using the GISTEMP algorithm that copes with missing months). *offset* can be used to offset each station. The first station in the *arg* list will have no offset, each subsequent station will have its data biased by adding *offset* (the offset increasing arithmetically for each station). All of the duplicates for a given station will be offset by the same amount. The visual effect is to displace stations upward (if the offset is positive). """ # Clear Climate Code, tool directory import v2index # Clear Climate Code from code import series v2 = v2index.File(inp) table = {} if not offset: offset = [0.0] * len(arg) for id,axis,off in zip(arg, axes, offset): for id12,rows in v2.get(id): data,begin = from_lines(rows, scale) if mode == 'anom': series.anomalize(data, None) if mode == 'annual': _, data = series.monthly_annual(data) data = apply_data_offset(data, off) table[id12] = (data,begin,axis) return table
def find_quintuples(sums, wgts, record, new_id, log): """The *sums* and *wgts* arrays are assumed to begin in the same year as *record*. Returns a boolean.""" # An identifier common to all the log output. logid = "%s %s" % (new_id, record.uid) rec_begin = record.first_valid_year() rec_end = record.last_valid_year() actual_begin, actual_end = get_actual_endpoints(wgts, record.first_year) max_begin = max(actual_begin, rec_begin) min_end = min(actual_end, rec_end) # Since max_begin and min_end are integers, this rounds fractional # middle years up. middle_year = int(.5 * (max_begin + min_end) + 0.5) offset = (middle_year - record.first_year) log.write("max begin: %s\tmin end: %s\n" % (max_begin, min_end)) new_data = average(sums, wgts) new_ann_mean, new_ann_anoms = series.monthly_annual(new_data) ann_std_dev = sigma(new_ann_anoms) log.write("ann_std_dev = %s\n" % ann_std_dev) rec_ann_anoms = record.ann_anoms rec_ann_mean = record.ann_mean # Whether we have an "overlap" or not. We have an "overlap" if # within *rad* years either side of *middle_year* both records have # *parameters.station_combine_min_mid_year* valid annnual anomalies. ov_success = False # The overlap is "okay" when the difference in annual temperature is # below a certain threshold. okay_flag = False for rad in range(1, parameters.station_combine_bucket_radius + 1): # For the two series, get data from from -rad to rad (inclusive) # around the middle year. base = offset-rad base = max(0, base) limit = offset+rad+1 new_middle = [x for x in new_ann_anoms[base:limit] if valid(x)] rec_middle = [x for x in rec_ann_anoms[base:limit] if valid(x)] if (len(new_middle) >= parameters.station_combine_min_mid_years and len(rec_middle) >= parameters.station_combine_min_mid_years): log.write("overlap success: %s\n" % logid) ov_success = True avg1 = sum(anom+new_ann_mean for anom in new_middle) / float( len(new_middle)) avg2 = sum(anom+rec_ann_mean for anom in rec_middle) / float( len(rec_middle)) diff = abs(avg1 - avg2) log.write("diff = %s\n" % diff) if diff < ann_std_dev: okay_flag = True log.write("combination success: %s\n" % logid) else: log.write("combination failure: %s\n" % logid) break if not ov_success: log.write("overlap failure: %s\n" % logid) log.write("counts: %d %d\n" % (len(new_middle), len(rec_middle))) return okay_flag