def calc_monthly_USHCN_offsets(u_record, g_record): """Given a USHCN record `u_record` and a GHCN record `g_record`, using any overlapping years, computes a set of 12 monthly offsets which can be added to the GHCN record to most closely approximate the USHCN record.""" u_years = u_record.get_set_of_years(parameters.USHCN_offset_start_year, u_record.last_year) g_years = g_record.get_set_of_years(parameters.USHCN_offset_start_year, u_record.last_year) reversed_year_pairs = list(reversed(zip(u_years, g_years))) diffs = [0.0] * 12 for month in range(12): sum = 0.0 count = 0 for u_year, g_year in reversed_year_pairs: u_temp, g_temp = u_year[month], g_year[month] if valid(u_temp) and valid(g_temp): sum += u_temp - g_temp count += 1 if count == parameters.USHCN_offset_max_months: break if count > 0: diffs[month] = sum / count return diffs
def record_correlation(s, t, overlap=300): """Return the correlation between the monthly anomalies of the two records, where they have common months.""" assert s.first_year == t.first_year a = list(s.series) b = list(t.series) series.anomalize(a) series.anomalize(b) common = [(u,v) for u,v in zip(a, b) if valid(u) and valid(v)] if len(common) < overlap: return None return correlation.pearson(*zip(*common))
def get_longest_overlap(target, begin, records): """Find the record in the *records* set that has the longest overlap with the *target* by considering annual anomalies. *target* is a sequence of monthly values starting in the year *begin*. A triple (record, diff, overlap) is returned; *diff* is the average difference in annual anomalies between *record* and *target* (positive when *record* is higher); *overlap* is the number of years in the overlap. Even when there is no overlap _some_ record is returned and in that case *diff* is None and *overlap* is 0. Like other functions, assumes (and asserts) that *begin* is the first year for all the records. """ # Annual mean, and annual anomaly sequence. mean, anoms = series.monthly_annual(target) overlap = 0 diff = None # :todo: the records are consulted in an essentially arbitrary # order (which depends on the implementation), but the order # may affect the result. Tie breaks go to the last record consulted. # For exact compatiblity with previous versions, we create a # temporary dict. t = dict((record.uid, record) for record in records) for record in t.values(): common = [(rec_anom,anom) for rec_anom, anom in zip(record.ann_anoms, anoms) if valid(rec_anom) and valid(anom)] if len(common) < overlap: continue overlap = len(common) best_record = record S = sum((record.ann_mean+rec_anom) - (mean+anom) for rec_anom, anom in common) if common: diff = S / len(common) return best_record, diff, overlap
def fresh_arrays(record, years): """Make and return a fresh pair of arrays: (*sums*, *wgts*). Each array is list (of length 12 * years; the input record should not be longer). The start of the result arrays will be the same as the start of the input *record*, which should generally be the same for all inputs. """ nmonths = years * 12 # Number of months in record. rec_months = len(record) assert rec_months <= nmonths sums = [0.0] * nmonths # Copy valid data rec_data into sums, assigning 0 for invalid data. sums[:rec_months] = (valid(x)*x for x in record.series) # Let wgts[i] be 1 where sums[i] is valid. wgts = [0] * nmonths wgts[:rec_months] = (int(valid(x)) for x in record.series) return sums, wgts
def adj(t, d): if valid(t): return t - d return t
def stationvalidmonths(record): """Return the set of months for which the record has valid data. Each month is encoded as a number with january of year 1 being 12.""" first = record.first_month - 1 return set(first+i for i,v in enumerate(record.series) if valid(v))
def find_quintuples(sums, wgts, record, new_id, log): """The *sums* and *wgts* arrays are assumed to begin in the same year as *record*. Returns a boolean.""" # An identifier common to all the log output. logid = "%s %s" % (new_id, record.uid) rec_begin = record.first_valid_year() rec_end = record.last_valid_year() actual_begin, actual_end = get_actual_endpoints(wgts, record.first_year) max_begin = max(actual_begin, rec_begin) min_end = min(actual_end, rec_end) # Since max_begin and min_end are integers, this rounds fractional # middle years up. middle_year = int(.5 * (max_begin + min_end) + 0.5) offset = (middle_year - record.first_year) log.write("max begin: %s\tmin end: %s\n" % (max_begin, min_end)) new_data = average(sums, wgts) new_ann_mean, new_ann_anoms = series.monthly_annual(new_data) ann_std_dev = sigma(new_ann_anoms) log.write("ann_std_dev = %s\n" % ann_std_dev) rec_ann_anoms = record.ann_anoms rec_ann_mean = record.ann_mean # Whether we have an "overlap" or not. We have an "overlap" if # within *rad* years either side of *middle_year* both records have # *parameters.station_combine_min_mid_year* valid annnual anomalies. ov_success = False # The overlap is "okay" when the difference in annual temperature is # below a certain threshold. okay_flag = False for rad in range(1, parameters.station_combine_bucket_radius + 1): # For the two series, get data from from -rad to rad (inclusive) # around the middle year. base = offset-rad base = max(0, base) limit = offset+rad+1 new_middle = [x for x in new_ann_anoms[base:limit] if valid(x)] rec_middle = [x for x in rec_ann_anoms[base:limit] if valid(x)] if (len(new_middle) >= parameters.station_combine_min_mid_years and len(rec_middle) >= parameters.station_combine_min_mid_years): log.write("overlap success: %s\n" % logid) ov_success = True avg1 = sum(anom+new_ann_mean for anom in new_middle) / float( len(new_middle)) avg2 = sum(anom+rec_ann_mean for anom in rec_middle) / float( len(rec_middle)) diff = abs(avg1 - avg2) log.write("diff = %s\n" % diff) if diff < ann_std_dev: okay_flag = True log.write("combination success: %s\n" % logid) else: log.write("combination failure: %s\n" % logid) break if not ov_success: log.write("overlap failure: %s\n" % logid) log.write("counts: %d %d\n" % (len(new_middle), len(rec_middle))) return okay_flag