def testDataset2(self): """Dataset with no missing values""" mean = compute_mean(self.dataset2, -9999) self.assertEquals(mean, 45.0) std = compute_std(self.dataset2, -9999) self.assertAlmostEquals(std, 18.7082, delta=1e-3)
def testDataset1(self): """Dataset with one missing value""" mean = compute_mean(self.dataset1, -9999) self.assertEquals(mean, 40.0) std = compute_std(self.dataset1, -9999) self.assertAlmostEquals(std, 18.2574, delta=1e-3)
def testStd1(self): """Shouldn't be able to compute std if less than 2 values""" miss = -9999 data = [3.3] std = compute_std(data, miss, valid=True) self.assertEquals(std, miss)
def testStd2(self): """Should return the missing value if empty dataset given.""" miss = -9999 data = [] std = compute_std(data, miss) self.assertEquals(std, miss)
def find_correlations( cand_series, series_dict, neighborhood, corrlim=0.1, begyr=1900, endyr=2010, minpair=14, numcorr=20, **kwargs ): corr_dict = dict() coop_id1 = cand_series.coop_id print "...%s" % coop_id1 neighbors = [id for id, dist in neighborhood] for coop_id2 in neighbors: print "......%s" % coop_id2 neighb_series = series_dict[coop_id2] # Get the data for these series. Note that up until this point, # we haven't corrected for the fact that temperature data is reported # in tenths of a degree in the USHCN database. Let's go ahead and # correct that factor; it turns out that if you don't, the correlation # doesn't work correctly. Note that computing anomalies is a linear # operation, so it doesn't matter for the math so far that we've used # tenths of a degree instead of whole degrees. cand_data = [val * 0.1 for val in cand_series.monthly_series] neighb_data = [val * 0.1 for val in neighb_series.monthly_series] # We SHOULD have read the same years of data, and have equal lengths # of data series. assert cand_series.years == neighb_series.years assert len(cand_series.series) == len(neighb_series.series) # What is the missing value placeholder? Correct for being in tenths # of a degree. MISS = cand_series.MISSING_VAL * 0.1 # Align the candidate and network series by looping through every # value, and choosing only months where BOTH a candidate and neighbor # value are present. If either or both are missing, skip that month # and go on to the next. print ".........Aligning cand/neighb series" cand_align, neighb_align = [], [] for (cand_val, neighb_val) in zip(cand_data, neighb_data): if cand_val != MISS and neighb_val != MISS: cand_align.append(cand_val) neighb_align.append(neighb_val) assert len(cand_align) == len(neighb_align) # We perform the correlation test on a first-difference series, so # compute that now. See util.compute_first_diff() for information on # what this operation entails. print ".........Computing first differences" cand_dif = compute_first_diff(cand_align, MISS) neighb_dif = compute_first_diff(neighb_align, MISS) # Now, we can actually compute the correlation coefficient. Again, # see util.compute_corr() for info on this mathematical operation. print ".........Computing correlation coefficient" r = compute_corr(cand_dif, neighb_dif, MISS, aligned=True) # r = compute_corr(cand_align, neighb_align, MISS) cand_std = compute_std(cand_dif, MISS) neighb_std = compute_std(neighb_dif, MISS) # If the correlation is above a threshold, we will keep it. In the # ushcn_corr_2004.v3 code, this threshold is 0.10. if r: print " %1.3f %3.3f %3.3f" % (r, cand_std, neighb_std) corr_dict[coop_id2] = r else: print " poor or no correlation" sort_corrs = sorted(corr_dict.iteritems(), key=itemgetter(1), reverse=True) good_corrs = [coop_id2 for (coop_id2, r) in sort_corrs if r > corrlim] nmonths = (endyr - begyr) * 12 ksum = [0] * nmonths jsum = [0] * nmonths lowtoo = [0] * nmonths kstns = 0 # Determine ksum[nmonths], the number of neighbor data available to use # in homogenizing data for this station at each month for imo in xrange(nmonths): if cand_data[imo] != MISS: for (k, coop_id2) in zip(xrange(len(good_corrs)), good_corrs): neighb_series = series_dict[coop_id2] neighb_data = neighb_series.monthly_series if neighb_data[imo] != neighb_series.MISSING_VAL: ksum[imo] = ksum[imo] + 1 kstns = k jsum[imo] = ksum[imo] * 1 if ksum[imo] < minpair: print " Total less than minpair: ", coop_id1, 1900 + (imo / 12), 1 + (imo % 12) lowtoo[imo] = 1 # If we have more neighbors than necessary, then let's see if we can adjust # the numbers somewhat to bolster the amount of data in low-info periods, # being careful not too delete other good data. useful_neighbors = [n for n in good_corrs] jstns = kstns * 1 if kstns > numcorr - 1: good_corrs.reverse() for (k, coop_id2) in zip(xrange(len(good_corrs)), good_corrs): iremove = 1 npair = 0 neighb_series = series_dict[coop_id2] neighb_data = neighb_series.monthly_series imonths = xrange(nmonths) CMISS, NMISS = MISS, neighb_series.MISSING_VAL iter_head = zip(imonths, cand_data, neighb_data) for (imo, c, n) in [(imo, c, n) for (imo, c, n) in iter_head]: if (c != CMISS) and (n != NMISS): npair = npair + 1 if ksum[imo] <= minpair: print " Cannot remove:", coop_id1, "-", coop_id2, 1900 + (imo / 12), 1 + (imo % 12), ksum[ imo ], lowtoo[imo] iremove = 0 break if iremove == 1: if kstns >= numcorr - 1: print " Remove:", coop_id1, "-", coop_id2, npair, corr_dict[coop_id2] kstns = kstns - 1 useful_neighbors.remove(coop_id2) for imo in xrange(nmonths): if cand_data[imo] != CMISS and neighb_data[imo] != NMISS: ksum[imo] = ksum[imo] - 1 for imo in xrange(nmonths): if jsum[imo] > 0: print "Original-Final:", coop_id1, 1900 + (imo / 12), 1 + (imo % 12), jsum[imo], ksum[imo] print "Original-Final Number stns:", coop_id1, jstns, kstns # Now, we know which neighbors a) are highly correlated with this station, # and b) add information where it is scarce in the temperature record. for coop_id2 in corr_dict.keys(): if not coop_id2 in useful_neighbors: del corr_dict[coop_id2] return corr_dict