def testDataset2(self):
     """Dataset with no missing values"""        
     mean = compute_mean(self.dataset2, -9999)
     self.assertEquals(mean, 45.0)
     
     std = compute_std(self.dataset2, -9999)
     self.assertAlmostEquals(std, 18.7082, delta=1e-3)
 def testDataset1(self):
     """Dataset with one missing value"""        
     mean = compute_mean(self.dataset1, -9999)
     self.assertEquals(mean, 40.0)
     
     std = compute_std(self.dataset1, -9999)
     self.assertAlmostEquals(std, 18.2574, delta=1e-3)
 def testStd1(self):
     """Shouldn't be able to compute std if less than 2 values"""
     miss = -9999
     data = [3.3]
     std = compute_std(data, miss, valid=True)
     self.assertEquals(std, miss)
 def testStd2(self):
     """Should return the missing value if empty dataset given."""
     miss = -9999
     data = []
     std = compute_std(data, miss)
     self.assertEquals(std, miss)
Example #5
0
def find_correlations(
    cand_series, series_dict, neighborhood, corrlim=0.1, begyr=1900, endyr=2010, minpair=14, numcorr=20, **kwargs
):

    corr_dict = dict()

    coop_id1 = cand_series.coop_id
    print "...%s" % coop_id1
    neighbors = [id for id, dist in neighborhood]

    for coop_id2 in neighbors:
        print "......%s" % coop_id2
        neighb_series = series_dict[coop_id2]

        # Get the data for these series. Note that up until this point,
        # we haven't corrected for the fact that temperature data is reported
        # in tenths of a degree in the USHCN database. Let's go ahead and
        # correct that factor; it turns out that if you don't, the correlation
        # doesn't work correctly. Note that computing anomalies is a linear
        # operation, so it doesn't matter for the math so far that we've used
        # tenths of a degree instead of whole degrees.
        cand_data = [val * 0.1 for val in cand_series.monthly_series]
        neighb_data = [val * 0.1 for val in neighb_series.monthly_series]

        # We SHOULD have read the same years of data, and have equal lengths
        # of data series.
        assert cand_series.years == neighb_series.years
        assert len(cand_series.series) == len(neighb_series.series)

        # What is the missing value placeholder? Correct for being in tenths
        # of a degree.
        MISS = cand_series.MISSING_VAL * 0.1

        # Align the candidate and network series by looping through every
        # value, and choosing only months where BOTH a candidate and neighbor
        # value are present. If either or both are missing, skip that month
        # and go on to the next.
        print ".........Aligning cand/neighb series"
        cand_align, neighb_align = [], []
        for (cand_val, neighb_val) in zip(cand_data, neighb_data):
            if cand_val != MISS and neighb_val != MISS:
                cand_align.append(cand_val)
                neighb_align.append(neighb_val)
        assert len(cand_align) == len(neighb_align)

        # We perform the correlation test on a first-difference series, so
        # compute that now. See util.compute_first_diff() for information on
        # what this operation entails.
        print ".........Computing first differences"
        cand_dif = compute_first_diff(cand_align, MISS)
        neighb_dif = compute_first_diff(neighb_align, MISS)

        # Now, we can actually compute the correlation coefficient. Again,
        # see util.compute_corr() for info on this mathematical operation.
        print ".........Computing correlation coefficient"
        r = compute_corr(cand_dif, neighb_dif, MISS, aligned=True)
        # r = compute_corr(cand_align, neighb_align, MISS)
        cand_std = compute_std(cand_dif, MISS)
        neighb_std = compute_std(neighb_dif, MISS)

        # If the correlation is above a threshold, we will keep it. In the
        # ushcn_corr_2004.v3 code, this threshold is 0.10.
        if r:
            print "            %1.3f %3.3f %3.3f" % (r, cand_std, neighb_std)
            corr_dict[coop_id2] = r

        else:
            print "            poor or no correlation"

    sort_corrs = sorted(corr_dict.iteritems(), key=itemgetter(1), reverse=True)
    good_corrs = [coop_id2 for (coop_id2, r) in sort_corrs if r > corrlim]

    nmonths = (endyr - begyr) * 12
    ksum = [0] * nmonths
    jsum = [0] * nmonths
    lowtoo = [0] * nmonths
    kstns = 0
    # Determine ksum[nmonths], the number of neighbor data available to use
    # in homogenizing data for this station at each month
    for imo in xrange(nmonths):
        if cand_data[imo] != MISS:
            for (k, coop_id2) in zip(xrange(len(good_corrs)), good_corrs):
                neighb_series = series_dict[coop_id2]
                neighb_data = neighb_series.monthly_series
                if neighb_data[imo] != neighb_series.MISSING_VAL:
                    ksum[imo] = ksum[imo] + 1
            kstns = k
            jsum[imo] = ksum[imo] * 1

            if ksum[imo] < minpair:
                print " Total less than minpair: ", coop_id1, 1900 + (imo / 12), 1 + (imo % 12)
                lowtoo[imo] = 1

    # If we have more neighbors than necessary, then let's see if we can adjust
    # the numbers somewhat to bolster the amount of data in low-info periods,
    # being careful not too delete other good data.
    useful_neighbors = [n for n in good_corrs]
    jstns = kstns * 1
    if kstns > numcorr - 1:

        good_corrs.reverse()
        for (k, coop_id2) in zip(xrange(len(good_corrs)), good_corrs):

            iremove = 1
            npair = 0
            neighb_series = series_dict[coop_id2]
            neighb_data = neighb_series.monthly_series

            imonths = xrange(nmonths)
            CMISS, NMISS = MISS, neighb_series.MISSING_VAL

            iter_head = zip(imonths, cand_data, neighb_data)
            for (imo, c, n) in [(imo, c, n) for (imo, c, n) in iter_head]:
                if (c != CMISS) and (n != NMISS):
                    npair = npair + 1
                    if ksum[imo] <= minpair:
                        print " Cannot remove:", coop_id1, "-", coop_id2, 1900 + (imo / 12), 1 + (imo % 12), ksum[
                            imo
                        ], lowtoo[imo]
                        iremove = 0
                        break

            if iremove == 1:
                if kstns >= numcorr - 1:
                    print " Remove:", coop_id1, "-", coop_id2, npair, corr_dict[coop_id2]
                    kstns = kstns - 1
                    useful_neighbors.remove(coop_id2)
                    for imo in xrange(nmonths):
                        if cand_data[imo] != CMISS and neighb_data[imo] != NMISS:
                            ksum[imo] = ksum[imo] - 1

    for imo in xrange(nmonths):
        if jsum[imo] > 0:
            print "Original-Final:", coop_id1, 1900 + (imo / 12), 1 + (imo % 12), jsum[imo], ksum[imo]

    print "Original-Final Number stns:", coop_id1, jstns, kstns

    # Now, we know which neighbors a) are highly correlated with this station,
    # and b) add information where it is scarce in the temperature record.
    for coop_id2 in corr_dict.keys():
        if not coop_id2 in useful_neighbors:
            del corr_dict[coop_id2]

    return corr_dict