Esempio n. 1
0
def estamt(network, minlenshf=24, **hom_params):
    """
    COPIED FROM ucpmonthly.v24a.f:
    
    The major steps in determining the best adjustment value for each station
    and changepoint. Entire network undergoes each of the following processes.
    In order:
        1) Remove unusable data. Align move swith respect to non-missing data
        and compress out changes that are too close AND the data between them.
        
        2) ISTEP=2 processing begins the adjustment process by removing the 
        non-significant changepoints to lengthen segments.
        
        3) NPASS (:= ISTEP=3) finishes the adjustment process by testing for the
        minimum number of months in a segment and number of neighbors with which
        the difference series can be examined.
        
        4) Final adjusted output is written.
    """

    ## FILTER 4
    ## Since the amplitude estimate MUST rely upon a minimum of MINLEN months to
    ## get even close to a reliable estimate at this point, it is assumed that
    ## the changepoints are as good as the station history files. Therefore,
    ## align moves with respect to non-missing data and compress out changes
    ## that are too close AND the data between them (i.e., less than MINLEN
    ## apart)

    # station_list = network.stations.keys()
    all_station_list = network.stations.keys()
    # station_list = ["215887", ]
    station_list = all_station_list

    # for each station...
    for id in station_list:
        station_index = station_list.index(id)
        station_series = network.raw_series[id]
        station_data = station_series.monthly_series[:]
        missing_val = station_series.MISSING_VAL

        # ... gen arrays for alignment
        move, amt, mday = [], [], []
        changepoints = station_series.changepoints
        cps = sorted(changepoints.keys())
        for cp in cps:
            print "  Hist move: ", len(move) + 1, station_index + 1, imo2iym(cp)
            move.append(cp)
            amt.append(changepoints[cp]["jsum"])
            mday.append(31)
        movnum = len(changepoints)

        if movnum > 0:
            ## At this point, the Fortran code executes alignmoves() in
            ## SHAPinp.v6c.f to reconcile the fact that station history files
            ## report dates of moves. It also removes segments that eare too short
            ## - less than minlenshf. Instead of implementing alignmoves(). Right
            ## now, I'll only implement this second functionality.
            # alignmoves()

            ####################################################################
            # Seek to find first and last month indices
            first_set = False
            for month in range(len(station_data)):
                # Skip first year
                if month < 12:
                    continue

                if station_data[month] != missing_val:
                    if not first_set:
                        first = month
                        first_set = True
                    last = month

            cps = sorted(changepoints.keys())
            cps.insert(0, first)
            cps.append(last)
            for (cp1, cp2) in zip(cps[:], cps[1:]):
                if (cp2 - cp1) < minlenshf:
                    months_to_delete = range(cp1 + 1, cp2 + 1)
                    network.raw_series[id].delete_months(months_to_delete)

                    if cp2 == last:
                        del_key = cp1
                    else:
                        del_key = cp2
                    if del_key in network.raw_series[id].changepoints:
                        # print len(network.raw_series[id].changepoints.keys()),
                        del network.raw_series[id].changepoints[del_key]
                        # print len(network.raw_series[id].changepoints.keys()),
                        # raw_input("pause")

                    del_str = "Del 1st segment: " if cp1 == first else "Delete segment: "

                    print id, station_index + 1, del_str, imo2iym(cp1), cp1, imo2iym(cp2), cp2

            new_changepoints = network.raw_series[id].changepoints
            new_cps = sorted(new_changepoints.keys())
            print "  First data value: ", imo2iym(first)
            for cp in new_cps:
                print "  End seg:", new_cps.index(cp), " ym: ", imo2iym(cp), cp, new_changepoints[cp]["jsum"]
            print "    End segment ym: ", imo2iym(last), last

            # Finally, add first and last value to the list of changepoints.
            first_stats = dict(ahigh=0.0, astd=0.0, jsum=0)
            last_stats = dict(ahigh=0.0, astd=0.0, jsum=0)
            network.raw_series[id].changepoints[first] = first_stats
            network.raw_series[id].changepoints[last] = last_stats
            ####################################################################

    ## Series of debug print statements summarizing the final list of
    ## changepoints. Not necessary at the moment

    ############################################################################
    # The subnetwork processing became a multi-step process plus a "post-process
    # pass" to manage:
    #    1) problems with documented changepoints with NO undocumented support
    #    2) determine the best amplitude estimation for each confirmed changepoint

    for step in [2, 3]:
        ## Setup output strings based on the step used. Only cosmetic differences
        ## really.

        iminlen = hom_params["minlen"]
        numclim = 3
        ## STEP 1 - NEVER USED (technically the history-consideration done previously
        if step == 1:
            continue
        elif step == 2:
            ## STEP 2 - NOT SIG REMOVAL
            ## equivalent to ipass loopback for istep == 2 in Fortran PHA
            print " ---------------- NOT SIG REMOVAL --------------- "
            tstr = "Not sig: "
            outid = "NS"
            ipass = 1
        elif step == 3:
            ## STEP 3 - ADJUSTMENT OF DISCONTINUITIES
            # equivalent to ipass loopback for istep == in FORTRAN PHA
            print " ---------------- ADJUST DISCONTINUITY STEP --------------- "
            print "Adjpass, iminlen, numclim", "--", iminlen, numclim
            print " ---------------- NPASS --------------- "
            tstr = "Dstep Dtrend: "
            outid = "WM"
            ipass = ipass + 1

        final_results = dict()
        print "  NET   STN    FILT TECH      ------ AFTER ------    ------ BEFORE ------"
        # Process each station and its network of neighbors
        for id in station_list:
            station_index = station_list.index(id)

            station_cp_dict = network.raw_series[id].changepoints
            sorted_cps = sorted(station_cp_dict.keys())
            ## If there are no breakpoints...
            if not sorted_cps:
                final_results[id] = dict()
                continue

            station_series = network.raw_series[id]
            missing_val = station_series.MISSING_VAL

            # compute monthly anomalies for this station data
            station_anomalies = station_series.monthly_anomaly_series

            # What are the first and last valid months in this station's data set?
            # We've saved them as the first and last changepoint before...
            first = sorted_cps[0]
            last = sorted_cps[-1]

            # What are the pairs to this station that we need to consider?
            station_pairs = []
            for other_id in all_station_list:
                pair = tuple(sorted([id, other_id]))
                if pair in hom_params["pairs"]:
                    station_pairs.append(pair)
            print station_pairs

            # List the remaining changepoints after the "confirmfilt" process
            for cp in sorted_cps:
                cp_stats = station_cp_dict[cp]
                hit_count = cp_stats["jsum"]
                iy, im = imo2iym(cp)
                print (
                    "%3d %5d %6s Estamt chgin: -- %4d %2d %4d %3d" % (ipass, station_index, id, iy, im, cp, hit_count)
                )

            ## ACCUMULATE PAIRED CHANGEPOINTS AND AMPLITUDE ESTIMATES
            # Loop over "brackets" of changepoints - that is, for changepoints
            # [a, b, c, d], consider the two brackets [a,b,c] and [b,c,d] with
            # the center value of the changepoints. Note that in the Fortran PHA,
            # we go through these brackets in reverse order - right to left.
            brackets = zip(sorted_cps[-3::-1], sorted_cps[-2::-1], sorted_cps[::-1])
            final_results[id] = dict()
            for bracket in brackets:
                # for bracket in brackets[:1]:
                (left, cp, right) = bracket[:]

                ly, lm = imo2iym(left)
                cpy, cpm = imo2iym(cp)
                ry, rm = imo2iym(right)
                print "Oriented: ", "--", "--", "--", left, cp, cp + 1, right

                # setup the output string for this bracket's tests
                chgptstr = "  Win1: %5d %4d%2d %5d %4d%2dto Win2: %5d %4d%2d %5d %4d%2d" % (
                    left,
                    ly,
                    lm,
                    cp,
                    cpy,
                    cpm,
                    cp,
                    cpy,
                    cpm,
                    right,
                    ry,
                    rm,
                )

                ## THIS SECTION ACCUMULATES TARGET-NEIGHBOR COMPARISONS
                # See if there are enough homogeneous data in the target;
                # check each window
                valid_count_right = len(get_valid_data(station_data[cp + 1 : right + 1], missing_val))
                valid_count_left = len(get_valid_data(station_data[left : cp + 1], missing_val))
                # if the segment length (valid count) is too short, skip this
                # changepoint (for now)
                if valid_count_left < iminlen:
                    print "Adjpass seg2 short ", station_index, id, chgptstr, valid_count_left
                    continue
                if valid_count_right < iminlen:
                    print "Adjpass seg1 short ", station_index, id, chgptstr, valid_count_right
                    continue

                ## We've pass the too-little-data pitfall. Now, we are actually going
                ## to go back through our paired neighbors and compute some final
                ## statistics about these changepoints. We'll store them in a
                ## dictionary for later, just like the pair_results dictionary
                ## from splitmerge
                pair_results = dict()
                # for (id1, id2) in [("215887", "200779")]:
                for (id1, id2) in station_pairs:
                    # Reset the left, cp, and right indices to the original
                    # bracket we're considering. We are going to be changing them
                    # while we look at this pair
                    (left, cp, right) = bracket[:]

                    ## Figure out which station is the neighbor (not the target
                    ## we're currently considering). At the same time, note that if
                    ## the target is the 2nd changepoint, the adjustments will be
                    ## flipped in sign, so we need to have a correction factor ready
                    correction = 1.0
                    if id == id1:
                        neighb_id = id2
                    else:
                        neighb_id = id1
                        # correction = -1.0

                    # Add this pair to pair_results if it's not already there
                    (ida, idb) = sorted([id1, id2])
                    pair_str = "%s-%s" % (ida, idb)
                    if pair_str not in pair_results:
                        pair_results[neighb_id] = dict()
                    print pair_str

                    neighb_index = all_station_list.index(neighb_id)
                    neighb_cp_dict = network.raw_series[neighb_id].changepoints

                    neighb_series = network.raw_series[neighb_id]
                    neighb_anomalies = neighb_series.monthly_anomaly_series

                    ## Generature a difference data set for this pair of stations
                    diff_data = diff(station_anomalies, neighb_anomalies)

                    ## It's possible that in the [left, right] bracket we're looking
                    ## at, there's a changepoint in the paired neighbor. We need
                    ## to adjust the endpoints of the bracket to exclude those
                    ## breakpoints
                    # Check right-hand side first and break out if ...
                    right_seg_len = len(get_valid_data(diff_data[cp + 1 : right + 1]))
                    # right_seg_len = len(diff_data[cp+1:right+1])
                    for month in range(cp + 1, right + 1):
                        if month == last:
                            continue

                        # ... we hit a changepoint in the neighbor ...
                        if month in neighb_cp_dict:
                            neighb_hits = neighb_cp_dict[month]["jsum"]
                            right_seg_len = len(get_valid_data(diff_data[cp + 1 : month + 1]))
                            # right_seg_len = len(diff_data[cp+1:month+1])
                            print (
                                "CHG2: ",
                                neighb_index,
                                neighb_id,
                                "num,edit,2b,2e,imo,nhits",
                                right_seg_len,
                                "--",
                                cp + 1,
                                right,
                                month,
                                neighb_hits,
                            )

                            right = month
                            break
                    # ... and the final right-segment is too short
                    print left, cp, right
                    if right_seg_len < iminlen:
                        print (
                            "Low2: ",
                            neighb_index,
                            neighb_id,
                            "num,edit,2b,2e,imo,nhits",
                            right_seg_len,
                            "--",
                            cp + 1,
                            right,
                            month,
                            "--",
                        )
                        continue

                    # Now, check the left-hand side and break out if ...
                    left_seg_len = len(get_valid_data(diff_data[left : cp + 1]))
                    for month in range(cp - 1, left, -1):
                        if month == first:
                            continue

                        # ... we hit a changepoint in the neighbor ...
                        if month in neighb_cp_dict:
                            neighb_hits = neighb_cp_dict[month]["jsum"]
                            left_seg_len = len(get_valid_data(diff_data[month:cp]))
                            # left_seg_len = len(diff_data[month:cp])
                            print (
                                "CHG1: ",
                                neighb_index,
                                neighb_id,
                                "num,edit,1b,1e,imo,nhits",
                                left_seg_len,
                                "--",
                                cp + 1,
                                left,
                                month,
                                neighb_hits,
                            )

                            left = month
                            break
                    # ... and the final left-segment is too short
                    if left_seg_len < iminlen:
                        print (
                            "Low1: ",
                            neighb_index,
                            neighb_id,
                            "num,edit,1b,1e,imo,nhits",
                            left_seg_len,
                            "--",
                            cp + 1,
                            left,
                            month,
                            "--",
                        )
                        continue

                    ## We can now estimate the raw changepoint amplitude using minbic.
                    ## However, we'll short-circuit a lot of the work by telling it to only
                    ## use the KTHTPR0 model (simple step-change model)
                    (seg_x, seg_data) = range(left + 1, right + 1), diff_data[left + 1 : right + 1]
                    bp_index = cp - (left + 1)
                    # print left, cp, right, "|", bp_index
                    # print left_seg_len, right_seg_len
                    bic_result = minbic(seg_x, seg_data, bp_index, missing_val, models=[("KTHTPR0", kthtpr0)])
                    ## Also check the first difference correlations between the
                    ## monthly anomalies
                    station_first_diff = compute_first_diff(station_anomalies, missing_val)
                    neighb_first_diff = compute_first_diff(neighb_anomalies, missing_val)
                    corr = compute_corr(station_anomalies, neighb_anomalies)

                    ## Write out the results of this testing process so far
                    cmodel = bic_result["cmodel"]
                    bic = bic_result["bic"]
                    test_stat = bic_result["test_stat"]
                    crit_val = bic_result["crit_val"]
                    offset = bic_result["offset"]
                    slopes = bic_result["slopes"]
                    left_slope, right_slope = slopes
                    print (
                        "%s %6s-%6s %s %7.2f %7.2f %7.2f %7.2f %7.3f %7.3f -- %d --"
                        % (
                            tstr,
                            id,
                            neighb_id,
                            chgptstr,
                            crit_val,
                            test_stat,
                            offset,
                            corr,
                            left_slope,
                            right_slope,
                            right_seg_len,
                        )
                    )

                    ## Analysis is done.
                    ## Keep the adjustment (offset) for each neighbor/segment,
                    ## set/reset trend for each neighbor/segment
                    ##     the first segment is the left-segment,
                    ##     the second segment is the right-segment
                    ##
                    ## Note that we reset left/right potentially to avoid conflicts
                    ## within the paired neighbor data. However, our estimates of
                    ## trends/offsets associated with the "right" adjacent changepoint
                    ## actually refers to that original right changepoint. We'll
                    ## reset left, cp, and right from the bracket before continuing
                    (left, cp, right) = bracket[:]
                    # Do the left segment first
                    left_dict = dict()
                    left_dict["adj"] = offset * correction
                    left_dict["cor"] = corr
                    left_dict["bic"] = bic
                    left_dict["cmodel"] = cmodel
                    left_dict["trend"] = left_slope
                    left_dict["spanob"] = left_seg_len
                    pair_results[neighb_id][cp] = left_dict

                    # Do the right segment now
                    right_dict = dict()
                    right_dict["adj"] = offset * correction
                    right_dict["cor"] = corr
                    right_dict["bic"] = bic
                    right_dict["cmodel"] = cmodel
                    right_dict["trend"] = right_slope
                    right_dict["spanob"] = right_seg_len

                    if right not in pair_results[neighb_id]:
                        pair_results[neighb_id][right] = right_dict
                    else:
                        # We've already recorded this segment before for the last
                        # changepoint. Update the slopes/spanob count (length of
                        # preceding segment) if the slopes are different and the
                        # length is different.
                        new_trend = slopes[1]
                        new_spanob = right_seg_len
                        old_trend = pair_results[neighb_id][right]["trend"]
                        old_spanob = pair_results[neighb_id][right]["spanob"]
                        if old_trend != new_trend:
                            print (
                                " Seg2 diff: %s %4d old: %7.2f %4d new: %7.2f %4d"
                                % (pair_str, right, old_trend, old_spanob, new_trend, new_spanob)
                            )
                            # if the new count is greater than the old one, the slope
                            # is probably more robust so update those entries.
                            if new_spanob > old_spanob:
                                pair_results[neighb_id][right]["trend"] = new_trend
                                pair_results[neighb_id][right]["spanob"] = new_spanob

                    ## We're done with this pair/changepoint. Summary output -
                    if step == 2:
                        print "itarg,ipair,ichg,numc,iqt,adj,trends: -- -- -- --", cmodel, offset, slopes
                # raw_input("pause")

                ####################################################################
                ## ADJUSTMENT DETERMINATION SECTION
                # Recall the paired-changepoint analyses we just performed, and
                # determine if the potential adjustment is statistically valid
                (left, cp, right) = bracket[:]

                pair_data = []
                for neighb_id in pair_results:
                    if not cp in pair_results[neighb_id]:
                        continue

                    cp_stats = pair_results[neighb_id][cp]
                    adjacent_stats = pair_results[neighb_id][right]

                    trends = (cp_stats["trend"], adjacent_stats["trend"])

                    pair_dict = dict(
                        neighb_id=neighb_id, adj=cp_stats["adj"], cor=cp_stats["cor"], trends=trends, used=True
                    )
                    pair_data.append(pair_dict)

                npairs = len(pair_data)
                if npairs < numclim:
                    print "Adjpass numc low --", station_index, id, left, cp, right, npairs
                    continue

                # Process -
                #    1) Remove both adjustment and trend outliers
                #    2) Calculate median adjustment
                #
                #    filter around inter-quartile range
                qscale = hom_params["qscale"]
                pair_data = sorted(pair_data, key=operator.itemgetter("adj"))
                pair_chgs = [p["adj"] for p in pair_data]
                chg_25th, chg_median, chg_75th = tukey_med(pair_chgs)

                chg_iqr = chg_75th - chg_25th
                chg_low = chg_25th - (chg_median - chg_25th) * 1.0 * qscale
                chg_high = chg_75th + (chg_75th - chg_median) * 1.0 * qscale
                print (
                    " TRIM p25, p75, pct50, rng, lo, hi: %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
                    % (chg_25th, chg_75th, chg_median, chg_iqr, chg_low, chg_high)
                )
                # If any of the estimated changepoints are outside the statistically
                # robust range we just computed, then flag them as we print them and
                for data in pair_data:
                    neighb_id = data["neighb_id"]
                    neighb_index = all_station_list.index(neighb_id)
                    adj = data["adj"]
                    cor = data["cor"]
                    trends = data["trends"]

                    if not (chg_low < adj < chg_high):
                        data["used"] = False
                    flag = "U" if data["used"] else "X"
                    print ("%s %4d %7.2f %8.4f %8.4f %7.2f" % (flag, neighb_index, adj, trends[0], trends[1], cor))

                valid_adj_count = len([d for d in pair_data if d["used"]])
                if valid_adj_count < numclim:
                    if step == 2:
                        print (
                            "Insuff trimmed mean -- %4d %s %5d %5d %5d %5d"
                            % (station_index, id, left, cp, right, valid_adj_count)
                        )
                        continue

                ## BUG: The code here re-computes the inter-quartile range by
                ##     scaling qscale by 1.0. Curiously, it doesn't reject any
                ##     pairs based on this new range.
                chg_iqr = chg_75th - chg_25th
                chg_low = chg_25th - (chg_median - chg_25th) * qscale
                chg_high = chg_75th + (chg_75th - chg_median) * qscale

                ## Tweak the inter-quartile range to check if the adjustment is
                ## Check whether the computed adjustment is significant. That is,
                ## if 0 is included within the inter-quartile range we computed, then
                ## we can't reject the null hypothesis that the changepoint is significant
                if chg_high * chg_low > 0.0:
                    # signs are the same, so 0 isn't included in the range.
                    procstr = "CONSHF"
                    sigadj = chg_median
                else:
                    procstr = "ZERSHF"
                    sigadj = 0.0

                final_results[id][cp] = dict(adj=sigadj, std=chg_iqr * 1.0 * qscale, num=npairs)

                print ("%2d %s-%s %s %7.2f" % (station_index, id, procstr, chgptstr, sigadj))

            ## Print some final output about what changepoints remain for this station
            final_station_results = final_results[id]
            final_cps = sorted(final_station_results.keys())
            for cp in final_cps:
                adj = final_station_results[cp]["adj"]
                std = final_station_results[cp]["std"]

                cp_stats = station_cp_dict[cp]
                hit_count = cp_stats["jsum"]
                iy, im = imo2iym(cp)

                print (
                    "-- %5d %s Estamt chgout: -- %4d%2d %5d %5d %7.2f %7.2f"
                    % (station_index + 1, id, iy, im, cp, hit_count, adj, std)
                )
            # raw_input("pause")

        ## Remove the accumulated non-significant changepoints (either non-sig because
        ## there was too much missing data, the target segment was too short, or the
        ## trimmed mean test could not reject the null hypothesis of no change
        for id in station_list:
            station_index = station_list.index(id)

            final_station_results = final_results[id]
            final_cps = sorted(final_station_results.keys())
            for cp in final_cps:
                iy, im = imo2iym(cp)
                cp_index = final_cps.index(cp)
                adj = final_station_results[cp]["adj"]
                std = final_station_results[cp]["std"]
                if adj == 0.0:
                    print ("%s %5d Remove chgpt %5d %4d %2d %4d" % (id, station_index, cp_index, iy, im, cp))
                    del network.raw_series[id].changepoints[cp]
                else:
                    # Update the network's record of changepoints with this new list
                    network.raw_series[id].changepoints[cp]["ahigh"] = adj
                    network.raw_series[id].changepoints[cp]["astd"] = std
            # the changepoint at first month has been removed; add it back in
            network.raw_series[id].changepoints[first] = dict(ahigh=0.0, astd=0.0, jsum=0)
Esempio n. 2
0
def find_correlations(
    cand_series, series_dict, neighborhood, corrlim=0.1, begyr=1900, endyr=2010, minpair=14, numcorr=20, **kwargs
):

    corr_dict = dict()

    coop_id1 = cand_series.coop_id
    print "...%s" % coop_id1
    neighbors = [id for id, dist in neighborhood]

    for coop_id2 in neighbors:
        print "......%s" % coop_id2
        neighb_series = series_dict[coop_id2]

        # Get the data for these series. Note that up until this point,
        # we haven't corrected for the fact that temperature data is reported
        # in tenths of a degree in the USHCN database. Let's go ahead and
        # correct that factor; it turns out that if you don't, the correlation
        # doesn't work correctly. Note that computing anomalies is a linear
        # operation, so it doesn't matter for the math so far that we've used
        # tenths of a degree instead of whole degrees.
        cand_data = [val * 0.1 for val in cand_series.monthly_series]
        neighb_data = [val * 0.1 for val in neighb_series.monthly_series]

        # We SHOULD have read the same years of data, and have equal lengths
        # of data series.
        assert cand_series.years == neighb_series.years
        assert len(cand_series.series) == len(neighb_series.series)

        # What is the missing value placeholder? Correct for being in tenths
        # of a degree.
        MISS = cand_series.MISSING_VAL * 0.1

        # Align the candidate and network series by looping through every
        # value, and choosing only months where BOTH a candidate and neighbor
        # value are present. If either or both are missing, skip that month
        # and go on to the next.
        print ".........Aligning cand/neighb series"
        cand_align, neighb_align = [], []
        for (cand_val, neighb_val) in zip(cand_data, neighb_data):
            if cand_val != MISS and neighb_val != MISS:
                cand_align.append(cand_val)
                neighb_align.append(neighb_val)
        assert len(cand_align) == len(neighb_align)

        # We perform the correlation test on a first-difference series, so
        # compute that now. See util.compute_first_diff() for information on
        # what this operation entails.
        print ".........Computing first differences"
        cand_dif = compute_first_diff(cand_align, MISS)
        neighb_dif = compute_first_diff(neighb_align, MISS)

        # Now, we can actually compute the correlation coefficient. Again,
        # see util.compute_corr() for info on this mathematical operation.
        print ".........Computing correlation coefficient"
        r = compute_corr(cand_dif, neighb_dif, MISS, aligned=True)
        # r = compute_corr(cand_align, neighb_align, MISS)
        cand_std = compute_std(cand_dif, MISS)
        neighb_std = compute_std(neighb_dif, MISS)

        # If the correlation is above a threshold, we will keep it. In the
        # ushcn_corr_2004.v3 code, this threshold is 0.10.
        if r:
            print "            %1.3f %3.3f %3.3f" % (r, cand_std, neighb_std)
            corr_dict[coop_id2] = r

        else:
            print "            poor or no correlation"

    sort_corrs = sorted(corr_dict.iteritems(), key=itemgetter(1), reverse=True)
    good_corrs = [coop_id2 for (coop_id2, r) in sort_corrs if r > corrlim]

    nmonths = (endyr - begyr) * 12
    ksum = [0] * nmonths
    jsum = [0] * nmonths
    lowtoo = [0] * nmonths
    kstns = 0
    # Determine ksum[nmonths], the number of neighbor data available to use
    # in homogenizing data for this station at each month
    for imo in xrange(nmonths):
        if cand_data[imo] != MISS:
            for (k, coop_id2) in zip(xrange(len(good_corrs)), good_corrs):
                neighb_series = series_dict[coop_id2]
                neighb_data = neighb_series.monthly_series
                if neighb_data[imo] != neighb_series.MISSING_VAL:
                    ksum[imo] = ksum[imo] + 1
            kstns = k
            jsum[imo] = ksum[imo] * 1

            if ksum[imo] < minpair:
                print " Total less than minpair: ", coop_id1, 1900 + (imo / 12), 1 + (imo % 12)
                lowtoo[imo] = 1

    # If we have more neighbors than necessary, then let's see if we can adjust
    # the numbers somewhat to bolster the amount of data in low-info periods,
    # being careful not too delete other good data.
    useful_neighbors = [n for n in good_corrs]
    jstns = kstns * 1
    if kstns > numcorr - 1:

        good_corrs.reverse()
        for (k, coop_id2) in zip(xrange(len(good_corrs)), good_corrs):

            iremove = 1
            npair = 0
            neighb_series = series_dict[coop_id2]
            neighb_data = neighb_series.monthly_series

            imonths = xrange(nmonths)
            CMISS, NMISS = MISS, neighb_series.MISSING_VAL

            iter_head = zip(imonths, cand_data, neighb_data)
            for (imo, c, n) in [(imo, c, n) for (imo, c, n) in iter_head]:
                if (c != CMISS) and (n != NMISS):
                    npair = npair + 1
                    if ksum[imo] <= minpair:
                        print " Cannot remove:", coop_id1, "-", coop_id2, 1900 + (imo / 12), 1 + (imo % 12), ksum[
                            imo
                        ], lowtoo[imo]
                        iremove = 0
                        break

            if iremove == 1:
                if kstns >= numcorr - 1:
                    print " Remove:", coop_id1, "-", coop_id2, npair, corr_dict[coop_id2]
                    kstns = kstns - 1
                    useful_neighbors.remove(coop_id2)
                    for imo in xrange(nmonths):
                        if cand_data[imo] != CMISS and neighb_data[imo] != NMISS:
                            ksum[imo] = ksum[imo] - 1

    for imo in xrange(nmonths):
        if jsum[imo] > 0:
            print "Original-Final:", coop_id1, 1900 + (imo / 12), 1 + (imo % 12), jsum[imo], ksum[imo]

    print "Original-Final Number stns:", coop_id1, jstns, kstns

    # Now, we know which neighbors a) are highly correlated with this station,
    # and b) add information where it is scarce in the temperature record.
    for coop_id2 in corr_dict.keys():
        if not coop_id2 in useful_neighbors:
            del corr_dict[coop_id2]

    return corr_dict