def summary_table(station_ids, hits, all_data, print_missing=False, missing_val=-9999, **hom_params): """Print out a summary of a specified working array in matrix form. The matrix will contain each entry in hits, and if print_missing is flagged as True, it will check all_data and replace an entry in hits with a mark that the value is missing in the original dataset. :Param station_ids: A list of the station_ids corresponding to the first axis in both hits and all_data. :Param hits: The working array to print in the matrix. :Param all_data: The array to check against for missing values. :Param print_missing: (optional) Enable printing of locations where "missing values" are detected in all_data, as denoted by the given missing_val :Param missing_val: (optional) Placeholder for missing values in all_data. Default is -9999. :Return: Nothing; prints the 'hits' array to console. """ ## Print header - head1 = " |"+"|".join([i[:3] for i in station_ids])+"|" head2 = " |"+"|".join([i[3:] for i in station_ids])+"|" print head1 print head2 ## Print monthly series basic def con2str(data, missing_val=-9999): """Convert entry in the working array into a string to print""" val, hits = data if val == missing_val and print_missing: return "-X-" elif hits > 0: return "%3d" % hits else: return "---" num_stations, num_months = all_data.shape for imo in xrange(num_months): year, month = imo2iym(imo) base_str = " %4d %2d %4d |" % (year, month, imo-11) month_strs = "|".join(map(con2str, zip(all_data[:,imo],hits[:,imo])))+"|" print_month_strs = False for i in range(10): if str(i) in month_strs: print_month_strs = True break if print_month_strs: print base_str+month_strs
def filter1(network, **hom_params): """Attempts to 'unconfound' the undocumented changepoints detected for each pair of stations and attribute them to a single station. This first filter also takes care of book-keeping by setting up arrays to hold the results of the filters employed in the Menne/Williams pairwise homogenization algorithm. :Param network: The network object containing the station/series data and metadata, as well as the paired results from the splitmerge process, stored in a dictionary called pair_results, :Ivar pair_results: A dictionary containing the paired results. Should have keys of the form "%s-%s" % (id1, id2), and each key should be associated with a dictionary element. That dictionary has keys corresponding to the breakpoint and the associated analysis computed by minibic() for that breakpoint, as well as the keys "nspan" and "del", which are lists of months flagged as suspect during the splitmerge process. :Param hom_params: The parameters object containing important parameter settings for this analysis: :Ivar nmo: The number of months total in all of the raw series in the network. :Return: Updates the network object with the following fields: :Ivar hits_array: A numpy array of dimensions [number of stations, number of months] recording the filtered set of changepoints. The array will be 0 everywhere except where a station logs a changepoint, and in those instances, the array will reflect the number of times that changepoint was implicated in a paired neighbor. :Ivar amps_array: A numpy array of dimensions [number of stations, number of months] recording the normalized changepoint amplitude estimated at a given month for a given station. If there is no breakpoint at a given entry, records 0.0; else, this is the average of the offsets for each paired hitpoint divided by their standard deviation. """ ## Setup the data in NumPy arrays for easy inspection. station_list = network.stations.keys() ids = station_list all_data = [network.raw_series[id].monthly_series for id in ids] all_data = np.array(all_data) ################################################################################ ## PRE FILTER 1 ## Search through the record of pair results and initially attribute every ## detected changepoint to *both* stations in the pair. Also, delete spans ## of data which were flagged as suspect or troublesome during the changepoint ## detection process. hits = np.zeros_like(all_data) hits_neighbors = [[list() for i in xrange(hom_params['nmo'])] for i in xrange(len(ids))] deleted = np.zeros_like(all_data) hits_models = np.zeros(all_data.shape, dtype=np.dtype( (str, 7) )) for pair in network.pair_results: id1, id2 = pair.split("-") id1_ind, id2_ind = ids.index(id1), ids.index(id2) result = network.pair_results[pair] for (bp, result) in result.iteritems(): if bp == "nspan": continue elif bp == 'del': #continue ## Store in network network.raw_series[id1].delete_months(network.pair_results[pair]['del']) network.raw_series[id2].delete_months(network.pair_results[pair]['del']) print "DEL", pair, network.pair_results[pair]['del'] for bad_month in network.pair_results[pair]['del']: # hits[id1_ind, bad_month] += 1 # hits[id2_ind, bad_month] += 1 # deleted[id1_ind, bad_month] += 1 # deleted[id2_ind, bad_month] += 1 pass else: hits[id1_ind, bp] += 1 hits[id2_ind, bp] += 1 hits_neighbors[id1_ind][bp].append(id2) hits_neighbors[id2_ind][bp].append(id1) hits_models[id1_ind, bp] = result['cmodel'] hits_models[id2_ind, bp] = result['cmodel'] ## Print a summary of the initial list of suspect changepoints summary_table(station_list, hits, all_data, False) ################################################################################ ## FILTER 1 ## Unconfound the paired changepoints by repeatedly finding the most common ## changepoint, and attributing that as the culprit for all of its pairs in ## the month it occurs while deleting the hits the culprit has caused. Repeat ## until there is no changepoint which has been a culprit less than two times. new_hits = np.zeros_like(hits) amps = np.zeros_like(hits) ## Loop forward through all of the months, analyzing one at a time for imo in range(hits.shape[1]): hits_month = hits[:,imo] while hits_month.max() > 1: max_in_hits = hits_month.max() station_index = hits_month.argmax() station_id = station_list[station_index] iy, im = imo2iym(imo) ## Find entries in pair_results with this station and a changepoint on this ## date pr_keys = [key for key in network.pair_results if (station_id in key and imo in network.pair_results[key])] if not pr_keys: break offset_sum, offset_z_sum, count = 0.0, 0.0, 0.0 for key in pr_keys: bp_summary = network.pair_results[key][imo] id1, id2 = key.split("-") offset, offset_z = bp_summary['offset'], bp_summary['offset_z'] if station_id == id2: offset = offset*-1.0 offset_sum += offset offset_z_sum += abs(offset_z) count += 1 avg_offset = offset_sum/count avg_offset_z = offset_z_sum/count print (" -- %s-CONFRM MW1 at %5d %4d %2d AVG ADJ: %3.2f %2.2f %3d" % (station_id, imo, iy, im, avg_offset, avg_offset_z, max_in_hits)) new_hits[station_index,imo] = max_in_hits amps[station_index,imo] = avg_offset_z hits[station_index,imo] = 1 hits_month[station_index] = 1 ## Print a summary of the filtered changepoints. summary_table(station_list, new_hits, all_data, False) ## Update network with this new data network.hits_array = new_hits network.amps_array = amps
def filter3(network, **hom_params): """Reduce the number of detected changepoints by condensing closely-related changepoints into one another, by choosing the biggest amplitude change. :Param network: The network object containing the station/series data and metadata, and a record of suspect, undocumented changepoints and their approximate amplitude: :Ivar hits_array: A numpy array of dimensions [number of stations, number of months] containing the timing of the set of undocumented changepoints filtered so far. :Ivar amps_array: A numpy array of dimensions [number of stations, number of months] containing the approximate amplitudes of the undocumented changepoints filtered so far. :Param hom_params: The parameters object containing important parameter settings for this analysis: <none so far> :Return: Record the final set of changepoints for each series in the given network object as the following field: :Ivar changepoints: A dictionary containing the changepoint locations and amplitudes. Each key in the dictionary is the integer month index where a breakpoint occurs, and each dictionary has the following fields: :Param jsum: The number of times this changepoint caused a break to be logged in a paired neighbor during splitmerge :Param ahigh: The amplitude change associated with the changepoint :Param astd: The error associated with the amplitude change """ ################################################################################ ## FILTER 3 TEST ## Try to reduce the number of changepoints by condensing closely-related ## changepoints into one another. ## ## Go back through the years/months ## For each technique ## find the highest remaining hits ## accumulate all of the new_hits and ntests for each month +/- mrgyr ## (from amps) while skipping missing data ## All new_hits and amps are zeroed out ## For all accumulations within nmrgryr ## when given month are greater or equal to ithresh then add to ## the nhits and ntests arrays ## ## iconfirm := 2 | the min number of hits for a possible breakpoint to be valid ## nmrgyr := -2 | no idea what this does; it defaults to -2 station_list = network.stations.keys() ids = station_list inconfirm = 2 nmrgyr = -2 final_hits = np.zeros_like(network.hits_array) ifound = 0 (num_stations, num_months) = network.amps_array.shape for station_index in range(num_stations): station_id = station_list[station_index] data = network.raw_series[station_id].series miss = network.raw_series[station_id].MISSING_VAL data_monthly = network.raw_series[station_id].monthly_series stdk = compute_monthly_avg_std(data) ## setup temp arrays khits = np.zeros(hom_params['nmo']) ktests = np.zeros(hom_params['nmo']) akhigh = np.zeros(hom_params['nmo']) # iterate until there are no more high points istop = False while not istop: ihighit, ahigh = 0.0, 0.0 # find the highest count - the most number of hits at a possible breakpoint for month in range(num_months): isum, asum = 0.0, 0.0 if network.hits_array[station_index, month] >= inconfirm: jhit = network.hits_array[station_index, month] isum += jhit asum += network.amps_array[station_index, month]*jhit # find the highest chgpt hit station by hits if isum > ihighit: ihighit = isum ihighmo = month ahigh = asum / isum ## now - ## ihighmo := month of highest hit value ## ihighit := sum of hits over all tests ## ahigh := estimated adjustment print "----itarg,ihighit,ihighmo,ahigh,stdk",station_index,ihighit,ihighmo,ahigh,stdk #Keep going until there are no more hits if ihighit > 0: # bracket the highest hit +/- nmrgyr if nmrgyr != -2: ibracket = nmrgyr*2 + 1 else: # else bracket using amplitude of chgpt to define month range ## This are PRE-DEFINED in inhomog.parm.system.mthly.incl. They ## will need to be re-factored later in to a more logical place ## (Parameters maybe?) arange = [0.4, 0.6, 0.8, 1.0, 1.5, 3.0, 5.0] mrgyr = [36, 18, 12, 8, 6, 5, 5] nrange = len(arange) # Create search bracket for looking at station history files for irange in range(len(arange)): astd = abs(ihighit) if astd < arange[irange]: break ibracket = mrgyr[irange]*2 + 1 # go through the bracket, look for the highest hits already found # start at the hit-point index and expand outward in the series # i.e, if our ihighmo = 10, look at [10, 11, 9, 12, 8, 13, 7, ...] # keep track of missing values in both directions. max_radius = ibracket/2 miss_left, miss_right = 0, 0 radius = 0 absorbed = False while radius < max_radius: ## DEAL WITH THE RIGHT MONTH right_month = ihighmo + radius + miss_right if right_month == hom_params['nmo']: break if data_monthly[right_month] == miss: # this month is missing, go to the next while data_monthly[right_month] == miss: right_month += 1 # Absorb lesser hit into the closest higher hit #print right_month, right_month-5, right_month #print np.where(khits > 0) if khits[right_month] > 0: khits[right_month] += ihighit akhigh[right_month] += ahigh*ihighit print "Absorb hit: ",station_index,ihighmo," to ",right_month,khits[right_month],ktests[right_month],akhigh[right_month]/khits[right_month] # zero test array block for next iter network.hits_array[station_index,ihighmo] = 0 network.amps_array[station_index,ihighmo] = 0.0 absorbed = True break ## DEAL WITH THE LEFT MONTH left_month = ihighmo - radius - miss_left if left_month == 0: break if data_monthly[left_month] == miss: # this month is missing, go to the next while data_monthly[left_month] == miss: left_month -= 1 # Absorb lesser hit into the closest higher hit #print left_month, left_month-5, left_month+5 #print np.where(khits > 0) if khits[left_month] > 0: khits[left_month] += ihighit akhigh[left_month] += ahigh*ihighit print "Absorb hit: ",station_index,ihighmo," to ",left_month,khits[left_month],ktests[right_month],akhigh[right_month]/khits[right_month] # zero test array block for next iter network.hits_array[station_index,ihighmo] = 0 network.amps_array[station_index,ihighmo] = 0.0 absorbed = True break radius += 1 # if no hits found, setup new hit if not absorbed: khits[ihighmo] = ihighit ktests[ihighmo] = 1 akhigh[ihighmo] = ahigh*ihighit print "New CHG hit: ",station_index,ihighmo,khits[ihighmo],ktests[ihighmo],akhigh[ihighmo]/khits[ihighmo] network.hits_array[station_index, ihighmo] = 0 network.amps_array[station_index, ihighmo] = 0.0 #raw_input("pause") else: istop = True print "----------------------------------------------" # examine interim khits array for station's filtered changepoints uchgpt_dict = dict() for month in range(hom_params['nmo']): # ... if highest hits > ithres(npair) then save # fetch the numbr of pairs tested if khits[month] > 0: npair = ktests[month] jsum = khits[month] ihthres = 2 iy,im = imo2iym(month) print "itarg,imo,iym,npair,jsum,ihthres,stdk",station_index,month,iy,im,npair,jsum,ihthres,stdk if jsum >= ihthres: # passed threshold test- put interim into final final_hits[station_index, month] += jsum ifound += 1 # debug stuff ahigh = akhigh[month]/khits[month] astd = ahigh print ("%5d %6s-UCHGPT KW%1d at %4d %3d %4d %6.2f %6.2f %3d %3d %3d" % (station_index, station_id, 1, iy, im, jsum, ahigh, astd, ibracket, npair, ihthres) ) uchgpt_dict[month] = { 'jsum': jsum, 'ahigh': ahigh, 'astd': astd } network.raw_series[station_id].changepoints = uchgpt_dict print "-------------------------------------------------" print "Undoc filter: ",ifound
def filter2(network, **hom_params): """Reconciles the detected undocumented changepoints with documented ones, if available, by "absorbing" detected changepoints near these known breaks in the data. TODO: implement this functionality :Param network: The network object containing the station/series data and metadata, and a record of suspect, undocumented changepoints and their approximate amplitude: :Ivar hits_array: A numpy array of dimensions [number of stations, number of months] containing the timing of the set of undocumented changepoints filtered so far. :Ivar amps_array: A numpy array of dimensions [number of stations, number of months] containing the approximate amplitudes of the undocumented changepoints filtered so far. :Param hom_params: The parameters object containing important parameter settings for this analysis: <none so far> :Return: does nothing right now other than print out some summary information. """ ################################################################################ ## FILTER 2 ## won't actually do anything but print things to console for now ## Use station history and metadata to absorb undocumented changepoints ## to known ones where possible station_list = network.stations.keys() ids = station_list (num_stations, num_months) = network.amps_array.shape for station_index in range(num_stations): station_id = station_list[station_index] station_series = network.raw_series[station_id] data = station_series.series scale_series(data, 0.1, station_series.MISSING_VAL) stdk = compute_monthly_avg_std(data) for month in range(num_months): if network.hits_array[station_index, month] > 0: ahigh = network.amps_array[station_index, month] jsum = network.hits_array[station_index, month] astd = abs(ahigh) iy, im = imo2iym(month) ## These are PRE-DEFINED in inhomog.parm.system.mthly.incl. They ## will need to be re-factored later in to a more logical place ## (Parameters maybe?) arange = [0.4, 0.6, 0.8, 1.0, 1.5, 3.0, 5.0] mrgyr = [36, 18, 12, 8, 6, 5, 5] nrange = len(arange) # Create search bracket for looking at station history files for irange in range(len(arange)): if astd < arange[irange]: break ibracket = mrgyr[irange]*2 + 1 print "ASTD: ",station_index,station_id,"1",iy,im,astd,ahigh,stdk,jsum,ibracket
## Print monthly series basic from util import imo2iym #con2str = lambda val, miss=-9999: "---" if val != miss else "-x-" def con2str(data, missing_val=-9999): val, hits = data #if val == missing_val: # return "-x-" if hits > 0: return "%3d" % hits else: return "---" for imo in xrange(hom_params.nmo): #for imo in xrange(10): year, month = imo2iym(imo) base_str = "%4d %2d %4d |" % (year, month, imo) month_strs = "|".join(map(con2str, zip(all_data[:,imo], new_hits[:,imo])))+"|" print_month_strs = False for i in range(10): if str(i) in month_strs: print_month_strs = True break if print_month_strs: print base_str+month_strs # Test plot #pair_str = pair_results.keys()[-1]
def estamt(network, minlenshf=24, **hom_params): """ COPIED FROM ucpmonthly.v24a.f: The major steps in determining the best adjustment value for each station and changepoint. Entire network undergoes each of the following processes. In order: 1) Remove unusable data. Align move swith respect to non-missing data and compress out changes that are too close AND the data between them. 2) ISTEP=2 processing begins the adjustment process by removing the non-significant changepoints to lengthen segments. 3) NPASS (:= ISTEP=3) finishes the adjustment process by testing for the minimum number of months in a segment and number of neighbors with which the difference series can be examined. 4) Final adjusted output is written. """ ## FILTER 4 ## Since the amplitude estimate MUST rely upon a minimum of MINLEN months to ## get even close to a reliable estimate at this point, it is assumed that ## the changepoints are as good as the station history files. Therefore, ## align moves with respect to non-missing data and compress out changes ## that are too close AND the data between them (i.e., less than MINLEN ## apart) # station_list = network.stations.keys() all_station_list = network.stations.keys() # station_list = ["215887", ] station_list = all_station_list # for each station... for id in station_list: station_index = station_list.index(id) station_series = network.raw_series[id] station_data = station_series.monthly_series[:] missing_val = station_series.MISSING_VAL # ... gen arrays for alignment move, amt, mday = [], [], [] changepoints = station_series.changepoints cps = sorted(changepoints.keys()) for cp in cps: print " Hist move: ", len(move) + 1, station_index + 1, imo2iym(cp) move.append(cp) amt.append(changepoints[cp]["jsum"]) mday.append(31) movnum = len(changepoints) if movnum > 0: ## At this point, the Fortran code executes alignmoves() in ## SHAPinp.v6c.f to reconcile the fact that station history files ## report dates of moves. It also removes segments that eare too short ## - less than minlenshf. Instead of implementing alignmoves(). Right ## now, I'll only implement this second functionality. # alignmoves() #################################################################### # Seek to find first and last month indices first_set = False for month in range(len(station_data)): # Skip first year if month < 12: continue if station_data[month] != missing_val: if not first_set: first = month first_set = True last = month cps = sorted(changepoints.keys()) cps.insert(0, first) cps.append(last) for (cp1, cp2) in zip(cps[:], cps[1:]): if (cp2 - cp1) < minlenshf: months_to_delete = range(cp1 + 1, cp2 + 1) network.raw_series[id].delete_months(months_to_delete) if cp2 == last: del_key = cp1 else: del_key = cp2 if del_key in network.raw_series[id].changepoints: # print len(network.raw_series[id].changepoints.keys()), del network.raw_series[id].changepoints[del_key] # print len(network.raw_series[id].changepoints.keys()), # raw_input("pause") del_str = "Del 1st segment: " if cp1 == first else "Delete segment: " print id, station_index + 1, del_str, imo2iym(cp1), cp1, imo2iym(cp2), cp2 new_changepoints = network.raw_series[id].changepoints new_cps = sorted(new_changepoints.keys()) print " First data value: ", imo2iym(first) for cp in new_cps: print " End seg:", new_cps.index(cp), " ym: ", imo2iym(cp), cp, new_changepoints[cp]["jsum"] print " End segment ym: ", imo2iym(last), last # Finally, add first and last value to the list of changepoints. first_stats = dict(ahigh=0.0, astd=0.0, jsum=0) last_stats = dict(ahigh=0.0, astd=0.0, jsum=0) network.raw_series[id].changepoints[first] = first_stats network.raw_series[id].changepoints[last] = last_stats #################################################################### ## Series of debug print statements summarizing the final list of ## changepoints. Not necessary at the moment ############################################################################ # The subnetwork processing became a multi-step process plus a "post-process # pass" to manage: # 1) problems with documented changepoints with NO undocumented support # 2) determine the best amplitude estimation for each confirmed changepoint for step in [2, 3]: ## Setup output strings based on the step used. Only cosmetic differences ## really. iminlen = hom_params["minlen"] numclim = 3 ## STEP 1 - NEVER USED (technically the history-consideration done previously if step == 1: continue elif step == 2: ## STEP 2 - NOT SIG REMOVAL ## equivalent to ipass loopback for istep == 2 in Fortran PHA print " ---------------- NOT SIG REMOVAL --------------- " tstr = "Not sig: " outid = "NS" ipass = 1 elif step == 3: ## STEP 3 - ADJUSTMENT OF DISCONTINUITIES # equivalent to ipass loopback for istep == in FORTRAN PHA print " ---------------- ADJUST DISCONTINUITY STEP --------------- " print "Adjpass, iminlen, numclim", "--", iminlen, numclim print " ---------------- NPASS --------------- " tstr = "Dstep Dtrend: " outid = "WM" ipass = ipass + 1 final_results = dict() print " NET STN FILT TECH ------ AFTER ------ ------ BEFORE ------" # Process each station and its network of neighbors for id in station_list: station_index = station_list.index(id) station_cp_dict = network.raw_series[id].changepoints sorted_cps = sorted(station_cp_dict.keys()) ## If there are no breakpoints... if not sorted_cps: final_results[id] = dict() continue station_series = network.raw_series[id] missing_val = station_series.MISSING_VAL # compute monthly anomalies for this station data station_anomalies = station_series.monthly_anomaly_series # What are the first and last valid months in this station's data set? # We've saved them as the first and last changepoint before... first = sorted_cps[0] last = sorted_cps[-1] # What are the pairs to this station that we need to consider? station_pairs = [] for other_id in all_station_list: pair = tuple(sorted([id, other_id])) if pair in hom_params["pairs"]: station_pairs.append(pair) print station_pairs # List the remaining changepoints after the "confirmfilt" process for cp in sorted_cps: cp_stats = station_cp_dict[cp] hit_count = cp_stats["jsum"] iy, im = imo2iym(cp) print ( "%3d %5d %6s Estamt chgin: -- %4d %2d %4d %3d" % (ipass, station_index, id, iy, im, cp, hit_count) ) ## ACCUMULATE PAIRED CHANGEPOINTS AND AMPLITUDE ESTIMATES # Loop over "brackets" of changepoints - that is, for changepoints # [a, b, c, d], consider the two brackets [a,b,c] and [b,c,d] with # the center value of the changepoints. Note that in the Fortran PHA, # we go through these brackets in reverse order - right to left. brackets = zip(sorted_cps[-3::-1], sorted_cps[-2::-1], sorted_cps[::-1]) final_results[id] = dict() for bracket in brackets: # for bracket in brackets[:1]: (left, cp, right) = bracket[:] ly, lm = imo2iym(left) cpy, cpm = imo2iym(cp) ry, rm = imo2iym(right) print "Oriented: ", "--", "--", "--", left, cp, cp + 1, right # setup the output string for this bracket's tests chgptstr = " Win1: %5d %4d%2d %5d %4d%2dto Win2: %5d %4d%2d %5d %4d%2d" % ( left, ly, lm, cp, cpy, cpm, cp, cpy, cpm, right, ry, rm, ) ## THIS SECTION ACCUMULATES TARGET-NEIGHBOR COMPARISONS # See if there are enough homogeneous data in the target; # check each window valid_count_right = len(get_valid_data(station_data[cp + 1 : right + 1], missing_val)) valid_count_left = len(get_valid_data(station_data[left : cp + 1], missing_val)) # if the segment length (valid count) is too short, skip this # changepoint (for now) if valid_count_left < iminlen: print "Adjpass seg2 short ", station_index, id, chgptstr, valid_count_left continue if valid_count_right < iminlen: print "Adjpass seg1 short ", station_index, id, chgptstr, valid_count_right continue ## We've pass the too-little-data pitfall. Now, we are actually going ## to go back through our paired neighbors and compute some final ## statistics about these changepoints. We'll store them in a ## dictionary for later, just like the pair_results dictionary ## from splitmerge pair_results = dict() # for (id1, id2) in [("215887", "200779")]: for (id1, id2) in station_pairs: # Reset the left, cp, and right indices to the original # bracket we're considering. We are going to be changing them # while we look at this pair (left, cp, right) = bracket[:] ## Figure out which station is the neighbor (not the target ## we're currently considering). At the same time, note that if ## the target is the 2nd changepoint, the adjustments will be ## flipped in sign, so we need to have a correction factor ready correction = 1.0 if id == id1: neighb_id = id2 else: neighb_id = id1 # correction = -1.0 # Add this pair to pair_results if it's not already there (ida, idb) = sorted([id1, id2]) pair_str = "%s-%s" % (ida, idb) if pair_str not in pair_results: pair_results[neighb_id] = dict() print pair_str neighb_index = all_station_list.index(neighb_id) neighb_cp_dict = network.raw_series[neighb_id].changepoints neighb_series = network.raw_series[neighb_id] neighb_anomalies = neighb_series.monthly_anomaly_series ## Generature a difference data set for this pair of stations diff_data = diff(station_anomalies, neighb_anomalies) ## It's possible that in the [left, right] bracket we're looking ## at, there's a changepoint in the paired neighbor. We need ## to adjust the endpoints of the bracket to exclude those ## breakpoints # Check right-hand side first and break out if ... right_seg_len = len(get_valid_data(diff_data[cp + 1 : right + 1])) # right_seg_len = len(diff_data[cp+1:right+1]) for month in range(cp + 1, right + 1): if month == last: continue # ... we hit a changepoint in the neighbor ... if month in neighb_cp_dict: neighb_hits = neighb_cp_dict[month]["jsum"] right_seg_len = len(get_valid_data(diff_data[cp + 1 : month + 1])) # right_seg_len = len(diff_data[cp+1:month+1]) print ( "CHG2: ", neighb_index, neighb_id, "num,edit,2b,2e,imo,nhits", right_seg_len, "--", cp + 1, right, month, neighb_hits, ) right = month break # ... and the final right-segment is too short print left, cp, right if right_seg_len < iminlen: print ( "Low2: ", neighb_index, neighb_id, "num,edit,2b,2e,imo,nhits", right_seg_len, "--", cp + 1, right, month, "--", ) continue # Now, check the left-hand side and break out if ... left_seg_len = len(get_valid_data(diff_data[left : cp + 1])) for month in range(cp - 1, left, -1): if month == first: continue # ... we hit a changepoint in the neighbor ... if month in neighb_cp_dict: neighb_hits = neighb_cp_dict[month]["jsum"] left_seg_len = len(get_valid_data(diff_data[month:cp])) # left_seg_len = len(diff_data[month:cp]) print ( "CHG1: ", neighb_index, neighb_id, "num,edit,1b,1e,imo,nhits", left_seg_len, "--", cp + 1, left, month, neighb_hits, ) left = month break # ... and the final left-segment is too short if left_seg_len < iminlen: print ( "Low1: ", neighb_index, neighb_id, "num,edit,1b,1e,imo,nhits", left_seg_len, "--", cp + 1, left, month, "--", ) continue ## We can now estimate the raw changepoint amplitude using minbic. ## However, we'll short-circuit a lot of the work by telling it to only ## use the KTHTPR0 model (simple step-change model) (seg_x, seg_data) = range(left + 1, right + 1), diff_data[left + 1 : right + 1] bp_index = cp - (left + 1) # print left, cp, right, "|", bp_index # print left_seg_len, right_seg_len bic_result = minbic(seg_x, seg_data, bp_index, missing_val, models=[("KTHTPR0", kthtpr0)]) ## Also check the first difference correlations between the ## monthly anomalies station_first_diff = compute_first_diff(station_anomalies, missing_val) neighb_first_diff = compute_first_diff(neighb_anomalies, missing_val) corr = compute_corr(station_anomalies, neighb_anomalies) ## Write out the results of this testing process so far cmodel = bic_result["cmodel"] bic = bic_result["bic"] test_stat = bic_result["test_stat"] crit_val = bic_result["crit_val"] offset = bic_result["offset"] slopes = bic_result["slopes"] left_slope, right_slope = slopes print ( "%s %6s-%6s %s %7.2f %7.2f %7.2f %7.2f %7.3f %7.3f -- %d --" % ( tstr, id, neighb_id, chgptstr, crit_val, test_stat, offset, corr, left_slope, right_slope, right_seg_len, ) ) ## Analysis is done. ## Keep the adjustment (offset) for each neighbor/segment, ## set/reset trend for each neighbor/segment ## the first segment is the left-segment, ## the second segment is the right-segment ## ## Note that we reset left/right potentially to avoid conflicts ## within the paired neighbor data. However, our estimates of ## trends/offsets associated with the "right" adjacent changepoint ## actually refers to that original right changepoint. We'll ## reset left, cp, and right from the bracket before continuing (left, cp, right) = bracket[:] # Do the left segment first left_dict = dict() left_dict["adj"] = offset * correction left_dict["cor"] = corr left_dict["bic"] = bic left_dict["cmodel"] = cmodel left_dict["trend"] = left_slope left_dict["spanob"] = left_seg_len pair_results[neighb_id][cp] = left_dict # Do the right segment now right_dict = dict() right_dict["adj"] = offset * correction right_dict["cor"] = corr right_dict["bic"] = bic right_dict["cmodel"] = cmodel right_dict["trend"] = right_slope right_dict["spanob"] = right_seg_len if right not in pair_results[neighb_id]: pair_results[neighb_id][right] = right_dict else: # We've already recorded this segment before for the last # changepoint. Update the slopes/spanob count (length of # preceding segment) if the slopes are different and the # length is different. new_trend = slopes[1] new_spanob = right_seg_len old_trend = pair_results[neighb_id][right]["trend"] old_spanob = pair_results[neighb_id][right]["spanob"] if old_trend != new_trend: print ( " Seg2 diff: %s %4d old: %7.2f %4d new: %7.2f %4d" % (pair_str, right, old_trend, old_spanob, new_trend, new_spanob) ) # if the new count is greater than the old one, the slope # is probably more robust so update those entries. if new_spanob > old_spanob: pair_results[neighb_id][right]["trend"] = new_trend pair_results[neighb_id][right]["spanob"] = new_spanob ## We're done with this pair/changepoint. Summary output - if step == 2: print "itarg,ipair,ichg,numc,iqt,adj,trends: -- -- -- --", cmodel, offset, slopes # raw_input("pause") #################################################################### ## ADJUSTMENT DETERMINATION SECTION # Recall the paired-changepoint analyses we just performed, and # determine if the potential adjustment is statistically valid (left, cp, right) = bracket[:] pair_data = [] for neighb_id in pair_results: if not cp in pair_results[neighb_id]: continue cp_stats = pair_results[neighb_id][cp] adjacent_stats = pair_results[neighb_id][right] trends = (cp_stats["trend"], adjacent_stats["trend"]) pair_dict = dict( neighb_id=neighb_id, adj=cp_stats["adj"], cor=cp_stats["cor"], trends=trends, used=True ) pair_data.append(pair_dict) npairs = len(pair_data) if npairs < numclim: print "Adjpass numc low --", station_index, id, left, cp, right, npairs continue # Process - # 1) Remove both adjustment and trend outliers # 2) Calculate median adjustment # # filter around inter-quartile range qscale = hom_params["qscale"] pair_data = sorted(pair_data, key=operator.itemgetter("adj")) pair_chgs = [p["adj"] for p in pair_data] chg_25th, chg_median, chg_75th = tukey_med(pair_chgs) chg_iqr = chg_75th - chg_25th chg_low = chg_25th - (chg_median - chg_25th) * 1.0 * qscale chg_high = chg_75th + (chg_75th - chg_median) * 1.0 * qscale print ( " TRIM p25, p75, pct50, rng, lo, hi: %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" % (chg_25th, chg_75th, chg_median, chg_iqr, chg_low, chg_high) ) # If any of the estimated changepoints are outside the statistically # robust range we just computed, then flag them as we print them and for data in pair_data: neighb_id = data["neighb_id"] neighb_index = all_station_list.index(neighb_id) adj = data["adj"] cor = data["cor"] trends = data["trends"] if not (chg_low < adj < chg_high): data["used"] = False flag = "U" if data["used"] else "X" print ("%s %4d %7.2f %8.4f %8.4f %7.2f" % (flag, neighb_index, adj, trends[0], trends[1], cor)) valid_adj_count = len([d for d in pair_data if d["used"]]) if valid_adj_count < numclim: if step == 2: print ( "Insuff trimmed mean -- %4d %s %5d %5d %5d %5d" % (station_index, id, left, cp, right, valid_adj_count) ) continue ## BUG: The code here re-computes the inter-quartile range by ## scaling qscale by 1.0. Curiously, it doesn't reject any ## pairs based on this new range. chg_iqr = chg_75th - chg_25th chg_low = chg_25th - (chg_median - chg_25th) * qscale chg_high = chg_75th + (chg_75th - chg_median) * qscale ## Tweak the inter-quartile range to check if the adjustment is ## Check whether the computed adjustment is significant. That is, ## if 0 is included within the inter-quartile range we computed, then ## we can't reject the null hypothesis that the changepoint is significant if chg_high * chg_low > 0.0: # signs are the same, so 0 isn't included in the range. procstr = "CONSHF" sigadj = chg_median else: procstr = "ZERSHF" sigadj = 0.0 final_results[id][cp] = dict(adj=sigadj, std=chg_iqr * 1.0 * qscale, num=npairs) print ("%2d %s-%s %s %7.2f" % (station_index, id, procstr, chgptstr, sigadj)) ## Print some final output about what changepoints remain for this station final_station_results = final_results[id] final_cps = sorted(final_station_results.keys()) for cp in final_cps: adj = final_station_results[cp]["adj"] std = final_station_results[cp]["std"] cp_stats = station_cp_dict[cp] hit_count = cp_stats["jsum"] iy, im = imo2iym(cp) print ( "-- %5d %s Estamt chgout: -- %4d%2d %5d %5d %7.2f %7.2f" % (station_index + 1, id, iy, im, cp, hit_count, adj, std) ) # raw_input("pause") ## Remove the accumulated non-significant changepoints (either non-sig because ## there was too much missing data, the target segment was too short, or the ## trimmed mean test could not reject the null hypothesis of no change for id in station_list: station_index = station_list.index(id) final_station_results = final_results[id] final_cps = sorted(final_station_results.keys()) for cp in final_cps: iy, im = imo2iym(cp) cp_index = final_cps.index(cp) adj = final_station_results[cp]["adj"] std = final_station_results[cp]["std"] if adj == 0.0: print ("%s %5d Remove chgpt %5d %4d %2d %4d" % (id, station_index, cp_index, iy, im, cp)) del network.raw_series[id].changepoints[cp] else: # Update the network's record of changepoints with this new list network.raw_series[id].changepoints[cp]["ahigh"] = adj network.raw_series[id].changepoints[cp]["astd"] = std # the changepoint at first month has been removed; add it back in network.raw_series[id].changepoints[first] = dict(ahigh=0.0, astd=0.0, jsum=0)
def splitmerge(network, pairs=None, beg_year=1, end_year=2, **kwargs): ## EXPERIMENTAL PLACEHOLDERS - will eventually be replaced with a master ## loop to do all the id pairs. id_list = network.stations.keys() pair_results = dict() def dict_to_tuples(d): keys = d.keys() return [(key, d[key]) for key in keys] ## Generate station pairs for use in splitmerge by iteratively going through the ## station_list and adding stations in order of decreasing correlation. Skip a ## neighbor if the pair is already present; want 20 stations or until all the ## correlated neighbors are used up. # pairs = [] # for id1 in id_list: # neighbors = dict_to_tuples(network.correlations[id1]) # sorted_neighbors = sorted(neighbors, key=operator.itemgetter(1)) # added_pairs = 0 # while sorted_neighbors and (added_pairs < 5): # id2, _ = sorted_neighbors.pop() # ordered_pair = tuple(sorted((id1, id2))) # if not ordered_pair in pairs: # pairs.append(ordered_pair) # added_pairs += 1 for (id1, id2) in pairs: print "Pair %s with %s" % (id1, id2) pair_str = "%6s-%6s" % (id1, id2) #if pair_str != "051528-298107": # continue raw_series = network.raw_series stations = network.stations series_copy = deepcopy(raw_series) min_ann = 5 num_years = end_year - beg_year num_months = num_years*12 for s in series_copy.itervalues(): data = s.series scaled = scale_series(data, 0.1, s.MISSING_VAL) anomalies = compute_monthly_anomalies(scaled, s.MISSING_VAL) s.set_series(anomalies, s.years) ## Retrieve the data for each of the stations. station1 = stations[id1] series1 = series_copy[id1] data1 = series1.monthly_series station2 = stations[id2] series2 = series_copy[id2] data2 = series2.monthly_series #print data1[:50] #print data2[:50] #print "################################################################" ## Compute the difference series diff_data = diff(data1, data2) MISS = series1.MISSING_VAL # Missing value placeholder ## Quickly pass through the data to find where it starts. We need to do this ## because it's possible that beg_year is earlier than the first year of ## valid data in either data1 or data2. Furthermore, the original PHA code ## deliberately clipped off the first year of good data, so emulate that ## effect here as well. ## ## Ultimately, we save the extreme early and extreme late month with valid ## data to use as our first guess at the undocumented changepoints. first = 0 first_set = False last = 0 for (i, d1, d2) in zip(xrange(num_months), data1, data2): if d1!=MISS and d2!=MISS: if first < 12: first = i #first_set = True #if not first_set: # first = i # first_set = True last = i ## Set the initial breakpoints and the list of already-found, homogenous ## segments. breakpoints = [first, last, ] homog_segs = [] ##################################################################### ## BEGIN SPLITMERGE PROCESS TO GENERATE FIRST GUESS AT UNDOCUMENTED ## CHANGEPOINTS iter = 0 # counts how many times we've repeated the splitmerge process enter_BIC = False # break out of iterations into the BIC process? last_breakpoints = [] while (iter < 10) and not enter_BIC: seg_bounds = zip(breakpoints[:-1], breakpoints[1:]) last_breakpoints = deepcopy(breakpoints) new_breakpoints = deepcopy(breakpoints) new_homog_segs = [] print "Parse segments (isplit = 1), ipass: "******"Too short: ", imo2iym(l), imo2iym(r) continue ## If we've previously found that this segment is homogenous (has no ## potential changepoint), then we can skip it as well and proceed to ## the next one. # Set the within() method to check if this segment is within any # previously found homogenous ones. Use lambda, since we can't pass # keyword or positional arguments to map(). within_this_seg = lambda seg: within((l, r), seg) within_stable_segs = map(within_this_seg, homog_segs) if any(within_stable_segs): print "Stable segment: ", imo2iym(l), imo2iym(r) if l == first: new_breakpoints.append(first) continue ## The standard normal homogeneity test - which is the statistical test ## we'll use to see if there is a potential changepoint in this segment ## - requires us to normalize our paired difference series. We can do ## that in snht(), but we'll do it right now so we can inspect those ## standardized values later. z = standardize(segment, MISS) ## Apply standard normal homogeneity test. ## For mechanics, see Alexandersson and Moberg 1997, Int'l Jrnl of ## Climatology (pp 25-34) likelihood_ratios = snht(z, MISS, standardized=True) z_count = len(get_valid_data(z)) ## We're left with the likelihood ratio for each value being a potential ## changepoint. Find the max ratio, and if that value is significant, let ## it be the newest potential changepoint. ind_max_ratio = 0 max_ratio = 0.0 clip_ratios = likelihood_ratios[2:-2] # clip the beginning and end, # they can't be changepoints. for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios): if ratio > max_ratio: ind_max_ratio = ind max_ratio = ratio ## Now we find the critical value for this data set, and check our max ## likelihood ratio against it crit_val = lrt_lookup(z_count) # The possible changepoint is the index of the max ratio we found. # We have to shift it the following ways to align it to the original # data - # 1) shift by 2 re-aligns it from clip_ratios to likelihood_ratios # 2) shift by adjust re-aligns it to this segment in diff_data # 3) shift by l re-aligns it to the first index in diff_data possible_changepoint = l + ind_max_ratio + 2 + adjust y_new, m_new = imo2iym(possible_changepoint) # year, month ## If this is the first iteration, we indicate as such, and add the new ## changepoint if iter == 0: print "%6s-%6s MD FIRST series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val) breakpoints.append(possible_changepoint) breakpoints = sorted(breakpoints) else: ## Else, if we found a new possible changepoint, add it to our list. if max_ratio > crit_val: print "%6s-%6s MD Inhomogenity for series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count) new_breakpoints.append(possible_changepoint) ## If not, record that we found a homogeneous segment. else: print "%6s-%6s MD Homogeneous series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count) new_homog_segs.append((l, r)) ## Now we need to update our account of which segments were homogeneous, ## because we need to know during the next iteration. We will do this, ## as well as condense stable segments that lie adjacent to each other ## i.e, if we have the segments [(1,5), (5, 10,),, (12, 15)], then we ## really have [(1,10), (12, 15)]. homog_segs.extend(new_homog_segs) if homog_segs: homog_segs = sorted(homog_segs, key=operator.itemgetter(0)) final_homog_segs = [homog_segs[0], ] # this will be like a stack for seg in homog_segs[1:]: last_seg = final_homog_segs[-1] if last_seg[1] == seg[0]: new_seg = (last_seg[0], seg[1]) final_homog_segs.pop() final_homog_segs.append(new_seg) else: final_homog_segs.append(seg) homog_segs = final_homog_segs ## So we have new segments that can be generated from these new ## breakpoints. Now, the PHA routine enters a "merge" process ## to see whether or not to keep these newly found changepoints or throw ## them out as false alarms. ## ## We do this by "leapfrogging" every other breakpoint. This gives us ## a set of segments that all have another breakpoint in them. We want ## to see if these segments are homogeneous, because if they are, it ## means that the breakpoint we previously found in the segment has ## been superseded. new_breakpoints = sorted(new_breakpoints) seg_bounds = zip(new_breakpoints[:-2], new_breakpoints[2:]) remove_breakpoints = set() merged_breakpoints = set() if iter > 0: print "Merge segments (isplit = 0), ipass: "******"Stable segment: ", imo2iym(l), imo2iym(r) # if l == first: # new_breakpoints.append(first) # seg_lookup.append(((l, r), 'stable')) # continue # Set the within() method to check if this segment is within any # previously found homogenous ones. Use lambda, since we can't pass # keyword or positional arguments to map(). within_this_seg = lambda seg: within((l, r), seg) within_stable_segs = map(within_this_seg, homog_segs) if any(within_stable_segs): print "Stable segment: ", imo2iym(l), imo2iym(r) #if l == first: # new_breakpoints.append(first) merged_breakpoints.update([l, r]) continue ## Apply the same adjustments and the same standard normal homogeneity ## test that we did in the previous splitting process. There is no ## difference here until we consider what to do if we find a new ## homogeneous segment. adjust = int(seg_bounds.index((l, r)) > 0) segment = diff_data[l+adjust:r+1] z = standardize(segment, MISS) likelihood_ratios = snht(z, MISS, standardized=True) z_count = len(get_valid_data(z)) ind_max_ratio = 0 max_ratio = 0.0 clip_ratios = likelihood_ratios[2:-2] # We clip the beginning and end for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios): if ratio > max_ratio: ind_max_ratio = ind max_ratio = ratio crit_val = lrt_lookup(z_count) possible_changepoint = l + ind_max_ratio + 2 + adjust y_new, m_new = imo2iym(possible_changepoint) if z_count < 2: y1, m1 = imo2iym(l) y2, m2 = imo2iym(r) print "%6s-%6s MD No found peaks %4d %2d to %4d %2d" % (id1,id2,y1,m1,y2,m2) print "%6s-%6s MD Compress 1 out peak at %4d %2d" % (id1,id2,y_new,m_new) #remove_breakpoints.add_ ## If we found a new breakpoint that is statistically significant, then ## great! Let's keep it. if max_ratio > crit_val: print "%6s-%6s MD Peak kept in merge at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val) merged_breakpoints.add(l) merged_breakpoints.add(new_bp) merged_breakpoints.add(r) ## If not, then this segment was homogeneous, so the breakpoint which ## already exists in it is no good. else: print "%6s-%6s MD Compress 2 out peak at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val) # Crap, if there are any potential breakpoints in this segment, # we need to remove them because this segment is homogeneous. Let's # remember this homogeneous segment for now and come back once # we've found all of them. merged_breakpoints.update([l, r]) remove_breakpoints.add(new_bp) ## At this point, we have a set of all the breakpoints we've accumulated ## during this iteration of split/merge, as well as a set of breakpoints ## which we've found to be of no further use. We can difference update ## our set of breakpoints to remove these guys, and let those merged ## breakpoints be the set of newest breakpoints for the next splitmerge ## iteration. merged_breakpoints.difference_update(remove_breakpoints) breakpoints = list(merged_breakpoints) breakpoints = sorted(breakpoints) ## Did we actually find new breakpoints? If not, then we're done ## with splitmerge and can move on to the BIC process. enter_BIC = (breakpoints == last_breakpoints) iter = iter + 1 ## Okay wow, we've potentially made it to the BIC stage now... ! if first not in breakpoints: breakpoints.insert(0, first) ym_breakpoints = map(imo2iym, breakpoints) #print ym_breakpoints ## ENTERING MINBIC bp_dictionary = dict() #################################### ##### MULTIPROCESS from multiprocessing import Pool global counter multi_bp_dict = {} counter = 0 def cb(r): global counter #print counter, r counter += 1 start = time.clock() po = Pool(processes=4) for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]): if left != first: left = left + 1 # recall that we only consider data after the first full year. we will be # computing regressions with the independent variable indexed from this # starting point, so we need to shift these indices. we also need to shift them # by +1 if this is any segment beyond the first one, so that we don't include # changepoints in more than one analysis. # TOTAL_SHIFT = -12 + 1 = -11 # # However, this shift is only necessary while looking at the array indices that # we generate using range(). the data should already be aligned correctly. total_shift = -12 + 1 left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift y1, m1 = imo2iym(left) yb, mb = imo2iym(bp) y2, m2 = imo2iym(right) #print "Entering MINBIC - %4d %2d %4d %2d %4d %2d" % (y1, m1, yb, # mb, y2, m2) (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] bp_index = bp-left #print len(seg_x), len(seg_data), bp_index #bp_analysis = minbic(seg_x, seg_data, bp_index, MISS) multi_bp_dict[bp] = po.apply_async(minbic,(seg_x,seg_data,bp_index,MISS,),callback=cb) po.close() po.join() for bp in multi_bp_dict: r = multi_bp_dict[bp] multi_bp_dict[bp] = r.get() #print "counter - %d" % counter elapsed = (time.clock() - start) print "ELAPSED TIME - %2.3e" % elapsed #print new_bp_dict #################################### ##### NORMAL # start = time.clock() # for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]): # # if left != first: # left = left + 1 # # recall that we only consider data after the first full year. we will be # # computing regressions with the independent variable indexed from this # # starting point, so we need to shift these indices. we also need to shift them # # by +1 if this is any segment beyond the first one, so that we don't include # # changepoints in more than one analysis. # # TOTAL_SHIFT = -12 + 1 = -11 # # # # However, this shift is only necessary while looking at the array indices that # # we generate using range(). the data should already be aligned correctly. # total_shift = -12 + 1 # left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift # y1, m1 = imo2iym(left) # yb, mb = imo2iym(bp) # y2, m2 = imo2iym(right) # print "Entering MINBIC - %4d %2d %4d %2d %4d %2d" % (y1, m1, yb, # mb, y2, m2) # (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] # bp_index = bp-left # #print len(seg_x), len(seg_data), bp_index # bp_analysis = minbic(seg_x, seg_data, bp_index, MISS) # # bp_dictionary[bp] = bp_analysis # elapsed2 = (time.clock() - start) # print "ELAPSED TIME = %3.2e" % elapsed2 ##################################3 ## Print the adjustment summaries bp_dictionary = multi_bp_dict sorted_bps = sorted(bp_dictionary.keys()) ndelete = [] valid_bps = {} for bp in sorted_bps: stats = bp_dictionary[bp] cmodel=stats['cmodel'] iqtype=stats['iqtype'] asigx=stats['offset'] azscr=stats['offset_z'] rslp=stats['slopes'] end1 = bp y_end1, m_end1 = imo2iym(end1) beg2 = bp+1 y_beg2, m_beg2 = imo2iym(beg2) # If cmodel is *SLR*, then there is no breakpoint if 'SLR' in cmodel: print ("%s-%s -- -- MD TESTSEG SKIP: %7.2f %5d %5d %3d %5d %5d %3d" % (id1, id2, asigx, end1, y_end1, m_end1, beg2, y_beg2, m_beg2)) # Don't store it! else: print ("%6s-%6s -- -- MD TESTSEG ADJ: %7.2f %7.2f %8.4f %8.4f %5d %5d %3d %5d %5d %3d %2d" % (id1,id2, asigx, azscr, rslp[0], rslp[1], end1, y_end1, m_end1, beg2, y_beg2, m_beg2, iqtype)) # Store it! valid_bps[bp] = stats ############################### ## Go back and see if we can get rid of some of the change points. ## If 2 or more of the chgpts are within MINLEN, ## a) if the chgpt estimates are the same sign, then test each ## singly with same endpoints and keep lowest BIC ## b) if not the same sign, ## retain earliest changepoint # add the first, last to valid_bps interior_bps = valid_bps.keys() # Add first, last if not already in interior_bps for bp in [first, last]: if bp not in interior_bps: interior_bps.append(bp) sorted_bps = sorted(interior_bps) for left in sorted_bps: print sorted_bps, left ## We're looking for the next interim breakpoint that satisfies two ## conditions: ## 1) at least MINLEN valid data (non-missing to the right) ## 2) has at least one breakpoint between 'left' and it right = 0 close_bps = [] for right in sorted_bps: if right <= left: continue if not close_bps: close_bps.append(right) else: valid_between_bps = diff_data[close_bps[-1]:right] valid_length = len(get_valid_data(valid_between_bps, MISS)) print imo2iym(close_bps[-1]),valid_length,imo2iym(right) if valid_length > MINLEN: break close_bps.append(right) # We could actually run out of things in sorted_bps, and wind up with # right == close_bps[-1]. Detect that and break out of this analysis # if that happens. if close_bps[-1]==right: break if left != first: left = left + 1 close_bp_results = {} for bp in close_bps: # # recall that we only consider data after the first full year. we will be # # computing regressions with the independent variable indexed from this # # starting point, so we need to shift these indices. we also need to shift them # # by +1 if this is any segment beyond the first one, so that we don't include # # changepoints in more than one analysis. # # TOTAL_SHIFT = -12 + 1 = -11 # # # # However, this shift is only necessary while looking at the array indices that # # we generate using range(). the data should already be aligned correctly. total_shift = -12 + 1 left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift y1, m1 = imo2iym(left) yb, mb = imo2iym(bp) y2, m2 = imo2iym(right) print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" print y1,m1,"-",yb,mb,"-",y2,m2 print "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] bp_index = bp-left bp_analysis = minbic(seg_x, seg_data, bp_index, MISS, kthslr0_on=True) cmodel=bp_analysis['cmodel'] iqtype= bp_analysis['iqtype'] offset= bp_analysis['offset'] rslp= bp_analysis['slopes'] crit_val = bp_analysis['crit_val'] test_stat = bp_analysis['test_stat'] bic = bp_analysis['bic'] print ("Interim chgpt: %s %4d %2d %4d %2d %4d %2d %8.2f %8.2f %8.2f %8.2f %7.3f %7.3f %2d" % (pair_str, y1, m1, yb, mb, y2, m2, bic, test_stat, crit_val, offset, rslp[0], rslp[1], iqtype)) close_bp_results[bp] = bp_analysis # Now we have a small problem... we might have more than one breakpoint, # so we need to choose which one is best. We will check the sign of # the breakpoint amplitude changes: sign_of_amps = map(sign, [close_bp_results[bp]['offset'] for bp in close_bps]) positive = lambda x: sign(x) >= 0 negative = lambda x: sign(x) <= 0 zero = lambda x: sign(x) == 0 print "------------>",[close_bp_results[bp]['offset'] for bp in close_bps] if (all(map(positive, sign_of_amps)) or all(map(negative, sign_of_amps))): # Pick the best (minimum BIC) bics = [(bp, close_bp_results[bp]['bic']) for bp in close_bps] sorted_bics = sorted(bics, key=operator.itemgetter(1)) smallest_bp = sorted_bics[0][0] # Remove this smallest-bic bp from the in-interval bps close_bps.remove(smallest_bp) valid_bps[smallest_bp] = close_bp_results[smallest_bp] #print "leftovers",close_bps for bp in close_bps: # The remaining bps which we will reject sorted_bps.remove(bp) # Remove them from this loop del valid_bps[bp] # Remove them as valid yb, mb = imo2iym(smallest_bp) print ("Same domain - Lowest Interim: %s %4d %2d" % (pair_str, yb, mb)) elif (all(map(zero, sign_of_amps))): # Choose the earliest changepoint; the rest of these have # amplitude changes which are 0. first_bp, last_bp = close_bps[0], close_bps[-1] # Remove the first interim bp and update valid_bps with this new # computation. close_bps.remove(first_bp) valid_bps[first_bp] = close_bp_results[first_bp] # Reject remaining interim bps for bp in close_bps: sorted_bps.remove(bp) del valid_bps[bp] yb, mb = imo2iym(first_bp) print ("Null domain - Earliest Interim : %s %4d %2d" % (pair_str, yb, mb)) else: # We'll use the earliest interim changepoint, but we need # to get rid of bad data. Replace all the data between the # interim changepoints as missing and re-compute BIC. first_bp, last_bp = close_bps[0], close_bps[-1] first_bp_index = first_bp-left last_bp_index = last_bp-left print len(seg_x), len(seg_data) print first_bp_index+1, last_bp_index+1 print left, bp, right for i in range(first_bp_index+1, last_bp_index+1): print i, imo2iym(i), i+left, imo2iym(i+left) seg_x[i] = MISS seg_data[i] = MISS # Recall that seg_data[0] == diff_data[left]. ndelete records # the *true month where there is unviable data*, so it needs to # point back to the original element in diff_data we are # worried about. ndelete.append(i+left) bp_analysis = minbic(seg_x, seg_data, first_bp_index, MISS, kthslr0_on=True) # Remove the first interim bp and update valid_bps with this new # computation. close_bps.remove(first_bp) valid_bps[first_bp] = bp_analysis # Reject remaining interim bps for bp in close_bps: sorted_bps.remove(bp) del valid_bps[bp] yb, mb = imo2iym(first_bp) print ("Diff domain - Earliest Interim : %s %4d %2d" % (pair_str, yb, mb)) ## Remove changepoints which are an SLR model. nspan = [0]*num_months bp_count = 1 for bp in sorted(valid_bps.keys()): bp_analysis = valid_bps[bp] if "SLR" in bp_analysis['cmodel']: del valid_bps[bp] continue print " IN: ",bp nspan[bp] = bp_count ## If adjacent months are missing next to this breakpoint, then ## assume that those could be a breakpoint as well and copy this ## breakpoint's analysis results for them. for month in range(bp+1, last): if (month in ndelete) or (diff_data[month] == MISS): nspan[month] = bp_count print " IN: ",month valid_bps[month] = bp_analysis else: break bp_count += 1 valid_bps['del'] = ndelete valid_bps['nspan'] = nspan pair_results[pair_str] = valid_bps #print "ELAPSED TIMES = %3.2e %3.2e" % (elapsed1, elapsed2) print "done" ## import pickle f = open("pair_results", 'w') pickle.dump(pair_results, f) return pair_results