def average_positions(filenames, chi2cutoff=1.15, write=True, plot=1): """Filter and average over positions in a capillary. """ filenames.sort() stack = stack_datafiles(filenames) incinds, cdm, links = cluster_reps(stack, threshold=chi2cutoff, plot=plot) ms = mean_stack(stack[incinds,...]) disinds = range(len(filenames)) for i in incinds: disinds.remove(i) included = [ [filenames[i], md5_file(filenames[i])] for i in incinds ] discarded = [ [filenames[i], md5_file(filenames[i])] for i in disinds ] ad = { 'chi2cutoff': float(chi2cutoff), 'included': included, 'discarded': discarded, 'chi2matrix' : map(float, list(cdm)), 'incinds' : map(int, list(incinds)), 'linkage' : [ map(float, ll) for ll in list(links) ] } outarr = np.zeros((7, ms.shape[1])) outarr[0:3,:] = ms outarr[3:5,:] = stack[0,1:3,:] outarr[5:7,:] = mean_stack(stack)[1:3,:] if write: fname = filenames[0] fname = "%s.clu.ydat" % fname[:(fname.find('.p'))] print(fname) write_ydat(outarr, fname, addict=ad, cols=['q', 'I', 'Ierr', 'I_first', 'Ierr_first', 'I_all', 'Ierr_all']) return ms
def filter_matfile(fname, outstem, p_reject=0.001, plot=1): stack = read_mat(fname) md5 = md5_file(fname) print("Rejection probability: %0.3g" % p_reject) N = np.sum(np.logical_not(np.isnan(stack[0,0,1,:]))) print("Number of valid channels: %d" % N) threshold = chi2.ppf(1.0 - p_reject, N) / N print("Chisq rejection threshold: %0.3g" % threshold) for pos in range(stack.shape[0]): reps = stack[pos,...] incinds, cdm = filter_outliers(reps, threshold=threshold, plot=plot) ms = mean_stack(reps[incinds,...]) disinds = range(reps.shape[0]) for i in incinds: disinds.remove(i) print("Pos %d, discarded: %s" % (pos, str(disinds))) ad = { 'chi2cutoff' : float(threshold), 'rejection_prob' : float(p_reject), 'incinds' : map(int, list(incinds)), 'disinds' : map(int, list(disinds)), 'chi2matrix' : map(float, list(cdm)), 'method' : "filter_outliers", 'inputfile' : [ fname, md5 ], 'inputposition' : int(pos), 'q~unit' : '1/nm', 'I~unit' : 'arb.', 'Ierr~unit' : 'arb.', 'I_first~unit' : 'arb.', 'Ierr_first~unit' : 'arb.', 'I_all~unit' : 'arb.', 'Ierr_all~unit' : 'arb.', } outarr = np.zeros((7, ms.shape[1])) outarr[0:3,:] = ms outarr[3:5,:] = reps[0,1:3,:] outarr[5:7,:] = mean_stack(reps)[1:3,:] outname = "%s.p%02d.out.ydat" % (outstem, pos) print(outname) write_ydat(outarr, outname, addict=ad, cols=['q','I','Ierr','I_first','Ierr_first','I_all','Ierr_all'], attributes=['~unit'])
def filter_matfile(fname, outstem): stack = read_mat(fname) for pos in range(stack.shape[0]): print("File: %s, pos %d" % (fname, pos)) sys.stdout.flush() first = stack[pos,0,...] aver = mean_stack(stack[pos,...]) filt, inds = chifilter_points(stack[pos,...]) outname = "%s.p%02d.fil.ydat" % (outstem, pos) write_filtered(filt, first, aver, inds, outname, \ os.path.basename(fname), pos) print(outname)
def cluster_reps(reps, threshold=1.0, plot=1): """Do clustering based `reps`. Returns a tuple with - The indices of the largest cluster found - The condensed distance matrix - Cluster linkage Keyword arguments: `threshold` : chisq threshold to use in discrimination. `plot` : Plot results, if True. """ cdm = chi2cdm(reps) links = hc.linkage(cdm, method='complete') clist = filter_with_linkage(links, threshold) print("Clusters: %s" % str(clist)) if plot: first = reps[0,...] aver = mean_stack(reps) filtered = mean_stack(reps[clist[0],...]) plot_clustering(filtered, first, aver, clist[0], cdm, links, threshold) return (clist[0], cdm, links)
def filter_outliers(reps, threshold=1.0, plot=1): """Filter by removing repetitions having mutual chisq above `threshold`. Returns a tuple containing the included indices and the condensed distance matrix. Repetitions are removed iteratively by checking which repetition contributes the largest number of over the threshold chi-squared values (outliers) in the chisq-distance matrix, and removing that point. If two repetitions cause an equal number of outliers, the repetition which has the highest chisq distance to a non-outlier distance matrix point is removed. """ cdm = chi2cdm(reps) dmat = squareform(cdm) incinds = filter_distmat(dmat, threshold) if plot: first = reps[0,...] aver = mean_stack(reps) filtered = mean_stack(reps[incinds,...]) plot_outliers(filtered, first, aver, incinds, cdm, threshold) return incinds, cdm
def chifilter_points(reps, chi2cutoff=1.1, winhw=25, plot=0): """Return an average of repetitions statistically similar to the first. Array of repetitions `reps` has the shape (nreps, q/I/Ierr, len(q)) and contains nreps curves with the q-scale and errors. The q-scales must be identical in all repetitions. Repetitions are compared to the first one point by point. The chi**2 between first measurement and the repetition is calculated on an interval centered on the compared point with half-width `winhw`. Points which have chi**2 > `chi2cutoff` in are discarded from the averaging. """ nreps = reps.shape[0] qlen = reps.shape[2] incmap = np.zeros((nreps, qlen), dtype=np.bool) incmap[0,:] = True def chi2wfilt(x, y, pos, winhw=winhw): ind = slice(max(0, pos-winhw), min(qlen, pos+winhw+1)) chi2 = chivectors(x[:,ind], y[:,ind]) return chi2 < chi2cutoff first = reps[0,...] for rep in range(1,nreps): for qind in range(qlen): incmap[rep,qind] = chi2wfilt(first, reps[rep,...], qind) filt = np.zeros((3, qlen)) filt[0,:] = first[0,:] def sumsq(x): return np.sum(np.square(x)) for qind in range(qlen): filt[1,qind] = np.mean(reps[incmap[:,qind], 1, qind]) N = np.sum(incmap[:,qind]) prop = np.sqrt(sumsq(reps[incmap[:,qind], 2, qind])) / N # sdev = np.std(reps[incmap[:,qind], 2, qind]) / np.sqrt(N) # filt[2,qind] = max(prop, sdev) filt[2,qind] = prop if plot: aver = mean_stack(reps) plot_filtered(filt, first, aver, incmap, figno=plot) return (filt, incmap)