def get_elbow(components,partial_mask): masked_eigenimgs = reshape_unmasked_values_to_shots( components.astype(np.float64), partial_mask) qs = np.linspace(0,1,1) dc = DiffCorr(masked_eigenimgs,q_values=qs,k=0) eigenimg_ac = dc.autocorr()[:,:,1] if np.abs(eigenimg_ac[0])<0.5: cutoff=1 else: for ii,aa in enumerate(eigenimg_ac[:,0]): if np.abs(aa)>=0.5: continue else: cutoff=ii break return cutoff
def get_elbow(components, partial_mask): masked_eigenimgs = reshape_unmasked_values_to_shots( components.astype(np.float64), partial_mask) qs = np.linspace(0, 1, 1) dc = DiffCorr(masked_eigenimgs, q_values=qs, k=0) eigenimg_ac = dc.autocorr()[:, :, 1] if np.abs(eigenimg_ac[0]) < 0.5: cutoff = 1 else: for ii, aa in enumerate(eigenimg_ac[:, 0]): if np.abs(aa) >= 0.5: continue else: cutoff = ii break return cutoff
def pair_diff_PI(norm_shots, mask_corr, phi_offset=0, pair_method='int'): if pair_method == 'corr': print("doing corr pairing...") #dummy qs num_phi = norm_shots.shape[-1] qs = np.array([1.0]) dc = DiffCorr(norm_shots, qs, 0, pre_dif=True) corr = dc.autocorr() corr /= mask_corr corr = corr[:, :, phi_offset:num_phi / 2 - phi_offset] eps = distance.cdist(corr[:, 0], corr[:, 0], metric='euclidean') if pair_method == 'int': print "doing intensity pair..." eps = distance.cdist(norm_shots[:, 0], norm_shots[:, 0], metric='euclidean') # do this so the diagonals are not the minimum, i.e. don't pair shot with itself epsI = 1.1 * eps.max(1) * np.identity(eps.shape[0]) eps += epsI shot_preference = np.roll(eps.argsort(1), 1, axis=1) pref_dict = {str(E[0]): list(E[1:]) for E in shot_preference.astype(str)} print("stable roommate pair....") pairs_dict = stable.stableroomate(prefs=pref_dict) pairing = np.array(MakeTagPairs._remove_duplicate_pairs(pairs_dict)) print("computing difference intensities...") diff_norm = np.zeros( (norm_shots.shape[0] / 2, norm_shots.shape[1], norm_shots.shape[-1]), dtype=np.float64) for index, pp in enumerate(pairing): diff_norm[index] = norm_shots[pp[0]] - norm_shots[pp[1]] return diff_norm, pairing
print("skipping run %d" % run) continue if 'run%d' % run in f_out.keys(): print("already seen this run, skip!") continue ##### load the mask used for this run f_mask = h5py.File(os.path.join(args.mask_dir, 'run%d.tbl' % run), 'r') mask = f_mask['polar_mask_binned'].value mask = (mask == mask.max()) mask.shape # do the mask cor qs = np.linspace(0, 1, mask.shape[0]) dc = DiffCorr(mask[None, :, :], qs, 0, pre_dif=True) mask_cor = dc.autocorr().mean(0) f_mask.close() f_out.create_group('run%d' % run) all_ave_cors = [] all_nums = [] for qidx in range(35): print('run%d q%d' % (run, qidx)) if 'num_pca_cutoff2' in f['q%d' % qidx].keys(): pca_num = f['q%d' % qidx]['num_pca_cutoff2'].value else:
print('Chunk %d: denoisng with PCA critical num_pca_components = %d...'%(n_chunk,num_pca) ) # get back the masked images and components masked_mean_train =reshape_unmasked_values_to_shots(Train,partial_mask).mean(0) # denoise Train_noise = new_Train[:,:num_pca].dot(components[:num_pca]) denoise_Train= reshape_unmasked_values_to_shots(Train-Train_noise-Train.mean(0)[None,:] , partial_mask) if denoise_Train.shape[0]%2>0: denoise_Train=denoise_Train[:-1] denoise_Train=denoise_Train[:-1][::2]-denoise_Train[1:][::2] dc=DiffCorr(denoise_Train,qvalues,0,pre_dif=True) Train_difcor= dc.autocorr().mean(0)[0] all_corrs.append(Train_difcor) all_nums.append(Train.shape[0]) f_out.create_dataset('q%d/dif_cor%d'%(qidx, n_chunk) ,data=Train_difcor) all_corrs = np.array(all_corrs) all_nums = np.array(all_nums) # print all_corrs.shape # print all_nums ave_corr = np.sum(all_corrs*(all_nums/float(all_nums.sum()))[:,None],axis=0 )
print('corr PCA clustering for qidx %d' % qidx) f_out.create_group('q%d' % qidx) shots = PI[:, qidx, :][:, None, :] this_mask = mask[qidx][None, :] norm_shots = np.zeros_like(shots) for idx, ss in enumerate(shots): norm_shots[idx] = normalize_shot(ss, this_mask) print("computing single shot correlations") phi_offset = 10 num_phi = norm_shots.shape[-1] qs = np.array([1.0]) dc = DiffCorr(this_mask[None, :, :], qs, 0, pre_dif=True) mask_corr = dc.autocorr() dc = DiffCorr(norm_shots, qs, 0, pre_dif=True) corr = dc.autocorr() corr /= mask_corr corr = corr[:, :, phi_offset:num_phi / 2 - phi_offset] pca = PCA(n_components=args.num_pca) new_corr = pca.fit_transform(corr[:, 0, :]) kmeans = KMeans(n_clusters=args.num_clusters) kmeans.fit(new_corr) f_out.create_dataset('q%d/cluster_labels' % qidx, data=kmeans.labels_)
norm_shots = np.zeros_like(shots) for idx,ss in enumerate(shots): norm_shots[idx]=normalize_shot(ss,this_mask) # do we want to normalize by the entire range of intensity? # divide into Train and test num_shots = norm_shots.shape[0] cutoff = int(num_pro_shots*0.1) # use 10% of the protein shots as testing set partial_mask = this_mask.copy() Train = norm_shots[cutoff:, partial_mask==1] Test = norm_shots[:cutoff, partial_mask==1] print ("%d test shots"%(Test.shape[0])) print ("%d train shots"%(Train.shape[0])) qvalues = np.linspace(0,1,partial_mask.shape[0]) mask_dc = DiffCorr(partial_mask,qvalues,0, pre_dif=True) mask_cor = mask_dc.autocorr() if args.num_pca is None: num_pca = int(num_pca_components[qidx]) max_pca = num_pca+5 else: num_pca = args.num_pca+1 max_pca = args.num_pca+1 print('denoisng with PCA critical num_pca_components = %d...'%num_pca) if 'pca_components' not in f_out[q_group].keys(): # if there is no pca component saved, then run it and save the components pca=PCA(n_components=50, whiten = False) new_Train=pca.fit_transform(Train) new_Test = pca.transform(Test)
print("skipping run %d" % run_num) sys.exit() out_file2 = run_file.replace('.tbl', '_chunks_intershot_uncertainty.h5') f_out2 = h5py.File(os.path.join(save_dir, out_file2), 'w') if 'polar_mask_binned' in f.keys(): mask = np.array( f['polar_mask_binned'].value == f['polar_mask_binned'].value.max(), dtype=int) else: print("there is no mask stored with the shots") sys.exit() # mask = np.load('/reg/d/psdm/cxi/cxilp6715/results/shared_files/binned_pmask_basic.npy') # do the mask cor qs = np.linspace(0, 1, mask.shape[0]) dc = DiffCorr(mask[None, :, :], qs, 0, pre_dif=True) mask_cor = dc.autocorr().mean(0) PI = f['polar_imgs'] # filter by photon energy. If the photon energy of the shot if not within 100 EV of the average, do not use photon_energy = np.nan_to_num(f['ebeam']['photon_energy'].value) mean_E = photon_energy.mean() E_sigma = 100. shot_tage_to_keep=np.where( (photon_energy> (mean_E-E_sigma))\ +(photon_energy< (mean_E-E_sigma)) )[0] print('Num of shots to be used: %d' % (shot_tage_to_keep.size)) # figure which qs are used for pairing qmin = args.qmin qmax = args.qmax
norm_shots = np.zeros_like(shots) for idx, ss in enumerate(shots): norm_shots[idx] = normalize_shot(ss, this_mask) # do we want to normalize by the entire range of intensity? # divide into Train and test num_shots = norm_shots.shape[0] cutoff = int(num_shots * 0.1) # use 10% of the shots as testing set partial_mask = this_mask.copy() Train = norm_shots[cutoff:, partial_mask == 1] Test = norm_shots[:cutoff, partial_mask == 1] print("%d test shots" % (Test.shape[0])) print("%d train shots" % (Train.shape[0])) qvalues = np.linspace(0, 1, partial_mask.shape[0]) mask_dc = DiffCorr(partial_mask, qvalues, 0, pre_dif=True) mask_cor = mask_dc.autocorr() max_pca_components = [] max_pca_components.append(int(num_pca_components[qidx])) if 'num_pca_cutoff' in f_out[q_group].keys(): max_pca_components.append(f_out[q_group]['num_pca_cutoff'].value) max_pca_components = list(set(max_pca_components)) if 'pca_components' not in f_out[q_group].keys(): # if there is no pca component saved, then run it and save the components pca = PCA(n_components=50, whiten=False) new_Train = pca.fit_transform(Train) new_Test = pca.transform(Test)
masked_mean_train =reshape_unmasked_values_to_shots(Train,partial_mask).mean(0) #### this is just for saving to get error bars if save_cors: grp=f_out['q%d'%qidx] nn=grp['num_pca_cutoff'].value if 'all_difcors' in grp['pca%d'%nn].keys(): print('already save dif cors for this cutoff (%d) at q%d'%(nn,qidx)) else: Train_noise = new_Train[:,:nn].dot(components[:nn]) denoise_Train= reshape_unmasked_values_to_shots(norm_shots-Train_noise-Train.mean(0)[None,:] , partial_mask) dc=DiffCorr(denoise_Train,qvalues,0,pre_dif=False) Train_difcor= dc.autocorr() f_out.create_dataset('%s/pca%d/all_train_difcors'%(cluster_group,nn) ,data=Train_difcor) del norm_shots continue # denoise for nn in range(1, max_pca): pca_group = '%s/pca%d'%(cluster_group,nn) if 'pca%d'%nn in f_out[cluster_group].keys(): print("pca denoise at pca n_components = %d is already done. Skip!"%nn) continue
norm_shots = np.zeros_like(shots) for idx,ss in enumerate(shots): norm_shots[idx]=normalize_shot(ss,this_mask) # do we want to normalize by the entire range of intensity? # divide into Train and test num_shots = norm_shots.shape[0] cutoff = int(num_shots*0.1) # use 10% of the shots as testing set partial_mask = this_mask.copy() Train = norm_shots[cutoff:, partial_mask==1] Test = norm_shots[:cutoff, partial_mask==1] print ("%d test shots"%(Test.shape[0])) print ("%d train shots"%(Train.shape[0])) qvalues = np.linspace(0,1,partial_mask.shape[0]) mask_dc = DiffCorr(partial_mask,qvalues,0, pre_dif=True) mask_cor = mask_dc.autocorr() num_pca = int(num_pca_components[qidx]) if num_pca >0: # do PCA stuff print('denoisng with PCA num_pca_components = %d...'%num_pca) pca=PCA(n_components=num_pca, whiten = False) new_Train=pca.fit_transform(Train) new_Test = pca.transform(Test) # get back the masked images and components components=pca.components_ masked_mean_train =reshape_unmasked_values_to_shots(Train,partial_mask).mean(0) masked_mean_test =reshape_unmasked_values_to_shots(Test,partial_mask).mean(0)
# get back the masked images and components masked_mean_train = reshape_unmasked_values_to_shots( Train, partial_mask).mean(0) # denoise Train_noise = new_Train[:, :num_pca].dot(components[:num_pca]) denoise_Train = reshape_unmasked_values_to_shots( Train - Train_noise - Train.mean(0)[None, :], partial_mask) if denoise_Train.shape[0] % 2 > 0: denoise_Train = denoise_Train[:-1] denoise_Train_diff = denoise_Train[:-1][::2] - denoise_Train[1:][::2] dc = DiffCorr(denoise_Train_diff, qvalues, 0, pre_dif=True) Train_difcor = dc.autocorr().mean(0)[0] all_corrs.append(Train_difcor) all_nums.append(Train.shape[0]) f_out.create_dataset('q%d/dif_cor%d' % (qidx, n_chunk), data=Train_difcor) #########do clustering with corr PCA clustering######### print("computing single shot correlations") phi_offset = 10 num_phi = denoise_Train.shape[-1] mask_dc = DiffCorr(partial_mask, qvalues, 0, pre_dif=True) mask_cor = mask_dc.autocorr()
#### this is just for saving to get error bars if args.save: grp=f_out['q%d'%qidx] nn=grp['num_pca_cutoff'].value if 'all_test_difcors' in grp['pca%d'%nn].keys(): print('already save dif cors for this cutoff (%d) at q%d'%(nn,qidx)) else: Test_noise = new_Test[:,:nn].dot(components[:nn]) denoise_Test = reshape_unmasked_values_to_shots(Test-Test_noise-Test.mean(0)[None,:], partial_mask) Train_noise = new_Train[:,:nn].dot(components[:nn]) denoise_Train= reshape_unmasked_values_to_shots(Train-Train_noise-Train.mean(0)[None,:] , partial_mask) dc=DiffCorr(denoise_Train,qvalues,0,pre_dif=False) Train_difcor= dc.autocorr() dc=DiffCorr(denoise_Test,qvalues,0,pre_dif=False) Test_difcor= dc.autocorr() f_out.create_dataset('q%d/pca%d/all_test_difcors'%(qidx,nn) ,data=Test_difcor) f_out.create_dataset('q%d/pca%d/all_train_difcors'%(qidx,nn) ,data=Train_difcor) del shots del norm_shots continue
# get back the masked images and components masked_mean_train = reshape_unmasked_values_to_shots( Train, partial_mask).mean(0) # denoise Train_noise = new_Train[:, :num_pca].dot(components[:num_pca]) denoise_Train = reshape_unmasked_values_to_shots( Train - Train_noise - Train.mean(0)[None, :], partial_mask) if denoise_Train.shape[0] % 2 > 0: denoise_Train = denoise_Train[:-1] denoise_Train = denoise_Train[:-1][::2] - denoise_Train[1:][::2] dc = DiffCorr(denoise_Train, qvalues, 0, pre_dif=True) Train_difcor = dc.autocorr().mean(0)[0] all_corrs.append(Train_difcor) all_nums.append(Train.shape[0]) f_out.create_dataset('q%d/dif_cor%d' % (qidx, n_chunk), data=Train_difcor) all_corrs = np.array(all_corrs) all_nums = np.array(all_nums) # print all_corrs.shape # print all_nums ave_corr = np.sum(all_corrs * (all_nums / float(all_nums.sum()))[:, None], axis=0)
norm_shots = np.zeros_like(shots) for idx, ss in enumerate(shots): norm_shots[idx] = normalize_shot(ss, this_mask) # do we want to normalize by the entire range of intensity? # divide into Train and test num_shots = norm_shots.shape[0] cutoff = int(num_shots * 0.1) # use 10% of the shots as testing set partial_mask = this_mask.copy() Train = norm_shots[cutoff:, partial_mask == 1] Test = norm_shots[:cutoff, partial_mask == 1] print("%d test shots" % (Test.shape[0])) print("%d train shots" % (Train.shape[0])) qvalues = np.linspace(0, 1, partial_mask.shape[0]) mask_dc = DiffCorr(partial_mask, qvalues, 0, pre_dif=True) mask_cor = mask_dc.autocorr() if args.num_pca > 0: # do PCA stuff print('denoisng with PCA...') pca = PCA(n_components=args.num_pca, whiten=False) new_Train = pca.fit_transform(Train) new_Test = pca.transform(Test) # get back the masked images and components components = pca.components_ masked_mean_train = reshape_unmasked_values_to_shots( Train, partial_mask).mean(0) masked_mean_test = reshape_unmasked_values_to_shots( Test, partial_mask).mean(0)
# output file to save data cluster_file = run_file.replace('.tbl','_PCA-cluster.h5') f_cluster = h5py.File(os.path.join(cluster_dir, cluster_file),'r') cluster_set_keys = f_cluster.keys() out_file = run_file.replace('.tbl','_cor.h5') f_out = h5py.File(os.path.join(save_dir, out_file),'w') if 'polar_mask_binned' in f.keys(): mask = np.array(f['polar_mask_binned'].value==f['polar_mask_binned'].value.max(), dtype = int) else: mask = np.load('/reg/d/psdm/cxi/cxilp6715/scratch/water_data/binned_pmask_basic.npy') qs=np.linspace(0.2,0.88,mask.shape[0]) dc=DiffCorr(mask[None,:,:],qs,0,pre_dif=True) mask_ac=dc.autocorr() PI = f['polar_imgs'] shot_tags = np.arange(0,PI.shape[0]) for set_key in cluster_set_keys: print("computing diff cor for %s..."%set_key) qidx = int( set_key.split('q')[1] ) labels = f_cluster[set_key]['cluster_labels'].value.astype(int) f_out.create_group(set_key) unique_labels=np.unique(labels) cluster_corrs=[]
f_cluster = h5py.File(os.path.join(cluster_dir, cluster_file), 'r') cluster_set_keys = f_cluster.keys() out_file = run_file.replace('.tbl', '_cor.h5') f_out = h5py.File(os.path.join(save_dir, out_file), 'w') if 'polar_mask_binned' in f.keys(): mask = np.array( f['polar_mask_binned'].value == f['polar_mask_binned'].value.max(), dtype=int) else: mask = np.load( '/reg/d/psdm/cxi/cxilp6715/scratch/water_data/binned_pmask_basic.npy') qs = np.linspace(0.2, 0.88, mask.shape[0]) dc = DiffCorr(mask[None, :, :], qs, 0, pre_dif=True) mask_ac = dc.autocorr() PI = f['polar_imgs'] shot_tags = np.arange(0, PI.shape[0]) for set_key in cluster_set_keys: print("computing diff cor for %s..." % set_key) qidx = int(set_key.split('q')[1]) labels = f_cluster[set_key]['cluster_labels'].value.astype(int) f_out.create_group(set_key) unique_labels = np.unique(labels) cluster_corrs = [] cluster_num_shots = []
print('corr PCA clustering for qidx %d'%qidx) f_out.create_group('q%d'%qidx) shots=PI[:,qidx,:][:,None,:] this_mask = mask[qidx][None,:] norm_shots = np.zeros_like(shots) for idx,ss in enumerate(shots): norm_shots[idx]=normalize_shot(ss,this_mask) print("computing single shot correlations") phi_offset=10 num_phi=norm_shots.shape[-1] qs = np.array([1.0]) dc=DiffCorr(this_mask[None,:,:],qs,0,pre_dif=True) mask_corr=dc.autocorr() dc = DiffCorr(norm_shots, qs,0,pre_dif=True) corr = dc.autocorr() corr/=mask_corr corr=corr[:,:,phi_offset:num_phi/2-phi_offset] pca=PCA(n_components=args.num_pca) new_corr=pca.fit_transform(corr[:,0,:]) kmeans=KMeans(n_clusters=args.num_clusters) kmeans.fit(new_corr)
f = h5py.File(os.path.join(data_dir, run_file), 'r') # output file to save data out_file = run_file.replace('.tbl', '_pca0.h5') f_out = h5py.File(os.path.join(save_dir, out_file), 'w') if 'polar_mask_binned' in f.keys(): mask = np.array( f['polar_mask_binned'].value == f['polar_mask_binned'].value.max(), dtype=int) else: print("there is no mask stored with the shots") sys.exit() # mask = np.load('/reg/d/psdm/cxi/cxilp6715/results/shared_files/binned_pmask_basic.npy') qs = np.linspace(0, 1, mask.shape[0]) dc = DiffCorr(mask[None, :, :], qs, 0, pre_dif=True) mask_cor = dc.autocorr().mean(0) PI = f['polar_imgs'] # filter by photon energy. If the photon energy of the shot if not within 100 EV of the average, do not use photon_energy = np.nan_to_num(f['ebeam']['photon_energy'].value) mean_E = photon_energy.mean() E_sigma = 100. shot_tage_to_keep=np.where( (photon_energy> (mean_E-E_sigma))\ +(photon_energy< (mean_E-E_sigma)) )[0] print('Num of shots to be used: %d' % (shot_tage_to_keep.size)) # figure which qs are used for pairing qmin = args.qmin qmax = args.qmax
if 'num_pca_cutoff2' in grp.keys(): nn=grp['num_pca_cutoff2'].value else: nn=grp['num_pca_cutoff'].value if 'all_test_difcors' in grp['pca%d'%nn].keys(): print('already save dif cors for this cutoff (%d) at q%d'%(nn,qidx)) else: Test_noise = new_Test[:,:nn].dot(components[:nn]) denoise_Test = reshape_unmasked_values_to_shots(Test-Test_noise-Test.mean(0)[None,:], partial_mask) Train_noise = new_Train[:,:nn].dot(components[:nn]) denoise_Train= reshape_unmasked_values_to_shots(Train-Train_noise-Train.mean(0)[None,:] , partial_mask) dc=DiffCorr(denoise_Train,qvalues,0,pre_dif=False) Train_difcor= dc.autocorr() dc=DiffCorr(denoise_Test,qvalues,0,pre_dif=False) Test_difcor= dc.autocorr() f_out.create_dataset('q%d/pca%d/all_test_difcors'%(qidx,nn) ,data=Test_difcor) f_out.create_dataset('q%d/pca%d/all_train_difcors'%(qidx,nn) ,data=Train_difcor) del norm_shots else:
norm_shots = np.zeros_like(shots) for idx,ss in enumerate(shots): norm_shots[idx]=normalize_shot(ss,this_mask) # do we want to normalize by the entire range of intensity? # divide into Train and test num_shots = norm_shots.shape[0] cutoff = int(num_shots*0.1) # use 10% of the shots as testing set partial_mask = this_mask.copy() Train = norm_shots[cutoff:, partial_mask==1] Test = norm_shots[:cutoff, partial_mask==1] print ("%d test shots"%(Test.shape[0])) print ("%d train shots"%(Train.shape[0])) qvalues = np.linspace(0,1,partial_mask.shape[0]) mask_dc = DiffCorr(partial_mask,qvalues,0, pre_dif=True) mask_cor = mask_dc.autocorr() if args.num_pca is None: num_pca = int(num_pca_components[qidx]) max_pca = num_pca+5 else: num_pca = args.num_pca+1 max_pca = args.num_pca+1 print('denoisng with PCA critical num_pca_components = %d...'%num_pca) if 'pca_components' not in f_out[q_group].keys(): # if there is no pca component saved, then run it and save the components pca=PCA(n_components=50, whiten = False) new_Train=pca.fit_transform(Train) new_Test = pca.transform(Test)
norm_shots = np.zeros_like(shots) for idx, ss in enumerate(shots): norm_shots[idx] = normalize_shot(ss, this_mask) # do we want to normalize by the entire range of intensity? # divide into Train and test num_shots = norm_shots.shape[0] cutoff = int(num_shots * 0.1) # use 10% of the shots as testing set partial_mask = this_mask.copy() Train = norm_shots[cutoff:, partial_mask == 1] Test = norm_shots[:cutoff, partial_mask == 1] print("%d test shots" % (Test.shape[0])) print("%d train shots" % (Train.shape[0])) qvalues = np.linspace(0, 1, partial_mask.shape[0]) mask_dc = DiffCorr(partial_mask, qvalues, 0, pre_dif=True) mask_cor = mask_dc.autocorr() if args.num_pca is None: num_pca = int(num_pca_components[qidx]) max_pca = num_pca + 5 else: num_pca = args.num_pca + 1 max_pca = args.num_pca + 1 print('denoisng with PCA critical num_pca_components = %d...' % num_pca) if 'pca_components' not in f_out[q_group].keys(): # if there is no pca component saved, then run it and save the components pca = PCA(n_components=50, whiten=False) new_Train = pca.fit_transform(Train) new_Test = pca.transform(Test)