def find_best_parameter(data): results = np.empty((810, 3)) counter = 0 bins = bin_data(data) for i in range(10): validation_set = bins[i] print("------------------------------") print("FOLD {}/{}".format(i + 1, 10)) print("------------------------------") for pb in range(0, 9): for k in range(9): tree_gen_set = np.empty((0, data.shape[1])) pruning_set = np.empty((0, data.shape[1])) for j in range(1, 10): if k < j <= (pb + k) % 9 or (pb + k) % 9 < k < j or j <= ( pb + k) % 9 < k: pruning_set = np.vstack( (pruning_set, bins[(i + j) % 10])) else: tree_gen_set = np.vstack( (tree_gen_set, bins[(i + j) % 10])) tree = decision_tree_learning(tree_gen_set) while pb > 0 and prune_tree(tree, pruning_set, tree): pass recalculate_depth(tree) matrix = get_conf_matrix(tree, validation_set) results[counter][0] = pb results[counter][1] = tree['depth'] results[counter][2] = classification_rate(matrix) counter += 1 print( "Completed assessment for pruning dataset size: {}".format(pb)) print_best_parameter(results)
def _get_node_split(node_data, edge_data, bin_bounds): flows = edge_data["flows"].values edge_idcs = edge_data.values[:, :2].astype( np.int) # Ex2 array indicating the two nodes an edge connects num_edges = len(edge_idcs) num_nodes = len(node_data) num_bins = len(bin_bounds) bin_idcs = bin_data(flows, num_bins, scale="custom", bin_bounds=bin_bounds) bin_counts = np.bincount(bin_idcs) bin_counts[0] = np.sum( (flows > 0) & (flows < bin_bounds[0]) ) # Special case: When it comes to compute the fraction of edges of the smallest bin, we exclude the huge number of 0-valued edges smallest_bin_idx = np.argmin(bin_counts) bin_samples, = np.where(bin_idcs == smallest_bin_idx) np.random.shuffle(bin_samples) test_edge_set, test_node_set = _create_node_set(set(), set(), smallest_bin_idx, bin_samples, bin_idcs, edge_idcs, num_bins, 0.2 * bin_counts) val_edge_set, val_node_set = _create_node_set(test_node_set, test_edge_set, smallest_bin_idx, bin_samples, bin_idcs, edge_idcs, num_bins, 0.1 * bin_counts) # Create training set by selecting all non-zero-valued edges and a limited # number of zero-valued edges non_train_node_set = val_node_set.union(test_node_set) train_edge_set = set() max_num_zero = 10000 # include limited number of zero-valued edges num_zero = 0 for edge_idx, (flow, edge_idcs) in enumerate(zip(flows, edge_idcs)): if (not edge_idcs[0] in non_train_node_set and not edge_idcs[1] in non_train_node_set): if flow >= 1.0: train_edge_set.add(edge_idx) elif num_zero < max_num_zero: train_edge_set.add(edge_idx) num_zero += 1 assert len(test_edge_set.intersection(val_edge_set)) == 0 assert len(test_edge_set.intersection(train_edge_set)) == 0 assert len(val_edge_set.intersection(train_edge_set)) == 0 assert len(test_node_set.intersection(val_node_set)) == 0 val_node_idcs = np.array(list(val_node_set), dtype=np.int) test_node_idcs = np.array(list(test_node_set), dtype=np.int) val_edge_idcs = np.array(list(val_edge_set), dtype=np.int) test_edge_idcs = np.array(list(test_edge_set), dtype=np.int) train_edge_idcs = np.array(list(train_edge_set), dtype=np.int) return (val_node_idcs, test_node_idcs, val_edge_idcs, test_edge_idcs, train_edge_idcs, bin_idcs)
def cross_validation(dataset): # create confusion_matrix confusion_matrix = np.zeros((4, 4), int) # bin the dataset (into 10) bins = bin_data(dataset) # each bin takes turns becoming the validation set for v_index, v_bin in enumerate(bins): training = np.empty((0, v_bin.shape[1])) # and we stich the remaining 9 bins together for index, bin in enumerate(bins): if (index != v_index): training = np.vstack((training, bins[index])) # train the decision tree and sum the confusion matrices tree = decision_tree_learning(training) confusion_matrix += get_conf_matrix(tree, v_bin) return confusion_matrix
def cross_validation_with_pruning(training_data, test_data, pruning_bins): confusion_matrix = np.zeros((4, 4), int) bins = bin_data(training_data) for i in range(10): training_set = np.empty((0, test_data.shape[1])) validation_set = np.empty((0, test_data.shape[1])) for k in range(10): if (i + k) % 10 < pruning_bins: validation_set = np.vstack((validation_set, bins[k])) else: training_set = np.vstack((training_set, bins[k])) tree = decision_tree_learning(training_set) while prune_tree(tree, validation_set, tree): pass recalculate_depth(tree) confusion_matrix += get_conf_matrix(tree, test_data) return confusion_matrix
def _load_bin_data(bin_bounds, edge_labels_unscaled, num_bins, train_idcs, val_idcs, test_idcs): # Get edge buckets (assign each edge to a bucket based on magnitude of # flow) edge_buckets = bin_data(edge_labels_unscaled, num_bins, scale="custom", bin_bounds=bin_bounds) # Compute weights for each bucket to counterbalance the imbalanced # class/bin distribution train_bin_weights = class_weight.compute_class_weight( 'balanced', np.unique(edge_buckets), edge_buckets[train_idcs]) val_bin_weights = class_weight.compute_class_weight( 'balanced', np.unique(edge_buckets), edge_buckets[val_idcs]) test_bin_weights = class_weight.compute_class_weight( 'balanced', np.unique(edge_buckets), edge_buckets[test_idcs]) train_bin_weights = train_bin_weights.astype(np.float32) val_bin_weights = val_bin_weights.astype(np.float32) test_bin_weights = test_bin_weights.astype(np.float32) return edge_buckets, train_bin_weights, val_bin_weights, test_bin_weights
def read_iue(models, lbdarr, wave0, flux0, sigma0, folder_data, folder_fig, star, cut_iue_regions, model): table = folder_data + str(star) + '/' + 'list_iue.txt' # os.chdir(folder_data + str(star) + '/') if os.path.isfile(table) is False or os.path.isfile(table) is True: os.system('ls ' + folder_data + str(star) + '/*.FITS | xargs -n1 basename >' + folder_data + str(star) + '/' + 'list_iue.txt') iue_list = np.genfromtxt(table, comments='#', dtype='str') file_name = np.copy(iue_list) fluxes, waves, errors = [], [], [] for k in range(len(file_name)): file_iue = str(folder_data) + str(star) + '/' + str(file_name[k]) hdulist = pyfits.open(file_iue) tbdata = hdulist[1].data wave = tbdata.field('WAVELENGTH') * 1e-4 # mum flux = tbdata.field('FLUX') * 1e4 # erg/cm2/s/A -> erg/cm2/s/mum sigma = tbdata.field('SIGMA') * 1e4 # erg/cm2/s/A -> erg/cm2/s/mum # Filter of bad data qualy = tbdata.field('QUALITY') idx = np.where((qualy == 0)) wave = wave[idx] sigma = sigma[idx] flux = flux[idx] fluxes = np.concatenate((fluxes, flux), axis=0) waves = np.concatenate((waves, wave), axis=0) errors = np.concatenate((errors, sigma), axis=0) if os.path.isdir(folder_fig + str(star)) is False: os.mkdir(folder_fig + str(star)) # ------------------------------------------------------------------------------ # Would you like to cut the spectrum? if cut_iue_regions is True: wave_lim_min_iue = 0.135 wave_lim_max_iue = 0.180 # Do you want to select a range to middle UV? (2200 bump region) wave_lim_min_bump_iue = 0.20 # 0.200 #0.195 #0.210 / 0.185 wave_lim_max_bump_iue = 0.30 # 0.300 #0.230 #0.300 / 0.335 indx = np.where(((waves >= wave_lim_min_iue) & (waves <= wave_lim_max_iue))) indx2 = np.where(((waves >= wave_lim_min_bump_iue) & (waves <= wave_lim_max_bump_iue))) indx3 = np.concatenate((indx, indx2), axis=1)[0] waves, fluxes, errors = waves[indx3], fluxes[indx3], errors[indx3] else: wave_lim_min_iue = min(waves) wave_lim_max_iue = 0.300 indx = np.where(((waves >= wave_lim_min_iue) & (waves <= wave_lim_max_iue))) waves, fluxes, errors = waves[indx], fluxes[indx], errors[indx] new_wave, new_flux, new_sigma = \ zip(*sorted(zip(waves, fluxes, errors))) nbins = 200 xbin, ybin, dybin = bin_data(new_wave, new_flux, nbins, exclude_empty=True) ordem = xbin.argsort() wave = xbin[ordem] flux = ybin[ordem] sigma = dybin[ordem] if model != 'befavor': wave = np.hstack([wave0, wave]) flux = np.hstack([flux0, flux]) sigma = np.hstack([sigma0, sigma]) ordem = wave.argsort() wave = wave[ordem] flux = flux[ordem] sigma = sigma[ordem] # ------------------------------------------------------------------------------ # select lbdarr to coincide with lbd models_new = np.zeros([len(models), len(wave)]) if model == 'beatlas' or model == 'aara': idx = np.where((wave >= np.min(lbdarr)) & (wave <= np.max(lbdarr))) wave = wave[idx] flux = flux[idx] sigma = sigma[idx] models_new = np.zeros([len(models), len(wave)]) for i in range(len(models)): models_new[i, :] = 10.**griddata(np.log(lbdarr), np.log10(models[i]), np.log(wave), method='linear') # to log space logF = np.log10(flux) dlogF = sigma / flux logF_grid = np.log10(models_new) return logF, dlogF, logF_grid, wave
#plt.axvline(x=14.5, c='black') #plt.axvline(x=13.5, c='black') #plt.colorbar(orientation="vertical") #rem = np.array([True if r < 1.02 and r > 0.97 else False for r in fluxes]) #ax2 = plt.subplot2grid((2,1),(0,0)) #plt.scatter(times[rem], xcenters[rem], c='black', s=0.2, marker='.', cmap='jet') #plt.ylabel('x-centroid', fontsize=16) # # #ax3 = plt.subplot2grid((2,1),(1,0),sharex=ax2) #plt.scatter(times[rem], ycenters[rem], c='black', s=0.2, marker='.', cmap='jet') #plt.xlabel('Time (days)', fontsize=16) #plt.ylabel('y-centroid', fontsize=16) #plt.show() # times = utils.bin_data(times, 50) xcenters = utils.bin_data(xcenters, 50) ycenters = utils.bin_data(ycenters, 50) fluxes = utils.bin_data(fluxes, 50) flux_errs = utils.bin_data(flux_errs, 50) flux_errs = flux_errs / 50 bf_full_model = utils.bin_data(bf_full_model, 50) bf_transit_model = utils.bin_data(bf_transit_model, 50) phase = utils.bin_data(phase, 50) plt.subplot2grid((2, 1), (0, 0)) plt.scatter(times, xcenters, s=0.5, c='blue') plt.ylim(14, 16) plt.ylabel('x-centroid', fontsize=16) plt.subplot2grid((2, 1), (1, 0)) plt.scatter(times, ycenters, s=0.5, c='blue')
from evaluation import interpret_matrix, normalise_matrix from training import cross_validation, cross_validation_with_pruning, decision_tree_learning, prune_tree, recalculate_depth from utils import bin_data, dimension_frequency from visualisation import visualise_tree if __name__ == "__main__": # First load config config = configparser.ConfigParser() config.read('config.ini') # Then load dataset dataset = np.loadtxt(config['Settings']['dataset']) # Section off one bin (200/2000) to be used as the test/final evaluation bins = bin_data(dataset) test_set = bins[0] # Stitch the rest of the bins back together training_data = np.empty((0, dataset.shape[1])) for i in bins[1:]: training_data = np.vstack((training_data, i)) # get_best_parameter(training_data) # Do the thing print( "--------------------------------------------------------------------") print( "Performing simple cross validation on the dataset (without pruning):") print( "--------------------------------------------------------------------")
# plt.ylabel('y-centroid', fontsize=16) # plt.axhline(y=15.5, c='black') # plt.axvline(x=15.5, c='black') # plt.axhline(y=14.5, c='black') # plt.axvline(x=14.5, c='black') # plt.colorbar(orientation="vertical") # ax2 = plt.subplot2grid((2,2),(0,0)) # plt.scatter(phase[rem], xcenters[rem], c=residuals[rem], s=1, marker='o', cmap='jet') # plt.ylabel('x-centroid', fontsize=16) # ax3 = plt.subplot2grid((2,2),(1,0)) # plt.scatter(phase[rem], ycenters[rem], c=residuals[rem], s=1, marker='o', cmap='jet') # plt.xlabel('Phase', fontsize=16) # plt.ylabel('y-centroid', fontsize=16) # plt.show() times = utils.bin_data(times, 100) xcenters = utils.bin_data(xcenters, 100) ycenters = utils.bin_data(ycenters, 100) fluxes = utils.bin_data(fluxes, 100) flux_errs = utils.bin_data(flux_errs, 100) bf_full_model = utils.bin_data(bf_full_model, 100) bf_transit_model = utils.bin_data(bf_transit_model, 100) phase = utils.bin_data(phase, 100) bf_sensitivity_map = utils.bin_data(bf_sensitivity_map, 100) plt.subplot2grid ((2,1),(0,0)) # plt.plot(times, fluxes, linewidth=1, color='black', alpha=1) plt.plot(times, bf_full_model, color='red', linewidth=2) plt.errorbar(times, fluxes, yerr=flux_errs, ecolor = 'blue', elinewidth=0.3, fmt='none') plt.scatter(times, fluxes, color='black', s=7)
def background_map(self): ixr = self.ix_radial # bcg index i = self.iclose # No back substraction if self.Ngal <= 2: self.Ngal_c = self.Ngal print(color('Background -- Not enough galaxies found in cluster', 31, 5)) return # Store radially ordered r = self.dist2BCG[ixr] * 60.0 # in arcmin Lr = self.Lr[ixr] # We do in the r-band as Reyes et al # Bin the Ngal/Lum data in log spacing n = 10 rbin = mklogarray(0.0, r.max(), n) Nbin, rcenter = histo(r, rbin, center='yes') Lbin, rcenter = bin_data(r, Lr, rbin, center='yes') # Compute the area in each shell ir = numpy.indices(rbin.shape)[0] ir1 = ir[:-1] ir2 = ir[1:] r1 = rbin[ir1] r2 = rbin[ir2] abin = math.pi * (r2**2 - r1**2) PN = old_div(Nbin, abin) # Number Surface density # Compute the background median density both in Lum and Ngal # Here's is where we are going to make a couple of maps to compute the areas # for the background R1 = 3.0 * self.r1Mpc * 60.0 R2 = r.max() # go all the way out print("# Estimating Background @ r > 3mpc -- %.2f - %.2f [arcmin]" % (R1, R2)) PN_bgr = PN[rcenter > R1] # Get the mean values for the Ngal and Lr profiles, which will # be the correction per arcmin^2 PN_mean = numpy.mean(PN_bgr) print('\tmean number of BG galaxies -- {}'.format(PN_mean)) # Total number in background area N_bgr = PN_bgr.sum() # get the area of the background. We'll make a 'blank' image the same size as # our input image and then sum over the pixels that are either in or out of # the cluster region. # cluster location a, b = round(self.x_image[self.iclose]), round(self.y_image[self.iclose]) # size of the image n = self.jpg_array.shape[0] # cluster radius in arcseconds converted to pixels. r = R1 * 60 / self.pixscale # create pixel grid y,x = numpy.ogrid[-a:n-a, -b:n-b] # mask the cluster region mask = x*x + y*y <= r*r # create new 'bool' image img_array = numpy.ones((n, n), dtype='bool') # the cluster region becomes 'false' or zero img_array[mask] = False # sum the background region gives the number of pixels. Multiply by the pixel # scale to get the total area. Convert to arcminutes. area_bgr = img_array.sum() * self.pixscale / 60 # Get the correction for Number of galaxies and Luminosoty # For R200 we need to recompute R200 and N200 based on new # R200 value. area_r1Mpc = math.pi * (self.r1Mpc * 60.)**2 # in arcmin2 # use the inverse of the cluster mask to find the cluster area area_r1mpc = (n**2 - img_array.sum()) * self.pixscale / 60 self.Ngal_c = self.Ngal - PN_mean * area_r1Mpc if self.Ngal_c < 0: self.Ngal_c = 0.0 self.d_Ngal_c2 = self.Ngal_c + ( (old_div(area_r1Mpc, area_bgr))**2) * N_bgr # Avoid sqrt of negative number if self.d_Ngal_c2 < 0: self.d_Ngal_c = 0 else: self.d_Ngal_c = math.sqrt(self.Ngal_c + ( (old_div(area_r1Mpc, area_bgr))**2) * N_bgr) return
def background(self): ixr = self.ix_radial # No back substraction if self.Ngal <= 2: self.Ngal_c = self.Ngal print(color('Background -- Not enough galaxies found in cluster', 31, 5)) return # Store radially ordered r = self.dist2BCG[ixr] * 60.0 # in arcmin Lr = self.Lr[ixr] # We do in the r-band as Reyes et al # Bin the Ngal/Lum data in log spacing n = 10 rbin = mklogarray(0.0, r.max(), n) Nbin, rcenter = histo(r, rbin, center='yes') Lbin, rcenter = bin_data(r, Lr, rbin, center='yes') # Compute the area in each shell ir = numpy.indices(rbin.shape)[0] ir1 = ir[:-1] ir2 = ir[1:] r1 = rbin[ir1] r2 = rbin[ir2] abin = math.pi * (r2**2 - r1**2) PN = old_div(Nbin, abin) # Number Surface density # Compute the background median density both in Lum and Ngal # Between 4.0 - 9.0 r1Mpc R1 = 4.0 * self.r1Mpc * 60.0 R2 = 9.0 * self.r1Mpc * 60.0 print("# Estimating Background between R1,R2 %.2f--%2.f[arcmin]" % (R1, R2)) if R2 >= r.max(): print(color('\tBackground R2 > image limits! -- recomputing', 31, 0)) R2 = r2.max() R1 = R2 - 2.0 * self.r1Mpc * 60.0 print("# Estimating Background between R1,R2 %.2f--%2.f[arcmin]" % (R1, R2)) PN_bgr = PN[land(rcenter > R1, rcenter < R2)] # Get the mean values for the Ngal and Lr profiles, which will # be the correction per arcmin^2 PN_mean = numpy.mean(PN_bgr) print('\tmean number of BG galaxies -- {}'.format(PN_mean)) # Total number in area N_bgr = PN_bgr.sum() area_bgr = math.pi * (R2**2 - R1**2) # Get the correction for Number of galaxies and Luminosoty # For R200 we need to recompute R200 and N200 based on new # R200 value. area_r1Mpc = math.pi * (self.r1Mpc * 60.)**2 # in arcmin2 self.Ngal_c = self.Ngal - PN_mean * area_r1Mpc if self.Ngal_c < 0: self.Ngal_c = 0.0 print('---- test stuff -----') print(self.iclose) print(self.x_image[self.iclose], self.y_image[self.iclose]) # print(self.Ngal) # print(PN) # print(r1) # print(rcenter) # print(R1,R2) # print(r.min(),r.max()) # print("PN_mean",PN_mean) # print(PN_bgr) # print(area_r1Mpc) # print("Ngal ", self.Ngal) # print("Ngal_c", self.Ngal_c) #print("r200_c",self.r200_c) #print("R200_c",self.R200_c) self.d_Ngal_c2 = self.Ngal_c + ( (old_div(area_r1Mpc, area_bgr))**2) * N_bgr # Avoid sqrt of negative number if self.d_Ngal_c2 < 0: self.d_Ngal_c = 0 else: self.d_Ngal_c = math.sqrt(self.Ngal_c + ( (old_div(area_r1Mpc, area_bgr))**2) * N_bgr) return