Ejemplo n.º 1
0
def find_best_parameter(data):
    results = np.empty((810, 3))
    counter = 0
    bins = bin_data(data)
    for i in range(10):
        validation_set = bins[i]
        print("------------------------------")
        print("FOLD {}/{}".format(i + 1, 10))
        print("------------------------------")
        for pb in range(0, 9):
            for k in range(9):
                tree_gen_set = np.empty((0, data.shape[1]))
                pruning_set = np.empty((0, data.shape[1]))
                for j in range(1, 10):
                    if k < j <= (pb + k) % 9 or (pb + k) % 9 < k < j or j <= (
                            pb + k) % 9 < k:
                        pruning_set = np.vstack(
                            (pruning_set, bins[(i + j) % 10]))
                    else:
                        tree_gen_set = np.vstack(
                            (tree_gen_set, bins[(i + j) % 10]))
                tree = decision_tree_learning(tree_gen_set)
                while pb > 0 and prune_tree(tree, pruning_set, tree):
                    pass
                recalculate_depth(tree)
                matrix = get_conf_matrix(tree, validation_set)

                results[counter][0] = pb
                results[counter][1] = tree['depth']
                results[counter][2] = classification_rate(matrix)
                counter += 1

            print(
                "Completed assessment for pruning dataset size: {}".format(pb))
    print_best_parameter(results)
Ejemplo n.º 2
0
def _get_node_split(node_data, edge_data, bin_bounds):
    flows = edge_data["flows"].values
    edge_idcs = edge_data.values[:, :2].astype(
        np.int)  # Ex2 array indicating the two nodes an edge connects
    num_edges = len(edge_idcs)
    num_nodes = len(node_data)
    num_bins = len(bin_bounds)
    bin_idcs = bin_data(flows, num_bins, scale="custom", bin_bounds=bin_bounds)
    bin_counts = np.bincount(bin_idcs)
    bin_counts[0] = np.sum(
        (flows > 0) & (flows < bin_bounds[0])
    )  # Special case: When it comes to compute the fraction of edges of the smallest bin, we exclude the huge number of 0-valued edges
    smallest_bin_idx = np.argmin(bin_counts)
    bin_samples, = np.where(bin_idcs == smallest_bin_idx)
    np.random.shuffle(bin_samples)

    test_edge_set, test_node_set = _create_node_set(set(), set(),
                                                    smallest_bin_idx,
                                                    bin_samples, bin_idcs,
                                                    edge_idcs, num_bins,
                                                    0.2 * bin_counts)
    val_edge_set, val_node_set = _create_node_set(test_node_set, test_edge_set,
                                                  smallest_bin_idx,
                                                  bin_samples, bin_idcs,
                                                  edge_idcs, num_bins,
                                                  0.1 * bin_counts)

    # Create training set by selecting all non-zero-valued edges and a limited
    # number of zero-valued edges
    non_train_node_set = val_node_set.union(test_node_set)
    train_edge_set = set()
    max_num_zero = 10000  # include limited number of zero-valued edges
    num_zero = 0
    for edge_idx, (flow, edge_idcs) in enumerate(zip(flows, edge_idcs)):
        if (not edge_idcs[0] in non_train_node_set
                and not edge_idcs[1] in non_train_node_set):
            if flow >= 1.0:
                train_edge_set.add(edge_idx)
            elif num_zero < max_num_zero:
                train_edge_set.add(edge_idx)
                num_zero += 1

    assert len(test_edge_set.intersection(val_edge_set)) == 0
    assert len(test_edge_set.intersection(train_edge_set)) == 0
    assert len(val_edge_set.intersection(train_edge_set)) == 0
    assert len(test_node_set.intersection(val_node_set)) == 0

    val_node_idcs = np.array(list(val_node_set), dtype=np.int)
    test_node_idcs = np.array(list(test_node_set), dtype=np.int)
    val_edge_idcs = np.array(list(val_edge_set), dtype=np.int)
    test_edge_idcs = np.array(list(test_edge_set), dtype=np.int)
    train_edge_idcs = np.array(list(train_edge_set), dtype=np.int)
    return (val_node_idcs, test_node_idcs, val_edge_idcs, test_edge_idcs,
            train_edge_idcs, bin_idcs)
Ejemplo n.º 3
0
def cross_validation(dataset):
    # create confusion_matrix
    confusion_matrix = np.zeros((4, 4), int)
    # bin the dataset (into 10)
    bins = bin_data(dataset)
    # each bin takes turns becoming the validation set
    for v_index, v_bin in enumerate(bins):
        training = np.empty((0, v_bin.shape[1]))
        # and we stich the remaining 9 bins together
        for index, bin in enumerate(bins):
            if (index != v_index):
                training = np.vstack((training, bins[index]))
        # train the decision tree and sum the confusion matrices
        tree = decision_tree_learning(training)
        confusion_matrix += get_conf_matrix(tree, v_bin)
    return confusion_matrix
Ejemplo n.º 4
0
def cross_validation_with_pruning(training_data, test_data, pruning_bins):
    confusion_matrix = np.zeros((4, 4), int)
    bins = bin_data(training_data)

    for i in range(10):
        training_set = np.empty((0, test_data.shape[1]))
        validation_set = np.empty((0, test_data.shape[1]))
        for k in range(10):
            if (i + k) % 10 < pruning_bins:
                validation_set = np.vstack((validation_set, bins[k]))
            else:
                training_set = np.vstack((training_set, bins[k]))

        tree = decision_tree_learning(training_set)
        while prune_tree(tree, validation_set, tree):
            pass
        recalculate_depth(tree)
        confusion_matrix += get_conf_matrix(tree, test_data)
    return confusion_matrix
Ejemplo n.º 5
0
 def _load_bin_data(bin_bounds, edge_labels_unscaled, num_bins, train_idcs,
                    val_idcs, test_idcs):
     # Get edge buckets (assign each edge to a bucket based on magnitude of
     # flow)
     edge_buckets = bin_data(edge_labels_unscaled,
                             num_bins,
                             scale="custom",
                             bin_bounds=bin_bounds)
     # Compute weights for each bucket to counterbalance the imbalanced
     # class/bin distribution
     train_bin_weights = class_weight.compute_class_weight(
         'balanced', np.unique(edge_buckets), edge_buckets[train_idcs])
     val_bin_weights = class_weight.compute_class_weight(
         'balanced', np.unique(edge_buckets), edge_buckets[val_idcs])
     test_bin_weights = class_weight.compute_class_weight(
         'balanced', np.unique(edge_buckets), edge_buckets[test_idcs])
     train_bin_weights = train_bin_weights.astype(np.float32)
     val_bin_weights = val_bin_weights.astype(np.float32)
     test_bin_weights = test_bin_weights.astype(np.float32)
     return edge_buckets, train_bin_weights, val_bin_weights, test_bin_weights
Ejemplo n.º 6
0
def read_iue(models, lbdarr, wave0, flux0, sigma0, folder_data,
             folder_fig, star, cut_iue_regions, model):

    table = folder_data + str(star) + '/' + 'list_iue.txt'

    # os.chdir(folder_data + str(star) + '/')
    if os.path.isfile(table) is False or os.path.isfile(table) is True:
        os.system('ls ' + folder_data + str(star) +
                  '/*.FITS | xargs -n1 basename >' +
                  folder_data + str(star) + '/' + 'list_iue.txt')
        iue_list = np.genfromtxt(table, comments='#', dtype='str')
        file_name = np.copy(iue_list)

    fluxes, waves, errors = [], [], []

    for k in range(len(file_name)):
        file_iue = str(folder_data) + str(star) + '/' + str(file_name[k])
        hdulist = pyfits.open(file_iue)
        tbdata = hdulist[1].data
        wave = tbdata.field('WAVELENGTH') * 1e-4  # mum
        flux = tbdata.field('FLUX') * 1e4  # erg/cm2/s/A -> erg/cm2/s/mum
        sigma = tbdata.field('SIGMA') * 1e4  # erg/cm2/s/A -> erg/cm2/s/mum

        # Filter of bad data
        qualy = tbdata.field('QUALITY')
        idx = np.where((qualy == 0))
        wave = wave[idx]
        sigma = sigma[idx]
        flux = flux[idx]

        fluxes = np.concatenate((fluxes, flux), axis=0)
        waves = np.concatenate((waves, wave), axis=0)
        errors = np.concatenate((errors, sigma), axis=0)

    if os.path.isdir(folder_fig + str(star)) is False:
        os.mkdir(folder_fig + str(star))

# ------------------------------------------------------------------------------
    # Would you like to cut the spectrum?
    if cut_iue_regions is True:
        wave_lim_min_iue = 0.135
        wave_lim_max_iue = 0.180

        # Do you want to select a range to middle UV? (2200 bump region)
        wave_lim_min_bump_iue = 0.20  # 0.200 #0.195  #0.210 / 0.185
        wave_lim_max_bump_iue = 0.30  # 0.300 #0.230  #0.300 / 0.335

        indx = np.where(((waves >= wave_lim_min_iue) &
                         (waves <= wave_lim_max_iue)))
        indx2 = np.where(((waves >= wave_lim_min_bump_iue) &
                          (waves <= wave_lim_max_bump_iue)))
        indx3 = np.concatenate((indx, indx2), axis=1)[0]
        waves, fluxes, errors = waves[indx3], fluxes[indx3], errors[indx3]

    else:
        wave_lim_min_iue = min(waves)
        wave_lim_max_iue = 0.300
        indx = np.where(((waves >= wave_lim_min_iue) &
                         (waves <= wave_lim_max_iue)))
        waves, fluxes, errors = waves[indx], fluxes[indx], errors[indx]

    new_wave, new_flux, new_sigma = \
        zip(*sorted(zip(waves, fluxes, errors)))

    nbins = 200
    xbin, ybin, dybin = bin_data(new_wave, new_flux, nbins,
                                 exclude_empty=True)

    ordem = xbin.argsort()
    wave = xbin[ordem]
    flux = ybin[ordem]
    sigma = dybin[ordem]

    if model != 'befavor':
        wave = np.hstack([wave0, wave])
        flux = np.hstack([flux0, flux])
        sigma = np.hstack([sigma0, sigma])

        ordem = wave.argsort()
        wave = wave[ordem]
        flux = flux[ordem]
        sigma = sigma[ordem]

# ------------------------------------------------------------------------------
    # select lbdarr to coincide with lbd
    models_new = np.zeros([len(models), len(wave)])
    if model == 'beatlas' or model == 'aara':
        idx = np.where((wave >= np.min(lbdarr)) & (wave <= np.max(lbdarr)))
        wave = wave[idx]
        flux = flux[idx]
        sigma = sigma[idx]
        models_new = np.zeros([len(models), len(wave)])

    for i in range(len(models)):
        models_new[i, :] = 10.**griddata(np.log(lbdarr),
                                         np.log10(models[i]),
                                         np.log(wave), method='linear')
    # to log space
    logF = np.log10(flux)
    dlogF = sigma / flux
    logF_grid = np.log10(models_new)

    return logF, dlogF, logF_grid, wave
Ejemplo n.º 7
0
#plt.axvline(x=14.5, c='black')
#plt.axvline(x=13.5, c='black')
#plt.colorbar(orientation="vertical")
#rem = np.array([True if r < 1.02 and r > 0.97 else False for r in fluxes])
#ax2 = plt.subplot2grid((2,1),(0,0))
#plt.scatter(times[rem], xcenters[rem], c='black', s=0.2, marker='.', cmap='jet')
#plt.ylabel('x-centroid', fontsize=16)
#
#
#ax3 = plt.subplot2grid((2,1),(1,0),sharex=ax2)
#plt.scatter(times[rem], ycenters[rem], c='black', s=0.2, marker='.', cmap='jet')
#plt.xlabel('Time (days)', fontsize=16)
#plt.ylabel('y-centroid', fontsize=16)
#plt.show()
#
times = utils.bin_data(times, 50)
xcenters = utils.bin_data(xcenters, 50)
ycenters = utils.bin_data(ycenters, 50)
fluxes = utils.bin_data(fluxes, 50)
flux_errs = utils.bin_data(flux_errs, 50)
flux_errs = flux_errs / 50
bf_full_model = utils.bin_data(bf_full_model, 50)
bf_transit_model = utils.bin_data(bf_transit_model, 50)
phase = utils.bin_data(phase, 50)

plt.subplot2grid((2, 1), (0, 0))
plt.scatter(times, xcenters, s=0.5, c='blue')
plt.ylim(14, 16)
plt.ylabel('x-centroid', fontsize=16)
plt.subplot2grid((2, 1), (1, 0))
plt.scatter(times, ycenters, s=0.5, c='blue')
Ejemplo n.º 8
0
from evaluation import interpret_matrix, normalise_matrix
from training import cross_validation, cross_validation_with_pruning, decision_tree_learning, prune_tree, recalculate_depth
from utils import bin_data, dimension_frequency
from visualisation import visualise_tree

if __name__ == "__main__":
    # First load config
    config = configparser.ConfigParser()
    config.read('config.ini')

    # Then load dataset
    dataset = np.loadtxt(config['Settings']['dataset'])

    # Section off one bin (200/2000) to be used as the test/final evaluation
    bins = bin_data(dataset)
    test_set = bins[0]
    # Stitch the rest of the bins back together
    training_data = np.empty((0, dataset.shape[1]))
    for i in bins[1:]:
        training_data = np.vstack((training_data, i))

    # get_best_parameter(training_data)

    # Do the thing
    print(
        "--------------------------------------------------------------------")
    print(
        "Performing simple cross validation on the dataset (without pruning):")
    print(
        "--------------------------------------------------------------------")
Ejemplo n.º 9
0
# plt.ylabel('y-centroid', fontsize=16)
# plt.axhline(y=15.5, c='black')
# plt.axvline(x=15.5, c='black')
# plt.axhline(y=14.5, c='black')
# plt.axvline(x=14.5, c='black')
# plt.colorbar(orientation="vertical")
# ax2 = plt.subplot2grid((2,2),(0,0))
# plt.scatter(phase[rem], xcenters[rem], c=residuals[rem], s=1, marker='o', cmap='jet')
# plt.ylabel('x-centroid', fontsize=16)
# ax3 = plt.subplot2grid((2,2),(1,0))
# plt.scatter(phase[rem], ycenters[rem], c=residuals[rem], s=1, marker='o', cmap='jet')
# plt.xlabel('Phase', fontsize=16)
# plt.ylabel('y-centroid', fontsize=16)
# plt.show()

times = utils.bin_data(times, 100)
xcenters = utils.bin_data(xcenters, 100)
ycenters = utils.bin_data(ycenters, 100)
fluxes = utils.bin_data(fluxes, 100)
flux_errs = utils.bin_data(flux_errs, 100)
bf_full_model = utils.bin_data(bf_full_model, 100)
bf_transit_model = utils.bin_data(bf_transit_model, 100)
phase = utils.bin_data(phase, 100)
bf_sensitivity_map = utils.bin_data(bf_sensitivity_map, 100)


plt.subplot2grid ((2,1),(0,0))
# plt.plot(times, fluxes,  linewidth=1, color='black', alpha=1)
plt.plot(times, bf_full_model, color='red', linewidth=2)
plt.errorbar(times, fluxes, yerr=flux_errs, ecolor = 'blue', elinewidth=0.3, fmt='none')
plt.scatter(times, fluxes, color='black', s=7)
Ejemplo n.º 10
0
def background_map(self):
    ixr = self.ix_radial

    # bcg index
    i = self.iclose

    # No back substraction
    if self.Ngal <= 2:
        self.Ngal_c = self.Ngal
        print(color('Background -- Not enough galaxies found in cluster', 31, 5))
        return

    # Store radially ordered
    r = self.dist2BCG[ixr] * 60.0  # in arcmin
    Lr = self.Lr[ixr]  # We do in the r-band as Reyes et al

    # Bin the Ngal/Lum data in log spacing
    n = 10
    rbin = mklogarray(0.0, r.max(), n)
    Nbin, rcenter = histo(r, rbin, center='yes')
    Lbin, rcenter = bin_data(r, Lr, rbin, center='yes')

    # Compute the area in each shell
    ir = numpy.indices(rbin.shape)[0]
    ir1 = ir[:-1]
    ir2 = ir[1:]
    r1 = rbin[ir1]
    r2 = rbin[ir2]
    abin = math.pi * (r2**2 - r1**2)
    PN = old_div(Nbin, abin)  # Number Surface density

    # Compute the background median density both in Lum and Ngal
    # Here's is where we are going to make a couple of maps to compute the areas
    # for the background
    R1 = 3.0 * self.r1Mpc * 60.0
    R2 = r.max() # go all the way out
    print("# Estimating Background @ r > 3mpc -- %.2f - %.2f [arcmin]" % (R1, R2))

    PN_bgr = PN[rcenter > R1]

    # Get the mean values for the Ngal and Lr profiles, which will
    # be the correction per arcmin^2
    PN_mean = numpy.mean(PN_bgr)
    print('\tmean number of BG galaxies -- {}'.format(PN_mean))

    # Total number in background area
    N_bgr = PN_bgr.sum()

    # get the area of the background. We'll make a 'blank' image the same size as
    # our input image and then sum over the pixels that are either in or out of
    # the cluster region.
    # cluster location
    a, b = round(self.x_image[self.iclose]), round(self.y_image[self.iclose])
    # size of the image
    n = self.jpg_array.shape[0]
    # cluster radius in arcseconds converted to pixels.
    r = R1 * 60 / self.pixscale

    # create pixel grid
    y,x = numpy.ogrid[-a:n-a, -b:n-b]
    # mask the cluster region
    mask = x*x + y*y <= r*r
    # create new 'bool' image
    img_array = numpy.ones((n, n), dtype='bool')
    # the cluster region becomes 'false' or zero
    img_array[mask] = False

    # sum the background region gives the number of pixels. Multiply by the pixel
    # scale to get the total area. Convert to arcminutes.
    area_bgr = img_array.sum() * self.pixscale / 60

    # Get the correction for Number of galaxies and Luminosoty
    # For R200 we need to recompute R200 and N200 based on new
    # R200 value.
    area_r1Mpc = math.pi * (self.r1Mpc * 60.)**2  # in arcmin2
    # use the inverse of the cluster mask to find the cluster area
    area_r1mpc = (n**2 - img_array.sum()) * self.pixscale / 60

    self.Ngal_c = self.Ngal - PN_mean * area_r1Mpc
    if self.Ngal_c < 0:
        self.Ngal_c = 0.0

    self.d_Ngal_c2 = self.Ngal_c + (
        (old_div(area_r1Mpc, area_bgr))**2) * N_bgr

    # Avoid sqrt of negative number
    if self.d_Ngal_c2 < 0:
        self.d_Ngal_c = 0
    else:
        self.d_Ngal_c = math.sqrt(self.Ngal_c + (
            (old_div(area_r1Mpc, area_bgr))**2) * N_bgr)

    return
Ejemplo n.º 11
0
def background(self):
    ixr = self.ix_radial

    # No back substraction
    if self.Ngal <= 2:
        self.Ngal_c = self.Ngal
        print(color('Background -- Not enough galaxies found in cluster', 31, 5))
        return

    # Store radially ordered
    r = self.dist2BCG[ixr] * 60.0  # in arcmin
    Lr = self.Lr[ixr]  # We do in the r-band as Reyes et al

    # Bin the Ngal/Lum data in log spacing
    n = 10
    rbin = mklogarray(0.0, r.max(), n)
    Nbin, rcenter = histo(r, rbin, center='yes')
    Lbin, rcenter = bin_data(r, Lr, rbin, center='yes')

    # Compute the area in each shell
    ir = numpy.indices(rbin.shape)[0]
    ir1 = ir[:-1]
    ir2 = ir[1:]
    r1 = rbin[ir1]
    r2 = rbin[ir2]
    abin = math.pi * (r2**2 - r1**2)
    PN = old_div(Nbin, abin)  # Number Surface density

    # Compute the background median density both in Lum and Ngal
    # Between 4.0 - 9.0 r1Mpc
    R1 = 4.0 * self.r1Mpc * 60.0
    R2 = 9.0 * self.r1Mpc * 60.0
    print("# Estimating Background between R1,R2 %.2f--%2.f[arcmin]" %
          (R1, R2))

    if R2 >= r.max():
        print(color('\tBackground R2 > image limits! -- recomputing', 31, 0))
        R2 = r2.max()
        R1 = R2 - 2.0 * self.r1Mpc * 60.0
    print("# Estimating Background between R1,R2 %.2f--%2.f[arcmin]" %
          (R1, R2))

    PN_bgr = PN[land(rcenter > R1, rcenter < R2)]

    # Get the mean values for the Ngal and Lr profiles, which will
    # be the correction per arcmin^2
    PN_mean = numpy.mean(PN_bgr)
    print('\tmean number of BG galaxies -- {}'.format(PN_mean))

    # Total number in area
    N_bgr = PN_bgr.sum()
    area_bgr = math.pi * (R2**2 - R1**2)

    # Get the correction for Number of galaxies and Luminosoty
    # For R200 we need to recompute R200 and N200 based on new
    # R200 value.
    area_r1Mpc = math.pi * (self.r1Mpc * 60.)**2  # in arcmin2
    self.Ngal_c = self.Ngal - PN_mean * area_r1Mpc
    if self.Ngal_c < 0:
        self.Ngal_c = 0.0

    print('---- test stuff -----')

    print(self.iclose)
    print(self.x_image[self.iclose], self.y_image[self.iclose])


    # print(self.Ngal)
    # print(PN)
    # print(r1)
    # print(rcenter)
    # print(R1,R2)
    # print(r.min(),r.max())
    # print("PN_mean",PN_mean)
    # print(PN_bgr)
    # print(area_r1Mpc)
    # print("Ngal ", self.Ngal)
    # print("Ngal_c", self.Ngal_c)
    #print("r200_c",self.r200_c)
    #print("R200_c",self.R200_c)

    self.d_Ngal_c2 = self.Ngal_c + (
        (old_div(area_r1Mpc, area_bgr))**2) * N_bgr

    # Avoid sqrt of negative number
    if self.d_Ngal_c2 < 0:
        self.d_Ngal_c = 0
    else:
        self.d_Ngal_c = math.sqrt(self.Ngal_c + (
            (old_div(area_r1Mpc, area_bgr))**2) * N_bgr)

    return