Ejemplo n.º 1
0
def optimize_trials(
    max_exponent, desired_count=3, thread_count=10, high_percentile=95, low_percentile=5, tolerance=0.01
):
    queue = multiprocessing.Queue()
    start_time = time.time()
    for exponent in xrange(1, max_exponent + 1):
        trials = 10 ** exponent
        threads = []
        for t in xrange(thread_count):
            thread = multiprocessing.Process(
                target=get_probability, kwargs={"trials": trials, "desired_heads": desired_count, "queue": queue}
            )
            threads.append(thread)
            thread.start()
        for thread in threads:
            thread.join()

        data = []
        while not queue.empty():
            data.append(queue.get())
        percentile_range = numpy.percentile(data, high_percentile) - numpy.percentile(data, low_percentile)
        if percentile_range <= tolerance:
            return DataTuple(
                probability=numpy.median(data),
                exponent=exponent,
                range=percentile_range,
                elapsed=time.time() - start_time,
            )

    return DataTuple(
        probability=numpy.median(data), exponent=exponent, range=percentile_range, elapsed=time.time() - start_time
    )
Ejemplo n.º 2
0
def TL_from_sample(dat_sample, analysis = 'partition', out_folder = './out_files/'):
    """Obtain the empirical and simulated TL relationship given the output file from sample_var().
    
    Here only the summary statistics are recorded for each study, instead of results from each 
    individual sample, because the analysis can be quickly re-done given the input file, without
    going through the time-limiting step of generating samples from partitions.
    The input dat_sample is in the same format as defined by get_var_sample_file().
    The output file has the following columns: 
    study, empirical b, empirical intercept, empirical R-squared, empirical p-value, mean b, intercept, R-squared from samples, 
    percentage of significant TL in samples (at alpha = 0.05), z-score between empirical and sample b, 2.5 and 97.5 percentile of sample b,
    z-score between empirical and sample intercept, 2.5 and 97.5 percentile of sample intercept.
    
    """
    study_list = sorted(np.unique(dat_sample['study']))
    for study in study_list:
        dat_study = dat_sample[dat_sample['study'] == study]
        emp_b, emp_inter, emp_r, emp_p, emp_std_err = stats.linregress(np.log(dat_study['mean']), np.log(dat_study['var']))
        b_list = []
        inter_list = []
        psig = 0
        R2_list = []
        for i_sim in dat_sample.dtype.names[5:]:
            var_sim = dat_study[i_sim][dat_study[i_sim] > 0] # Omit samples of zero variance 
            mean_list = dat_study['mean'][dat_study[i_sim] > 0]
            sim_b, sim_inter, sim_r, sim_p, sim_std_error = stats.linregress(np.log(mean_list), np.log(var_sim))
            b_list.append(sim_b)
            inter_list.append(sim_inter)
            R2_list.append(sim_r ** 2)
            if sim_p < 0.05: psig += 1
        psig /= len(dat_sample.dtype.names[5:])
        out_file = open(out_folder + 'TL_form_' + analysis + '.txt', 'a')
        print>>out_file, study, emp_b, emp_inter, emp_r ** 2, emp_p, np.mean(b_list), np.mean(inter_list), np.mean(R2_list), \
             psig, get_z_score(emp_b, b_list), np.percentile(b_list, 2.5), np.percentile(b_list, 97.5), get_z_score(emp_inter, inter_list), \
             np.percentile(inter_list, 2.5), np.percentile(inter_list, 97.5)
        out_file.close()
Ejemplo n.º 3
0
def signmag_plot(a, b, z, ref):
    imdata1 = np.sign(ref)
    cmap1 = plt.cm.RdBu
    cmap1.set_bad('k', 1)

    imdata2 = np.log10(np.abs(ref))
    cmap2 = plt.cm.YlOrRd
    cmap2.set_bad('k', 1)

    fig, axarr = plt.subplots(ncols=2, figsize=(12, 6))
    axarr[0].pcolormesh(a, b, imdata1, cmap=cmap1, vmin=-1, vmax=1)
    im = axarr[1].pcolormesh(a, b, imdata2, cmap=cmap2,
                             vmin=np.percentile(imdata2,  5),
                             vmax=np.percentile(imdata2, 95))

    for ax in axarr:
        ax.set_xlim((np.min(a), np.max(a)))
        ax.set_ylim((np.min(b), np.max(b)))
        ax.set_xlabel("a")
        ax.set_ylabel("b")
        ax.set(adjustable='box-forced', aspect='equal')

    fig.subplots_adjust(right=0.8)
    cbar_ax = fig.add_axes([0.85, 0.15, 0.03, 0.7])
    fig.colorbar(im, cax=cbar_ax)

    axarr[0].set_title("Sign of hyp1f1")
    axarr[1].set_title("Magnitude of hyp1f1")
    plt.suptitle("z = {:.2e}".format(np.float64(z)))

    return fig
Ejemplo n.º 4
0
def iqr(data):
    return ",".join(
        (
            digits.format(numpy.percentile(data[column], 75) - numpy.percentile(data[column], 25))
            for column in data.columns
        )
    )
Ejemplo n.º 5
0
def main(argv):
  map_utilizations = []
  reduce_utilizations = []
  all_utilizations = []
  dirname = argv[0]
  for filename in os.listdir(dirname):
    full_name = os.path.join(dirname, filename)
    if os.path.isfile(full_name) and filename.endswith("job_log"):
      print "Reading %s" % filename
      analyzer = parse_logs.Analyzer(full_name)

      for (id, stage) in analyzer.stages.iteritems():
        for task in stage.tasks:
          for name, block_device_numbers in task.disk_utilization.iteritems():
            if name in ["xvdb", "xvdf"]:
              effective_util = 0
              if block_device_numbers[0] > 0:
                effective_util = (block_device_numbers[1] + block_device_numbers[2]) / block_device_numbers[0]
              all_utilizations.append(effective_util)
              if task.has_fetch:
                reduce_utilizations.append(effective_util)
              else:
                map_utilizations.append(effective_util)

  output_filename = os.path.join(dirname, "disk_utilization_cdf")
  f = open(output_filename, "w")
  for percent in range(100):
    f.write("%s\t%s\t%s\t%s\n" % (percent / 100., numpy.percentile(map_utilizations, percent),
      numpy.percentile(reduce_utilizations, percent),
      numpy.percentile(all_utilizations, percent)))
  f.close()
Ejemplo n.º 6
0
 def bppd_filter(self, images):    
     """
     1. RBG --> HSV
     2. Set minimum saturation equal to the mean saturation
     3. Set minimum value equal to the mean value
     4. Take hues within range from green-yellow to green-blue
     """
     if self.config['VERBOSE']: self.log_msg('BPPD', 'NOTE: Filtering for plants ...')
     if images == []: raise Exception("No input image(s)!", important=True)
     a = time.time()
     masks = []
     threshold_min = np.array([self.config['HUE_MIN'], self.config['SAT_MIN'], self.config['VAL_MIN']], np.uint8)
     threshold_max = np.array([self.config['HUE_MAX'], self.config['SAT_MAX'], self.config['VAL_MAX']], np.uint8)
     for bgr in images:
         if bgr is not None:
             try:
                 hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV)
                 threshold_min[1] = np.percentile(hsv[:,:,1], 100 * self.config['SAT_MIN'] / 255.0)
                 threshold_min[2] = np.percentile(hsv[:,:,2], 100 * self.config['VAL_MIN'] / 255.0)
                 threshold_max[1] = np.percentile(hsv[:,:,1], 100 * self.config['SAT_MAX'] / 255.0)
                 threshold_max[2] = np.percentile(hsv[:,:,2], 100 * self.config['VAL_MAX'] / 255.0)
                 mask = cv2.inRange(hsv, threshold_min, threshold_max)
                 kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(self.config['KERNEL_XY'],self.config['KERNEL_XY']))
                 mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
                 masks.append(mask)
                 if self.config['VERBOSE']: self.log_msg('BPPD', 'OK: Mask #%d was successful' % len(masks))                    
             except Exception as error:
                 self.log_msg('BPPD', str(error), important=True)
         else:
             if self.config['VERBOSE']: self.log_msg('BPPD', 'WARN: Mask #%d is blank' % len(masks), important=True)
             masks.append(None)
     b = time.time()
     if self.config['VERBOSE']: self.log_msg('BPPD', '... %.2f ms' % ((b - a) * 1000))
     return masks
Ejemplo n.º 7
0
    def _auto_limits(self):

        if self.component_data is None:
            return

        exclude = (100 - self.percentile) / 2.

        # For subsets in 'data' mode, we want to compute the limits based on
        # the full dataset, not just the subset.
        if isinstance(self.data, Subset):
            data_values = self.data.data[self.component_id]
        else:
            data_values = self.data[self.component_id]

        try:
            lower = np.nanpercentile(data_values, exclude)
            upper = np.nanpercentile(data_values, 100 - exclude)
        except AttributeError:  # Numpy < 1.9
            data_values = data_values[~np.isnan(data_values)]
            lower = np.percentile(data_values, exclude)
            upper = np.percentile(data_values, 100 - exclude)

        if isinstance(self.data, Subset):
            lower = 0

        self.set_limits(lower, upper)
Ejemplo n.º 8
0
    def _determine_cmap_params(self, plot_data, vmin, vmax,
                               cmap, center, robust):
        """Use some heuristics to set good defaults for colorbar and range."""
        calc_data = plot_data.data[~np.isnan(plot_data.data)]
        if vmin is None:
            vmin = np.percentile(calc_data, 2) if robust else calc_data.min()
        if vmax is None:
            vmax = np.percentile(calc_data, 98) if robust else calc_data.max()
        self.vmin, self.vmax = vmin, vmax

        # Choose default colormaps if not provided
        if cmap is None:
            if center is None:
                self.cmap = cm.rocket
            else:
                self.cmap = cm.icefire
        elif isinstance(cmap, string_types):
            self.cmap = mpl.cm.get_cmap(cmap)
        elif isinstance(cmap, list):
            self.cmap = mpl.colors.ListedColormap(cmap)
        else:
            self.cmap = cmap

        # Recenter a divergent colormap
        if center is not None:
            vrange = max(vmax - center, center - vmin)
            normlize = mpl.colors.Normalize(center - vrange, center + vrange)
            cmin, cmax = normlize([vmin, vmax])
            cc = np.linspace(cmin, cmax, 256)
            self.cmap = mpl.colors.ListedColormap(self.cmap(cc))
Ejemplo n.º 9
0
    def plot_wavenvelope(self, ax, w_start, w_end):

        """ This function plots the envelope of the recording.

        :param ax: The axis in which you wish to plot.
        :param w_start: Start of the best window.
        :param w_end: End of the best window.
        """
        window_size = int(0.05 * self._sample_rate)  # 0.050 are 50 milliseconds for the envelope window!
        w = 1.0 * np.ones(window_size) / window_size
        envelope = (np.sqrt((np.correlate(self._eod ** 2, w, mode='same') -
                    np.correlate(self._eod, w, mode='same') ** 2)).ravel()) * np.sqrt(2.)
        upper_bound = np.max(envelope) + np.percentile(envelope, 1)
        ax.fill_between(self._time[::500], y1=-envelope[::500], y2=envelope[::500], color='purple', alpha=0.5)
        ax.plot((w_start, w_start), (-upper_bound, upper_bound), 'k--', linewidth=2)
        ax.plot((w_end, w_end), (-upper_bound, upper_bound), 'k--', linewidth=2)
        ax.text((w_start + w_end) / 2., upper_bound - np.percentile(envelope, 10), 'Analysis Window',
                rotation='horizontal', horizontalalignment='center', verticalalignment='center', fontsize=14)

        ax.set_ylim(-upper_bound, upper_bound)
        ax.set_xlabel('Time [s]', fontsize=16)
        ax.set_ylabel('Signal Amplitude [au]', fontsize=16)
        ax.tick_params(axis='both', which='major', labelsize=14)

        pass
def evaluate(im, algo, gt_illuminant, i, range_thresh, bin_num, dst_folder):
    new_im = None
    start_time = timeit.default_timer()
    if algo=="grayworld":
        new_im = cv2.xphoto.autowbGrayworld(im, 0.95)
    elif algo=="nothing":
        new_im = im
    elif algo=="learning_based":
        new_im = cv2.xphoto.autowbLearningBased(im, None, range_thresh, 0.98, bin_num)
    elif algo=="GT":
        gains = gt_illuminant / min(gt_illuminant)
        g1 = float(1.0 / gains[2])
        g2 = float(1.0 / gains[1])
        g3 = float(1.0 / gains[0])
        new_im = cv2.xphoto.applyChannelGains(im, g1, g2, g3)
    time = 1000*(timeit.default_timer() - start_time) #time in ms

    if len(dst_folder)>0:
        if not os.path.exists(dst_folder):
            os.makedirs(dst_folder)
        im_name = ("%04d_" % i) + algo + ".jpg"
        cv2.imwrite(os.path.join(dst_folder, im_name), stretch_to_8bit(new_im))

    #recover the illuminant from the color balancing result, assuming the standard model:
    estimated_illuminant = [0, 0, 0]
    eps = 0.01
    estimated_illuminant[2] = np.percentile((im[:,:,0] + eps) / (new_im[:,:,0] + eps), 50)
    estimated_illuminant[1] = np.percentile((im[:,:,1] + eps) / (new_im[:,:,1] + eps), 50)
    estimated_illuminant[0] = np.percentile((im[:,:,2] + eps) / (new_im[:,:,2] + eps), 50)

    res = np.arccos(np.dot(gt_illuminant,estimated_illuminant)/
                   (np.linalg.norm(gt_illuminant) * np.linalg.norm(estimated_illuminant)))
    return (time, (res / np.pi) * 180)
Ejemplo n.º 11
0
def get_mean_vmax():
    hostvmaxs = []
    hostvmax25s = []
    hostvmax75s = []
    twentyfifth, fifty, seventyfifth = get_percentile()
    rootdir = "/Users/catherinefielder/Documents/Research_Halos/HaloDetail"
    for subdir, dirs, files in os.walk(rootdir):
        head, tail = os.path.split(subdir)
        haloname = tail
        for file in files:
            if file.endswith("_columnsadded_final"):
                values = ascii.read(
                    os.path.join(subdir, file), format="commented_header"
                )  # Get full path and access file
                hostvmax = values[1]["host_vmax"]
                hostvmaxs = np.append(hostvmaxs, hostvmax)
    twentyfifth = np.percentile(hostvmaxs, 25)
    seventyfifth = np.percentile(hostvmaxs, 75)
    for i in range(0, len(hostvmaxs)):
        if hostvmaxs[i] >= seventyfifth:
            hostvmax75s = np.append(hostvmax75s, hostvmaxs[i])
        elif hostvmaxs[i] < twentyfifth:
            hostvmax25s = np.append(hostvmax25s, hostvmaxs[i])
        else:
            continue
    sumvmax = np.sum(hostvmaxs)
    meanvmax = np.divide(sumvmax, len(hostvmaxs))
    mean75 = np.mean(hostvmax75s)
    mean25 = np.mean(hostvmax25s)
    print "mean"
    print meanvmax
    print mean75
    print mean25
    return meanvmax, mean75, mean25
Ejemplo n.º 12
0
def plotKineticsScatter(kinArr, outputFileName):

    handles = []
    colors = ['red', 'green', 'blue', 'magenta']
    bases = ['A', 'C', 'G', 'T']

    fig, ax = _createFigTemplate(dims=(10, 8))

    for i in xrange(4):
        baseHits = kinArr[kinArr['base'] == bases[i]]

        if baseHits.shape[0] > 0:
            # Add a bit of scatter to avoid ugly aliasing in plot due to
            # integer quantization
            cov = baseHits['coverage'] + 0.25 * \
                np.random.randn(baseHits.shape[0])
            score = baseHits['score'] + 0.25 * \
                np.random.randn(baseHits.shape[0])

            pl = ax.scatter(cov, score, c=colors[i], label=bases[
                            i], lw=0, alpha=0.3, s=12)
            handles.append(pl)

    ax.set_xlabel('Per-Strand Coverage')
    ax.set_ylabel('Modification QV')
    plt.legend(handles, bases, loc='upper left')

    if kinArr.shape[0] > 0:
        ax.set_xlim(0, np.percentile(kinArr['coverage'], 95.0) * 1.4)
        ax.set_ylim(0, np.percentile(kinArr['score'], 99.9) * 1.3)

    fig.savefig(outputFileName, dpi=72)
    plt.close(fig)
Ejemplo n.º 13
0
 def updateStats(self):
     if self.current_layer is not None:
         current_attribute = self.dlg.getCurrentAttribute()
         if current_attribute >= 0:
             attribute = self.layer_attributes[current_attribute]
             # check if stats have been calculated before
             idx = self.checkValuesAvailable(attribute)
             if idx == -1:
                 self.retrieveAttributeValues(attribute)
                 idx = len(self.attribute_statistics)-1
             stats = self.attribute_statistics[idx]
             # calculate stats of selected objects only
             select_stats = dict()
             if self.current_layer.selectedFeatureCount() > 0:
                 self.selection_values, self.selection_ids = uf.getFieldValues(self.current_layer, attribute['name'], null=False, selection=True)
                 sel_values = [val for val in self.selection_values if val != NULL]
                 select_stats['Number'] = len(sel_values)
                 select_stats['Mean'] = uf.truncateNumber(np.mean(sel_values))
                 select_stats['Std Dev'] = uf.truncateNumber(np.std(sel_values))
                 select_stats['Variance'] = uf.truncateNumber(np.var(sel_values))
                 select_stats['Median'] = uf.truncateNumber(np.median(sel_values))
                 select_stats['Minimum'] = np.min(sel_values)
                 select_stats['Maximum'] = np.max(sel_values)
                 select_stats['Range'] = uf.truncateNumber(select_stats['Maximum']-select_stats['Minimum'])
                 select_stats['1st Quart'] = uf.truncateNumber(np.percentile(sel_values,25))
                 select_stats['3rd Quart'] = uf.truncateNumber(np.percentile(sel_values,75))
                 select_stats['IQR'] = uf.truncateNumber(select_stats['3rd Quart']-select_stats['1st Quart'])
                 select_stats['Gini'] = uf.roundNumber(uf.calcGini(sel_values))
             else:
                 self.selection_values = []
                 self.selection_ids = []
             # update the dialog
             self.dlg.setStats(stats, select_stats)
Ejemplo n.º 14
0
Archivo: math.py Proyecto: scholi/pySPM
def stat_info(data):
    import matplotlib.pyplot as plt
    D = np.ravel(data)
    U = np.unique(D)
    if len(U)>1:
        sep = np.min(U[1:]-U[:-1])
        N = min(100, int(np.ceil((np.max(D)-np.min(D))/sep)))
    else:
        N = 1
    
    mean = np.mean(D)
    std = np.std(D)
    
    fig, ax = plt.subplots(2,1,figsize=(21,4))
    ax[0].boxplot(D, 0, 'ro', 0);
    ax[1].hist(D, N, density=True);
    ax[1].axvline(mean, color='r', label='mean')
    ax[1].axvline(mean+std, color='r', linestyle='--', label='1$\\sigma$')
    ax[1].axvline(mean-std, color='r', linestyle='--', label='1$\\sigma$')
    if mean-2*std >= U[0]:
        ax[1].axvline(mean-2*std, color='r', linestyle=':', label='2$\\sigma$')
    if mean+2*std <= U[-1]:
        ax[1].axvline(mean+2*std, color='r', linestyle=':', label='2$\\sigma$')
    ax[1].legend();
    print("Stats")
    print("\tAverage:", mean)
    print("\tStandard-deviation:", std)
    print("\tMinimum:", np.min(D))
    print("\tQ1:", np.percentile(D, 25))
    print("\tMedian:", np.percentile(D, 50))
    print("\tQ3:", np.percentile(D, 75))
    print("\tMaximum:", np.max(D))
Ejemplo n.º 15
0
def getMed(x):
    if len(x) == 0:
        x = np.array([0])
    median = np.percentile(x, 50)
    sigma_min = median - np.percentile(x, 16)
    sigma_max = np.percentile(x, 84) - median
    return median, sigma_min, sigma_max
Ejemplo n.º 16
0
def _plot_distribution(z, lat, lev, fig, ax, figpath, titlestr, xstr, xl=None,
                       xu=None, bins=None):
    """Plots a stack of histograms of log10(data) at all levels"""
    # Initialize the bins and the frequency
    num_bins = 100
    if bins is None:
        bins = np.linspace(np.percentile(z, .02), np.percentile(z, 99.98),
                           num_bins+1)
    n = np.zeros((num_bins, lev.size))
    # Calculate distribution at each level
    for i in range(lev.size):
        n[:, i], _ = np.histogram(z[:, i], bins=bins)
    # Take a logarithm and deal with case where we take log of 0
    n = np.log10(n)
    n_small = np.amin(n[np.isfinite(n)])
    n[np.isinf(n)] = n_small
    # Plot histogram
    ca = ax.contourf(bins[:-1], lev, n.T)
    ax.set_ylim(1, 0)
    if xl is not None:
        ax.set_xlim(xl, xu)
    plt.colorbar(ca, ax=ax)
    ax.set_xlabel(xstr)
    ax.set_ylabel(r'$\sigma$')
    ax.set_title(titlestr)
    xl, xr = ax.set_xlim()
    return xl, xr, bins
    def handle_data(self):
        current_time = self.current_datetime
        try:
            location = self.date_index.get_loc(current_time)
        except KeyError:
            return

        if location >= 99:
            histories = self.signals.factor[location-99:location]
            current_level = histories[-1]
            upper = np.percentile(histories, 95)
            lower = np.percentile(histories, 5)

            mid_upper = np.percentile(histories, 75)
            mid_lower = np.percentile(histories, 25)

            if current_level > upper:
                self.order_to('ru.xsge', 1, 1)
            elif current_level < lower:
                self.order_to('ru.xsge', -1, 1)
            #elif mid_lower < current_level < mid_upper:
            #    self.order_to('ru.cffex', 1, 0)

            self.keep('factor', current_level)
            self.keep('factor (95%)', upper)
            self.keep('factor (5%)', lower)
            self.keep('factor (75%)', mid_upper)
            self.keep('factor (25%)', mid_lower)
            self.keep('ru.xsge', self.close['ru.xsge'])
        else:
            return
Ejemplo n.º 18
0
    def meanPlot(self, scale, xIndex=0, yIndex=1):
        i = 0
        nxti = 0
        num = 0
        sumTimes = 0
        numItems = 0
        setNxt = False
        x = []
        y = []
        error = [[], []]
        tmp = []
        while i < len(self.log[xIndex]):
            if self.log[xIndex][i] > (num + 1) * scale:
                if numItems != 0:
                    x.append(num * scale)
                    y.append(np.percentile(tmp, 50))
                    error[0].append(np.percentile(tmp, 25))
                    error[1].append(np.percentile(tmp, 75))
                    i = nxti
                    num += 1
                    tmp = []
                    numItems = 0
                    setNxt = False
            if self.log[xIndex][i] >= (num - 1) * scale:
                tmp.append(self.log[yIndex][i])
                numItems += 1
                if not setNxt:
                    setNxt = True
                    nxti = i
            i += 1
        c = plt.plot(x, y, zorder=10)[0].get_color()
        plt.fill_between(x, error[0], error[1], color=c, alpha="0.25", zorder=0)

        plt.show(block=False)
def viz_docwordfreq_sidebyside(P1, P2, title1='', title2='', 
                                vmax=None, aspect=None, block=False):
  from matplotlib import pylab
  pylab.figure()

  if vmax is None:
    vmax = 1.0
    P1limit = np.percentile(P1.flatten(), 97)
    if P2 is not None:
      P2limit = np.percentile(P2.flatten(), 97)
    else:
      P2limit = P1limit
    while vmax > P1limit and vmax > P2limit:
      vmax = 0.8 * vmax

  if aspect is None:
    aspect = float(P1.shape[1])/P1.shape[0]
  pylab.subplot(1, 2, 1)
  pylab.imshow(P1, aspect=aspect, interpolation='nearest', vmin=0, vmax=vmax)
  if len(title1) > 0:
    pylab.title(title1)
  if P2 is not None:
    pylab.subplot(1, 2, 2)
    pylab.imshow(P2, aspect=aspect, interpolation='nearest', vmin=0, vmax=vmax)
    if len(title2) > 0:
      pylab.title(title2)
  pylab.show(block=block)
Ejemplo n.º 20
0
def BootstrapSc(Method, Data, n=10000):
    """
    Bootstrap the calculation of the best fit Sc value n times to get the 95%
    confidence interval for the best fit Sc.

    Values of n larger than 10000 will take a long time to run.
    """
    tmp = []
    # need to convert the LH,R,CHT data into a serial 1D array before bootstrapping
    if Method == "raw":
        for i in range(len(Data[0])):
            tmp.append(SerializeData(Data[2][i], Data[4][i], Data[3][i]))
    if Method == "patches":
        for i in range(len(Data[0])):
            tmp.append(SerializeData(Data[2][i], Data[10][i], Data[6][i]))
    if Method == "basins":
        for i in range(len(Data[0])):
            tmp.append(SerializeData(Data[5][i], Data[7][i], Data[6][i]))

    ToSample = np.array(tmp)

    Scs = []
    i = 0
    while i < n:
        print i

        sample = np.random.choice(ToSample, len(ToSample), replace=True)
        LH, R, CHT = UnserializeList(sample)
        sc, _, _, _, _ = optimize.leastsq(Residuals, 0.8, args=(R, LH, CHT), full_output=True)
        if sc < 2.0:
            Scs.append(sc[0])
            i += 1

    #        mean          upper bound                               lower bound
    return np.mean(Scs), np.percentile(Scs, 97.5) - np.mean(Scs), np.mean(Scs) - np.percentile(Scs, 2.5)
Ejemplo n.º 21
0
def init_model(dataset, metadata, model_path, surprise_depth, experiment):
	#Initialise the VAE from the given file.
	dataset_changed = False
	print "Initalising a VAE from the model file at",model_path+"."
	#model = globals()[metadata['model_class']](dataset, "data/",selected_hypers=metadata["experiments"][experiment]["best_hypers"])
	model = globals()[metadata['model_class']](dataset, "data/",selected_hypers=metadata["best_hypers"])
	if "monary_type" in metadata.keys():
		model.set_monary_type(metadata["monary_type"])
	model.load(model_path)
	model.init_model_functions()
	model.metadata = metadata
	conditional_dists_file = model_path[:-4]+"_surpdist_"+str(surprise_depth)+".csv"
	metadata["experiments"][experiment]["surprise_distribution"] = model.precalculate_conditional_dists(from_file=True,file_path=conditional_dists_file, depth=surprise_depth)
	if any(k not in metadata["experiments"][experiment].keys() for k in ["plausibility_distribution","errors_by_length","hidden_rep_averages"]):
		print "Generating distribution over plausibility for each design in the dataset."
		plausibilities, errors_by_length,hidden_rep_averages = model.get_dataset_errors(metadata, return_averages_by_length=True, return_hidden_rep_averages=True)
		print "plausibilities.shape",plausibilities.shape
		plaus_dist = {}
		plaus_dist["min"] = float(np.amin(plausibilities))
		plaus_dist["max"] = float(np.amax(plausibilities))
		plaus_dist["5%"] = float(np.percentile(plausibilities, 5))
		plaus_dist["95%"] = float(np.percentile(plausibilities, 95))
		plaus_dist["mean"] = float(np.average(plausibilities))
		print "plaus_dist",plaus_dist
		metadata["experiments"][experiment]["plausibility_distribution"] = plaus_dist
		metadata["experiments"][experiment]["errors_by_length"] = errors_by_length
		metadata["experiments"][experiment]["hidden_rep_averages"] = hidden_rep_averages
		dataset_changed = True
	if dataset_changed:
		print "Saving updates to the",dataset,"dataset entry."
		client = pymongo.MongoClient()
		db = client.creeval
		db.datasets.save(metadata)
	return model
Ejemplo n.º 22
0
    def test_random_posterior(self):
        ndraws = 100000
        ssqr_draws = np.empty(ndraws)
        for i in xrange(ndraws):
            ssqr_draws[i] = self.sigsqr.random_posterior()

        nu = self.sigsqr.nu
        prior_ssqr = self.sigsqr.lamb

        post_dof = nu + len(self.y)
        post_ssqr = (nu * prior_ssqr + self.y.size * np.var(self.sigsqr.bart_step.resids)) / post_dof

        igam_shape = post_dof / 2.0
        igam_scale = post_dof * post_ssqr / 2.0
        igamma = stats.distributions.invgamma(igam_shape, scale=igam_scale)

        # test draws from conditional posterior by comparing 1st and 2nd moments to true values
        true_mean = igamma.moment(1)
        frac_diff = np.abs(true_mean - ssqr_draws.mean()) / true_mean
        rpmsg = "Fractional difference in mean from BartVariance.random_posterior() is greater than 2%"
        self.assertLess(frac_diff, 0.02, msg=rpmsg)

        true_ssqr = igamma.moment(2)
        frac_diff = np.abs(true_ssqr - (ssqr_draws.var() + ssqr_draws.mean() ** 2)) / true_ssqr
        rpmsg = "Fractional difference in 2nd moment from BartVariance.random_posterior() is greater than 2%"
        self.assertLess(frac_diff, 0.02, msg=rpmsg)

        # make sure gibbs sampler constrains the correct value
        ssqr_low = np.percentile(ssqr_draws, 1.0)
        ssqr_high = np.percentile(ssqr_draws, 99.0)
        rpmsg = "Value of Variance parameter returned by Gibbs sampler is outside of 99% credibility interval."
        self.assertGreater(self.true_sigsqr, ssqr_low, msg=rpmsg)
        self.assertLess(self.true_sigsqr, ssqr_high, msg=rpmsg)
Ejemplo n.º 23
0
def show_bootstrap_statistics(clf, X, y, features):
    num_features = len(features)

    coefs = []
    for i in range(num_features):
        coefs.append([])

    for _ in range(BOOTSTRAP_ITERATIONS):
        X_sample, y_sample = resample(X, y)
        clf.fit(X_sample, y_sample)
        for i, c in enumerate(get_normalized_coefs(clf)):
            coefs[i].append(c)

    poi_index = features.index('POI')
    building_index = features.index('Building')
    coefs[building_index] = coefs[poi_index]

    intervals = []

    print()
    print('***** Bootstrap statistics *****')
    print('{:<20}{:<20}{:<10}{:<10}'.format('Feature', '95% interval', 't-value', 'Pr(>|t|)'))
    print()
    for i, cs in enumerate(coefs):
        values = np.array(cs)
        lo = np.percentile(values, 2.5)
        hi = np.percentile(values, 97.5)
        interval = '({:.3f}, {:.3f})'.format(lo, hi)
        tv = np.mean(values) / np.std(values)
        pr = (1.0 - t.cdf(x=abs(tv), df=len(values))) * 0.5

        stv = '{:.3f}'.format(tv)
        spr = '{:.3f}'.format(pr)
        print('{:<20}{:<20}{:<10}{:<10}'.format(features[i], interval, stv, spr))
Ejemplo n.º 24
0
    def test_quantile(self):
        from numpy import percentile

        q = self.ts.quantile(0.1)
        self.assertEqual(q, percentile(self.ts.valid(), 10))

        q = self.ts.quantile(0.9)
        self.assertEqual(q, percentile(self.ts.valid(), 90))

        # object dtype
        q = Series(self.ts, dtype=object).quantile(0.9)
        self.assertEqual(q, percentile(self.ts.valid(), 90))

        # datetime64[ns] dtype
        dts = self.ts.index.to_series()
        q = dts.quantile(.2)
        self.assertEqual(q, Timestamp('2000-01-10 19:12:00'))

        # timedelta64[ns] dtype
        tds = dts.diff()
        q = tds.quantile(.25)
        self.assertEqual(q, pd.to_timedelta('24:00:00'))

        # GH7661
        result = Series([np.timedelta64('NaT')]).sum()
        self.assertTrue(result is pd.NaT)

        msg = 'percentiles should all be in the interval \\[0, 1\\]'
        for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
            with tm.assertRaisesRegexp(ValueError, msg):
                self.ts.quantile(invalid)
Ejemplo n.º 25
0
def ampDiffStats(ampIm1, ampIm2, osIm1, osIm2, exptime=0.0):
    stats = np.zeros(shape=(1,),
                     dtype=statDtype)

    a_i = 0
    _s1 = np.median(ampIm1) - np.median(osIm1)
    _s2 = np.median(ampIm2) - np.median(osIm2)
    stats[a_i]['signal'] = signal = (_s1 + _s2)/2
    stats[a_i]['npix'] = ampIm1.size
    stats[a_i]['sqrtSig'] = np.sqrt(signal)
    stats[a_i]['bias'] = (np.median(osIm1) + np.median(osIm2))/2

    ampIm = ampIm2.astype('f4') - ampIm1
    osIm = osIm2.astype('f4') - osIm1

    sig1 = (0.741/np.sqrt(2)) * np.subtract.reduce(np.percentile(ampIm, [75,25]))
    sig2 = (0.741/np.sqrt(2)) * np.subtract.reduce(np.percentile(osIm, [75,25]))
    _, trusig1, _ = geom.clippedStats(ampIm) / np.sqrt(2)
    _, trusig2, _ = geom.clippedStats(osIm) / np.sqrt(2)

    stats[a_i]['readnoise'] = sig2
    stats[a_i]['readnoiseM'] = trusig2

    stats[a_i]['shotnoise'] = sig = np.sqrt(np.abs(sig1**2 - sig2**2))
    stats[a_i]['shotnoiseM'] = trusig = np.sqrt(np.abs(trusig1**2 - trusig2**2))

    stats[a_i]['gain'] = gain = signal/sig**2
    stats[a_i]['gainM'] = signal/trusig**2
    stats[a_i]['noise'] = sig2*gain
    stats[a_i]['flux'] = signal/exptime if exptime != 0 else 0.0

    return stats, ampIm, osIm
def get_stat_function(statistics, perc=None): 
    # Define personalized functions for binned_statistics
    if (statistics == 'mean') | (statistics == 'median'):
        stat_func = statistics
    elif statistics == 'std':
        stat_func = np.std
    elif statistics == 'mse':
        stat_func = lambda x: np.mean(x**2)
    elif statistics == 'frac':
        # stat_func = lambda x: 100.0*np.abs(np.mean(x))/(np.abs(np.mean(x)) + np.std(x)) Wrong decomposition
        stat_func = lambda x: np.sign(np.mean(x))*100.0*np.mean(x)**2/(np.mean(x)**2 + np.std(x)**2)
    elif statistics == 'cv':
        stat_func_ratio = lambda x: np.std(x)/np.mean(x)
        stat_func_diff = lambda x: np.std(x) - np.abs(np.mean(x)) # To compute the CV for an already multiplicative variable (GD)
    elif statistics == 'iqr':
        stat_func = lambda x: np.percentile(x,75) - np.percentile(x,25)
    elif statistics == 'percentile':
        if perc == None:
            print('Do not forget to pass the wanted percentile. I will use 50 by default...')
            perc = 50
        stat_func = lambda x: np.percentile(x, perc)
    else:
        print('Wrong statistics asked:', statistics)
        sys.exit(1)
    return(stat_func)
Ejemplo n.º 27
0
def descriptive_stats(array, verbose=True, label='', mean=False, plot=False):
    """ Simple statistics from vector.
    """
    if mean:
        mean_ = np.mean(array)
    median = np.median(array)
    mini = np.min(array)
    maxi = np.max(array)
    first_qu = np.percentile(array, 25)
    third_qu = np.percentile(array, 75)
    
    if verbose:
        if mean:
            label += 'min={:.1f} / 1st QU={:.1f} / ave={:.1f} / med={:.1f} / '
            label += '3rd QU={:.1f} / max={:.1f}'
            print(label.format(mini, first_qu, mean_, median, third_qu, maxi))
        else:
            label += 'min={:.1f} / 1st QU={:.1f} / med={:.1f} / 3rd QU={:.1f} '
            label += '/ max={:.1f}'
            print(label.format(mini, first_qu, median, third_qu, maxi))
    
    if plot:
        boxplot(array, vert=False, meanline=mean, showfliers=True, sym='.')
    
    if mean:
        return mini, first_qu, mean_, median, third_qu, maxi
    else:
        return mini, first_qu, median, third_qu, maxi
Ejemplo n.º 28
0
	def write_parameters_outputvalues(self, P):		


		Mstar, SFR_opt, _ = model.stellar_info_array(self.chain.flatchain_sorted, self.data, self.out['realizations2int'])
		column_names = np.transpose(np.array(["P025","P16","P50","P84","P975"], dtype='|S3'))
		chain_pars = np.column_stack((self.chain.flatchain_sorted, Mstar, SFR_opt))		
											# np.mean(chain_pars, axis[0]),
											# np.std(chain_pars, axis[0]),
		if self.out['calc_intlum']:			


			SFR_IR = model.sfr_IR(self.int_lums[0]) #check that ['intlum_names'][0] is always L_IR(8-100)
			
			chain_others =np.column_stack((self.int_lums.T, SFR_IR))
			outputvalues = np.column_stack((np.transpose(map(lambda v: (v[0],v[1],v[2],v[3],v[4]), zip(*np.percentile(chain_pars, [2.5,16, 50, 84,97.5], axis=0)))),
											np.transpose(map(lambda v: (v[0],v[1],v[2],v[3],v[4]), zip(*np.percentile(chain_others, [2.5,16, 50, 84,97.5], axis=0))))											)) 


	
			outputvalues_header= ' '.join([ i for i in np.hstack((P.names, 'Mstar', 'SFR_opt', self.out['intlum_names'], 'SFR_IR',))] )

		else:
			outputvalues = np.column_stack((map(lambda v: (v[1], v[2]-v[1], v[1]-v[0]), zip(*np.percentile(chain_pars, [16, 50, 84],  axis=0))))) 
			outputvalues_header=' '.join( [ i for i in P.names] )
		return outputvalues, outputvalues_header
Ejemplo n.º 29
0
 def arrivals(self, stories, state=6):
     ''' Chart a plot point for every arrival time in state
     '''
     arrivals = self.release.kanban().state_arrival_interval(state)
     dates = [a['date'] for a in arrivals]
     arrivals = [round(a['interval']/60./60., 1) for a in arrivals]
     average = numpy.median([arrivals])
     std = numpy.std([arrivals])
     iql = numpy.percentile([arrivals], 25)
     iqh = numpy.percentile([arrivals], 75)
     nsul = []
     nsuw = []
     nsll = []
     nslw = []
     avg = []
     for x in arrivals:
         nsul.append(average + (iqh * 3))
         nsuw.append(average + (iqh * 2))
         nslw.append(average - (iql * 2))
         nsll.append(average - (iql * 3))
         avg.append(average)
     pyplot.plot(dates, arrivals, '*', color='g')
     pyplot.plot(dates, nsul, 'o', linestyle='-', color='r')
     pyplot.plot(dates, nsuw, '.', linestyle=':', color='y')
     pyplot.plot(dates, nslw, '.', linestyle=':', color='y')
     pyplot.plot(dates, nsll, 'o', linestyle='-', color='r')
     pyplot.plot(dates, avg, '',linestyle='-.',  markerfacecolor='None')
     pyplot.show(block=False)
Ejemplo n.º 30
0
    def test_random_posterior(self):
        # first get values of mu drawn from their conditional posterior
        ndraws = 100000
        nleaves = len(self.mu.value)
        mu_draws = np.empty((ndraws, nleaves))
        for i in xrange(ndraws):
            mu_draws[i, :] = self.mu.random_posterior()

        l_idx = 0
        for leaf in self.mu.treeparam.value.terminalNodes:
            ny = leaf.npts
            ybar = leaf.ybar
            post_var = 1.0 / (1.0 / self.mu.prior_var + ny / self.mu.sigsqr.value)
            post_mean = post_var * (self.mu.mubar / self.mu.prior_var + ny * ybar / self.mu.sigsqr.value)

            # test draws from conditional posterior by comparing 1st and 2nd moments to true values
            zscore = np.abs((post_mean - mu_draws[:, l_idx].mean())) / np.sqrt(post_var / ndraws)
            rpmsg = "Sample mean from BartMeanParameter.random_posterior() differs by more than 3-sigma."
            self.assertLess(zscore, 3.0, msg=rpmsg)

            frac_diff = np.abs(np.sqrt(post_var) - mu_draws[:, l_idx].std()) / np.sqrt(post_var)
            rpmsg = "Fractional difference in standard deviation from BartMeanParameter.random_posterior() is greater" \
                + " than 2%"
            self.assertLess(frac_diff, 0.02, msg=rpmsg)

            # make sure gibbs sampler constrains the correct value
            mu_low = np.percentile(mu_draws[:, l_idx], 1.0)
            mu_high = np.percentile(mu_draws[:, l_idx], 99.0)
            rpmsg = "Value of Terminal Node output parameter returned by Gibbs sampler is outside of 99% credibility" \
                + " interval.\n Violated: " + str(mu_low) + ' < ' + str(self.true_mu[l_idx]) + ' < ' + str(mu_high)
            self.assertGreater(self.true_mu[l_idx], mu_low, msg=rpmsg)
            self.assertLess(self.true_mu[l_idx], mu_high, msg=rpmsg)

            l_idx += 1
Ejemplo n.º 31
0
def run(data_path, str_class, algs=["DT", "MLP", "RF"]):

    data = pd.read_csv(data_path)
    folds = 10
    rep = 1

    X = data.drop([str_class], axis=1).values
    y = data[str_class].values

    perceltil_inf = [0, 1.5, 2.5, 3.5] + [5 * i for i in range(1, 7)]
    perceltil_sup = [(100 - perceltil_inf[i])
                     for i in range(len(perceltil_inf))]
    range_high_TG = [
        np.percentile(y, perceltil_sup[i]) for i in range(len(perceltil_sup))
    ]
    # range_high_TG.reverse()
    range_low_TG = [
        np.percentile(y, perceltil_inf[i]) for i in range(len(perceltil_inf))
    ]

    range_high_TG = np.round(range_high_TG, 2)
    range_low_TG = np.round(range_low_TG, 2)

    print(range_low_TG)
    print(perceltil_inf)
    print(range_high_TG)
    print(perceltil_sup)

    dic_oracle = {}
    data = []
    for low, low_perc in zip(range_low_TG, perceltil_inf):
        for high, high_perc in zip(range_high_TG, perceltil_sup):
            for alg_low in algs:
                for alg_middle in algs:
                    for alg_high in algs:
                        line = []
                        line = [
                            low_perc, 100 - (low_perc + 100 - high_perc),
                            100 - high_perc
                        ]

                        for i in [alg_low, alg_middle, alg_high]:
                            line = line + [
                                1 if i == "MLP" else 0, 1 if i == "RF" else 0,
                                1 if i == "DT" else 0
                            ]
                        res = oracle(low, high, alg_low, alg_middle, alg_high)
                        line = line + res.tolist()
                        dic_oracle["{0}-{1}_{2}_{3}_{4}".format(
                            low, high, alg_low, alg_middle, alg_high)] = line
                        data.append(line)

    cols_name = [
        "S", "M", "E", "S_MLP", "S_RF", "S_DT", "M_MLP", "M_RF", "M_DT",
        "E_MLP", "E_RF", "E_DT", "Global_mean_MAE", "Global_mean_MSE",
        "Global_mean_R2_S", "Global_mean_RRMSE", "Global_mean_RMSE",
        "Global_mean_MARE", "Global_mean_R2", "Global_sd_MAE", "Global_sd_MSE",
        "Global_sd_R2_S", "Global_sd_RRMSE", "Global_sd_RMSE",
        "Global_sd_MARE", "Global_sd_R2", "Local_S_mean_MAE",
        "Local_S_mean_MSE", "Local_S_mean_R2_S", "Local_S_mean_RRMSE",
        "Local_S_mean_RMSE", "Local_S_mean_MARE", "Local_S_mean_R2",
        "Local_S_sd_MAE", "Local_S_sd_MSE", "Local_S_sd_R2_S",
        "Local_S_sd_RRMSE", "Local_S_sd_RMSE", "Local_S_sd_MARE",
        "Local_S_sd_R2", "Local_M_mean_MAE", "Local_M_mean_MSE",
        "Local_M_mean_R2_S", "Local_M_mean_RRMSE", "Local_M_mean_RMSE",
        "Local_M_mean_MARE", "Local_M_mean_R2", "Local_M_sd_MAE",
        "Local_M_sd_MSE", "Local_M_sd_R2_S", "Local_M_sd_RRMSE",
        "Local_M_sd_RMSE", "Local_M_sd_MARE", "Local_M_sd_R2",
        "Local_E_mean_MAE", "Local_E_mean_MSE", "Local_E_mean_R2_S",
        "Local_E_mean_RRMSE", "Local_E_mean_RMSE", "Local_E_mean_MARE",
        "Local_E_mean_R2", "Local_E_sd_MAE", "Local_E_sd_MSE",
        "Local_E_sd_R2_S", "Local_E_sd_RRMSE", "Local_E_sd_RMSE",
        "Local_E_sd_MARE", "Local_E_sd_R2"
    ]
    df = pd.DataFrame(data, columns=cols_name)
    df.to_csv('../result/evaluating_range/ranges2.csv')

    return df
Ejemplo n.º 32
0
def combine_flat(
    files,
    instrument,
    mode,
    extension=None,
    bhead=None,
    bias=None,
    plot=False,
    plot_title=None,
    bias_scaling="number_of_files",
    **kwargs,
):
    """
    Combine several flat files into one master flat

    Parameters
    ----------
    files : list(str)
        flat files
    instrument : str
        instrument mode for modinfo
    extension: {int, str}, optional
        fits extension to use (default: 1)
    bias: array(int, float), optional
        bias image to subtract from master flat (default: 0)

    xr: 2-tuple(int), optional
        x range to use (default: None, i.e. whole image)
    yr: 2-tuple(int), optional
        y range to use (default: None, i.e. whole image)
    dtype : np.dtype, optional
        datatype of the combined bias frame (default: float32) 
    Returns
    -------
    flat, fhead
        image and header of master flat
    """
    flat, fhead = combine_frames(files, instrument, mode, extension, **kwargs)
    # Subtract master dark
    # TODO: Why do we scale with number of files and not exposure time?
    if bias is not None:
        if bias_scaling == "number_of_files":
            flat -= bias * len(files)
        elif bias_scaling == "exposure_time":
            flat -= bias * fhead["exptime"] / bhead["exptime"]
        elif bias_scaling == "mean":
            flat -= bias * np.ma.mean(flat) / np.ma.mean(bias)
        elif bias_scaling == "median":
            flat -= bias * np.ma.median(flat) / np.ma.median(bias)
        else:
            raise ValueError(
                "Unexpected value for 'bias_scaling', expected one of ['number_of_files', 'exposure_time'], but got %s"
                % bias_scaling)

    if plot:  # pragma: no cover
        title = "Master Flat"
        if plot_title is not None:
            title = f"{plot_title}\n{title}"
        plt.title(title)
        plt.xlabel("x [pixel]")
        plt.ylabel("y [pixel]")
        bot, top = np.percentile(flat, (10, 90))
        plt.imshow(flat, vmin=bot, vmax=top, origin="lower")
        plt.show()

    return flat, fhead
    print('Too many labels: ', too_many_labels)
    print('no_labelled_data: ', no_labelled_data)

tpl = [(bidirectional_flow_lengths, '_bidir.csv'),
       (forward_flow_lengths, '_fwd.csv'), (backward_flow_lengths, '_bwd.csv')]
for flow_lengths, output_ext in tpl:
    percentile = np.array([
        'Label', 'num_flows', 'Min', '20-th', '50-th', '90-th percentile',
        '95-th percentile', '99-th percentile', '99.9-th percentile',
        '100-th percentile'
    ])
    percentile = percentile.reshape((1, -1))

    for label in label_names:
        flow_lengths_for_label = flow_lengths[label]
        print("{:40s}-->{:10d}".format(label, len(flow_lengths_for_label)))

        if len(flow_lengths_for_label) < 1:
            continue
        flow_lengths_for_label = np.array(flow_lengths_for_label)

        row = np.array([label,len(flow_lengths_for_label),np.min(flow_lengths_for_label),\
           np.percentile(flow_lengths_for_label,20),np.percentile(flow_lengths_for_label,50),\
           np.percentile(flow_lengths_for_label,90),np.percentile(flow_lengths_for_label,95),np.percentile(flow_lengths_for_label,99),\
           np.percentile(flow_lengths_for_label,99.9), np.percentile(flow_lengths_for_label,100)])
        percentile = np.concatenate((percentile, row.reshape((1, -1))), axis=0)
        np.savetxt(output_filename.replace('.csv', output_ext),
                   percentile,
                   fmt='%s',
                   delimiter=',')
@author: rian-van-den-ander
"""

import numpy as np
import pandas as pd

dataset = pd.read_csv('personality_data.csv', header=0, sep='\t')
"""
Data cleansing
----------
"""

dataset = dataset.dropna()  #drop any null data
dataset = dataset[dataset.age < 100]  # Removing bogus age
dataset = dataset[dataset.gender.isin([1, 2])]  # removing non specific genders
dataset = dataset[dataset.accuracy > np.percentile(
    dataset.accuracy, 5)]  # removing very low accuracies
dataset = dataset[dataset.accuracy <= 100]  # Removing very high accuracies
dataset = dataset[dataset.elapsed <= 5000]  # Removing very high accuracies
dataset = dataset[dataset.elapsed > 300]  # Removing very high accuracies

X = dataset.iloc[:, 0:-6].values
y_age = dataset.iloc[:, -6].values
y_gender = dataset.iloc[:, -5].values
y_accuracy = dataset.iloc[:, -4].values
y_elapsed = dataset.iloc[:, -1].values
"""
Data engineering
----------
"""

# Adding interaction between personality items to X
Ejemplo n.º 35
0
def vertprofileplot(ifiles, args):
    if args.variables is None:
        raise ValueError('User must specify variable(s) to plot:\n%s' %
                         '\n\t'.join(ifiles[0].variables.keys()))
    from PseudoNetCDF.coordutil import getsigmamid, getpresmid, gettimes
    import pylab as pl
    from pylab import figure, NullFormatter, close, rcParams
    rcParams['text.usetex'] = False
    from matplotlib.colors import LinearSegmentedColormap, BoundaryNorm, LogNorm
    scale = args.scale
    minmax = eval(args.minmax)
    minmaxq = eval(args.minmaxq)
    sigma = args.sigma
    maskzeros = args.maskzeros
    outunit = args.outunit
    tespaths = args.tespaths
    omipaths = args.omipaths
    edges = args.edges
    try:
        f, = ifiles
    except:
        raise ValueError(
            'curtain plot expects one file when done. Try stack time --stack=time to concatenate'
        )

    # Add CF conventions if necessary
    if 'latitude_bounds' not in f.variables.keys():
        try:
            from PseudoNetCDF import getvarpnc
            from PseudoNetCDF.conventions.ioapi import add_cf_from_ioapi
            f = getvarpnc(f, None)
            add_cf_from_ioapi(f)
        except:
            pass
    if sigma:
        vertcrd = getsigmamid(f)
    else:
        vertcrd = getpresmid(f, pref=101325., ptop=getattr(f, 'VGTOP', 10000))
        if vertcrd.max() > 2000: vertcrd /= 100.

    try:
        lonb = f.variables['geos_longitude_bounds']
        latb = f.variables['geos_latitude_bounds']
    except:
        lonb = f.variables['longitude_bounds']
        latb = f.variables['latitude_bounds']
    for var_name in args.variables:
        temp = defaultdict(lambda: 1)
        try:
            eval(var_name, None, temp)
            var = eval(var_name, None, f.variables)[:]
        except:
            temp[var_name]
            var = f.variables[var_name][:]
        if maskzeros: var = np.ma.masked_values(var, 0)
        vkeys = [k for k in temp.keys()]
        unit = f.variables[vkeys[0]].units.strip()
        if unit in unitconvert:
            var = unitconvert.get((unit, outunit), lambda x: x)(var)
        else:
            outunit = unit
        bmap = None
        vmin, vmax = np.percentile(
            np.ma.compressed(var).ravel(), list(minmaxq))
        if minmax[0] is not None:
            vmin = minmax[0]
        if minmax[1] is not None:
            vmax = minmax[1]
        if edges:
            fig = pl.figure(figsize=(16, 4))
            offset = 0.05
            ax = fig.add_axes([.1 - offset, .15, .22, .725])
            ax = fig.add_axes([.325 - offset, .15, .22, .725])
            ax = fig.add_axes([.55 - offset, .15, .22, .725])
            ax = fig.add_axes([.775 - offset, .15, .22, .725])
            ss = 0
            se = ss + f.NCOLS + 1
            es = se
            ee = se + f.NROWS + 1
            ns = ee
            ne = ee + f.NCOLS + 1
            ws = ne
            we = ws + f.NROWS + 1
            axs = fig.axes
            for ax in fig.axes[1:]:
                ax.yaxis.set_major_formatter(pl.NullFormatter())

            vars = [
                var[:, :, ss:se], var[:, :, es:ee],
                var[:, :, ns:ne][:, :, ::-1], var[:, :, ws:we][:, :, ::-1]
            ]
            lonbss = [
                lonb[ss:se], lonb[es:ee], lonb[ns:ne][::-1], lonb[ws:we][::-1]
            ]
            latbss = [
                latb[ss:se], latb[es:ee], latb[ns:ne][::-1], latb[ws:we][::-1]
            ]

        else:
            fig = pl.figure()
            ax = fig.add_subplot(111)
            axs = fig.axes
            vars = [var]
            if lonb.dimensions == ('longitude',
                                   'nv') and latb.dimensions == ('latitude',
                                                                 'nv'):
                lonbss = [lonb[:][None, :, :]]
                latbss = [latb[:][:, None, :]]
            else:
                lonbss = [lonb[:]]
                latbss = [latb[:]]
        for ax, var, lonbs, latbs in zip(axs, vars, lonbss, latbss):
            vals = var.swapaxes(0, 1).reshape(var.shape[1], -1)
            modl, modr = minmaxmean(ax,
                                    vals,
                                    vertcrd,
                                    facecolor='k',
                                    edgecolor='k',
                                    alpha=.2,
                                    zorder=4,
                                    label='mod (%d)' % vals.shape[1],
                                    ls='-',
                                    lw=2,
                                    color='k')
            llines = [(modl, modr)]
            ymin, ymax = vertcrd.min(), vertcrd.max()
            ax.set_ylim(ymax, ymin)
            ax.set_xscale(scale)
            ax.set_xlim(vmin, vmax)
            #if scale == 'log':
            #    ax.set_xticklabels(['%.1f' % (10**x) for x in ax.get_xticks()])

            if 'TFLAG' in f.variables.keys():
                SDATE = f.variables['TFLAG'][:][0, 0, 0]
                EDATE = f.variables['TFLAG'][:][-1, 0, 0]
                STIME = f.variables['TFLAG'][:][0, 0, 1]
                ETIME = f.variables['TFLAG'][:][-1, 0, 1]
                if SDATE == 0:
                    SDATE = 1900001
                    EDATE = 1900001
                sdate = datetime.strptime('%07d %06d' % (SDATE, STIME),
                                          '%Y%j %H%M%S')
                edate = datetime.strptime('%07d %06d' % (EDATE, ETIME),
                                          '%Y%j %H%M%S')
            elif 'tau0' in f.variables.keys():
                sdate = datetime(1985, 1, 1,
                                 0) + timedelta(hours=f.variables['tau0'][0])
                edate = datetime(1985, 1, 1,
                                 0) + timedelta(hours=f.variables['tau1'][-1])
            else:
                times = gettimes(f)
                sdate = times[0]
                edate = times[-1]

            if len(tespaths) > 0:
                tesl, tesr = plot_tes(ax, lonbs, latbs, tespaths)
                if not tesl is None:
                    llines.append((tesl, tesr))
            if len(omipaths) > 0:
                omil, omir = plot_omi(
                    ax,
                    lonbs,
                    latbs,
                    omipaths,
                    airden=f.variables['AIRDEN'][:].mean(0).mean(1),
                    airdenvert=vertcrd)
                if not omil is None:
                    llines.append((omil, omir))

        try:
            title = '%s to %s' % (sdate.strftime('%Y-%m-%d'),
                                  edate.strftime('%Y-%m-%d'))
        except:
            title = var_name
        if sigma:
            axs[0].set_ylabel('sigma')
        else:
            axs[0].set_ylabel('pressure')

        xmax = -np.inf
        xmin = np.inf
        for ax in fig.axes:
            tmp_xmin, tmp_xmax = ax.get_xlim()
            xmax = max(tmp_xmax, xmax)
            xmin = min(tmp_xmin, xmin)
        for ax in fig.axes:
            ax.set_xlim(xmin, xmax)

        if len(axs) == 1:
            axs[0].set_xlabel('%s %s' % (var_name, outunit))
        else:
            axs[0].set_xlabel('South')
            axs[1].set_xlabel('East')
            axs[2].set_xlabel('North')
            axs[3].set_xlabel('West')
            fig.text(.5,
                     .90,
                     '%s %s' % (var_name, outunit),
                     horizontalalignment='center',
                     fontsize=16)
        nl = 0
        for ax in axs:
            if len(ax.get_lines()) > nl:
                nl = len(ax.get_lines())
                pl.sca(ax)

        llabels = [l[0].get_label() for l in llines]
        pl.legend(llines,
                  llabels,
                  bbox_to_anchor=(.1, 1),
                  loc='upper left',
                  bbox_transform=fig.transFigure,
                  ncol=6)
        if edges:
            fig.text(0.95,
                     0.975,
                     title,
                     horizontalalignment='right',
                     verticalalignment="top",
                     fontsize=16)
        else:
            fig.text(0.95,
                     0.025,
                     title,
                     horizontalalignment='right',
                     verticalalignment="bottom",
                     fontsize=16)
        figpath = args.outpath + var_name + '.' + args.figformat
        fig.savefig(figpath)
        if args.verbose > 0: print('Saved fig', figpath)
        #pl.close(fig)
    return fig
Ejemplo n.º 36
0
    def make_score_vs_rmsd_plot(self, loop):
        """
        Create a score vs RMSD plot for the given loop.  In fact two plots are 
        made: one which includes every model and one which includes only the 
        top 75% best scoring models.  Normally the second plot is of more 
        interest, because it focuses better on the interesting lower-left 
        region of the plot.  The full plots often have outliers that really 
        scale the score axis.
        """

        # This method would be much more concise if it used matplotlib.

        if not loop.has_data:
            return

        tsv_path = os.path.join(loop.latex_dir, 'score_vs_rmsd.tsv')
        gnu_path = os.path.join(loop.latex_dir, 'score_vs_rmsd.gnu')
        pdf_path_100 = os.path.join(loop.latex_dir, 'score_vs_rmsd_all.pdf')
        pdf_path_75 = os.path.join(loop.latex_dir, 'score_vs_rmsd_third_quartile.pdf')

        tsv_row = '{0.id}\t{0.rmsd}\t{0.score}\n'

        sorted_models = loop.models_sorted_by_score
        scores = loop.scores
        min_score, max_score = min(scores), max(scores)
        third_quartile = numpy.percentile(scores, 75)
        native_score = 0    # This isn't stored in the database yet.

        # Write score vs RMSD data to a tab-separated value (TSV) file that can 
        # easily be parsed by gnuplot.

        with open(tsv_path, 'w') as file:
            file.write('#Model\tLoop_rmsd\tTotal_score\n')
            file.write('input_structure\t0.0\t{0}\n'.format(native_score))

            # All models
            file.write('\n\n')
            for model in sorted_models:
                file.write(tsv_row.format(model))

            # Top X scoring models
            file.write('\n\n')
            for model in sorted_models[:top_x]:
                file.write(tsv_row.format(model))

            # Top scoring model
            file.write('\n\n')
            file.write(tsv_row.format(sorted_models[0]))

        # Write the gnuplot script and generate the EPS plots.

        gnuplot_script = '''\
set autoscale
set border 31
set tics out
set terminal pdf
set xtics autofreq
set xtics nomirror
set ytics autofreq
set ytics nomirror
set noy2tics
set nox2tics

set style line 1 lt 1 lc rgb "dark-magenta" lw 2
set style line 2 lt 1 lc rgb "{loop.benchmark.color}" lw 2 ps 0.5 pt 7
set style line 3 lt 1 lc rgb "forest-green" lw 2 ps 2 pt 13
set style line 4 lt 1 lc rgb "dark-gray" lw 2 ps 0.5 pt 7
set style line 5 lt 1 lc rgb "black" lw 2 ps 0.8 pt 13
set style line 6 lt 1 lc rgb "black" lw 2
set style line 7 lt 1 lc rgb "dark-gray" lw 2
set style line 8 lt 1 lc rgb "gray" lw 2
set style line 9 lt 2 lc rgb "dark-gray" lw 5

set boxwidth 0.75

set key below right
set xrange [0:]
set encoding iso_8859_1
set title "{loop.pdb_id}: {loop.percent_subangstrom:0.2f}% sub-\305 models"
set xlabel "r.m.s. deviation to crystal loop [\305]"
set arrow from 1, graph 0 to 1, graph 1 ls 9 nohead
set ylabel "Rosetta all-atom score"
set output "{pdf_path_100}"
plot "{tsv_path}" index 1 using ($2):($3) with points ls 2 title "all models" axes x1y1, \\
     "{tsv_path}" index 2 using ($2):($3) with points ls 4 title "5 lowest energy models" axes x1y1, \\
     "{tsv_path}" index 3 using ($2):($3) with points ls 5 title "top 5 best model" axes x1y1
set yrange [:{third_quartile}]
set output "{pdf_path_75}"
set xrange [0:]
plot "{tsv_path}" index 1 using ($2):($3) with points ls 2 title "75% lowest-scoring models" axes x1y1, \\
     "{tsv_path}" index 2 using ($2):($3) with points ls 4 title "5 lowest energy models" axes x1y1, \\
     "{tsv_path}" index 3 using ($2):($3) with points ls 5 title "top 5 best model" axes x1y1
'''
        with open(gnu_path, 'w') as file:
            file.write(gnuplot_script.format(**locals()))

        utilities.run_gnuplot(gnu_path, verbose=self.verbose)

        return pdf_path_100, pdf_path_75
Ejemplo n.º 37
0
def save(idstr, tractor, nlscale=1., debug=False, plotAll=False, imgi=0,
         chilo=-10., chihi=10., roi=None):
    #print "Index: ", imgi
    mod = tractor.getModelImage(imgi)
    chi = tractor.getChiImage(imgi=imgi)
    synthfn = 'synth-%s.fits' % idstr
    print('Writing synthetic image to', synthfn)
    fitsio.write(synthfn, mod, clobber=True)

    pfn = 'tractor-%s.pickle' % idstr
    print('Saving state to', pfn)
    pickle_to_file(tractor, pfn)

    plt.clf()
    plt.hist(chi.ravel(), range=(-10,10), bins=100)
    plt.savefig('chi2.png')

    timg = tractor.getImage(imgi)
    data = timg.getImage()
    print('Mod type:', mod.dtype)
    print('Chi type:', chi.dtype)
    print('Data type:', data.dtype)
    zr = timg.zr
    print('zr', zr)
    # Set up nonlinear mapping based on the statistics of the data image.
    #sigma = np.median(timg.getInvError())
    #print 'sigma', sigma
    ima = dict(interpolation='nearest', origin='lower')
    if nlscale == 0.:
        ima.update(vmin=zr[0], vmax=zr[1])
    else:
        print(data.shape)
        q1,q2,q3 = np.percentile(data.ravel(), [25, 50, 75])
        print('Data quartiles:', q1, q2, q3)
        ima.update(norm = ArcsinhNormalize(mean=q2, std=(1./nlscale) * (q3-q1)/2.,
                                           vmin=zr[0], vmax=zr[1]))

    if roi is not None:
        ima.update(extent=roi)

    imchi = ima.copy()
    if nlscale == 0.:
        imchi.update(vmin=chilo, vmax=chihi, norm=None)
    else:
        imchi.update(norm = ArcsinhNormalize(mean=0., std=1./nlscale, vmin=chilo, vmax=chihi))

    imdiff = ima.copy()
    dzr = (zr[1] - zr[0])/2.
    if nlscale == 0.:
        imdiff.update(vmin=-dzr, vmax=+dzr, norm=None)
    else:
        imdiff.update(norm = ArcsinhNormalize(mean=0., std=1./nlscale, vmin=-dzr, vmax=dzr))

    if debug:
        sources = tractor.getCatalog()
        wcs = timg.getWcs()
        allobjx = []
        allobjy = []
        allobjc = []
        pointx = []
        pointy = []
        xplotx = []
        xploty = []

        for obj in sources:
            if (isinstance(obj,PointSource)):
                xt,yt = wcs.positionToPixel(obj.getPosition(), obj)
                pointx.append(xt)
                pointy.append(yt)
                continue
            print(type(obj))
            shapes = []
            attrType = []
            if (isinstance(obj,st.CompositeGalaxy)):
                for attr in 'shapeExp', 'shapeDev':
                    shapes.append(getattr(obj, attr))
                    attrType.append(attr)
            else:
                shapes.append(getattr(obj,'shape'))
                attrType.append(' ')
            x0,y0 = wcs.positionToPixel(obj.getPosition(), obj)
            
            cd = timg.getWcs().cdAtPixel(x0,y0)
            print("CD",cd)
            for i,shape in enumerate(shapes):
                xplotx.append(x0)
                xploty.append(y0)
                T=np.linalg.inv(shape.getTensor(cd))
                print("Inverted tensor:",T)
                print(obj.getPosition())
                print(i)

                x,y = [],[]
                for theta in np.linspace(0,2*np.pi,100):
                    ux = np.cos(theta)
                    uy = np.sin(theta)
                    dx,dy = np.dot(T,np.array([ux,uy]))
                    x.append(x0+dx)
                    y.append(y0+dy)
                allobjx.append(x)
                allobjy.append(y)
                if (attrType[i] == 'shapeExp'):
                    allobjc.append('b')
                elif attrType[i] == 'shapeDev':
                    allobjc.append('g')
                else:
                    allobjc.append('r')

    def savepng(pre, img, title=None, **kwargs):
        fn = '%s-%s.png' % (pre, idstr)
        print('Saving', fn)
        plt.clf()
        plt.imshow(img, **kwargs)
        ax = plt.axis()
        if debug:
            print(len(xplotx),len(allobjx))
            for i,(objx,objy,objc) in enumerate(zip(allobjx,allobjy,allobjc)):
                plt.plot(objx,objy,'-',c=objc)
                tempx = []
                tempx.append(xplotx[i])
                tempx.append(objx[0])
                tempy = []
                tempy.append(xploty[i])
                tempy.append(objy[0])
                plt.plot(tempx,tempy,'-',c='purple')
            plt.plot(pointx,pointy,'y.')
            plt.plot(xplotx,xploty,'xg')
        plt.axis(ax)
        if title is not None:
            plt.title(title)
        plt.colorbar()
        plt.gray()
        plt.savefig(fn)

    savepng('data', data, title='Data ' + timg.name, **ima)
    savepng('model', mod, title='Model ' + timg.name, **ima)
    savepng('diff', data - mod, title='Data - Model, ' + timg.name, **imdiff)
    savepng('chi',  chi, title='Chi ' + timg.name, **imchi)
    if plotAll:
        debug = False
        for i,src in enumerate(tractor.getCatalog()):
            savepng('data-s%i'%(i+1),data - sky, title='Data '+timg.name,**ima)
            modelimg = tractor.getModelImage(timg, srcs=[src])
            savepng('model-s%i'%(i+1), modelimg - sky, title='Model-s%i'%(i+1),**ima) 
            savepng('diff-s%i'%(i+1), data - modelimg, title='Model-s%i'%(i+1),**imdiff)
            savepng('chi-s%i'%(i+1),tractor.getChiImage(imgi,srcs=[src]),title='Chi',**imchi)
        range_spread_cc_list = np.empty((0, k), float)
        for test_sample in range(0, distances_order.shape[0]):
            range_spread = y_train['Spread bps'].iloc[distances_order.iloc[
                test_sample, :].values]
            range_spread_cc = y_train['Spread bps'].iloc[
                d_closest_cluster.iloc[test_sample, :].values]
            range_spread_list = \
                np.vstack((range_spread_list, range_spread))
            range_spread_cc_list = \
                np.vstack((range_spread_cc_list, range_spread_cc))
        ############
        df_range_spread = pd.DataFrame(range_spread_list)
        df_range_spread_cc = pd.DataFrame(range_spread_cc_list)

        df_range_spread_75p = df_range_spread.apply(
            lambda x: np.percentile(x, 75), axis=1)
        df_range_spread_50p = df_range_spread.apply(
            lambda x: np.percentile(x, 50), axis=1)
        df_range_spread_25p = df_range_spread.apply(
            lambda x: np.percentile(x, 25), axis=1)

        df_range_spread_75p_include_pred = np.column_stack(
            (df_range_spread_75p, predictions)).max(axis=1)
        df_range_spread_25p_include_pred = np.column_stack(
            (df_range_spread_25p, predictions)).min(axis=1)

        df_coverage_real_spread = \
            sum((df_range_spread_25p < y_test['Spread bps'].values) &
                (df_range_spread_25p < y_test['Spread bps'].values))/n_test_samples
        df_coverage_predicted_spread = \
            sum((df_range_spread_25p < predictions) &
#plotting the cumulative distribution (_cdf)
plt.hist(sep,
         bins,
         prange,
         color='cyan',
         histtype='step',
         rwidth=2,
         cumulative=True,
         density=True,
         label='BCG - SZ')

print(np.median(sep))
print(np.median(sep1))
print(np.median(simuk))
print(np.percentile(
    sep1,
    .1,
))
print(np.percentile(
    sep,
    .1,
))
print(np.percentile(
    simuk,
    .1,
))
# x-axis label
plt.xlabel('Seperation Value (kpc)')
# frequency labe
plt.ylabel('Cumulitive Value (%)')
# plot title
Ejemplo n.º 40
0
def raw_chunkify_with_remap_main(args):
    """ Main function for `chunkify.py raw_remap` producing batch file for model training
    """
    if not args.overwrite:
        if os.path.exists(args.output):
            print("Cowardly refusing to overwrite {}".format(args.output))
            sys.exit(1)
        if os.path.exists(args.output_strand_list):
            print("Cowardly refusing to overwrite {}".format(
                args.output_strand_list))
            sys.exit(2)

    fast5_files = fast5.iterate_fast5(args.input_folder,
                                      paths=True,
                                      limit=args.limit,
                                      strand_list=args.input_strand_list)

    references = util.fasta_file_to_dict(args.references)

    print('* Processing data using', args.jobs, 'threads')

    kwarg_names = [
        'trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip',
        'chunk_len', 'normalisation', 'downsample_factor', 'interpolation',
        'open_pore_fraction'
    ]
    kwargs = util.get_kwargs(args, kwarg_names)
    kwargs['references'] = references

    i = 0
    compiled_file = helpers.compile_model(args.model, args.compile)
    output_strand_list_entries = []
    bad_list = []
    chunk_list = []
    label_list = []
    with open(args.output_strand_list, 'w') as slfh:
        slfh.write(u'\t'.join([
            'filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end'
        ]) + u'\n')
        for res in imap_mp(
                raw_chunk_remap_worker,
                fast5_files,
                threads=args.jobs,
                fix_kwargs=kwargs,
                unordered=True,
                init=batch.init_chunk_remap_worker,
                initargs=[compiled_file, args.kmer_len, args.alphabet]):
            if res is not None:
                i = util.progress_report(i)

                read, score, nblocks, path, seq, chunks, labels, bad_ev = res

                chunk_list.append(chunks)
                label_list.append(labels)
                bad_list.append(bad_ev)
                strand_data = [
                    read, nblocks, -score / nblocks,
                    np.sum(np.ediff1d(path, to_begin=1) == 0),
                    len(seq),
                    min(path),
                    max(path)
                ]
                slfh.write('\t'.join([str(x) for x in strand_data]) + '\n')

    if compiled_file != args.compile:
        os.remove(compiled_file)

    if chunk_list == []:
        print("no chunks were produced", file=sys.stderr)
        sys.exit(1)
    else:
        print('\n* Writing out to HDF5')
        hdf5_attributes = {
            'chunk': args.chunk_len,
            'downsample_factor': args.downsample_factor,
            'input_type': 'raw',
            'interpolation': args.interpolation,
            'kmer': args.kmer_len,
            'normalisation': args.normalisation,
            'section': 'template',
            'trim': args.trim,
            'alphabet': args.alphabet,
        }
        blanks_per_chunk = np.concatenate([(l == 0).mean(1)
                                           for l in label_list])
        blanks = np.percentile(blanks_per_chunk, args.blanks_percentile)
        util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes,
                                         chunk_list, label_list, bad_list)
Ejemplo n.º 41
0
def binning_data_split(df, var, global_bt, global_gt, min_sample, alpha=0.01):
    """
    Specify the data split level and return the split value list
    :return:
    """
    iv_var = InfoValue()
    # Calculates the IV of the current node before splitted
    gd = calulate_iv(df, var, global_bt, global_gt)

    woei, ivi = gd['woei'], gd['ivi']

    if np.unique(df[var]).__len__() <= 8:
        # print 'running into if'
        split = list(np.unique(df[var]))
        split.sort()
        # print 'split:',split
        #Segmentation point checking and processing
        split = check_point(df, var, split, min_sample)
        split.sort()
        # print 'after check:',split
        iv_var.split_list = split
        return node(split_point=split, iv=ivi)

    percent_value = list(np.unique(np.percentile(df[var], range(100))))
    percent_value.sort()

    if percent_value.__len__() <= 2:
        iv_var.split_list = list(np.unique(percent_value)).sort()
        return node(split_point=percent_value, iv=ivi)

    # A sentry that attempts to split the current node
    # Init bestSplit_iv with zero
    bestSplit_iv = 0
    bestSplit_woel = []
    bestSplit_woer = []
    bestSplit_ivl = 0
    bestSplit_ivr = 0
    bestSplit_point = []

    #remove max value and min value in case dataset_r  or dataset_l will be null
    for point in percent_value[0:percent_value.__len__() - 1]:
        # If there is only a sample or a negative sample, skip
        if set(df[df[var] > point]['target']).__len__() == 1 or set(df[df[var] <= point]['target']).__len__() == 1 \
                or df[df[var] > point].shape[0] < min_sample or df[df[var] <= point].shape[0] < min_sample :
            continue

        woel, woer, iv, dataset_l, dataset_r, ivl, ivr = calculate_iv_split(
            df, var, point, global_bt, global_gt)

        if iv > bestSplit_iv:
            bestSplit_woel = woel
            bestSplit_woer = woer
            bestSplit_iv = iv
            bestSplit_point = point
            bestSplit_dataset_r = dataset_r
            bestSplit_dataset_l = dataset_l
            bestSplit_ivl = ivl
            bestSplit_ivr = ivr

    # If the IV after division is greater than the IV value before the current segmentation, the segmentation is valid and recursive
    # specified step learning rate 0.01
    if bestSplit_iv > ivi * (1 + alpha) and bestSplit_dataset_r.shape[
            0] > min_sample and bestSplit_dataset_l.shape[0] > min_sample:
        presplit_right = node()
        presplit_left = node()

        # Determine whether the right node satisfies the segmentation prerequisite
        if bestSplit_dataset_r.shape[0] < min_sample or set(
                bestSplit_dataset_r['target']).__len__() == 1:
            presplit_right.iv = bestSplit_ivr
            right = presplit_right
        else:
            right = binning_data_split(bestSplit_dataset_r,
                                       var,
                                       global_bt,
                                       global_gt,
                                       min_sample,
                                       alpha=0.01)

        # Determine whether the left node satisfies the segmentation prerequisite
        if bestSplit_dataset_l.shape[0] < min_sample or np.unique(
                bestSplit_dataset_l['target']).__len__() == 1:
            presplit_left.iv = bestSplit_ivl
            left = presplit_left
        else:
            left = binning_data_split(bestSplit_dataset_l,
                                      var,
                                      global_bt,
                                      global_gt,
                                      min_sample,
                                      alpha=0.01)

        return node(var_name=var,
                    split_point=bestSplit_point,
                    iv=ivi,
                    left=left,
                    right=right)
    else:
        # Returns the current node as the final leaf node
        return node(var_name=var, iv=ivi)
def RSF_bootstrap(fp, num=False):
    df = pd.read_csv(fp, index_col=0)

    # configure bootstrap (sampling 50% of data)
    n_iterations = 100
    n_size = int(len(df) * 0.50)

    # parameters
    NUMESTIMATORS = 100
    TESTSIZE = 0.20
    random_state = 20

    # calculate population of statistics
    metrics = []
    for i in range(n_iterations):
        # prepare sample

        # if indicated, include number of mets (col 42)
        if num:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]],
                              n_samples=n_size)
            X = sample.iloc[:, np.r_[:20, 42]].copy()

        else:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size)
            X = sample.iloc[:, :20].copy()

        X = X.to_numpy().astype('float64')
        y = sample[['Event', 'Time']].copy()
        y['Event'] = y['Event'].astype('bool')
        y['Time'] = y['Time'].astype('float64')
        y = y.to_records(index=False)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TESTSIZE, random_state=random_state)
        rsf = RandomSurvivalForest(n_estimators=NUMESTIMATORS,
                                   min_samples_split=15,
                                   min_samples_leaf=8,
                                   max_features="sqrt",
                                   n_jobs=-1,
                                   random_state=random_state)
        rsf.fit(X_train, y_train)

        score = rsf.score(X_test, y_test)
        metrics.append(score)

    # calculate confidence interval
    alpha = 0.95
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(metrics, p))
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = min(1.0, np.percentile(metrics, p))
    med = np.percentile(metrics, 50)

    # identify aggregation method name
    if num:
        name = fp.split('/')[-1].split('_')[0] + ' + NumMets'
    else:
        name = fp.split('/')[-1].split('_')[0]

    return print(name, 'RSF', '%.3f (%.3f-%.3f)' % (med, lower, upper))
Ejemplo n.º 43
0
    def __init__(self,
                 audio_dir,
                 sample_rate,
                 speakers_sub_list=None):
        self.audio_dir = os.path.expanduser(audio_dir)  # for the ~/
        self.sample_rate = sample_rate
        self.metadata = dict()  # small cache <SPEAKER_ID -> SENTENCE_ID, filename>
        self.cache = dict()  # big cache <filename, data:audio librosa, blanks.>

        logger.debug('Initializing AudioReader()')
        logger.debug('audio_dir = {}'.format(self.audio_dir))
        logger.debug('sample_rate = {}'.format(sample_rate))
        logger.debug('speakers_sub_list = {}'.format(speakers_sub_list))

        st = time()
        if len(find_files(TMP_DIR, pattern='*.pkl')) == 0:  # generate all the pickle files.
            logger.debug('Nothing found at {}. Generating all the caches now.'.format(TMP_DIR))
            files = find_files(self.audio_dir)
            assert len(files) != 0, 'Generate your cache please.'
            logger.debug('Found {} files in total in {}.'.format(len(files), self.audio_dir))
            if speakers_sub_list is not None:
                files = list(
                    filter(lambda x: any(word in extract_speaker_id(x) for word in speakers_sub_list), files))
                logger.debug('{} files correspond to the speaker list {}.'.format(len(files), speakers_sub_list))
            assert len(files) != 0

            bar = tqdm(files)
            for filename in bar:
                bar.set_description(filename)
                try:
                    speaker_id = extract_speaker_id(filename)
                    audio, _ = read_audio_from_filename(filename, self.sample_rate)
                    energy = np.abs(audio[:, 0])
                    silence_threshold = np.percentile(energy, 95)
                    offsets = np.where(energy > silence_threshold)[0]
                    left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)
                    right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
                    # _, left_blank, right_blank = trim_silence(audio[:, 0], silence_threshold)
                    logger.info('_' * 100)
                    logger.info('left_blank_duration_ms = {}, right_blank_duration_ms = {}, '
                                'audio_length = {} frames, silence_threshold = {}'.format(left_blank_duration_ms,
                                                                                          right_blank_duration_ms,
                                                                                          len(audio),
                                                                                          silence_threshold))
                    obj = {'audio': audio,
                           'audio_voice_only': audio[offsets[0]:offsets[-1]],
                           'left_blank_duration_ms': left_blank_duration_ms,
                           'right_blank_duration_ms': right_blank_duration_ms,
                           FILENAME: filename}
                    cache_filename = filename.split('/')[-1].split('.')[0] + '_cache'
                    tmp_filename = os.path.join(TMP_DIR, cache_filename) + '.pkl'
                    with open(tmp_filename, 'wb') as f:
                        dill.dump(obj, f)
                        logger.debug('[DUMP AUDIO] {}'.format(tmp_filename))
                    # commit to metadata dictionary when you're sure no errors occurred during processing.
                    if speaker_id not in self.metadata:
                        self.metadata[speaker_id] = {}
                    sentence_id = extract_sentence_id(filename)
                    if sentence_id not in self.metadata[speaker_id]:
                        self.metadata[speaker_id][sentence_id] = []
                    self.metadata[speaker_id][sentence_id] = {SPEAKER_ID: speaker_id,
                                                              SENTENCE_ID: sentence_id,
                                                              FILENAME: filename}
                except librosa.util.exceptions.ParameterError as e:
                    logger.error(e)
                    logger.error('[DUMP AUDIO ERROR SKIPPING FILENAME] {}'.format(filename))
            dill.dump(self.metadata, open(os.path.join(TMP_DIR, 'metadata.pkl'), 'wb'))

        logger.debug(
            'Using the generated files at {}. Using them to load the cache. Be sure to have enough memory.'.format(
                TMP_DIR))
        self.metadata = dill.load(open(os.path.join(TMP_DIR, 'metadata.pkl'), 'rb'))

        pickle_files = find_files(TMP_DIR, pattern='*.pkl')
        for pkl_file in tqdm(pickle_files, desc='reading cache'):
            if 'metadata' not in pkl_file:
                with open(pkl_file, 'rb') as f:
                    obj = dill.load(f)
                    self.cache[obj[FILENAME]] = obj
        logger.debug('Cache took {0:.2f} seconds to load. {1:} keys.'.format(time() - st, len(self.cache)))
Ejemplo n.º 44
0
nsources = []
TS = []
beta = (0.5)  #For background ts
TS_beta = []  #Calculated from the total TS median after we get all the TS.
beta_err = []
gamma = []

for file in files:
    for item in range(len(file['n_inj'])):
        n_inj.append(file['n_inj'][item])
        nsources.append(file['nsources'][item])
        TS.append(file['TS'][item])
        gamma.append(file['gamma'][item])

TSs = file['TS']
TS_beta = np.percentile(TSs, 100. * (1. - beta))
m = np.count_nonzero(np.asarray(TSs) > (TS_beta))
i = len(TSs)
fraction = float(m) / float(i)
beta_err = (np.sqrt(fraction * (1. - fraction) /
                    float(i)) if 0 < beta < 1 else 1.)

##Now we have all the pieces of the original dictionary. Time to glue bckg_trials back in place, in their proper file type.##
bckg_trials = {
    'n_inj': n_inj,
    'nsources': np.asarray(nsources),
    'TS': np.asarray(TS),
    'beta': beta,
    'beta_err': beta_err,
    'TS_beta': TS_beta,
    'gamma': np.asarray(gamma)
def creative_boxplot(ax: matplotlib.axes,
                     data: List[np.ndarray or List[int or float]]
                     or np.ndarray,
                     bins: int = 10,
                     whis: float = 1.5,
                     labelset: list or bool = False,
                     showcaps: bool = True,
                     showfliers: bool = True,
                     showmeans: bool = True,
                     showtrend: bool = True,
                     variawidth: bool = True,
                     curfacecolor: str = 'white',
                     curlinecolor: str = 'black',
                     curalpha: int = 1,
                     outlierlinecolor: str = 'white',
                     outliercolor: str = 'steelblue',
                     outlierlinewidth: int = 1,
                     capcolor: str = 'black',
                     capwidth: int or float = 1,
                     whiskercolor: str = 'black',
                     whiskerwidth: int or float = 1,
                     boxfacecolor: str = 'white',
                     boxedgecolor: str = 'black',
                     boxedgewidth: int or float = 1,
                     mediancolor: str = 'orange',
                     medianwidth: int or float = 1,
                     medianlinestyle: str = '-',
                     meancolor: str = 'green',
                     meanwidth: int or float = 1,
                     meanlinestyle: str = '--',
                     trendcolor: str = 'blue',
                     trendwidth: int or float = 1.5,
                     trendlinestyle: str = ':',
                     rotation: int or float = 0) -> matplotlib.axes:
    """
    Make a creative mixed plot with various properties assignable, such as color, width and line style.
    The box plot is on the left half and the frequency area is on the right side.

    Make a box and whisker plot for each data set in the data list. The box extends from the lower to upper quartile
    values of the data, with a line at the median. The whiskers extend from the box to show the range of the data.
    Outliers are those past the end of the whiskers. It allows users to specify the face color of the box and the
    outliers, the line color of the box, the whisker, the caps, the outliers, and the median. It also allows users
    to specify whether to show the caps, outliers, means, and the line among boxes. Users can also set the labels of
    datasets. There are other properties such as line width and line style that are able to be specified. Besides,
    users can set the widths of the boxs changeable to make it reflect the size of the samples when comparing grouped
    data. When used for time series data, dotted line between the boxes can be specified to show the variation trends of
    the median among the samples.


    parameters:
    ax: matplotlib.axes

    data: List[np.ndarray or List[int or float]] or np.ndarray
        consists in a list of list and each item of data is a list containing multiple series of numerical values.

    bins: int, default: 10

    whis: float, default: 1.5
        The position of the whiskers.
        If a float, the lower whisker is at the lowest datum above Q1 - whis*(Q3-Q1),
        and the upper whisker at the highest datum below Q3 + whis*(Q3-Q1), where Q1 and Q3 are the first and third quartiles.
        The default value of whis = 1.5 corresponds to Tukey's original definition of boxplots.

    labelset: list, optional, default: [1,2,3,4,...]
        Labels for each dataset (one per dataset).

    showcaps: bool, default: True
        If True, show the caps on the ends of whiskers.

    showfliers: bool, default: True
        If True, show the outliers beyond the caps.

    showmeans: bool, default: True
        If True, show the arithmetic means.

    showtrend: bool, default: True
        If True, show the broken line among medians of datasets

    variawidth: bool, default: True
        If True, change the widths of boxes according to the sizes of datasets

    capcolor: color, default: 'black'
        The color of caps (horizontal lines at the ends of the whiskers)

    capwidth: float or int, default: 1
        The width of caps (horizontal lines at the ends of the whiskers)

    whiskercolor: color, default: 'black'
        The color of whiskers (the vertical lines extending to the most extreme, non-outlier data points)

    whiskerwidth: float or int, default: 1
        The width of whiskers (the vertical lines extending to the most extreme, non-outlier data points)

    boxfacecolor: color, default: 'white'
        The color of the faces of the boxes

    boxedgecolor: color, default: 'black'
        The color of the edges of the boxes

    boxedgewidth: float or int, default: 1
        The width of the edges of the boxes

    mediancolor: color, default: 'orange'
        The color of the median lines in the boxes

    medianwidth: float or int, default: 1
        The width of the median lines in the boxes

    medianlinestyle: str, default:'--'
        The line style of the median lines in the boxes
            '-': solid line style
            '--': dashed line style
            '-.': dash-dot line style
            ':': dotted line style

    meancolor: color, default: 'green'
        The color of the mean lines in the boxes

    meanwidth: float or int, default: 1
        The width of the mean lines in the boxes

    meanlinestyle: str, default:'--'
         The line style of the mean lines in the boxes
             '-': solid line style
             '--': dashed line style
             '-.': dash-dot line style
             ':': dotted line style
    trendcolor: color, default: 'blue'
        The color of the line connecting the medians of the boxes

    trendwidth: float or int, default: 1.5
        The width of the line connecting the medians of the boxes

    trendlinestyle: str, default:':'
        The line style of the line connecting the medians of the boxes
            '-': solid line style
            '--': dashed line style
            '-.': dash-dot line style
            ':': dotted line style

    curlinecolor: str, default: 'white'
        The color of edges of the curves

    curfacecolor: str, default: 'black'
        The color of faces of the curves

    curalpha: int, default: 1
        The transparency of faces of the curves

    outliercolor: color, default: 'white'
        The color of the faces of points represent the outliers

    outlierlinecolor: color, default: 'black'
        The color of the edges of points represent the outliers

    outlierlinewidth: float or int, default: 1
        The width of the edges of points represent the outliers


    Returns
    -------
        matplotlib.axes

    """

    try:
        bins += 0
    except TypeError as err:
        print("The bins should be integer")
        raise err
    if isinstance(data, np.ndarray):
        assert len(data.shape) == 2, "The input should be 2-D array"
        assert data.dtype != '<U11', "The element in 2-D array should be numerical values"
    else:
        data = input_checking(data)

    # set x-axis and y-axis
    labels = [i + 1 for i in range(len(data))]
    y_min = min(min(data[i]) for i in range(len(data)))
    y_max = max(max(data[i]) for i in range(len(data)))
    ax.set_ylim(y_min - 0.1 * (abs(y_max)), y_max + 0.1 * (abs(y_max)))
    ax.set_xlim(0, len(labels) + 1)

    ax.set_xticks(labels)
    if labelset:
        ax.set_xticklabels(labelset, rotation=rotation)

    proportion = []
    for index in data:
        proportion.append(len(index))

    # set a box for each list of data
    for index in range(len(data)):
        # set the width of the box and caps
        if variawidth:
            width = 0.5 * (proportion[index] / sum(proportion))
        else:
            width = 0.25
        # get the quantiles
        quantiles = np.percentile(data[index], (25, 50, 75))
        iqr = quantiles[2] - quantiles[0]
        # the lower bound of the box
        low_bound = quantiles[0] - whis * iqr
        # the upper bound of the box
        up_bound = quantiles[2] + whis * iqr
        # define the top of box
        box_top = min(max(data[index]), up_bound)
        # define the bottom of box
        box_bottom = max(min(data[index]), low_bound)

        height = max(data[index]) - min(data[index])
        ax.vlines(labels[index],
                  ymin=min(data[index]),
                  ymax=max(data[index]),
                  linewidth=1)
        inter = height / bins
        barwidth = height / bins
        total = []
        low = min(data[index])
        yli = []
        xli = []
        for m in range(bins):
            count = 0
            for n in data[index]:
                if n >= low and n < low + inter:
                    count += 1
            low += inter
            if m == bins - 2:
                low += 1  # take the maximum value into consideration
            total.append(count)
        total = [(x - min(total)) / (max(total) - min(total)) * 0.5
                 for x in total]  # scaler to(0,0.5)
        for p in range(len(total)):
            yli.append(min(data[index]) + p * barwidth + barwidth / 2)
            xli.append(total[p] + labels[index])
        xli.insert(0, labels[index])
        xli.append(labels[index])
        yli.insert(0, box_bottom)
        yli.append(box_top)

        from operator import itemgetter
        yli, xli = [
            list(x) for x in zip(*sorted(zip(yli, xli), key=itemgetter(0)))
        ]
        y = np.array(yli)
        ynew = np.linspace(min(y), max(y), 1000)

        from scipy.interpolate import make_interp_spline
        power_smooth = make_interp_spline(yli,
                                          xli,
                                          bc_type=([(1, 0.0)], [(1, 0.0)
                                                                ]))(ynew)
        for t in range(len(power_smooth)):
            if power_smooth[t] < labels[index]:
                power_smooth[t] = labels[index]
        ax.fill_betweenx(ynew,
                         labels[index],
                         power_smooth,
                         facecolor=curfacecolor,
                         edgecolor=curlinecolor,
                         alpha=curalpha)

        rect = plt.Rectangle((labels[index] - width, quantiles[0]),
                             width,
                             quantiles[2] - quantiles[0],
                             color=boxfacecolor)
        ax.add_patch(rect)

        # pick out and draw the outliers
        outliers = np.concatenate((data[index][low_bound > data[index]],
                                   data[index][up_bound < data[index]]))
        for o in outliers:
            if showfliers:
                trans = (ax.figure.dpi_scale_trans +
                         transforms.ScaledTranslation(labels[index], o,
                                                      ax.transData))
                circle = matplotlib.patches.Circle((0, 0),
                                                   0.04,
                                                   edgecolor=outlierlinecolor,
                                                   facecolor=outliercolor,
                                                   transform=trans,
                                                   linewidth=outlierlinewidth)
                ax.add_patch(circle)
            # do not consider outliers when drawing the boxplot
            data[index] = data[index][~np.isin(data[index], o)]
        # draw the bottom of box
        ax.hlines(quantiles[0],
                  labels[index] - width,
                  labels[index],
                  linewidth=boxedgewidth,
                  color=boxedgecolor)
        # draw the median of box
        ax.hlines(
            quantiles[1],
            labels[index] - width,
            labels[index],
            color=mediancolor,
            linewidth=medianwidth,
            ls=medianlinestyle,
        )
        # draw the top of box
        ax.hlines(quantiles[2],
                  labels[index] - width,
                  labels[index],
                  linewidth=boxedgewidth,
                  color=boxedgecolor)
        if showcaps:
            # draw the high cap
            ax.hlines(box_top,
                      labels[index] - width / 2,
                      labels[index] + width / 2,
                      linewidth=capwidth,
                      color=capcolor)
            # draw the low cap
            ax.hlines(box_bottom,
                      labels[index] - width / 2,
                      labels[index] + width / 2,
                      linewidth=capwidth,
                      color=capcolor)
        # draw the low whisker
        ax.vlines(labels[index],
                  ymin=box_bottom,
                  ymax=quantiles[0],
                  linewidth=whiskerwidth,
                  color=whiskercolor)
        # draw the high whisker
        ax.vlines(labels[index],
                  ymin=quantiles[2],
                  ymax=box_top,
                  linewidth=whiskerwidth,
                  color=whiskercolor)
        # draw the left bound of whisker
        ax.vlines(labels[index] - width,
                  ymin=quantiles[0],
                  ymax=quantiles[2],
                  linewidth=boxedgewidth,
                  color=boxedgecolor)
        if showtrend:
            if index > 0:
                verts = [
                    (labels[index - 1], lastme),
                    (labels[index], quantiles[1]),
                ]
                codes = [
                    Path.MOVETO,
                    Path.LINETO,
                ]
                path = Path(verts, codes)
                patch = patches.PathPatch(path,
                                          color=trendcolor,
                                          ls=trendlinestyle,
                                          lw=trendwidth)
                ax.add_patch(patch)
            lastme = quantiles[1]
        if showmeans:
            ax.hlines(np.mean(data[index]),
                      labels[index] - width,
                      labels[index],
                      color=meancolor,
                      ls=meanlinestyle,
                      linewidth=meanwidth)
    return ax
Ejemplo n.º 46
0
    def plot(self, background=False):

        # capval_percentile = 95%
        
        kwargs1 = {}
        kwargs2 = {}
        if 'fc' in self.plot_kwargs:
            kwargs1['fc'] = self.plot_kwargs['fc']

        if 'alpha' in self.plot_kwargs:
            kwargs1['alpha'] = self.plot_kwargs['alpha']
            
        if 'label_fontsize' in self.plot_kwargs:
            kwargs2['fontsize'] = self.plot_kwargs['label_fontsize']
        else:
            kwargs2['fontsize'] = 9

        if 'label' in self.plot_kwargs:
            label = self.plot_kwargs['label']
        else:
            label = self.df.columns[3]

        ly = self.layout

        if ly.chrms_plot is None:
            vals = [v for k,(c,beg,end,v) in self.df.iterrows()]
        else:
            vals = [v for k,(c,beg,end,v) in self.df.iterrows() if c in ly.chrms_plot]
            
        minval = np.min(vals)
        maxval = np.max(vals)
        if 'capval_percentile' in self.plot_kwargs:
            capval = np.percentile(vals, self.plot_kwargs['capval_percentile'])
        else:
            capval = maxval
        valrange = float(capval - minval)
        
        angle1s = []
        anglewids = []
        heights = []
        for k, (chrm, beg, end, value) in self.df.iterrows():
            if chrm not in ly.chrm2angles:
                continue

            if value > capval:
                value = capval
                
            angle1 = ly.loc2angle(chrm, beg)
            anglewid = ly.loc2angle(chrm, end) - angle1
            angle1s.append(angle1)
            anglewids.append(anglewid)
            heights.append(self.track_height*(value-minval)/valrange*self.maxr)

        # ha='center', va='center', rotation=normalize_text_angle(text_angle/(np.pi*2)*360,tangent=True),
        
        # plotting background
        bg_start = []
        bg_width = []
        bg_height = []
        if background:
            for chrm in ly.chrm2angles:
                angle_beg = ly.loc2angle(chrm,0)
                angle_end = ly.loc2angle(chrm,ly.chrm2len[chrm])
                bg_start.append(angle_beg)
                bg_width.append(angle_end-angle_beg)
                bg_height.append(self.track_height*self.maxr)
        ly.ax.bar(bg_start, bg_height, bg_width, bottom=self.track_bottom, ec='none', fc='grey', align='edge', alpha=0.1)
        
        # plot data
        ly.ax.bar(angle1s, heights, anglewids, bottom=self.track_bottom, ec='none', align='edge',  **kwargs1)
        if 'labelside_text_angle' in self.plot_kwargs:
            text_angle = self.plot_kwargs['labelside_text_angle'] / 180.0 * np.pi
        else:
            text_angle = ly.angle_beg

        if 'label' in self.plot_kwargs:
            ly.ax.text(text_angle, self.track_bottom + self.track_height/2.0, label,
                       fontname=ly.fontname, va='center', ha='left', **kwargs2)

        if 'labelside_circle' in self.plot_kwargs:
            fc = self.plot_kwargs['fc'] if 'fc' in self.plot_kwargs else 'r'
            if 'labelside_circ_angle' in self.plot_kwargs:
                circ_angle = self.plot_kwargs['labelside_circ_angle']
            else:
                circ_angle = ly.angle_beg
            ly.ax.add_artist(mpatches.Circle(
                polar2cart(circ_angle, self.track_bottom),
                self.plot_kwargs['labelside_circle'],
                edgecolor=fc, facecolor=fc, alpha=0.4, lw=0.1, transform=ly.ax.transData._b))
def histobox_plot(ax: matplotlib.axes,
                  data: List[np.ndarray or List[int or float]] or np.ndarray,
                  bins: int = 10) -> matplotlib.axes:
    """

    Drawing function for plot which is a mix between a box plot and a histogram

    Drawing a mixed plot for each data set in the data list. The left half is a traditional box plot,
    while there is a histogram reflecting the distribution on the right half.

    Parameters
    ----------
    ax: matplotlib.axes

    data: List[np.ndarray or List[int or float]] or np.ndarray
        consists in a list of list and each item of data is a list containing multiple series of numerical values.

    bins: int, default: 10

    Returns
    -------
        matplotlib.axes

    """

    # input checking
    try:
        bins += 0
    except TypeError as err:
        print("The bins should be integer")
        raise err
    if isinstance(data, np.ndarray):
        assert len(data.shape) == 2, "The input should be 2-D array"
        assert data.dtype != '<U11', "The element in 2-D array should be numerical values"
    else:
        data = input_checking(data)

    # set x-axis and y-axis
    labels = [i + 1 for i in range(len(data))]
    ax.set_xticks(labels)
    y_min = min(min(data[i]) for i in range(len(data)))
    y_max = max(max(data[i]) for i in range(len(data)))
    ax.set_ylim(y_min - 0.1 * abs(y_max), y_max + 0.1 * (abs(y_max)))
    ax.set_xlim(0, len(labels) + 1)

    # set a box for each list of data
    for index in range(len(data)):
        # set the width of the box and caps
        width = 0.2
        # get the quantiles
        quantiles = np.percentile(data[index], (25, 50, 75))
        iqr = quantiles[2] - quantiles[0]
        # the lower bound of the box
        low_bound = quantiles[0] - 1.5 * iqr
        # the upper bound of the box
        up_bound = quantiles[2] + 1.5 * iqr

        # deal with the bar plot
        height = max(data[index]) - min(data[index])
        ax.vlines(labels[index],
                  ymin=min(data[index]),
                  ymax=max(data[index]),
                  linewidth=1)
        inter = height / bins
        barwidth = height / bins
        total = []
        low = min(data[index])
        for m in range(bins):
            count = 0
            for n in data[index]:
                if n >= low and n < low + inter:
                    count += 1
            low += inter
            # take the maximum value into consideration
            if m == bins:
                low += 1
            total.append(count)
        # scaler to(0,0.5)
        total = [(x - min(total)) / (max(total) - min(total)) * 0.5
                 for x in total]
        for p in range(len(total)):
            rect = plt.Rectangle(
                (labels[index], min(data[index]) + p * barwidth),
                total[p],
                barwidth,
                edgecolor='black',
                facecolor='silver')
            ax.add_patch(rect)

        # pick out and draw the outliers
        outliers = np.concatenate((data[index][low_bound > data[index]],
                                   data[index][up_bound < data[index]]))
        for o in outliers:
            trans = (
                ax.figure.dpi_scale_trans +
                transforms.ScaledTranslation(labels[index], o, ax.transData))
            circle = matplotlib.patches.Circle((0, 0),
                                               0.04,
                                               edgecolor='black',
                                               facecolor='white',
                                               transform=trans)
            ax.add_patch(circle)
            # do not consider outliers when drawing the boxplot
            data[index] = data[index][~np.isin(data[index], o)]

        # draw the whisker,caps and box
        # define the top of box
        box_top = min(max(data[index]), up_bound)
        # define the bottom of box
        box_bottom = max(min(data[index]), low_bound)
        # draw the bottom of box
        ax.hlines(quantiles[0],
                  labels[index] - width,
                  labels[index],
                  linewidth=1)
        # draw the median of box
        ax.hlines(quantiles[1],
                  labels[index] - width,
                  labels[index],
                  linewidth=1)
        # draw the top of box
        ax.hlines(quantiles[2],
                  labels[index] - width,
                  labels[index],
                  linewidth=1)
        # draw the high cap
        ax.hlines(box_top,
                  labels[index] - width / 2,
                  labels[index],
                  linewidth=1)
        # draw the low cap
        ax.hlines(box_bottom,
                  labels[index] - width / 2,
                  labels[index],
                  linewidth=1)
        # draw the low whisker
        ax.vlines(labels[index],
                  ymin=box_bottom,
                  ymax=quantiles[0],
                  linewidth=1)
        # draw the high whisker
        ax.vlines(labels[index], ymin=quantiles[2], ymax=box_top, linewidth=1)
        # draw the left bound of whisker
        ax.vlines(labels[index] - width,
                  ymin=quantiles[0],
                  ymax=quantiles[2],
                  linewidth=1)

    ax.set_xlim(0, len(labels) + 1)
    return ax
Ejemplo n.º 48
0
def find_bright_spots(image,
                      n_clusters=3,
                      blur_radius=21,
                      amount_of_bright_parts=0.8,
                      return_all_pos=False):
    """
    Find the indices location of the top-k brightest spots in an color image.

    :param image: input image. Must be an mutli-channel RGB color image
    :type image: numpy.ndarray
    :param n_clusters: expected number of clusters/brightest spots in the input image
    :type n_clusters: int
    :param blur_radius: radius of the Gaussian blur kernel that used to smooth the image
    :type blur_radius: int
    :param amount_of_bright_parts: amount of bright parts in an image. Used to find the lower bound for \
    distinguishing the bright and non-bright part of the input image. Range of amount_of_bright_parts is \
    in [0, 1] (all non-bright -> all bright)
    :type amount_of_bright_parts: float
    :return: The location of centers of top-3 bright spots (with irregular shape), percentage of dominance of each spot \
    (relative size of the spot)
    :rtype: (numpy.ndarray, numpy.ndarray)
    """
    assert amount_of_bright_parts >= 0 and amount_of_bright_parts <= 1, "Range of the sample ration is in [0, 1]"
    assert n_clusters >= 1, "The number of bright spots must be larger or equal to 1"

    amount_of_bright_parts = amount_of_bright_parts * 100
    # Convert BRG to Greyscale
    grayscale_img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # Blur the image with radius = blurRadius
    blurred_img = cv2.GaussianBlur(grayscale_img, (blur_radius, blur_radius),
                                   0)

    # Compute the lower bound threshold
    lower_bound = np.percentile(blurred_img, 100 - amount_of_bright_parts) - 1

    # Threshold the imgae by setting any pixel with value larger than lower bound to 255
    threshed_img = cv2.threshold(blurred_img, lower_bound, 255,
                                 cv2.THRESH_BINARY)[1]

    # Purifiy the edges of the brightest spots.
    threshed_img = cv2.erode(threshed_img, None, iterations=2)
    threshed_img = cv2.dilate(threshed_img, None, iterations=4)

    # Get the location of all white pixel in binary threshold image
    locs = np.argwhere(threshed_img == 255)
    try:
        # convert to np.float32
        locs_above_threshold = np.float32(locs)

        # define criteria and apply kmeans()
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10,
                    1.0)
        ret, label, center = cv2.kmeans(locs_above_threshold, n_clusters, None,
                                        criteria, 10,
                                        cv2.KMEANS_RANDOM_CENTERS)

        # Compute the percentage of each clusters in the image.
        percent_of_dominance = compute_percents_of_labels(label)

        if return_all_pos:
            return label, locs_above_threshold.astype(
                "int32"), percent_of_dominance
        else:
            return center.astype("int32"), percent_of_dominance

    except:
        # Catch exception return negative array.
        if return_all_pos:
            return np.array([-1]), np.array([-1]), np.array([-1])
        else:
            return np.ones(shape=(n_clusters,
                                  1)) * -1, np.ones(shape=(n_clusters, 1)) * -1
    def combine_tracks(self,
                       plot=True,
                       overwrite=False,
                       search_radius=8,
                       min_duration=8,
                       min_duration_in_start_area=3,
                       propagation_speed=-2,
                       propagation_length=10,
                       lat_restriction=[5, 35]):
        out_file = self._working_dir + str(self._identifier) + '_track_info.nc'
        if overwrite and os.path.isfile(out_file):
            os.system('rm ' + out_file)
            os.system('rm ' + self._working_dir + 'track_path/' +
                      str(self._identifier) + '_*_*.png')

        elif overwrite == False and os.path.isfile(out_file):
            self._aews = da.read_nc(out_file)
            return self._aews

        def unit_vector(vector):
            return vector / np.linalg.norm(vector)

        def angle_between(v1, v2):
            if sum(v1) == 0 or sum(v2) == 0:
                return 0
            v1_u = unit_vector(v1)
            v2_u = unit_vector(v2)
            return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

        def v_len(v):
            return sum([zz**2 for zz in v])**0.5

        # convert distances from degrees into grid-cells
        search_radius = self.degree_to_step(search_radius)
        propagation_length = self.degree_to_step(propagation_length)

        found_id = 0
        found_tracks = {}

        postions = self._detected.copy().values
        used_pos = []
        print('combining tracks\n10------50-------100')
        for p_0, progress in zip(
                postions.tolist(),
                np.array([['-'] + [''] * (len(postions.tolist()) / 20 + 1)] *
                         20).flatten()[0:len(postions.tolist())]):
            sys.stdout.write(progress)
            sys.stdout.flush()
            track = [p_0, p_0]
            running = True
            #go backwards
            while True:
                p = track[0]
                p__ = track[1]
                candidates = {}
                v_1 = ((p[1] - p__[1]), (p[2] - p__[2]))
                for p_1 in postions[postions[:, 0] == p[0] - 1, :].tolist():
                    v_2 = ((p_1[1] - p[1]), (p_1[2] - p[2]))
                    if v_len(v_2) < search_radius:
                        candidates[v_len(v_2) * angle_between(v_1, v_2)] = p_1
                        end = False
                if len(candidates.keys()) > 0:
                    track = [candidates[np.min(candidates.keys())]] + track
                else:
                    break

            #go forewards
            while True:
                p = track[-1]
                p__ = track[-2]
                candidates = {}
                v_1 = ((p[1] - p__[1]), (p[2] - p__[2]))
                for p_1 in postions[postions[:, 0] == p[0] + 1, :].tolist():
                    v_2 = ((p_1[1] - p[1]), (p_1[2] - p[2]))
                    if v_len(v_2) < search_radius:
                        candidates[v_len(v_2) * angle_between(v_1, v_2)] = p_1
                        end = False
                if len(candidates.keys()) > 0:
                    track = track + [candidates[np.min(candidates.keys())]]
                else:
                    break

            track.remove(p_0)
            track = da.DimArray(track,
                                axes=[np.array(track)[:, 0], self._detected.z],
                                dims=['time', 'z'])

            # conditions:
            keep = True

            # duration
            if len(track.time) < min_duration:
                keep = False

            # duration in start domain
            if min_duration_in_start_area > 1 and keep:
                start_of_track = track[
                    self._lons[np.array(track[:, 'y'], int),
                               np.array(track[:, 'x'], int)] > -40]
                if len(start_of_track.time) < min_duration_in_start_area:
                    keep = False

            # propagation speed (in starting domain?)
            if propagation_speed is not None and len(track.time) > 1 and keep:
                c = np.array([
                    self.step_to_distance(track[i - 1, 'y'], track[i, 'y'],
                                          track[i - 1, 'x'], track[i, 'x'])[0]
                    / (6. * 60. * 60) for i in track.time[1:]
                    if i - 1 in track.time
                ])
                if np.percentile(c, 50) > propagation_speed:
                    kepp = False

            # propagation length
            if propagation_length is not None and len(track.time) > 1 and keep:
                if abs(self._lons[int(track.ix[-1, 1]),
                                  int(track.ix[-1, 2])] -
                       self._lons[int(track.ix[0, 1]),
                                  int(track.ix[0, 2])]) < propagation_length:
                    keep = False

            # lat genesis restriction
            if lat_restriction is not None and keep:
                if self._lats[np.array(track.ix[0, 1], int),
                              np.array(track.ix[0, 2], int)] > max(
                                  lat_restriction) or self._lats[
                                      np.array(track.ix[0, 1], int),
                                      np.array(track.ix[0, 2], int)] < min(
                                          lat_restriction):
                    keep = False

            # delete duplicates
            if keep:
                for id__, track__ in found_tracks.items():
                    if sum([
                            pp in track__.values.tolist()
                            for pp in track.values.tolist()
                    ]) / float(len(track.values.tolist())) != 0:
                        if track[:,
                                 'members'].sum() > track__[:,
                                                            'members'].sum():
                            found_tracks.pop(id__)
                            break
                        else:
                            keep = False
                            break

            if keep:
                found_tracks[found_id] = track
                found_id += 1

        self._aews = {}
        self._id = 0
        for track in found_tracks.values():
            track = da.DimArray(track,
                                axes=[np.array(track)[:, 0], self._detected.z],
                                dims=['time', 'z'])
            self._aews[self._identifier + '_' + str(self._id)] = track
            if plot: self.plot_track_path(track)
            self._id += 1

        self._aews = da.Dataset(self._aews)
        self._aews.write_nc(out_file, mode='w')
        print('\ndone')
        return self._aews
def info_boxplot_v1(
    ax: matplotlib.axes, data: List[np.ndarray or List[int or float]]
    or np.ndarray) -> matplotlib.axes:
    """
    Drawing function for box plots.

    This is the 1st version of info_boxplot which satisfies the requirement 1.
    The box plots will be drawn by this function and the quartile 1, median, quartile 3,
    the lower fence (sometimes called “minimum”), and upper fence will be included.
    The outliers will be drawn as pointers on the plot.

    Parameters
    ----------
    ax: matplotlib.axes.Axis

    data: list(list()), ...)
          consists in a list of list and each item of data is a list containing multiple series of numerical values

    Returns
    -------
    matplotlib.axes

    """

    # input checking
    if isinstance(data, np.ndarray):
        assert len(data.shape) == 2, "The input should be 2-D array"
        assert data.dtype != '<U11', "The element in 2-D array should be numerical values"
    else:
        data = input_checking(data)

    # set x-axis and y-axis
    labels = [i + 1 for i in range(len(data))]
    y_min = min(min(data[i]) for i in range(len(data)))
    y_max = max(max(data[i]) for i in range(len(data)))
    ax.set_ylim(y_min - 0.1 * (abs(y_max)), y_max + 0.1 * (abs(y_max)))
    ax.set_xlim(0, len(labels) + 1)
    ax.set_xticks(labels)

    # set a box for each list of data
    for index in range(len(data)):
        # set the width of the box and caps
        width = 0.2
        # get the quantiles
        quantiles = np.percentile(data[index], (25, 50, 75))
        iqr = quantiles[2] - quantiles[0]
        # the lower bound of the box
        low_bound = quantiles[0] - 1.5 * iqr
        # the upper bound of the box
        up_bound = quantiles[2] + 1.5 * iqr

        # pick out and draw the outliers
        outliers = np.concatenate((data[index][low_bound > data[index]],
                                   data[index][up_bound < data[index]]))
        for o in outliers:
            trans = (
                ax.figure.dpi_scale_trans +
                transforms.ScaledTranslation(labels[index], o, ax.transData))
            circle = matplotlib.patches.Circle((0, 0),
                                               0.04,
                                               edgecolor='black',
                                               facecolor='white',
                                               transform=trans)
            ax.add_patch(circle)
            # do not consider outliers when drawing the boxplot
            data[index] = data[index][~np.isin(data[index], o)]

        # draw the whisker,caps and box
        # define the top of box
        box_top = min(max(data[index]), up_bound)
        # define the bottom of box
        box_bottom = max(min(data[index]), low_bound)
        # draw the bottom of box
        ax.hlines(quantiles[0],
                  labels[index] - width,
                  labels[index] + width,
                  linewidth=1)
        # draw the median of box
        ax.hlines(quantiles[1],
                  labels[index] - width,
                  labels[index] + width,
                  color='orange',
                  linewidth=1)
        # draw the top of box
        ax.hlines(quantiles[2],
                  labels[index] - width,
                  labels[index] + width,
                  linewidth=1)
        # draw the high cap
        ax.hlines(box_top,
                  labels[index] - width / 2,
                  labels[index] + width / 2,
                  linewidth=1)
        # draw the low cap
        ax.hlines(box_bottom,
                  labels[index] - width / 2,
                  labels[index] + width / 2,
                  linewidth=1)
        # draw the low whisker
        ax.vlines(labels[index],
                  ymin=box_bottom,
                  ymax=quantiles[0],
                  linewidth=1)
        # draw the high whisker
        ax.vlines(labels[index], ymin=quantiles[2], ymax=box_top, linewidth=1)
        # draw the left bound of whisker
        ax.vlines(labels[index] - width,
                  ymin=quantiles[0],
                  ymax=quantiles[2],
                  linewidth=1)
        # draw the right bound of whisker
        ax.vlines(labels[index] + width,
                  ymin=quantiles[0],
                  ymax=quantiles[2],
                  linewidth=1)
    return ax
Ejemplo n.º 51
0
    def find_bad_by_correlation(
        self, corr_thresh=0.4, fraction_bad=0.1, corr_window_secs=1.0
    ):
        """Detect channels that do not correlate well with the other channels.

        Divide the whole signal into windows and compute window wise
        correlations. If a channel has more than `fraction_bad` windows that
        have correlate less than `corr_thresh` with the other channels, that
        channel is considered `bad_by_correlation`. The measure of correlation
        with other channels is defined as the 98th percentile of the absolute
        values of the correlations with the other channels in each window.

        Parameters
        ----------
        corr_thresh : float
            The minimum correlation threshold that should be attained within a
            data window.

        fraction_bad : float
            If this percentage of all data windows in which the correlation
            threshold was not surpassed is exceeded, classify a
            channel as `bad_by_correlation`.

        corr_window_secs : float
            Width of the correlation window in seconds.

        """
        # Based on the data, determine how many windows we need
        # and how large they should be
        correlation_frames = corr_window_secs * self.sfreq
        correlation_window = np.arange(0, correlation_frames)
        n = correlation_window.shape[0]
        correlation_offsets = np.arange(
            0, (self.signal_len - correlation_frames), correlation_frames
        )
        w_correlation = correlation_offsets.shape[0]

        # preallocate
        channel_correlations = np.ones((w_correlation, self.n_chans))

        # Cut the data indo windows
        x_bp_window = self.x_bp[: self.n_chans, : n * w_correlation]
        x_bp_window = x_bp_window.reshape(self.n_chans, n, w_correlation)

        # Perform Pearson correlations across channels per window
        # For each channel, take the absolute of the 98th percentile of
        # correlations with the other channels as a measure of how well
        # correlated that channel is with the others.
        for k in range(w_correlation):
            eeg_portion = x_bp_window[:, :, k]
            window_correlation = np.corrcoef(eeg_portion)
            abs_corr = np.abs(
                (window_correlation - np.diag(np.diag(window_correlation)))
            )
            channel_correlations[k, :] = np.percentile(abs_corr, 98, axis=0)

        # Perform thresholding to see which channels correlate badly with the
        # other channels in a certain fraction of windows (bad_time_threshold)
        thresholded_correlations = channel_correlations < corr_thresh
        frac_bad_corr_windows = np.mean(thresholded_correlations, axis=0)

        # find the corresponding channel names and return
        bad_idxs_bool = frac_bad_corr_windows > fraction_bad
        bad_idxs = np.argwhere(bad_idxs_bool)
        bads = self.ch_names[bad_idxs.astype(int)]
        bads = [i[0] for i in bads]
        bads.sort()
        self.bad_by_correlation = bads
        self._channel_correlations = channel_correlations
        return None
def info_boxplot_v3(ax: matplotlib.axes,
                    data: List[np.ndarray or List[int or float]] or np.ndarray,
                    facecolor: str = 'white',
                    outliercolor: str = 'steelblue',
                    boxlinecolor: str = 'black',
                    whiskercolor: str = 'black',
                    outlierlinecolor: str = 'white',
                    capcolor: str = 'black',
                    medianlinecolor: str = 'orange',
                    multiplebox: bool = True) -> matplotlib.axes:
    """
    Drawing function for box plots.

    This is the 3rd version of info_boxplot which satisfies the requirement 3. Based on the previous `info_boxplot`,
    the `info_boxplot_v3` can show every 5%-percentile from the 1st quartile (Q1) until the 3rd quartile (Q3).

    Parameters
    ----------
    ax: matplotlib.axes.Axis

    data: List[np.ndarray or List[int or float]] or np.ndarray
        consists in a list of list and each item of data is a list containing multiple series of numerical values.

    facecolor: str, default: 'white'
        The color of the faces of boxes.

    outliercolor: str, default: 'steelblue'
        The color of points which represent outliers.

    outlierlinecolor: str, default: 'white'
        The color of the edges of points which represent outliers.

    boxlinecolor: str, default: 'black'
        The color of the edges of the boxes.

    whiskercolor: str, default: 'black'
        The color of whiskers (the vertical lines extending to the most extreme, non-outlier data points).

    capcolor: str, default: 'black'
        The color of caps (horizontal lines at the ends of the whiskers).

    medianlinecolor: str, default: 'orange'
        The color of the median lines in the boxes.

    multiplebox: bool, default: True
        If true, lines which represent every 5%-percentile from the 1st quartile (Q1) until the 3rd quartile (Q3)
        will be drawn.

    Returns
    -------
        matplotlib.axes

    """

    # input checking
    if isinstance(data, np.ndarray):
        assert len(data.shape) == 2, "The input should be 2-D array"
        assert data.dtype != '<U11', "The element in 2-D array should be numerical values"
    else:
        data = input_checking(data)

    # set x-axis and y-axis
    labels = [i + 1 for i in range(len(data))]
    y_min = min(min(data[i]) for i in range(len(data)))
    y_max = max(max(data[i]) for i in range(len(data)))
    ax.set_ylim(y_min - 0.1 * abs(y_max), y_max + 0.1 * (abs(y_max)))
    ax.set_xlim(0, len(labels) + 1)
    ax.set_xticks(labels)

    # set a box for each list of data
    for index in range(len(data)):
        width = 0.2  # set the width of the box and caps
        quantiles = np.percentile(data[index],
                                  (25, 50, 75))  # get the quantiles
        iqr = quantiles[2] - quantiles[0]
        low_bound = quantiles[0] - 1.5 * iqr  # the lower bound of the box
        up_bound = quantiles[2] + 1.5 * iqr  # the upper bound of the box

        # pick out and draw the outliers
        outliers = np.concatenate((data[index][low_bound > data[index]],
                                   data[index][up_bound < data[index]]))
        for o in outliers:
            trans = (
                ax.figure.dpi_scale_trans +
                transforms.ScaledTranslation(labels[index], o, ax.transData))
            circle = matplotlib.patches.Circle((0, 0),
                                               0.04,
                                               edgecolor=outlierlinecolor,
                                               facecolor=outliercolor,
                                               transform=trans)
            ax.add_patch(circle)
            # do not consider outliers when drawing the boxplot
            data[index] = data[index][~np.isin(data[index], o)]

        # draw the whisker,caps and box
        # define the top of box
        box_top = min(max(data[index]), up_bound)
        # define the bottom of box
        box_bottom = max(min(data[index]), low_bound)
        # draw the bottom of box
        ax.hlines(quantiles[0],
                  labels[index] - width,
                  labels[index] + width,
                  linewidth=1,
                  color=boxlinecolor)
        # draw the median of box
        ax.hlines(quantiles[1],
                  labels[index] - width,
                  labels[index] + width,
                  color=medianlinecolor,
                  linewidth=1)
        # draw the top of box
        ax.hlines(quantiles[2],
                  labels[index] - width,
                  labels[index] + width,
                  linewidth=1,
                  color=boxlinecolor)
        # draw the high cap
        ax.hlines(box_top,
                  labels[index] - width / 2,
                  labels[index] + width / 2,
                  linewidth=1,
                  color=capcolor)
        # draw the low cap
        ax.hlines(box_bottom,
                  labels[index] - width / 2,
                  labels[index] + width / 2,
                  linewidth=1,
                  color=capcolor)
        # draw the low whisker
        ax.vlines(labels[index],
                  ymin=box_bottom,
                  ymax=quantiles[0],
                  linewidth=1,
                  color=whiskercolor)
        # draw the high whisker
        ax.vlines(labels[index],
                  ymin=quantiles[2],
                  ymax=box_top,
                  linewidth=1,
                  color=whiskercolor)
        # draw the left bound of whisker
        ax.vlines(labels[index] - width,
                  ymin=quantiles[0],
                  ymax=quantiles[2],
                  linewidth=1,
                  color=boxlinecolor)
        # draw the right bound of whisker
        ax.vlines(labels[index] + width,
                  ymin=quantiles[0],
                  ymax=quantiles[2],
                  linewidth=1,
                  color=boxlinecolor)

        if multiplebox:
            per5 = np.percentile(data[index],
                                 (30, 35, 40, 45, 50, 55, 60, 65, 70),
                                 interpolation='midpoint')
            for k in range(len(per5)):
                ax.hlines(per5[k],
                          labels[index] - width,
                          labels[index] + width,
                          linewidth=1,
                          color=boxlinecolor)
            ax.hlines(quantiles[1],
                      labels[index] - width,
                      labels[index] + width,
                      color=medianlinecolor,
                      linewidth=3)

        # define the color of the box's face
        rect = plt.Rectangle((labels[index] - width, quantiles[0]),
                             2 * width,
                             quantiles[2] - quantiles[0],
                             color=facecolor)
        ax.add_patch(rect)
    return ax
Ejemplo n.º 53
0
        #To only consider the area inside the box for detecting the digit
        #roi = Region of Interest
        roi = gray[upper_left[1]:bottom_right[1],
                   upper_left[0]:bottom_right[0]]

        #convert cv2 to pil format
        im_pil = Image.fromarray(roi)

        #convert to grayscale image - 'L' format means each pixel is
        #represented by a single value from 0 to 255
        image_bw = im_pil.convert('L')
        image_bw_resized = image_bw.resize((28, 28), Image.ANTIALIAS)

        image_bw_resized_inverted = PIL.ImageOps.invert(image_bw_resized)
        pixel_filter = 20
        min_pixel = np.percentile(image_bw_resized_inverted, pixel_filter)
        image_bw_resized_inverted_scaled = np.clip(
            image_bw_resized_inverted - min_pixel, 0, 255)
        max_pixel = np.max(image_bw_resized_inverted)
        image_bw_resized_inverted_scaled = np.asarray(
            image_bw_resized_inverted_scaled) / max_pixel
        test_sample = np.array(image_bw_resized_inverted_scaled).reshape(
            1, 784)
        test_pred = clf.predict(test_sample)
        print("Predicted class is :-", test_pred)

        # Display the resulting frame
        cv2.imshow('frame', gray)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    except Exception as e:
Ejemplo n.º 54
0
import os
import numpy as np
import sys
sys.path.append('../')
from word_count_helpers import *

# Load the file names                                                                                                                                      
dataDir = "../../../data/word_count/"
fnames = os.listdir(dataDir)

allcounts = []
for fname in fnames:
    f = open(dataDir+fname, 'r', encoding='utf-8')
    text = cleantext(f.readlines())
    allcounts.append(countwords(text))

globalcounts = dict()
for counts in allcounts:
    globalcounts = { k: counts.get(k, 0) + globalcounts.get(k, 0) for k in set(counts) | set(globalcounts) }
    
for counts in allcounts:
    counts = { k:v for k, v in counts.items() if v > np.percentile(list(counts.values()),98) }
    normcounts = { k: counts.get(k, 0) / globalcounts.get(k, 0) for k in set(counts) & set(globalcounts) }
    top5 = sorted(normcounts, key=normcounts.get, reverse=True)[:5]
    str =''
    for k in top5:
        str=str + "%s: %s" % (k, normcounts[k]) + "; "
    print(str[:-2])
Ejemplo n.º 55
0
# the resolution and S/N cuts.   Use these distributions to make cuts.
for ind in range(len(fwhm_arr)):
    fwhm = fwhm_arr[ind]
    print 'Beginning work for FWHM=%.2f arcsec:' % fwhm

    # Get the resolution for this FWHM value.
    res = resolution_arr[:, ind]

    # Get the subsets that pass / fail the resolution>=1/3 cut.
    pass_cuts = res >= 1. / 3
    fail_cuts = (1 - pass_cuts).astype(bool)

    # Find the 5th percentile in half-light radius for galaxies that pass the resolution cut.  Then
    # check that if we cut there, what fraction of the galaxies that fail the resolution cut are
    # eliminated.
    cut_val = np.percentile(gal_hlr[pass_cuts], 5.)
    elim_frac = float(np.sum(gal_hlr[fail_cuts] < cut_val)) / len(
        gal_hlr[fail_cuts])
    print '    Radius cut at %.3f arcsec eliminates a fraction %f of res failures' % (
        cut_val, elim_frac)

    if do_plot:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        n, bins, patches = ax.hist(gal_hlr[pass_cuts],
                                   np.linspace(0., 1., 21),
                                   facecolor='green',
                                   alpha=0.75)
        n, bins, patches = ax.hist(gal_hlr[fail_cuts],
                                   np.linspace(0., 1., 21),
                                   facecolor='red',
Ejemplo n.º 56
0
    'LW': LW,
}
fit2 = pystan.stan(model_code=model, data=data, iter=5000, chains=4)

la2 = fit2.extract()
fit2

plt.figure(figsize=(15, 7))
cmap = matplotlib.cm.get_cmap('tab10')

for j, player in enumerate(arr_target_player):

    samples = la2['mu'][:, j, :]

    medians = np.median(samples, axis=0)
    lower, upper = np.percentile(samples, q=[25.0, 75.0], axis=0)

    c = cmap(j)

    plt.plot(arr_target_year, medians, marker='o', label=player, color=c)
    plt.fill_between(arr_target_year, lower, upper, alpha=0.2, color=c)

plt.xlabel('year')
plt.ylabel('latent strength')
plt.legend(loc='lower left', bbox_to_anchor=(1, 0.5))
plt.show()


cmap = matplotlib.cm.get_cmap('tab10')

for j, player in enumerate(arr_target_player):
Ejemplo n.º 57
0
def get_sum_metrics(batch_output,
                    batch_target,
                    metrics_type,
                    test=False,
                    printDice=False):

    if torch.is_tensor(batch_output):
        batch_output = batch_output.data.cpu().numpy()
    if torch.is_tensor(batch_target):
        batch_target = batch_target.data.cpu().numpy()
    assert batch_output.shape == batch_target.shape
    assert len(batch_output.shape) == 4
    spacing = (1, 1)
    size = batch_output.shape[0]
    metrics = dict.fromkeys(metrics_type, 0)
    dices = []
    for i in range(size):
        output = batch_output[i, 0]
        target = batch_target[i, 0]
        labelPred = sitk.GetImageFromArray(output, isVector=False)
        labelPred.SetSpacing(spacing)
        labelTrue = sitk.GetImageFromArray(target, isVector=False)
        labelTrue.SetSpacing(spacing)  # spacing order (x, y, z)
        # voxel_metrics
        pred = output.astype(int)
        gdth = target.astype(int)
        fp_array = copy.deepcopy(pred)  # keep pred unchanged
        fn_array = copy.deepcopy(gdth)
        gdth_sum = np.sum(gdth)
        pred_sum = np.sum(pred)
        intersection = gdth & pred
        union = gdth | pred
        intersection_sum = np.count_nonzero(intersection)
        union_sum = np.count_nonzero(union)

        tp_array = intersection

        tmp = pred - gdth
        fp_array[tmp < 1] = 0

        tmp2 = gdth - pred
        fn_array[tmp2 < 1] = 0

        tn_array = np.ones(gdth.shape) - union

        tp, fp, fn, tn = np.sum(tp_array), np.sum(fp_array), np.sum(
            fn_array), np.sum(tn_array)

        smooth = EPSILON
        precision = (tp) / (pred_sum + smooth)
        recall = (tp) / (gdth_sum + smooth)

        false_positive_rate = (fp) / (fp + tn + smooth)
        false_negtive_rate = (fn) / (fn + tp + smooth)

        jaccard = (intersection_sum) / (union_sum + smooth)
        dice = (2 * intersection_sum) / (gdth_sum + pred_sum + smooth)
        ppv = (intersection_sum) / (pred_sum + smooth)
        dicecomputer = sitk.LabelOverlapMeasuresImageFilter()
        dicecomputer.Execute(labelTrue > 0.5, labelPred > 0.5)

        # distance_metrics
        signed_distance_map = sitk.SignedMaurerDistanceMap(
            labelTrue > 0.5, squaredDistance=False,
            useImageSpacing=True)  # It need to be adapted.

        ref_distance_map = sitk.Abs(signed_distance_map)

        ref_surface = sitk.LabelContour(labelTrue > 0.5, fullyConnected=True)

        statistics_image_filter = sitk.StatisticsImageFilter()
        statistics_image_filter.Execute(ref_surface > 0.5)

        num_ref_surface_pixels = int(statistics_image_filter.GetSum())

        signed_distance_map_pred = sitk.SignedMaurerDistanceMap(
            labelPred > 0.5, squaredDistance=False, useImageSpacing=True)

        seg_distance_map = sitk.Abs(signed_distance_map_pred)

        seg_surface = sitk.LabelContour(labelPred > 0.5, fullyConnected=True)

        seg2ref_distance_map = ref_distance_map * sitk.Cast(
            seg_surface, sitk.sitkFloat32)

        ref2seg_distance_map = seg_distance_map * sitk.Cast(
            ref_surface, sitk.sitkFloat32)

        statistics_image_filter.Execute(seg_surface > 0.5)

        num_seg_surface_pixels = int(statistics_image_filter.GetSum())

        seg2ref_distance_map_arr = sitk.GetArrayViewFromImage(
            seg2ref_distance_map)
        seg2ref_distances = list(
            seg2ref_distance_map_arr[seg2ref_distance_map_arr != 0])
        seg2ref_distances = seg2ref_distances + list(
            np.zeros(num_seg_surface_pixels - len(seg2ref_distances)))
        ref2seg_distance_map_arr = sitk.GetArrayViewFromImage(
            ref2seg_distance_map)
        ref2seg_distances = list(
            ref2seg_distance_map_arr[ref2seg_distance_map_arr != 0])
        ref2seg_distances = ref2seg_distances + list(
            np.zeros(num_ref_surface_pixels - len(ref2seg_distances)))  #
        all_surface_distances = seg2ref_distances + ref2seg_distances

        metrics['dice'] += dice
        metrics['jaccard'] += jaccard
        metrics['precision'] += precision
        metrics['recall'] += recall
        metrics['fpr'] += false_positive_rate
        metrics['fnr'] += false_negtive_rate
        metrics['vs'] += dicecomputer.GetVolumeSimilarity()
        metrics['ppv'] += ppv
        metrics["msd"] += np.mean(all_surface_distances)
        metrics["mdsd"] += np.median(all_surface_distances)
        metrics["stdsd"] += np.std(all_surface_distances)
        metrics["hd95"] += np.percentile(all_surface_distances, 95)
        metrics["hd"] += np.max(all_surface_distances)
        if printDice:
            dices.append(dice)
    if printDice:
        return metrics, dices
    return metrics
Ejemplo n.º 58
0
import time

output = sys.argv[1]

if not os.path.exists('../' + output):
    os.makedirs('../' + output)

with open('../inputs/mirror_roll/train.pickle', 'rb') as f:
    images = pickle.load(f)

list1 = []

for itr in range(len(images['Label'])):
    trla = images['Label'][itr]
    trlaa = trla[0:1, :, :, :]
    label_ratio = (trlaa > 0).sum() / (trlaa.shape[1] * trlaa.shape[2] * trlaa.shape[3] - (trlaa > 0).sum())
    list1.append(label_ratio)
list2 = np.sort(list1)
print('mean: ',np.mean(list2))
print('max: ',np.max(list2))
print('min: ', np.min(list2))
print('20%: ', np.percentile(list2,20))
print('80%: ', np.percentile(list2,80))

plt.hist(list2,bins=100)
plt.title('label ratio distribution')
plt.savefig('../' + output + '/dis.png')
df = pd.DataFrame(list2)
df.to_csv('../' + output + '/dis.csv',index_label = False)

Ejemplo n.º 59
0
            csv_file.write('lat\tlng\tcluster\tname\tiso2\tnum_posts\n')
            for city, cid in sorted(zip(eligible_cities, cluster_ids)):
                (lat, lng), iso2 = locations[city]
                csv_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (lat, lng, cid, city, iso2, city_density[city]))


elif args.pca:
    # vectors = np.array([model.docvecs[city] for city in eligible_cities])
    # subtract negatives to guarantee positive numbers
    vectors -= vectors.min()
    pca = NMF(n_components=3, init='nndsvd', shuffle=True)
    rgb = pca.fit_transform(vectors)

    # scale values in RGB to [0-1]
    for component in range(3):
        lower_bound = np.percentile(rgb[:, component], 1, axis=0)
        upper_bound = np.percentile(rgb[:, component], 90, axis=0)
        rgb[:, component] -= lower_bound
        rgb[:, component] /= upper_bound - lower_bound
    color_names = np.clip(rgb, 0.0, 1.0)
    cMap = 'Reds'

# make map from shape file

# plot background map
fs = (int(area.geometry.bounds.maxx.max() - area.geometry.bounds.minx.min())//1.5, int(area.geometry.bounds.maxy.max() - area.geometry.bounds.miny.min()))
fig, ax = plt.subplots(figsize=fs)
area.plot(ax=ax, edgecolor='black', facecolor='white', linewidth=1);

if args.nuts:
    area.plot(ax=ax, edgecolor='black', facecolor=color_names, linewidth=1, alpha=0.6);
Ejemplo n.º 60
0
def main():
    ################################################################################
    ##########              READ IN THE RAW PHOTOMETRY          ####################
    #################################################################################
    numecl = 0
    plnm = 'WASP_101'
    verbose = 'false'
    fpath = '/Users/rahuljayaraman/Documents/Miscellany/Research (Tucker Group)/Python (Transits)/' + plnm
    aorlist = os.listdir(fpath)

    #aorlist= [item for item in aorlist if not item.startswith('.')]
    #aorlist=aor_from_list(plnm, 1)
    #aorlist=[50494976]
    aorlist = ['62158336', '62159360']
    #aorlist=np.delete(aorlist, [0,1, len(aorlist)-1])
    for aor in aorlist:
        print(aor)
        aor = str(aor)
        prisec = 'primary'
        ramp_style = 'none'
        fpathout = fpath + aor + '/apr_fits/' + ramp_style + '/'
        directory = os.path.dirname(fpathout)
        if not os.path.exists(directory):
            os.makedirs(directory)

        #dd=np.load('/Users/Brian/Desktop/Tucker_Group/t_1/outputs/'+plnm+'/'+aor)
        dd = np.load(fpath + '/' + aor + 'extraction.npz')
        t = dd['time']
        all_lc = dd['lc']
        #hp=dd['hp']
        cp = dd['cp']
        exptime = dd['exptime']
        framtime = 0.1
        orbparams = dd['op']
        holdpos = dd['hold_pos']
        npix = dd['beta_np']
        chnum = dd['ch']
        red_all = []
        orbparams[6] = 2456164.6934  #only for wasp-101b

        ################################################################################
        pred_ecl_time = get_pred_time(orbparams, t, prisec)
        print(orbparams)
        print(pred_ecl_time - t[0])

        freeparams = [pred_ecl_time - t[0], orbparams[2]]

        if prisec == 'secondary':
            freeparams[1] = 0.0011
            ldc = []
        else:
            ldc = find_coeffs(
                orbparams[10], orbparams[9], orbparams[8], 2,
                'quadratic')  #(temp, log_g, metallicity, channel, type_limb)

        for apr in range(0, all_lc.shape[1]):

            directory = os.path.dirname(fpathout)
            if not os.path.exists(directory):
                os.makedirs(directory)
            lc = np.squeeze(all_lc[:, apr] * 2.35481)
            time = (t - t[0])
            time = np.squeeze(time)
            norm = np.nanmedian(lc)
            #print('Photon Noise limit is: ',(np.sqrt(norm*1.002)/(norm*1.002)))

            err = 1.1 * lc**0.5
            lc = lc / norm
            err = err / norm
            err = np.ones(len(lc)) * 0.0045

            xpos = holdpos[:, 0]
            ypos = holdpos[:, 1]
            npix = dd['beta_np']

            ################################################################################
            ##########              NORMALIZE THE PIXEL VALUES          ####################
            ################################################################################
            timelength = len(t)
            #cp1=cp[1:4, 1:4, :]
            cp1 = cp
            dep_ind = cp1.shape[0] * cp1.shape[1]
            cp2 = np.reshape(cp1, (dep_ind, timelength))
            cp3 = cp2  #[:,start:end]
            for p in range(0, len(time)):
                norm = np.sum(cp3[:, p])
                cp3[:, p] /= norm
    ################################################################################
    ##########                  FILTER THE DATA                 ####################
    ################################################################################
    #fpathout='/Users/Brian/Desktop/Tucker_Group/Spitzer/mapping_files/outputs/'+plnm+'/'+aor+'/apr_fits/'
            filt_file = fpathout + 'post_filter_' + str(apr) + '.npz'

            #print(filt_file)
            if os.path.isfile(filt_file):
                if verbose == 'true': print('Found Filter File')
                ff = np.load(filt_file)
                lc = ff['lc']
                #cp3=ff['cp3']
                time = ff['time']
                xpos = ff['xpos']
                ypos = ff['ypos']
                npix = ff['npix']
                err = ff['err']
                found = 'true'

            else:
                found = 'false'
                if verbose == 'true': print('In Filter')
                lc, cp3, time, xpos, ypos, npix, err = filter_data(
                    lc, cp3, time, xpos, ypos, npix, dep_ind, err)
                if verbose == 'true': print('Out of Filter')

            plt.figure()
            plt.title(plnm + ' Ch: ' + str(chnum) + '\n' + str(aor) + '_' +
                      str(apr))
            plt.axvline(x=pred_ecl_time - t[0])
            plt.axvline(x=pred_ecl_time - orbparams[4] * 0.5 - t[0],
                        color='r',
                        linestyle='dashed')
            plt.axvline(x=pred_ecl_time + orbparams[4] * 0.5 - t[0],
                        color='r',
                        linestyle='dashed')
            plt.scatter(time, lc, s=1)
            if prisec == 'secondary': plt.ylim(0.95, 1.05)
            else: plt.ylim(0.95, 1.03)

            #plt.xlim(time[0], np.amax(time))
            plt.savefig(fpathout + 'raw_lc_plot_' + str(apr))
            if verbose == 'true':
                plt.draw()
                plt.pause(1200)
            plt.close('all')

            # time2=np.multiply(time, time)
            # time=time[np.newaxis]
            # time2=time2[np.newaxis]
            # t2hours=time2*24.0**2.0
            # thours=time*24.0

            ################################################################################
            ##########                  TRIM THE DATA                 ####################
            ################################################################################
            trim_time = 0.  #in minutes
            if trim_time != 0.:
                trim_time = trim_time / (60. * 24.0)  #convert to days
                start_index = int(trim_time / (exptime / 86400.0))
                end_ind = np.squeeze(lc)
                end_ind = end_ind.size

                print(exptime)

                lc = lc[start_index:end_ind]
                time = np.squeeze(time[start_index:end_ind])
                xpos = xpos[start_index:end_ind]
                ypos = ypos[start_index:end_ind]
                npix = npix[start_index:end_ind]
                err = err[start_index:end_ind]
                plt.figure()
                plt.scatter(time, lc, s=1)
                plt.draw()
################################################################################
##########             FIND NEIGHBORS                ####################
################################################################################

            if found == 'true':
                gw = ff['gw']
                nbr = ff['nbr']
            else:
                if verbose == 'true': print('In Find NBR')
                gw, nbr = find_nbr_qhull(xpos,
                                         ypos,
                                         npix,
                                         sm_num=50,
                                         a=1.0,
                                         b=1.7777,
                                         c=1.0,
                                         print_space=10000.)
                if verbose == 'true': print('Out of Find NBR')
            np.savez(fpathout + 'post_filter_' + str(apr),
                     lc=lc,
                     cp3=cp3,
                     time=time,
                     xpos=xpos,
                     ypos=ypos,
                     npix=npix,
                     err=err,
                     gw=gw,
                     nbr=nbr,
                     orbparams=orbparams,
                     pred_ecl_time=pred_ecl_time)
            ################################################################################
            ##########                  FIT THE DATA                 ####################
            ################################################################################

            if prisec == 'secondary':
                freeparams = [pred_ecl_time - t[0], orbparams[2], 0.005,
                              0.05]  #the last 2 free params are ramp terms
            else:
                if ramp_style == 'linear':
                    freeparams = [
                        pred_ecl_time - t[0], orbparams[2], 0.00001, 1.000001
                    ]
                if ramp_style == 'exp':
                    freeparams = [
                        pred_ecl_time - t[0], orbparams[2], 0.005, 0.05
                    ]
                if ramp_style == 'none':
                    freeparams = [pred_ecl_time - t[0], orbparams[2], 1.0, 1.0]
            params, m = initialize_model(np.squeeze(time), freeparams,
                                         orbparams, prisec, ldc)
            fluxcurve = m.light_curve(params)
            fit_params, pcov, infodict, flag, sucess = leastsq(
                nnbr_res,
                freeparams,
                args=(time, lc, err, gw, nbr, params, m, prisec, ramp_style),
                full_output=1)
            print('apr# ' + str(apr), fit_params)
            file_name = fpathout + 'apr_fit_' + str(apr)
            fileObject = open(file_name, 'wb')
            pickle.dump([lc, time, err, gw, nbr, fit_params], fileObject)
            fileObject.close()

            ################################################################################
            ##########                  PLOT THE FIT                ####################
            ################################################################################
            if prisec == 'secondary':
                params.t_secondary = fit_params[0]
                params.fp = fit_params[1]
            else:
                params.t0 = fit_params[0]
                params.rp = fit_params[1]
            eclipse_model = m.light_curve(params)
            ramp = ramp_model([fit_params[2], fit_params[3]], time, ramp_style)
            lc2 = np.squeeze(lc / eclipse_model / ramp)

            w1 = lc2[nbr]
            w2 = np.multiply(w1, gw)
            w3 = np.sum(w2, 1)
            w4 = np.divide(lc2, w3)
            w5 = w4 * eclipse_model
            resids = (w4 - 1.)  #/err
            res2 = (lc / eclipse_model - 1.0) / err

            pltbins = 64

            blc = bin_anything(w5, pltbins)
            btime = bin_anything(time, pltbins)

            if prisec == 'secondary':
                phase = 0.5 + (time + t[0] - pred_ecl_time) / orbparams[5]
            if prisec == 'primary':
                phase = 0.0 + (time + t[0] - pred_ecl_time) / orbparams[5]
            bphase = bin_anything(phase, pltbins)

            plt.figure()
            plt.title(plnm + ' Ch: ' + str(chnum) + '\n' + str(aor) + '_' +
                      str(apr))
            plt.scatter(bphase, blc, s=10)
            #plt.scatter(time, lc, alpha=0.1, color='b', s=1)
            plt.plot(np.squeeze(phase), eclipse_model, color='r')
            if prisec == 'secondary':
                plt.ylim(0.9975, 1.0035)
                plt.text(
                    0.47, 1.003, 'T_center O-C (s): ' + str(
                        round((fit_params[0] + t[0] - pred_ecl_time) * 86400.,
                              1)) + '                   Depth: ' +
                    str(round(fit_params[1] * 1.0e6, 0)) + ' ppm')
                plt.text(0.49, 1.0025,
                         'SDNR:  ' + str(round(np.std(resids), 6)))
            else:
                plt.ylim(0.983, 1.005)
                plt.text(
                    0.43, 0.9925, 'T_center O-C (s): ' + str(
                        round((fit_params[0] + t[0] - pred_ecl_time) * 86400.,
                              1)))
                plt.text(
                    0.43, 0.990, 'Transit Depth: ' +
                    str(round(fit_params[1]**2. * 100, 4)) + ' %')
                plt.text(0.43, 0.9875,
                         'SDNR:  ' + str(round(np.std(resids), 6)))
            plt.xlabel('Phase Units')
            plt.ylabel('Relative Flux')

            plt.savefig(fpathout + 'apr_fit_plot_' + str(apr))
            if verbose == 'true':
                plt.draw()
                plt.pause(1.2)

################################################################################
##########                 Get Red Noise                    ####################
################################################################################
            sdnr, beta_red = est_rednoise(resids, framtime, fpathout, aor, apr,
                                          plnm, chnum, prisec)
            if red_all == []:
                red_all = np.ones(shape=(all_lc.shape[1], 5)) * 1000.
            red_all[apr, :] = [
                sdnr, beta_red * sdnr, beta_red,
                round(fit_params[1] * 1.e6, 1), fit_params[0]
            ]

        best = np.nanargmin(red_all, axis=0)
        best = best[1]

        np.save(fpathout + aor + '_summary', red_all)
        np.savetxt(fpathout + aor + '_summary', red_all)
        if verbose == 'true': print(best)

        ################################################################################
        ##########                 Load the best apr results        ####################
        ################################################################################

        filename = fpathout + 'apr_fit_' + str(best)
        fileObject = open(filename, 'rb')
        lc, time, err, gw, nbr, fit_params = pickle.load(fileObject)
        err = err * red_all[best, 2]

        print('Best Beta_red', red_all[best, 2])
        params, m = initialize_model(np.squeeze(time), freeparams, orbparams,
                                     prisec, ldc)

        ################################################################################
        ##########                        run_mcmc                 ####################
        ################################################################################
        theta = fit_params
        ndim, nwalkers = len(theta), 20
        sampler = emcee.EnsembleSampler(nwalkers,
                                        ndim,
                                        lnprob,
                                        args=(time, lc, err, gw, nbr, params,
                                              m, prisec, ramp_style))
        pos = [theta + 1.e-4 * np.random.randn(ndim) for i in range(nwalkers)]
        sampler.run_mcmc(pos, 1500)

        samples = sampler.chain[:, 50:, :].reshape((-1, ndim))
        np.save(fpathout + aor + '_samples', samples)
        if prisec == 'primary':
            fig = corner.corner(samples, labels=["t0", "rp", "a1",
                                                 "a2"])  #, "A/R", "inc"])
        else:
            fig = corner.corner(samples, labels=["t0", "Fp", "a1",
                                                 "a2"])  #, "A/R", "inc"])
        fig.savefig(fpathout + aor + '_corner_' + str(best) + '.png')
        #plt.show(block=False)
        #plt.pause(0.5)

        #Derive error bars
        t0_mcmc, rp_mcmc, a1_mcmc, a2_mcmc = map(
            lambda v: (v[1], v[2] - v[1], v[1] - v[0]),
            zip(*np.percentile(samples, [16, 50, 84], axis=0)))
        print(rp_mcmc, t0_mcmc)
        np.savez(fpathout + aor + '_mcmc_results',
                 rp_mcmc=rp_mcmc,
                 t0_mcmc=t0_mcmc,
                 a1_mcmc=a1_mcmc,
                 a2_mcmc=a2_mcmc,
                 best=best)
        phase = 0.0 + (time + t[0] - pred_ecl_time) / orbparams[5]
        bphase = bin_anything(phase, pltbins)

        plt.figure()
        for t0, rp, a1, a2 in samples[np.random.randint(len(samples),
                                                        size=100)]:
            params.rp = rp
            params.t0 = t0
            ecl_mod = m.light_curve(params)

            plt.plot(phase, ecl_mod, color='k', alpha=0.05)

            ramp = ramp_model([a1, a2], time, ramp_style)
            lc2 = np.squeeze(lc / ecl_mod / ramp)
            w1 = lc2[nbr]
            w2 = np.multiply(w1, gw)
            w3 = np.sum(w2, 1)
            w4 = np.divide(lc2, w3)
            w5 = w4 * ecl_mod
            resids = (w4 - 1.)  #/err
            res2 = (lc / ecl_mod - 1.0) / err

            blc = bin_anything(w5, pltbins)
            btime = bin_anything(time, pltbins)
        plt.scatter(bphase, blc, s=8, alpha=0.5)
        plt.xlabel("Phase Units")
        plt.ylabel("Relative Flux")
        plt.title(plnm + ' Ch: ' + str(chnum))
        plt.show()
        #plt.savefig('/Users/Brian/Desktop/W79_summary/'+str(chnum)+'_mcmc_fit')

    return None