Example #1
0
    def fit(self, df_agg):

        df = deepcopy(df_agg)
        for model in self.models:
            df['error'] = ((df[model] - df[self.target_col]) / df[model])
            grouped = df.groupby(self.region_col).agg({
                'error': ['mean', 'std']
            }).reset_index()
            grouped.columns = [self.region_col, 'mean', 'std']

            self.mean[model] = {
                grouped[self.region_col].iloc[i]: grouped['mean'].iloc[i]
                for i in range(grouped.shape[0])
            }
            self.std[model] = {
                grouped[self.region_col].iloc[i]: grouped['std'].iloc[i]
                for i in range(grouped.shape[0])
            }

            {
                grouped[self.region_col].iloc[i]:
                norm.interval(self.ci_range,
                              loc=grouped['mean'].iloc[i],
                              scale=grouped['std'].iloc[i])
                for i in range(grouped.shape[0])
            }

            self.ci[model] = {
                grouped[self.region_col].iloc[i]:
                norm.interval(self.ci_range,
                              loc=grouped['mean'].iloc[i],
                              scale=grouped['std'].iloc[i])
                for i in range(grouped.shape[0])
            }
Example #2
0
    def predict_dist(self, X, interval=0.95, *args, **kwargs):

        # Expectation and variance in latent space
        Ey_t, Vy_t, ql, qu = super().predict_dist(X, interval)

        # Save computation if identity transform
        if type(self.target_transform) is transforms.Identity:
            return Ey_t, Vy_t, ql, qu

        # Save computation if standardise transform
        elif type(self.target_transform) is transforms.Standardise:
            Ey = self.target_transform.itransform(Ey_t)
            Vy = Vy_t * self.target_transform.ystd ** 2
            ql, qu = norm.interval(interval, loc=Ey, scale=np.sqrt(Vy))
            return Ey, Vy, ql, qu

        # All other transforms require quadrature
        Ey = np.empty_like(Ey_t)
        Vy = np.empty_like(Vy_t)

        # Used fixed order quadrature to transform prob. estimates
        for i, (Eyi, Vyi) in enumerate(zip(Ey_t, Vy_t)):
            # Establish bounds
            Syi = np.sqrt(Vyi)
            a, b = Eyi - 3 * Syi, Eyi + 3 * Syi  # approx 99% bounds

            # Quadrature
            Ey[i], _ = fixed_quad(self.__expec_int, a, b, n=QUADORDER,
                                  args=(Eyi, Syi))
            Vy[i], _ = fixed_quad(self.__var_int, a, b, n=QUADORDER,
                                  args=(Ey[i], Eyi, Syi))

        ql, qu = norm.interval(interval, loc=Ey, scale=np.sqrt(Vy))

        return Ey, Vy, ql, qu
Example #3
0
def Tukey_outliers(set_of_means, FDR=0.005, supporting_interval=0.5, verbose=False):
    """
    Performs Tukey quintile test for outliers from a normal distribution with defined false discovery rate
    :param set_of_means:
    :param FDR:
    :return:
    """
    # false discovery rate v.s. expected falses v.s. power
    q1_q3 = norm.interval(supporting_interval)
    FDR_q1_q3 = norm.interval(1 - FDR)  # TODO: this is not necessary: we can perfectly well fit it with proper params to FDR
    multiplier = (FDR_q1_q3[1] - q1_q3[1]) / (q1_q3[1] - q1_q3[0])
    l_means = len(set_of_means)

    q1 = np.percentile(set_of_means, 50*(1-supporting_interval))
    q3 = np.percentile(set_of_means, 50*(1+supporting_interval))
    high_fence = q3 + multiplier*(q3 - q1)
    low_fence = q1 - multiplier*(q3 - q1)

    if verbose:
        print 'FDR:', FDR
        print 'q1_q3', q1_q3
        print 'FDRq1_q3', FDR_q1_q3
        print 'q1, q3', q1, q3
        print 'fences', high_fence, low_fence

    if verbose:
        print "FDR: %s %%, expected outliers: %s, outlier 5%% confidence interval: %s"% (FDR*100, FDR*l_means,
                                                                                  poisson.interval(0.95, FDR*l_means))

    ho = (set_of_means < low_fence).nonzero()[0]
    lo = (set_of_means > high_fence).nonzero()[0]

    return lo, ho
Example #4
0
def robustness(graphclusters, permutations):
    """
    Compares vectors of cluster assignments to estimate cluster-wise robustness
    and node-wise robustness. These are returned as dictionaries.

    Inspired by reliablity scores as proposed by:
    Frantz, T. L., & Carley, K. M. (2017).
    Reporting a network’s most-central actor with a confidence level.
    Computational and Mathematical Organization Theory, 23(2), 301-312.

    Because calculating the accuracy of a cluster assignment is not trivial,
    the function does not compare cluster labels directly.
    Instead, this function calculates the Jaccard similarity between cluster assignments.


    Parameters
    ----------
    :param graphclusters: Dictionary of original cluster assignments
    :param permutations: Number of permutations to compute robustness.
    :return: Two dictionaries of reliability scores (cluster-wise and node-wise).
    """
    rev_assignments = list()
    for assignment in permutations:
        subassignments = dict()
        for k, v in assignment.items():
            subassignments.setdefault(v, set()).add(k)
        rev_assignments.append(subassignments)
    revclusters = dict()
    for k, v in graphclusters.items():
        revclusters.setdefault(v, set()).add(k)
    # clusterwise jaccard
    clusjaccards = dict()
    for cluster in set(graphclusters.values()):
        true_composition = revclusters[cluster]
        jaccards = list()
        # keys don't have to match so both cluster assignments should be evaluated
        for rev_assignment in rev_assignments:
            scores = list()
            for key in rev_assignment:
                scores.append(jaccard_similarity_score(true_composition, rev_assignment[key]))
            bestmatch = np.max(scores)
            jaccards.append(bestmatch)
        clusjaccards[cluster] = np.round(norm.interval(0.95, np.mean(jaccards), np.std(jaccards)), 4)
    logger.info("Confidence intervals for Jaccard similarity of cluster assignments:")
    logger.info(str(clusjaccards))
    nodejaccards = dict.fromkeys(graphclusters.keys())
    ci_width = dict.fromkeys(graphclusters.keys())
    for node in nodejaccards:
        true_composition = revclusters[graphclusters[node]]
        jaccards = list()
        for i in range(len(permutations)):
            clusid = permutations[i][node]
            rev_assignment = rev_assignments[i][clusid]
            jaccards.append(jaccard_similarity_score(true_composition, rev_assignment))
        nodejaccards[node] = np.round(norm.interval(0.95, np.mean(jaccards), np.std(jaccards)), 4)
        ci_width[node] = np.round(nodejaccards[node][1] - nodejaccards[node][0], 4)
    return clusjaccards, nodejaccards, ci_width
Example #5
0
def plot_samples(ax, data, colour="black"):

    samples = map(operator.itemgetter(0), data)
    Nx = np.array(map(operator.itemgetter(1), data), dtype=int)
    Na = np.array(map(operator.itemgetter(2), data), dtype=int)
    Rx = np.array(map(operator.itemgetter(5), data), dtype=float)
    Rx_CI = map(operator.itemgetter(6), data)
    sex = map(operator.itemgetter(7), data)
    Elx = np.array(map(operator.itemgetter(8), data), dtype=float)

    Rx_m = [x for x,sx in zip(Rx,sex) if sx=='M']
    Rx_f = [x for x,sx in zip(Rx,sex) if sx=='F']

    ax.vlines(0.5, -2, len(samples), linestyle=':')
    ax.vlines(1.0, -2, len(samples), linestyle=':')

    y_pos = np.arange(len(samples))
    ax.set_yticks(y_pos)
    ax.set_yticklabels(["{} ({})".format(s,x+a) for s,x,a in zip(samples,Nx,Na)])


    if len(Rx_m) > 1:
        ax.vlines(np.mean(Rx_m), -2, len(samples), linestyle='-', color='red')
        #ax.vlines(2*np.mean(Rx_m), -2, len(samples), linestyle='-.', color='blue')
        m_ci = norm.interval(0.99, np.mean(Rx_m), np.std(Rx_m))
        ax.fill_between(m_ci, -2, len(samples), alpha=0.2, color="red", edgecolor="none")
    else:
        ax.fill_between([0.4, 0.6], -2, len(samples), alpha=0.2, color="red", edgecolor="none")

    if len(Rx_f) > 1:
        ax.vlines(np.mean(Rx_f), -2, len(samples), linestyle='-', color='blue')
        f_ci = norm.interval(0.99, np.mean(Rx_f), np.std(Rx_f))
        ax.fill_between(f_ci, -2, len(samples), alpha=0.2, color="blue", edgecolor="none")
    else:
        ax.fill_between([0.8, 1.2], -2, len(samples), alpha=0.2, color="blue", edgecolor="none")

    sex_colour = {'M': "red", 'F': "blue", 'U': 'black'}
    ecol = [sex_colour[sx] for sx in sex]

    ax.scatter(Rx, y_pos, facecolor=ecol, edgecolors=colour, lw=0.5, s=60)

    err_low = Rx - np.array(map(operator.itemgetter(0), Rx_CI))
    err_high = np.array(map(operator.itemgetter(1), Rx_CI)) - Rx
    ax.errorbar(Rx, y_pos, xerr=[err_low, err_high], ecolor=colour, marker="none", fmt="none", capsize=0)

    ax.set_ylim([-0.5, len(samples)-0.5])
    ax.set_xlim([0, 1.5])

    ax.set_xlabel('Read dosage (X)', size=16)
    ax.set_ylabel('Sample (number of sequences)', size=16)
Example #6
0
def temporal_prior(traces, actmn, actvar, fwhm, outliers=None):
    """
    Generate temporal-dependent priors using basis sets and mexican-hat functions
    :param traces: matrix of traces, ncells by nframes
    :param actmn: mean activity
    :param actvar: variation above which we will consider it a guaranteed event
    :param fwhm: the full-width at half-maximum to use for the temporal prior
    :param outliers: cells with strongly outlying activity
    :return: prior vector
    """

    if outliers is None:
        outliers = np.zeros(np.shape(traces)[0]) > 1

    # Set the half-width of the convolution kernel
    xhalfwidth = 100

    # Determine a normal function sigma from the full-width at half-maximum
    def sigma(fwhm_):
        return fwhm_ / (2 * np.sqrt(2 * np.log(2)))

    # Generate the basis functions and correct population activity for baseline and variation
    basis = np.power(fwhm, np.arange(4) + 1)
    popact = (np.nanmean(traces[np.invert(outliers), :], axis=0) -
              actmn) / actvar
    fits = np.zeros((len(basis) - 1, len(popact)))

    # Get the first basis normal function
    defrange = int(norm.interval(0.99999, loc=0, scale=sigma(basis[0]))[1]) + 3
    defrange = min(xhalfwidth, defrange)
    b0 = np.zeros(2 * xhalfwidth + 1)
    b0[xhalfwidth - defrange:xhalfwidth + defrange + 1] = norm.pdf(
        range(-defrange, defrange + 1), loc=0, scale=sigma(basis[0]))

    # Generate the fits
    for b in range(1, len(basis)):
        defrange = int(
            norm.interval(0.99999, loc=0, scale=sigma(basis[b]))[1]) + 3
        defrange = min(xhalfwidth, defrange)
        bn = np.zeros(2 * xhalfwidth + 1)
        bn[xhalfwidth - defrange:xhalfwidth + defrange + 1] = norm.pdf(
            range(-defrange, defrange + 1), loc=0, scale=sigma(basis[b]))
        fits[b - 1, :] = np.convolve(popact, b0 - bn, 'same')

    # And return the wfits to the narrowest basis function
    weights = np.clip(np.nanmin(fits, axis=0), 0, 1)

    return weights
def get_stats(values, intervals=True):
    stats = {}

    values_array = np.array(values, dtype=np.float64)

    stats['min'] = np.asscalar(np.amin(values_array))
    stats['max'] = np.asscalar(np.amax(values_array))
    stats['mean'] = np.asscalar(np.mean(values_array))
    stats['median'] = np.asscalar(np.median(values_array))

    if values_array.size > 1:
        stats['std_dev'] = np.asscalar(np.std(values_array, ddof=1))
    else:
        stats['std_dev'] = 0

    if intervals:
        stats['intervals'] = []
        loc = stats['mean']
        scale = stats['std_dev'] / sqrt(values_array.size)

        for alpha in (.95, .99, .90, .85, .80, .50):
            if values_array.size > 30:
                interval = norm.interval(alpha, loc=loc, scale=scale)
            else:
                interval = t.interval(alpha, values_array.size - 1, loc, scale)
            stats['intervals'].append(
                {'confidence': alpha, 'interval': interval})

    return stats
Example #8
0
def test_find_confidence_interval(test_data):
    """
    """

    # z test case
    test_statistic, standard_error = one_samp_z(test_data)
    ci = find_confidence_interval(
        se=standard_error,
        df=np.inf,
        alpha=0.05,
        tails=True,
    )
    ci_s = norm.interval(
        alpha=0.95,
        loc=np.mean(test_data),
        scale=sem(test_data),
    )
    ci_s = ci_s[1] - np.mean(test_data)
    assert np.abs(ci - ci_s) <= 1e-10

    # student t test case
    test_statistic, standard_error, degrees_freedom = one_samp_t(test_data)
    ci = find_confidence_interval(
        se=standard_error,
        df=degrees_freedom,
        alpha=0.05,
        tails=True,
    )
    ci_s = t.interval(alpha=0.95,
                      df=len(data) - 1,
                      loc=np.mean(data),
                      scale=sem(data))
    ci_s = ci_s[1] - np.mean(test_data)
    assert np.abs(ci - ci_s) <= 1e-10
def validate_area(ALPHA, areas, area, discarded_file, discarded_homographies):

    if len(areas) < 2: return True
    norm_areas_parameters = norm.fit(
        areas
    )  # returned a list of two parameters (mean=parameters[0] and std=parameters[1])
    areas_quantiles = norm.interval(ALPHA, norm_areas_parameters[0],
                                    norm_areas_parameters[1])

    ##print('-----')
    ##print('Area: '+str(areas))
    ##print('Mean: '+str(norm_areas_parameters[0]))
    ##print('Std: '+str(norm_areas_parameters[1]))
    ##print(str(areas_quantiles[0])+' < '+str(area)+' < '+str(areas_quantiles[1]))
    ##print('-----')

    if area >= areas_quantiles[0] and area <= areas_quantiles[1]: return True
    else:
        discarded_homographies[0] += 1
        discarded_file.write(
            "HOMOGRAPHY DISCARDED #" +
            str(discarded_homographies[0] + discarded_homographies[1] +
                discarded_homographies[2] + discarded_homographies[3]) +
            " (area too big)\n")
        discarded_file.write("Min bound: " + str(areas_quantiles[0]) +
                             "\nMax bound: " + str(areas_quantiles[1]) +
                             "\nArea: " + str(area) + "\n\n")
        return False
def fit_normal(signal, tag):
    mu, std = norm.fit(signal["mean"].values)
    confidence_interval = norm.interval(CONFIDENCE, loc=mu, scale=std)
    if PLOTTING:
        # Plot the histogram.
        plt.subplots()
        plt.hist(signal["mean"].values,
                 bins=25,
                 density=True,
                 alpha=0.6,
                 color="g")
        # Plot the PDF.
        xmin, xmax = plt.xlim()
        x = np.linspace(xmin, xmax, 100)
        p = norm.pdf(x, mu, std)
        plt.plot(x, p, "k", linewidth=2)
        title = "Fit results normal: mu = %.2f,  std = %.2f" % (mu, std)
        plt.title(title)
        plt.axvline(x=confidence_interval[0])
        plt.axvline(x=confidence_interval[1])

        # Plot the confidence interval
        plt.savefig(f"analysis/images/{slugify(tag)}_fit_histogram.png",
                    format="png")

    return {
        "distribution": "normal",
        "params": [{
            "mu": mu,
            "std": std
        }],
        "confidence": [confidence_interval],
    }
Example #11
0
 def sumStats(self, pelistgroup, bclass, confint):
     outputMatrix = [[
         "Boyce Class", "Mean", "Median", "Range", "Lower Bound",
         "Upper Bound"
     ]]
     convertMatrix = []
     header = [["Mean", "Median", "Range", "Lower Bound", "Upper Bound"]]
     for i in range(len(pelistgroup[0])):
         tempList = []
         for list in pelistgroup:
             tempList.append(list[i])
         convertMatrix.append(tempList)
     for i in range(len(convertMatrix)):
         average = np.mean(convertMatrix[i])
         median = np.median(convertMatrix[i])
         convertMatrix[i].sort()
         ran = convertMatrix[i][-1] - convertMatrix[i][0]
         sterr = np.std(convertMatrix[i]) / math.sqrt(len(convertMatrix[i]))
         alpha = float(confint) / 100
         lbound = stats.norm.interval(alpha, average, sterr)[0]
         if lbound < 0:
             lbound = 0
         ubound = norm.interval(alpha, average, sterr)[1]  #Edited 10/9/2013
         outputMatrix.append(
             [str(i + 1), average, median, ran, lbound, ubound])
     return convertMatrix, outputMatrix
def calculate_calibration_intervals(
    targets, predicted_means, predicted_stddevs,
    step=0.05, verbose=False
):
    """
    Computes calibration curve - how theoretical
    conf intervals correlate wti practical (assuming prediction is Normal)
    """
    real_errors = np.abs(targets - predicted_means)

    all_fractions = []

    q_list = np.arange(0.0, 1.0 + step, step)
    if verbose:
        q_list = tqdm(q_list)

    for q in q_list:
        predicted_error_bound = -predicted_means + \
            norm.interval(q, predicted_means, predicted_stddevs)[1]

        emp_fraction = np.mean((
            real_errors <= predicted_error_bound
        ).astype(float))
        all_fractions.append(emp_fraction)

    # Calculates area to diagonal (ideal calibration).
    c_auc_score = np.abs(
        np.array(all_fractions) - np.arange(0.0, 1.0 + step, step)
    ).sum() * step

    return all_fractions, c_auc_score
Example #13
0
    def is_in_confidence_region(self, x, alpha):
        """Check if sample is in alpha confidence region.

        Parameters
        ----------
        x : array, shape (n_features,)
            Sample

        alpha : float
            Value between 0 and 1 that defines the probability of the
            confidence region, e.g., 0.6827 for the 1-sigma confidence
            region or 0.9545 for the 2-sigma confidence region.

        Returns
        -------
        is_in_confidence_region : bool
            Is the sample in the alpha confidence region?
        """
        self._check_initialized()
        # we have one degree of freedom less than number of dimensions
        n_dof = len(x) - 1
        if n_dof >= 1:
            return self.squared_mahalanobis_distance(x) <= chi2(n_dof).ppf(alpha)
        else:  # 1D
            lo, hi = norm.interval(
                alpha, loc=self.mean[0], scale=self.covariance[0, 0])
            return lo <= x[0] <= hi
Example #14
0
def get_intervals(xi_i, p_xi, frequencies, alpha=0.95):
    N = xi_i / p_xi
    dist_xi = norm.interval(alpha,
                            loc=xi_i,
                            scale=math.sqrt(xi_i * (1 - p_xi)))
    dist_xi = (dist_xi[0] / N, dist_xi[1] / N)
    return dist_xi
    def calibration_test(self, x, y_norm):
        mean, var, shape, rate, mixture_var = self(x)
        y = y_norm * self.y_std + self.y_mean

        y_norm = y_norm.detach().numpy()
        y = y.detach().numpy()

        confidence_values = np.expand_dims(np.arange(0.1, 1, 0.1), axis=1)

        norm_lower, norm_upper = norm.interval(
            confidence_values,
            loc=mean.detach().numpy(),
            scale=np.sqrt(var.detach().numpy()),
        )
        gamma_lower, gamma_upper = gamma.interval(confidence_values,
                                                  shape.detach().numpy(),
                                                  scale=1 /
                                                  rate.detach().numpy())

        output = torch.zeros_like(norm_upper)
        normal_check = np.logical_and(norm_lower < y_norm[np.newaxis, :],
                                      y_norm[np.newaxis, :] < norm_upper)
        gamma_check = np.logical_and(gamma_lower < y[np.newaxis, :],
                                     y[np.newaxis, :] < gamma_upper)

        output[mixture_var < 0.5] = normal_check
        output[mixture_var > 0.5] = gamma_check

        return output
Example #16
0
    def get_decision(self):
        if not self.fed_data:
            return None
        self.fed_data = False
        # keep default period until we collect more data points
        if self.points_observed < 10:
            return None
        #can at most increase monitoring period to the next one in the list
        curr_index = self.mon_periods.index(self.curr_period)
        max_index = min(curr_index + 1, len(self.mon_periods) - 1)

        # pick largest period which doesn't cross thresholds with 'confidence' probability
        for i in reversed(range(max_index + 1)):
            period = self.mon_periods[i]
            mean = self.latest_val
            std = max(0.01,
                      np.sqrt(self.ewmv * (1 + self.weight * (period - 1))))
            interval = norm.interval(self.confidence, loc=mean, scale=std)
            #print(mean, std, interval)
            if interval[0] > self.ok_interval[0] and \
               interval[1] < self.ok_interval[1]:
                # period stays unchanged, so no decision to change
                if self.curr_period == period:
                    return None
                self.curr_period = period
                return period
        if self.curr_period == self.mon_periods[0]:
            return None
        self.curr_period = self.mon_periods[0]
        return self.mon_periods[0]
Example #17
0
def _std_tuple_of(var=None, std=None, interval=None):
    """
    Convienence function for plotting. Given one of var, standard
    deviation, or interval, return the std. Any of the three can be an
    iterable list.

    Examples
    --------
    >>>_std_tuple_of(var=[1, 3, 9])
    (1, 2, 3)

    """

    if std is not None:
        if np.isscalar(std):
            std = (std, )
        return std

    if interval is not None:
        if np.isscalar(interval):
            interval = (interval, )

        return norm.interval(interval)[1]

    if var is None:
        raise ValueError("no inputs were provided")

    if np.isscalar(var):
        var = (var, )
    return np.sqrt(var)
def success_count_trial_outliers(alpha, trials, p, n):
    μ, σ = params_binomal_to_normal(p, n)
    acceptence_interval = norm.interval(alpha, μ, σ)
    return [
        trial for trial in trials
        if trial < acceptence_interval[0] or trial > acceptence_interval[1]
    ]
Example #19
0
def load_data(data_path):
    data = np.load(data_path)

    T = data.shape[1]
    n = data.shape[0]
    #n_sample = data.shape[1]
    #n = bs#*n_sample

    means = []
    stds = []
    conf_intervals = []
    conf_intervals_max = []
    conf_intervals_min = []

    dof = n - 1
    for i in range(T):
        mean = np.mean(data[:, i])
        std = np.std(data[:, i])
        print(mean)
        print(std)
        std = std / math.sqrt(n)
        #conf_interval = ST.ppf(1-alpha/2., dof) * std*np.sqrt(1.+1./n)
        conf_interval = ST.interval(0.95, loc=mean, scale=std)
        print(conf_interval)

        means.append(mean)
        stds.append(std)
        conf_intervals.append(conf_interval)
        conf_intervals_max.append(mean + (conf_interval[1] - mean))
        conf_intervals_min.append(mean - (mean - conf_interval[0]))

    return means, conf_intervals_min, conf_intervals_max
Example #20
0
 def price(self, n, level = 0.95):
     x = asset.simulate(n)
     y = self.payoff(x)
     delta = np.mean(y)
     Var = np.var(y)
     CI = [delta + q * sqrt(Var/n) for q in norm.interval(level)]
     return {"option price":delta, "confidence interval":CI, "variance":Var}
Example #21
0
def _std_tuple_of(var=None, std=None, interval=None):
    """
    Convienence function for plotting. Given one of var, standard
    deviation, or interval, return the std. Any of the three can be an
    iterable list.

    Examples
    --------
    >>>_std_tuple_of(var=[1, 3, 9])
    (1, 2, 3)

    """

    if std is not None:
        if np.isscalar(std):
            std = (std,)
        return std


    if interval is not None:
        if np.isscalar(interval):
            interval = (interval,)

        return norm.interval(interval)[1]

    if var is None:
        raise ValueError("no inputs were provided")

    if np.isscalar(var):
        var = (var,)
    return np.sqrt(var)
Example #22
0
 def get_PI(self):
     mean = self.latest_val
     std = max(
         0.01,
         np.sqrt(self.ewmv * (1 + self.weight * (self.curr_period - 1))))
     interval = norm.interval(self.confidence, loc=mean, scale=std)
     return interval
Example #23
0
def make_vec(n):
    ranges = []
    for i in range(1, 2**n + 1):
        a, b = norm.interval(alpha=i / (2**n + 1), loc=0, scale=0.4)
        ranges.append(a)
        ranges.append(b)
    return sorted(ranges)
Example #24
0
    def predict_proba(self, x, interval=0.95, *args, **kwargs):

        # We can't make predictions until we have trained the model
        if not self._trained:
            print('Train first')
            return

        y_pred = np.zeros((x.shape[0], self.forests * self.n_estimators))

        for i in range(self.forests):
            if self.parallel:  # used in training
                pk_f = join(self.temp_dir, 'rf_model_{}.pk'.format(i))
            else:  # used when parallel is false, i.e., during x-val
                pk_f = join(self.temp_dir,
                            'rf_model_{}_{}.pk'.format(i, mpiops.chunk_index))
            with open(pk_f, 'rb') as fp:
                f = pickle.load(fp)
                for m, dt in enumerate(f.estimators_):
                    y_pred[:, i * self.n_estimators + m] = \
                        f.ytform.itransform(dt.predict(x))

        y_mean = np.mean(y_pred, axis=1)
        y_var = np.var(y_pred, axis=1)

        # Determine quantiles
        ql, qu = norm.interval(interval, loc=y_mean, scale=np.sqrt(y_var))

        return y_mean, y_var, ql, qu
Example #25
0
 def detect_signal(self, history_dict, pos=None, contrary=False):
     best_f = self.__lrfs.extract_best_feature(history_dict=history_dict)
     kfo = KalmanFilterOptimizer(y=best_f['series'],
                                 x0=self.__x0,
                                 v0=self.__v0,
                                 pmv_ratio=self.__pmv_ratio)
     q, r = kfo.optimize()
     kf = KalmanFilter(x0=self.__x0, v0=self.__v0, q=q, r=r)
     kf_res = kf.fit(y=best_f['series']).iloc[-1].to_dict()
     self.__logger.debug(f'kf_res:\t{kf_res}')
     gauss_mu = kf_res['x']
     gauss_ci = np.asarray(
         norm.interval(alpha=self.__ci_level,
                       loc=gauss_mu,
                       scale=np.sqrt(kf_res['v'] + q)))
     sig_side = 'short' if gauss_mu * [1, -1][int(contrary)] < 0 else 'long'
     if gauss_ci[1] < 0 or gauss_ci[0] > 0:
         sig_act = sig_side
     else:
         sig_act = None
     sig_log_str = '{:^40}|'.format('{0:>3}[{1:>3}]:{2:>9}{3:>18}'.format(
         self.__lrfs.code, best_f['granularity_str'], f'{gauss_mu:.1g}',
         np.array2string(gauss_ci,
                         formatter={'float_kind': lambda f: f'{f:.1g}'})))
     return {
         'sig_act': sig_act,
         'granularity': best_f['granularity'],
         'sig_log_str': sig_log_str,
         'sig_mu': gauss_mu,
         'sig_cil': gauss_ci[0],
         'sig_ciu': gauss_ci[1]
     }
Example #26
0
    def predict_proba(self, X, interval=0.95, *args, **kwargs):
        """
        Predictive mean and variance for a probabilistic regressor.

        Parameters
        ----------
        X: ndarray
            (Ns, d) array query dataset (Ns samples, d dimensions).
        interval: float, optional
            The percentile confidence interval (e.g. 95%) to return.
        fields: dict, optional
            dictionary of fields parsed from the shape file.
            ``indicator_field`` should be a key in this dictionary. If this is
            not present, then a Gaussian likelihood will be used for all
            predictions. The only time this may be input if for cross
            validation.

        Returns
        -------
        Ey: ndarray
            The expected value of ys for the query inputs, X of shape (Ns,).
        Vy: ndarray
            The expected variance of ys (excluding likelihood noise terms) for
            the query inputs, X of shape (Ns,).
        ql: ndarray
            The lower end point of the interval with shape (Ns,)
        qu: ndarray
            The upper end point of the interval with shape (Ns,)
        """

        Ey, Vy = self.predict_moments(X, *args, **kwargs)
        ql, qu = norm.interval(interval, loc=Ey, scale=np.sqrt(Vy))

        return Ey, Vy, ql, qu
Example #27
0
    def get_decision(self):
        # keep default period until we collect more data points
        if len(self.val_window) < self.window_size:
            return None
        np_window = np.array(self.change_window.data)
        curr_val = self.val_window[0]
        change_mean = np.mean(np_window)
        change_var = np.var(np_window)

        # pick largest period which doesn't cross thresholds with 'confidence' probability
        for period in reversed(self.mon_periods):
            rnd_walk_std = np.sqrt(change_var * period)
            rnd_walk_std = max(0.01 * period, rnd_walk_std)
            change_interval = norm.interval(self.confidence,
                                            loc=change_mean,
                                            scale=rnd_walk_std)
            if curr_val + change_interval[0] > self.ok_interval[0] and \
               curr_val + change_interval[1] < self.ok_interval[1]:
                # period stays unchanged, so no decision to change
                if self.curr_period == period:
                    return None
                self.curr_period = period
                return period
        if self.curr_period == self.mon_periods[0]:
            return None
        self.curr_period = self.mon_periods[0]
        return self.mon_periods[0]
def dPDF(pts,mu,sigma, distribuition, outlier = 0, data = 0, n=10, seed = None):
    import numpy as np
    from scipy.interpolate import interp1d
    from distAnalyze import dpdf, mediaMovel
    from scipy.stats import norm, lognorm
    
    eps = 5e-5
    ngrid = int(1e6)

    if distribuition == 'normal':
        outlier_inf = outlier_sup = outlier  
        if not data:  
              inf, sup = norm.interval(0.9999, loc = mu, scale = sigma)
              x = np.linspace(inf-outlier_inf,sup+outlier_sup,ngrid)
              y = dpdf(x,mu,sigma,distribuition)
              
        else:
              np.random.set_state(seed)
              d = np.random.normal(mu,sigma,data)
              inf,sup = min(d)-outlier_inf,max(d)+outlier_sup
              
              y,x = np.histogram(d,bins = 'fd',normed = True)
              x = np.mean(np.array([x[:-1],x[1:]]),0)
              
              y = abs(np.diff(mediaMovel(y,n)))
              x = x[:-1]+np.diff(x)[0]/2
              
    elif distribuition == 'lognormal':
        outlier_inf = 0
        outlier_sup = outlier
        if not data:
              inf, sup = lognorm.interval(0.9999, sigma, loc = 0, scale = np.exp(mu))
              x = np.linspace(inf-outlier_inf,sup+outlier_sup,ngrid)
              y = dpdf(x,mu,sigma,distribuition)
        else:
              np.random.set_state(seed)
              d = np.random.lognormal(mu,sigma,data)
              inf,sup = min(d)-outlier_inf,max(d)+outlier_sup
              
              y,x = np.histogram(d,bins = 'fd',normed = True)
              x = np.mean(np.array([x[:-1],x[1:]]),0)
              
              y = abs(np.diff(mediaMovel(y,n)))
              x = x[:-1]+np.diff(x)[0]/2
              y = y/(np.diff(x)[0]*sum(y))
    #dy = lambda x,u,s : abs(1/(s**3*sqrt(2*pi))*(u-x)*np.exp(-0.5*((u-x)/s)**2))
    
  
    cdf = np.cumsum(y)
       
    #cdf = np.sum(np.tri(len(x))*y,1)    
    #cdf = np.concatenate(cdf)
    cdf = cdf/max(cdf)
    #time.time()-t
    
    interp = interp1d(cdf,x, fill_value = 'extrapolate')
    Y = np.linspace(eps,1-eps,pts)
    X = interp(Y)
    
    return X,Y
Example #29
0
def qqplot(data, labels, n_quantiles=100, alpha=0.95, error_type='theoretical', distribution = 'binomial', log10conv=True, color=['k', 'r', 'b'], fill_dens=[0.1, 0.1, 0.1], type = 'uniform', title='title'):
    '''
    Function for plotting Quantile Quantile (QQ) plots with confidence interval (CI)
    :param data: NumPy 1D array with data
    :param labels:
    :param type: type of the plot
    :param n_quantiles: number of quntiles to plot
    :param alpha: confidence interval
    :param log10conv: conversion to -log10(p) for the figure
    :return: nothing
    '''
    xmax = 0
    ymax = 0
    if type == 'uniform':
        # we expect distribution from 0 to 1
        for j in range(len(data)):
            # define quantiles positions:
            q_pos = np.concatenate([np.arange(99.)/len(data[j]), np.logspace(-np.log10(len(data[j]))+2, 0, n_quantiles)])
            # define quantiles in data
            q_data = mquantiles(data[j], prob=q_pos, alphap=0, betap=1, limit=(0, 1)) # linear interpolation
            # define theoretical predictions
            q_th = q_pos.copy()
            # evaluate errors
            q_err = np.zeros([len(q_pos),2])
            if np.sum(alpha) > 0:
                for i in range(0, len(q_pos)):
                    if distribution == 'binomial':
                        q_err[i, :] = binom.interval(alpha=alpha, n=len(data[j]), p=q_pos[i])
                    elif distribution == 'normal':
                        q_err[i, :] = norm.interval(alpha, len(data[j])*q_pos[i], np.sqrt(len(data[j])*q_pos[i]*(1.-q_pos[i])))
                        q_err[i, q_err[i, :] < 0] = 1e-12
                    else:
                        print('Distribution is not defined!')
                q_err /= 1.0*len(data[j])
                for i in range(0, 100):
                    q_err[i,:] += 1e-12
            # print(q_err[100:, :])
            slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data)
            # print(labels[j], ' -- Slope: ', slope, " R-squared:", r_value**2)
            plt.plot(-np.log10(q_th[n_quantiles-1:]), -np.log10(q_data[n_quantiles-1:]), '-', color=color[j])
            plt.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color[j], label=labels[j])
            xmax = np.max([xmax, - np.log10(q_th[1])])
            ymax = np.max([ymax, - np.log10(q_data[0])])
            # print(- np.log10(q_th[:]))
            if np.sum(alpha)>0:
                if error_type=='experimental':
                    plt.fill_between(-np.log10(q_th), -np.log10(q_data/q_th*q_err[:,0]), -np.log10(q_data/q_th*q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha)
        if np.sum(alpha)>0:
            if error_type=='theoretical':
                plt.fill_between(-np.log10(q_th), -np.log10(q_err[:,0]), -np.log10(q_err[:,1]), color=color[j], alpha=fill_dens[j], label='%1.3f CI'%alpha)
    plt.legend(loc=4)
    plt.xlabel('Theoretical -log10')
    plt.ylabel('Experimental -log10')
    plt.plot([0, 100], [0, 100],'--k')
    # print(xmax,ymax)
    plt.xlim([0, np.ceil(xmax)])
    plt.ylim([0, np.ceil(ymax*1.05)])
    plt.title(title)
    plt.tight_layout()
Example #30
0
def MCerr(func, ins, params, errs, nums, conf, nproc=1):
    #func : function taking in parameters
    #ins : list of inputs to function
    #params : list of parameters to put into function
    #err : list of error associated with parameters
    #nums: list of number of trials to compute for each parameter
    #np.random.seed(0)

    from scipy.stats import norm

    #val = func(*(ins+params))
    n = len(params)
    val_errs = np.zeros(n)
    val_means = np.zeros(n)
    #val_means = np.zeros(n)
    #for each parameter
    for i in range(n):
        #print "computing parameter "+str(i+1)+"/"+str(n)
        #perturb parameter N times by STD
        trials = np.random.normal(params[i], errs[i], nums[i])
        #confidence interval
        conf_int = norm.interval(conf, loc=params[i], scale=errs[i])
        trials = trials[np.logical_and(trials > conf_int[0],
                                       trials < conf_int[1])]

        if nproc > 1:
            from multiprocessing import Pool
            pool = Pool(nproc)
            procs = []
            #vals = np.zeros(nums[i])
            #for each perturbation
            for j in range(len(trials)):
                #calculate value using perturbed perameter
                trial_params = np.copy(params)
                trial_params[i] = trials[j]
                #perform processes in parallel
                #vals[j] = func(*(ins+trial_params))
                procs.append(pool.apply_async(func, ins + list(trial_params)))
            vals = np.array([proc.get(timeout=10) for proc in procs])
            pool.terminate()
        else:
            vals = np.zeros(len(trials))
            #for each perturbation
            for j in range(len(trials)):
                #calculate value using perturbed perameter
                trial_params = np.copy(params)
                trial_params[i] = trials[j]
                #perform process
                vals[j] = func(*(ins + list(trial_params)))

        #error associated with perturbation of parameter
        val_errs[i] = vals.std()
        val_means[i] = vals.mean()
        #val_means[i] = vals.mean()
    #total summed error associated with all perturbation
    val_err = np.sqrt(np.square(val_errs).sum())
    val = val_means.mean()
    #return value and error
    return val, val_err
Example #31
0
    def predict_interval(self, X, confidence):
        assert np.isscalar(confidence), "Confidence should be a scalar"
        ensemble_mean, std = self.predict(X, return_std=True)
        std += 1e-6 # Avoid NaNs. TODO: Better solution? How are std=0 produced??
        interval_tuple = norm.interval(confidence, loc=ensemble_mean, scale=std)
        intervals = np.concatenate((interval_tuple[0][:, np.newaxis], interval_tuple[1][:, np.newaxis]), axis=1)

        return intervals
Example #32
0
def make_normal(n):
    ranges = []
    for i in range(1, n + 1, 2):
        a, _ = norm.interval(alpha=i / (n + 2), loc=0, scale=1)
        ranges.append(a)
    ranges = np.asarray(ranges)
    ranges /= abs(max(ranges, key=abs))
    return np.sort(ranges) * random.uniform(0.1, 0.7)
Example #33
0
    def predict_proba(self, x, interval=0.95):
        """ Predict the outputs and variances of the inputs
        This method predicts the output values that would correspond to
        each input in X. This method also returns the certainty of the
        model in each case, which is only sensible when the number of
        commitee members is greater than one.

        This method also outputs quantile information along with the
        variance to establish the probability distribution clearly.

        Parameters
        ----------
        x: numpy.array
            The inputs for which the model should be evaluated
        interval: float
            The probability threshold for which the quantiles should
            be output.

        Returns
        -------
        y_mean: numpy.array
            An array of expected output values given the inputs
        y_var: numpy.array
            The variance of the outputs
        ql: numpy.array
            The lower quantiles for each input
        qu: numpy.array
            The upper quantiles for each input
        """

        n, m = x.shape

        # We can't make predictions until we have trained the model
        if not self._trained:
            print('Train first')
            return

        # Determine which rule to run on each row and then run the regression
        # on each row of x to get the regression output.
        y_pred = np.zeros((n, len(self.models)))
        for m, model in enumerate(self.models):
            for rule in model:

                # Determine which rows satisfy this rule
                mask = rule.satisfied(x)

                # Make the prediction for the whole matrix, and keep only the
                # rows that are correctly sized
                y_pred[mask, m] += rule.regress(x, mask)

        y_mean = np.mean(y_pred, axis=1)
        y_var = np.var(y_pred, axis=1)

        # Determine quantiles
        ql, qu = norm.interval(interval, loc=y_mean, scale=np.sqrt(y_var))

        # Convert the prediction to a numpy array and return it
        return y_mean, y_var, ql, qu
Example #34
0
def print_and_plot_results(count, results, verbose, plot_file_name):
    print("RPS calculated as 95% confidence interval")

    rps_mean_ar = []
    low_ar = []
    high_ar = []
    test_name_ar = []

    for test_name in sorted(results):
        data = results[test_name]
        rps = count / array(data)
        rps_mean = tmean(rps)
        rps_var = tvar(rps)
        low, high = norm.interval(0.95, loc=rps_mean, scale=rps_var**0.5)
        times = array(data) * 1000000 / count
        times_mean = tmean(times)
        times_stdev = tstd(times)
        print('Results for', test_name)
        print('RPS: {:d}: [{:d}, {:d}],\tmean: {:.3f} μs,'
              '\tstandard deviation {:.3f} μs'
              .format(int(rps_mean),
                      int(low),
                      int(high),
                      times_mean,
                      times_stdev))

        test_name_ar.append(test_name)
        rps_mean_ar.append(rps_mean)
        low_ar.append(low)
        high_ar.append(high)

        if verbose:
            print('    from', times)
        print()


    if plot_file_name is not None:
        import matplotlib.pyplot as plt
        from matplotlib import cm
        fig = plt.figure()
        ax = fig.add_subplot(111)
        L = len(rps_mean_ar)
        color = [cm.autumn(float(c) / (L - 1)) for c in arange(L)]
        bars = ax.bar(
            arange(L), rps_mean_ar,
            color=color, yerr=(low_ar, high_ar), ecolor='k')
        # order of legend is reversed for visual appeal
        ax.legend(
            reversed(bars), reversed(test_name_ar),
            loc='upper left')
        ax.get_xaxis().set_visible(False)
        plt.ylabel('Requets per Second', fontsize=16)
        print(plot_file_name)
        plt.savefig(plot_file_name, dpi=96)
        print("Plot is saved to {}".format(plot_file_name))
        if verbose:
            plt.show()
Example #35
0
def theoretical_stddev_and_confidence_intervals(n):
    """
    Returns an output helping to build the latex tables for the theoretical CIs
    """
    stddev = ((pi - 2) * 2 / (n * (pi ** 2))) ** 0.5
    output = ['{n}'.format(n=n), '{stddev:.4f}'.format(stddev=stddev)]

    for alpha in ALPHA_LEVELS:
        output.append(r'$\frac{2}{\pi} \pm ' + '{x:.4f}'.format(x=(norm.interval(alpha)[1] * stddev)) + '$')

    return ' & '.join(output) + r' \\'
Example #36
0
def confidence_interval(errors):
    # tvar is the sample variance
    from scipy.stats import norm, tvar
    import math

    mu = sum(errors) / float(len(errors))
    var = tvar(errors)
    std_dev = math.sqrt(var)
    std_error = std_dev / math.sqrt(len(errors))
    span_95 = norm.interval(0.95, loc=mu, scale=std_error)

    return span_95
def _95p_center(means_accumulator, errs_accumulator):

    def helper(initial, bounds, target):
        shift = target - initial
        if np.abs(shift) < bounds:
            return target
        else:
            scale = bounds / np.abs(shift)
            return initial + shift*scale

    target_percentile = np.sum(np.logical_not(np.isnan(means_accumulator)), axis=0)-1
    target_percentile = 1-np.power(np.ones_like(target_percentile)*0.05, 1/np.sqrt(target_percentile))

    _95p_cosntant = norm.interval(0.95)[1]

    contractor = np.vectorize(lambda x: _95p_cosntant/norm.interval(x)[1])
    contraction_interval = contractor(target_percentile)
    contraction_interval = errs_accumulator/contraction_interval[np.newaxis, :]

    contractor2 = np.vectorize(helper)

    new_means_accumulator = contractor2(means_accumulator, contraction_interval, np.nanmean(means_accumulator, axis=0)[np.newaxis, :])

    return new_means_accumulator
Example #38
0
    def exp_data_generator(
        self, exp_val, apparatus_uncer, first_abs=0.0,
        first_rel=0.0, conf=0.95
    ):
        """
            Predict experimental data according to the experimental
            method and apparatus uncertainty information with the
            assumption that the apparatus uncertainty is described
            by a 95% confidence interval under a normal distribution

            Parameters:
            ===========
            exp_val: float
                expected reading of the sensor
            apparatus_uncer: class exp_uncer.APPARATUS_UNCER
                uncertainty information of the measurement apparatus
            first_abs: float
                extra absolute uncertainty to the measurement
                due to noise, in the same engineering unit as the
                measurement
            first_rel: float
                extra relative uncertainty to the measurement
                due to noise
            conf: float, optional
                the confidence level of the apparatus uncertainty. Default 95%

            Returns:
            ===========
            data: numpy array
                readings in time-series

        """

        num = self.get_num()  # number of data points
        zero_order = apparatus_uncer.zero_order_uncer(exp_val)
        first_order = sqrt(
            first_abs**2+(first_rel*exp_val)**2
        )

        norm_std = sqrt(
            zero_order**2+first_order**2
        )/norm.interval(0.95)[1]
        data = []
        for ii in xrange(num):
            data.append(normalvariate(exp_val, norm_std))

        return np.array(data)
def test_KL_divergence_for_normal_distributions(show_plot=True):

	mu_0 = 0
	sigma_0 = 1

	interval = norm.interval(.99,mu_0,sigma_0)

	support = numpy.linspace(interval[0], interval[1], num=2000)

	mus = numpy.linspace(0, 3, num=30)

	p_0 = norm.logpdf(support, mu_0, sigma_0)

	KL_inf = []
	KL_ana = []

	for mu in mus:
		p_1 = norm.logpdf(support, mu, sigma_0)

		kld = qtu.KL_divergence_arrays(support, p_0, p_1, False)

		KL_inf.append(float(kld))
		KL_ana.append(actual_KL(mu_0, sigma_0, mu, sigma_0))


	KL_inf = numpy.array(KL_inf)
	KL_ana = numpy.array(KL_ana)
	KL_diff = KL_ana-KL_inf


	if show_plot:
		pylab.subplot(1,2,1)
		pylab.plot(KL_inf, label='est')
		pylab.plot(KL_ana, label='analytical')
		pylab.title('estimated KL')
		pylab.legend()

		pylab.subplot(1,2,2)
		pylab.plot(KL_diff)
		pylab.title('KL error')

		pylab.show()


	_, p = pearsonr(KL_inf, KL_ana)

	return p
Example #40
0
def conf_interval(arr, confidence = 0.95):
    N = arr.size

    if N <= 30:
        z = t.interval(0.95, N - 1)
    else:
        z = norm.interval(0.95)

    s = arr.std()
    x_bar = arr.mean()

    return (x_bar - z*(s/np.sqrt(N)), x_bar + z*(s/np.sqrt(N)))
    



    
    

    
Example #41
0
 def sumStats(self, pelistgroup, bclass, confint):
     outputMatrix = [["Boyce Class", "Mean", "Median", "Range", "Lower Bound", "Upper Bound"]]
     convertMatrix = []
     header = [["Mean", "Median", "Range", "Lower Bound", "Upper Bound"]]
     for i in range(len(pelistgroup[0])):
         tempList = []
         for list in pelistgroup:
             tempList.append(list[i])
         convertMatrix.append(tempList)
     for i in range(len(convertMatrix)):
         average = np.mean(convertMatrix[i])
         median = np.median(convertMatrix[i])
         convertMatrix[i].sort()
         ran = convertMatrix[i][-1] - convertMatrix[i][0]
         sterr = np.std(convertMatrix[i])/math.sqrt(len(convertMatrix[i]))
         alpha = float(confint)/100
         lbound = stats.norm.interval(alpha, average, sterr)[0]
         if lbound < 0:
             lbound = 0
         ubound = norm.interval(alpha, average, sterr)[1] #Edited 10/9/2013
         outputMatrix.append([str(i+1), average, median, ran, lbound, ubound])
     return convertMatrix, outputMatrix
    def generate_discrete_support(params, support=0.95, nbins=100):
        """
        returns a set of intervals over which the component model pdf is 
        supported. 
        Inputs:
            params: a dict with entries 'mu' and 'rho'
            nbins: cardinality of the set or the number of grid points in the 
                approximation
            support: a float in (0,1) that describes the amount of probability 
                we want in the range of support 
        """
        if type(nbins) is not int:
            raise TypeError("nbins should be an int")
            
        if nbins <= 0:
            raise ValueError("nbins should be greater than 0")
            
        support = check_type_force_float(support, "support")
        if support <= 0.0 or support >= 1.0:
            raise ValueError("support is a float st: 0 < support < 1")
            
        check_model_params_dict(params)
        
        mu = params['mu']
        sigma = (1.0/params['rho'])**.5
        
        interval = norm.interval(support,mu,sigma)
        
        a = interval[0]
        b = interval[1]
        
        support_range = b - a;
        support_bin_size = support_range/(nbins-1.0)
        
        bins = [a+i*support_bin_size for i in range(nbins)]

        return bins
Example #43
0
def guess_param(current_val, lo_bound, hi_bound, SEARCH_RATE=5, BACKTRACK_PROB=.1,
                hard_lower_bound = .0001, hard_upper_bound = 10):
    lo_val = lo_bound
    hi_val = hi_bound

    # If we are missing either of the bounds, artificially create them using the search rate
    if(lo_val==None):
        #lo_val = current_val / SEARCH_RATE
        if(hi_val!=None):
            lo_val = hi_val / SEARCH_RATE
        else:
            lo_val = current_val / SEARCH_RATE
    if(hi_val==None):
        #hi_val = current_val * SEARCH_RATE
        if(lo_val!=None):
            hi_val = lo_val * SEARCH_RATE
        else:
            hi_val = current_val * SEARCH_RATE
    
    # Create a normal distribution centered between the two bounds, and
    # Ensure that there is only a BACKTRACK_PROB probability of generating
    # a point outside of that range
    mean = (hi_val+ lo_val) / 2
    (lo_conf_bound, hi_conf_bound) = norm.interval(1 - BACKTRACK_PROB)
    sd = (hi_val - lo_val) / (hi_conf_bound*2)
    
    # Draw a random guess from the distribution and update bounds if necessary
    current_val = normalvariate(mean, sd)
    current_val = max(current_val, hard_lower_bound)
    current_val = min(current_val, hard_upper_bound)
    if(lo_bound!=None):
        lo_bound = min(lo_bound, current_val)
    if(hi_bound!=None):
        hi_bound = max(hi_bound, current_val)
    
    
    return current_val, lo_bound, hi_bound
Example #44
0
def get_gaussian_model(residuals, n_bins=50):

    from scipy.stats import norm
    import matplotlib.mlab as mlab

    mu, std = norm.fit(residuals)

    plt.figure()
    plt.subplot(211)
    result = plt.hist(residuals, n_bins, histtype='bar')

    born_inf, born_supp = norm.interval(0.95, mu, std)

    x = numpy.linspace(min(residuals), max(residuals), 100)

    dx = result[1][1] - result[1][0]
    scale = len(residuals) * dx
    plt.plot(x, mlab.normpdf(x, mu, std) * scale, "r--", linewidth=2.0)
    plt.axvline(born_inf, color='g', linewidth=2.0)
    plt.axvline(born_supp, color='g', linewidth=2.0)
    plt.subplot(212)
    plt.plot(residuals)

    plt.show()
Example #45
0
# calculate prob, disregard weights
lpr = log_multivariate_normal_density(input_df,g_1.means_,g_1.covars_,g_1.covariance_type)
logprob = logsumexp(lpr,axis=1)
responsibilities = np.exp(lpr - logprob[:, np.newaxis])
probs = pd.DataFrame(responsibilities)
probs.set_index(input_df.index,inplace=True)
probs.columns = ['prob_0','prob_1']
probs.loc[:,'color'] = 'k'
probs.loc[probs.prob_0>=0.90, 'color'] = 'r'
probs.loc[probs.prob_1>=0.90, 'color'] = 'b'
# plot 1D GMM
delta= 0.0001
x = np.arange(0.5, 1.2, delta)
mu_1, sigma_1 = (g_1.means_[0][0],np.sqrt(g_1.covars_[0][0]))
mu_2, sigma_2 = (g_1.means_[1][0],np.sqrt(g_1.covars_[1][0]))
intervals_1 = norm.interval(0.95,loc=mu_1,scale=sigma_1)
intervals_2 = norm.interval(0.95,loc=mu_2,scale=sigma_2)
interval_1_x = np.arange(intervals_1[0][0],intervals_1[1][0],delta)
interval_2_x = np.arange(intervals_2[0][0],intervals_2[1][0],delta)
print intervals_1
print intervals_2
Z1 = mlab.normpdf(x,mu_1,sigma_1)
Z2 = mlab.normpdf(x,mu_2,sigma_2)
Z = (Z2-Z1)
diffpts = zip(x,Z)
diffpts = [(a,b) for a,b in diffpts if a > 0.8 and a < 1.4]
zeropt = sorted(diffpts, key=lambda x: abs(x[1]))[0][0]
min_interval = min(intervals_1[1][0],intervals_2[1][0])
max_interval = max(intervals_1[0][0],intervals_2[0][0])
input_df.plot(kind='density')
plt.plot(x,Z1,label='gaussian one')
Example #46
0
def bootstrap_residuals(data, model, num_samples=100, statistic=np.mean):

    ''' Bootstraps data with models a given number of times and calculates the
    Goodness of fit for each run. The standard deviation of the Goodness-of-fit
    values is then used to estimate the confidence interval.

    Parameters
    ----------
    data : array_like
        The observed data, must be the same size as the model
    model : array_like
        The model data, must be the same size as the observed data.
    num_samples : int, optional
        Number of runs in the bootstrapping.
    alpha : float, optional
        Significance of confidence interval.
    data_error : float, array_like, optional
        If unset, the error will be the standard deviation of the data. If an
        array, it must have the same dimensions as the observed data.
    sigma : float, optional
        If set, the confidence interval will be calculated using the number of
        standard deviations from the mean.
    verbose : bool, optional
        Print out progress?

    Returns
    -------
    out : list
        A list, [confidence interval, goodness of fit array]

    '''

    import numpy as np
    from scipy.stats import norm

    data_list = data.ravel()
    model_list = model.ravel()
    residuals = data - model
    length = len(data_list)

    num_samples = int(num_samples)
    gofArray = np.zeros(num_samples)

    if verbose:
        print('Beginning bootstrapping')

    for i in range(num_samples):
        # randomly sample all values of data and model
        indices_sample = np.random.choice(length,size=length,replace=True)
        data_sample = data_list[indices_sample]
        model_sample = model_list[indices_sample]
        gofArray[i] = ((data_sample - model_sample)**2 / \
                data_error_list**2).sum()
        if verbose:
            if i%10 == 0:
                print(str(i) + 'th run complete.')

    mean, std = gofArray.mean(), gofArray.std()
    if sigma is not None:
        alpha = 1 - norm.cdf(sigma)
    confid_int = norm.interval(1 - alpha, loc=mean, sigma=std)

    return (confid_int,gofArray)
Example #47
0
        # because it's a discrete distribution)
        low, high = binom.interval(alpha, N, p)
        if p==0:
            low = high = 0
        elif p==1:
            low = high = N
        q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p)
        low, high = binom.interval(alpha, num_Np_checks, q)
        if q==0:
            low = high = 0
        if num_Np_fails<low or num_Np_fails>high:
            print 'N=%d, p=%.3f failed %d of %d checks, outside range (%d, %d)' % (N, p, num_Np_fails,
                                                                                   num_Np_checks, low, high)
print
failrate = float(numfails)/numchecks
low, high = norm.interval(alpha, loc=mu, scale=sqrt(sigma2))
print '%d/%d=%.2f%% failed at %d%%' % (numfails, numchecks, numfails*100.0/numchecks, 100*alpha)
print 'Expected mean=%d, std dev=%d (mean fail rate=%.2f%%)' % (mu, sqrt(sigma2), 100*mu/numchecks)
if low<=numfails<=high:
    print 'Overall passed at %d%%: within range (%d, %d)' % (alpha*100, low, high)
else:
    print 'Overall failed at %d%%: outside range (%d, %d)' % (alpha*100, low, high)

figure(figsize=(10, 6))
plotnum = 0
for p in p_range:
    if p==0 or p==1:
        continue
    plotnum += 1
    subplot(2, 3, plotnum)
    n = arange(1, isi_max[p])
Example #48
0
def cross_validate_disc_version(algorithm, tab_file, min_support=-30, sample_pct=0.1, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions

    # Get the total observed triples
    borgelt_start = time()
    observed_file_name = path + 'observed_frequent_items.out'
    args = [algorithm, tab_file, observed_file_name, '-s' + str(min_support), '-n3']
    # pro = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    # os.killpg(pro.pid, signal.SIGTERM)
    call(args)
    # sleep(20)
    print 'fpgrowth on all data done: {} secs'.format(time()-borgelt_start)

    freq = Borgelt.read_frequent_items(observed_file_name)

    # Create ds of all observed triplets
    # Saved as sorted keys for lookup,
    # and their frequency as value
    observed = {}
    count = 0
    for item in freq:
        if len(item[0]) == 3:
            sorted_trip = triple_sort(item[0])
            observed[sorted_trip] = item[1][0]
    print 'Total triplets observed:', len(observed)
    average_observed = sum(observed.values()) / float(len(observed))
    print 'Baseline: ', average_observed

    del freq

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    avg_errors_ind = []
    var_errors_ind = []
    avg_errors_baseline = []

    occurrences = [0 for i in range(100)]
    max_ent_acc_error = [0 for i in range(100)]
    ext_acc_error = [0 for i in range(100)]
    ind_acc_error = [0 for i in range(100)]
    heu_acc_error = [0 for i in range(100)]
    baseline_acc_error = [0 for i in range(100)]

    # Record trip counts for the best estimats
    max_ent_best = Counter()
    ext_best = Counter()
    ind_best = Counter()

    for index in range(iterations):

        # Create sample file
        sampling_start = time()
        if sample_pct > 0:
            sample_size= int(total_transactions*sample_pct)
        else:
            sample_size = abs(sample_pct)
        test_data_size = total_transactions - sample_size
        sample = random.sample(range(total_transactions), sample_size)
        assert len(sample) == sample_size, 'Sample size not equal to sample'
        sample.sort()
        sample_file_name = path + str(index) + '_sample.tab'
        with open(sample_file_name, 'a') as sample_file:
            sample_line = 0
            for line_num, line in enumerate(open(tab_file, 'rb')):
                if line_num == sample[sample_line]:
                    sample_file.write(line)
                    sample_line += 1
                    if sample_line == sample_size:
                        break

        del sample
        print 'Sample size: {} time: {}'.format(sample_size, time() - sampling_start)
        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, sample_file_name, sample_freq_name, '-s-1', '-n3']
        call(args)
        print 'fpgrowth on sample data done: {} secs'.format(time()-borgelt_start)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, test_data_size)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        independences = []
        heurestics = []
        baselines = []
        observations = []

        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        MAPE_errors_ind = []
        MAPE_errors_heu = []
        MAPE_errors_baseline = []
        true_errors = []
        pair_triple_ratios = []

        triangle_counts = []

        # s1_list = []
        # s2_list = []
        # s3_list = []
        # s12_list = []
        # s13_list = []
        # s23_list = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2)) + 1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]
                for n3 in s3_dict.keys():
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))

                    triangle = (n1, n2, n3)

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)

                    # Get the obs (test data) frequency minus those found in the sample (training data)
                    obs = 0
                    if triangle in observed:
                         # (triples in data) - (triples in sample). Calculating the number of triples in test data.
                        obs = observed[triangle] - s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(sample_size), num=req_depth) * (test_data_size / float(sample_size))

                    if est < 0:
                        print 'max ent below 0'
                        print 's1 s2 s3 s12 s13 s23 s123', (s1, s2, s3, s12, s23, s13, s123)

                    # extrapolation estimate
                    est2 = s123 / float(sample_size) * test_data_size

                    # independence estimat
                    est3 = (s1 / float(sample_size)) * (s2 / float(sample_size)) * (s3 / float(sample_size)) * test_data_size
                    # est3 = (s1*s2*s3)/float(sample_size*sample_size) * test_data_size/float(sample_size)

                    # heurestic, use max_ent for 0 triple in sample
                    est4 = s123 < 5 and est or est2

                    # base line estimat
                    est5 = average_observed

                    estimates.append(est)
                    extrapolations.append(est2)
                    independences.append(est3)
                    heurestics.append(est4)
                    baselines.append(est5)
                    observations.append(obs)
                    triplets.append(triangle)
                    # TODO Do why save these? They already exist in the triangle tree (and take
                    # up shit load of space..)
                    # s1_list.append(s1)
                    # s2_list.append(s2)
                    # s3_list.append(s3)
                    # s12_list.append(s12)
                    # s13_list.append(s13)
                    # s23_list.append(s23)
                    #end TODO

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs) # * 100
                    MAPE_errors.append(error)
                    true_errors.append(obs-est)

                    # MAPE error extrapolation
                    error2 = 0
                    if est2 > 0:
                        error2 = abs(obs-est2) / math.sqrt(obs) # * 100
                    MAPE_errors_ext.append(error2)

                    # MAPE error independence
                    error3 = abs(obs-est3) / math.sqrt(obs) # * 100
                    MAPE_errors_ind.append(error3)

                    # MAPE error heurestic
                    error4 = abs(obs-est4) / math.sqrt(obs) # * 100
                    MAPE_errors_heu.append(error4)

                    # MAPE baseline error
                    error5 = abs(obs-est5) / math.sqrt(obs) #* 100
                    MAPE_errors_baseline.append(error5)

                    # Record error for the estimeate that performed best
                    if error < error2 and error < error3:
                        max_ent_best[s123] += 1
                    elif error2 < error and error2 < error3:
                        ext_best[s123] += 1
                    else:
                        ind_best[s123] += 1

                    try:
                        occurrences[s123] += 1
                        max_ent_acc_error[s123] += error
                        ext_acc_error[s123] += error2
                        ind_acc_error[s123] += error3
                        heu_acc_error[s123] += error4
                        baseline_acc_error[s123] += error5
                    except IndexError, ie:
                        pass


        # print 'true errors: ', true_errors
        # print 'estimates: ', estimates
        # print 'observed: ', observed
        # print 'mape ', MAPE_errors
        del triangle_tree
        del sample_triples

        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            avg_errors_ext.append(avg_error_ext)

            # independence error
            avg_error_ind = sum(MAPE_errors_ind) / float(len(MAPE_errors_ind))
            avg_errors_ind.append(avg_error_ind)

            # heurestic error
            avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            avg_errors_heu.append(avg_error_heu)

            # baseline error
            avg_error_baseline = sum(MAPE_errors_baseline) / float(len(MAPE_errors_baseline))
            avg_errors_baseline.append(avg_error_baseline)

            var_error = 0
            var_error_ext = 0
            var_error_heu = 0
            var_error_ind = 0
            # variance
            if len(MAPE_errors) > 1:
                var_error = tvar(MAPE_errors) #tvar is the sample variance
                var_error_ext = tvar(MAPE_errors_ext)
                var_error_heu = tvar(MAPE_errors_heu)
                var_error_ind = tvar(MAPE_errors_ind)


            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            std_dev_ext = math.sqrt(var_error_ext)
            std_error_ext = std_dev_ext / math.sqrt(sample_size)
            span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # independence confidence interval
            std_dev_ind = math.sqrt(var_error_ind)
            std_error_ind = std_dev_ind / math.sqrt(sample_size)
            span_99_ind = norm.interval(0.99, avg_error_ind, std_error_ind)
            span_95_ind = norm.interval(0.95, avg_error_ind, std_error_ind)

            # heurestic confidence interval
            std_dev_heu = math.sqrt(var_error_heu)
            std_error_heu = std_dev_heu / math.sqrt(sample_size)
            span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            var_errors_ext.append(var_error_ext)
            var_errors_heu.append(var_error_heu)
            var_errors_ind.append(var_error_ind)

            res_string = "\nResult ({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), total_transactions-sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            res_string += 'avg_error_ind:{} var_error_ind:{}\n'.format(avg_error_ind, var_error_ind)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ind))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ind))

            res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            res_string += 'avg_error_baseline:{}\n'.format(avg_error_baseline)

            with open(path + str(index) + '_log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_heurestic.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(heurestics):
                    fd.write(str(heurestics[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_independece.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(independences):
                    fd.write(str(independences[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')

            # Save the errors
            with open(path + str(index) + '_MAPE_errors.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors, fd)
            with open(path + str(index) + '_MAPE_errors_ext.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ext, fd)
            with open(path + str(index) + '_MAPE_errors_heu.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_heu, fd)
            with open(path + str(index) + '_MAPE_errors_ind.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_ind, fd)
            with open(path + str(index) + '_MAPE_errors_baseline.pickle', 'wb') as fd:
                pickle.dump(MAPE_errors_baseline, fd)

            #saves amounts of all subsets of triples.
            # TODO this code does not run!
            # with open(path + str(index) + '_data_correlations.tsv', 'w') as fd:
            #     fd.write('s1\ts2\ts3\ts12\ts13\ts23\n')
            #     for _index, i in enumerate(s123):
            #         fd.write(str(s1[_index]) + '\t' + str(s2[_index]) + '\t' + str(s3[_index]) + '\t' + str(s12[_index]) + '\t' + str(s13[_index]) + '\t'+ str(s23[_index]) + '\n')

            #saves independence estimate for all triples.
            # TODO Why s123[_index] in the denominator?
            # TODO What is a 'double independece estimat'?
            # TODO Why not calculate and save estimates in the same way as ext and max_ent?
            # with open(path + str(index) + '_independence_estimate.tsv', 'w') as fd:
            #     fd.write('single independence estimate\tdouble independence estimate\n')
            #     for _index, i in enumerate(s123):
            #     	tempVal1 = sample_size/(s1[_index])
            #     	tempVal2=sample_size/(s2[_index])
            #     	tempVal3=sample_size/(s3[_index])
            #     	tempVal12=sample_size/(s12[_index])
            #     	tempVal13=sample_size/(s13[_index])
            #     	tempVal23=sample_size/(s23[_index])
            #         fd.write(str(s123[_index]/tempVal1*tempVal2*tempVal3*(total_transactions-sample_size) + '\t' + s123[_index]/tempVal12*tempVal13*tempVal23*(total_transactions-sample_size) + '\n'))


            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'
Example #49
0
def ci_norm(data, confidence):
  mean=np.mean(data)
  sigma = np.std(data)
  v1, v2 = norm.interval(confidence, loc=mean, scale=sigma)
  return v2-v1
Example #50
0
def est_all_data_disc_version(algorithm, tab_file, min_support=-30, iterations=1, only_interesting_triples=False, restricted_triples=None, extra_id=''):
    from subprocess import call
    from parsers import Borgelt

    cv_start = time()

    # Create work folder
    _id = str(time()).replace('.','') + '_' + extra_id
    path = '../tmp/cv_' + _id + '/'
    os.mkdir(path)
    print "\n### Running cross validation on ALL DATA cv_{}###".format(_id)

    total_transactions = 0
    for line in open(tab_file, 'rb'):
        total_transactions += 1
    print 'Total total_transactions: ', total_transactions
    sample_size = total_transactions

    avg_errors = []
    var_errors = []
    avg_errors_ext = []
    var_errors_ext = []
    avg_errors_heu = []
    var_errors_heu = []
    for index in range(iterations):

        borgelt_start = time()
        sample_freq_name = path + str(index) + '_sample_frequent_items.out'
        args = [algorithm, tab_file, sample_freq_name, '-s' + str(min_support), '-n3']
        call(args)
        print 'fpgrowth on sample data (ALL DATA) done: {} secs'.format(time()-borgelt_start)


        freq = Borgelt.read_frequent_items(sample_freq_name)
        # Create ds of all observed triplets
        # Saved as sorted keys for lookup,
        # and their frequency as value
        observed = {}
        count = 0
        for item in freq:
            if len(item[0]) == 3:
                sorted_trip = triple_sort(item[0])
                # * 2, horrible hack to make Forward calculated the 
                # observed frequency correctly.
                observed[sorted_trip] = item[1][0] * 2
        print 'Total triplets observed:', len(observed)

        # Check any frequent items were found
        if not os.path.exists(sample_freq_name):
            print 'No frequent items found'
            print 'args', args
            continue

        min_support_trips = min_supported_trips(min_support, total_transactions)
        print 'Forward min_support_trips set to: ', min_support_trips
        triangles_start = time()
        triangle_tree, sample_triples = Forward.forward_compact(sample_freq_name, min_support_trips, observed, only_interesting_triples, restricted_triples)
        print 'Found triangles done: {}'.format(time() - triangles_start)

        #del sample_freq

        estimates = []
        extrapolations = []
        heurestics = []
        observations = []
        triplets = []
        MAPE_errors = []
        MAPE_errors_ext = []
        triangle_counts = []
        triplets = []
        pair_triple_ratios = []

        # Recursion for estimate to converge
        req_depth = int(math.log(total_transactions, 2))+1

        # DFS of the tree holding all triangles
        for n1 in triangle_tree.keys():
            s1, s2_dict = triangle_tree[n1]
            for n2 in s2_dict.keys():
                s2, s12, s3_dict = s2_dict[n2]                                                                                                                                                                                                                          
                for n3 in s3_dict.keys():                                                                                                                                       
                    s3, s13, s23, s123 = s3_dict[n3]

                    triangle = (n1, n2, n3)  
                    triplets.append(triangle)

                    triangle_counts.append((s1, s2, s3, s12, s13, s23, s123))   

                    pair_triple_ratio = s123 / float(min(s12, s13, s23))
                    pair_triple_ratios.append(pair_triple_ratio)                                                                                                                                                                                                                                                                                                                                                                                                                                   

                    # Observed is the triple support, since sample is all data
                    obs = s123

                    # maxent estimate
                    est = ent.maxent_est_rosa(s1, s2, s3, s12, s23, s13, float(total_transactions), num=req_depth)

                    # extrapolation estimate, does not make sense for all data
                    est2 = s123 / float(sample_size) * (total_transactions)

                    # heurestic, use max_ent for 0 triple in sample, does not make sense for all data
                    # est3 = s123 == 0 and est or est2

                    estimates.append(est)
                    # extrapolations.append(est2)
                    # heurestics.append(est3)
                    observations.append(obs)
                    triplets.append(triangle)

                    # MAPE error max ent
                    error = abs(obs-est) / math.sqrt(obs)
                    MAPE_errors.append(error)
                    # MAPE error extrapolation
                    error2 = abs(obs-est2) / math.sqrt(obs)
                    MAPE_errors_ext.append(error2)
                    # MAPE error heurestic
                    # error3 = abs(obs-est3) / float(obs) * 100
                    # MAPE_errors_heu.append(error3)

        
        del triangle_tree
        del sample_triples
                    
        if len(MAPE_errors) > 0: #TODO handle this, probably when nothing has been found

            min_error = min(MAPE_errors)
            max_error = max(MAPE_errors)

            # max ent error
            avg_error = sum(MAPE_errors) / float(len(MAPE_errors))
            avg_errors.append(avg_error)

            # extrapolation error
            # avg_error_ext = sum(MAPE_errors_ext) / float(len(MAPE_errors_ext))
            # avg_errors_ext.append(avg_error_ext)
            
            # heurestic error
            # avg_error_heu = sum(MAPE_errors_heu) / float(len(MAPE_errors_heu))
            # avg_errors_heu.append(avg_error_heu)
            
            # variance
            var_error = var(MAPE_errors)
            # var_error_ext = tvar(MAPE_errors_ext)
            # var_error_heu = tvar(MAPE_errors_heu)

            # max_ent confidence interval
            std_dev = math.sqrt(var_error)
            std_error = std_dev / math.sqrt(sample_size)
            span_99 = norm.interval(0.99, avg_error, std_error)
            span_95 = norm.interval(0.95, avg_error, std_error)

            # ext confidence interval
            # std_dev_ext = math.sqrt(var_error_ext)
            # std_error_ext = std_dev_ext / math.sqrt(sample_size)
            # span_99_ext = norm.interval(0.99, avg_error_ext, std_error_ext)
            # span_95_ext = norm.interval(0.95, avg_error_ext, std_error_ext)

            # heurestic confidence interval
            # std_dev_heu = math.sqrt(var_error_heu)
            # std_error_heu = std_dev_heu / math.sqrt(sample_size)
            # span_99_heu = norm.interval(0.99, avg_error_heu, std_error_heu)
            # span_95_heu = norm.interval(0.95, avg_error_heu, std_error_heu)

            var_errors.append(var_error)
            # var_errors_ext.append(var_error_ext)
            # var_errors_heu.append(var_error_heu)
            
            res_string = "\nResult ALL DATA({}):\nSample size:{} triangles:{} test_data:{}\n".format(index, sample_size, len(estimates), sample_size)
            # log max ent result
            res_string += "avg_error:{} var_error:{}\n".format(avg_error, var_error)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95))

            res_string += 'avg_error_ext:{} var_error_ext:{}\n'.format(avg_error_ext, var_error_ext)
            res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_ext))
            res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_ext))

            # res_string += 'avg_error_heu:{} var_error_heu:{}\n'.format(avg_error_heu, var_error_heu)
            # res_string += '99% Confidence interval(-/+): {}\n'.format(str(span_99_heu))
            # res_string += '95% Confidence interval(-/+): {}\n'.format(str(span_95_heu))

            with open(path + 'log.txt', 'a') as log_file:
                log_file.write(res_string)
            print res_string

            # Write result data
            with open(path + str(index) + '_data.json', 'w') as fd:
                # triplet_key = ['triple' for t in estimates]
                # est_key = ['est' for t in estimates]
                # obs_key = ['obs' for t in observations]
                fd.write(json.dumps(zip(triplets, zip(estimates, observations))))
            with open(path + str(index) + '_data.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(estimates[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            with open(path + str(index) + '_data_extrapolation.tsv', 'w') as fd:
                fd.write('est\tobs\tn1\tn2\tn3\tpair_trip_ratio\ts1\ts2\ts3\ts12\ts13\ts23\ts123\n')
                for _index, i in enumerate(estimates):
                    fd.write(str(extrapolations[_index]) + '\t' + str(observations[_index]) + '\t' + str(triplets[_index][0]) + '\t' + str(triplets[_index][1]) + '\t' + str(triplets[_index][2]) + '\t' + str(pair_triple_ratios[_index]) + '\t' + str(triangle_counts[_index][0]) + '\t' + str(triangle_counts[_index][1]) + '\t' + str(triangle_counts[_index][2]) + '\t' + str(triangle_counts[_index][3]) + '\t' + str(triangle_counts[_index][4]) + '\t' + str(triangle_counts[_index][5]) + '\t' + str(triangle_counts[_index][6]) + '\n')
            del estimates
            del observations

            # remove tmp files
            # os.remove(sample_freq_name)
            # os.remove(sample_file_name)

        else:
            print 'No abs errors!'

    print "Cross validation done!"
    print "time: ", (time() - cv_start)
    if len(avg_errors) > 0:
        total_avg_error = sum(avg_errors)/float(len(avg_errors))
        total_res_string = "Avg error:{}".format(total_avg_error)