Esempio n. 1
0
    def fit_transform(self, X, y=None):
        """Normalize numerical columns.
        
        Args:
            X (numpy.array) : numerical columns to normalize

        Returns:
            X (numpy.array): normalized numerical columns
        """

        self.ecdfs = [None] * X.shape[1]

        for col in range(X.shape[1]):
            self.ecdfs[col] = ECDF(X[:, col])
            X[:, col] = self._transform_col(X[:, col], col)

        return X
Esempio n. 2
0
def summarize(data, name=None, units=None, plot=True):
    """Summarizes an array of posterior sample draws.
    
    Parameters:
    -----------
    data: pandas Series or numpy array
        A 1D array containing MCMC posterior sample draws
    name: string
        Name of the parameter; automatically pulled if
        data is a pd.Series and name=None
    units: string
        The units associated with the parameter
    plot: bool, default True
        Whether or not to display an empirical cumulative
        distribution (ECDF) plot
    
    Returns:
    --------
    summary: DataFrame
        Contains the summary stats of the samples
    """
    if isinstance(data, pd.Series) and name is None:
        name = data.name
    if units is not None:
        name = f'{name} ({units})'

    vals = np.round(np.quantile(data, [0.5, 0.025, 0.975]), 3)

    df = pd.DataFrame(
        {
            'value': name,
            'median': vals[0],
            '95% CR': f'[{vals[1]}, {vals[2]}]'
        },
        index=[0])

    if plot:
        ecdf = ECDF(data.values)
        p = bokeh.plotting.figure(plot_width=400, plot_height=300)
        p.xaxis.axis_label = name
        p.yaxis.axis_label = 'ECDF'
        p.circle(ecdf.x, ecdf.y)
        bokeh.io.show(p)

    return df
Esempio n. 3
0
def PlotNumerical(DF, targetcols = None, figsize = (10,5), ticksfontsize = 12, titlefontsize = 20, kde = True):
    
    if targetcols == None:
        
        SelectColumns = DF.columns.values
        
    else:
                
        SelectColumns = targetcols
    
    for col in SelectColumns:
        
#        col = "OWN_CAR_AGE"
        
        plt.figure(figsize = figsize)
        
        plt.suptitle(col,fontsize = titlefontsize,y = 0.91)
        
        plt.subplot(221)
        plt.grid()
        plt.xticks(fontsize = ticksfontsize)
        plt.yticks(fontsize = ticksfontsize)
        try:
            sns.distplot(DF[col], kde = kde)
        except:
            sns.distplot(DF[col], kde = False)
        
        plt.subplot(222)
        plt.grid()
        plt.xticks(fontsize = ticksfontsize)
        plt.yticks(fontsize = ticksfontsize)
        sns.boxplot(x = col, data = DF)

        plt.subplot(212)
        plt.grid()
        plt.xticks(fontsize = ticksfontsize)
        plt.yticks(fontsize = ticksfontsize)
#        sns.distplot(DF[col],rug_kws = {"cumulative" : True}, kde_kws = {"cumulative":True})
        ecdf = ECDF(DF[col])
        plt.plot(ecdf.x, ecdf.y / ecdf.y.max())
#        sns.distplot(DF[col], rug_kws = {"cumulative":True})
        plt.xlabel('Value')
        plt.ylabel('ECDF')

        plt.show()
Esempio n. 4
0
def inv_ecdf_vs_pred_entropy(probabilities, label=None, color='b', linestyle='-', axis=None):
    pred_ent = predictive_entropy(probabilities)
    ecdf = ECDF(pred_ent)
    x_lim = np.log(probabilities.shape[1])
    entropy_range = np.linspace(0.0, x_lim, probabilities.shape[1] * 100)
    if axis is None:
        fig, ax = plt.subplots(figsize=(12, 7), tight_layout=True)
    else:
        ax = axis
    ax.plot(entropy_range, 1 - ecdf(entropy_range), c=color, ls=linestyle, lw=3, label=label, clip_on=False)
    ax.set_xlim(ax.get_xlim()[0], np.ceil(x_lim))
    ax.set_ylim(ax.get_ylim()[0], 1)
    ax.tick_params(direction='out', labelsize=14)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.tick_params(direction='out', labelsize=14, right=False, top=False)
    ax.set_ylabel('1-ecdf', fontsize=16)
    ax.set_xlabel('Predictive Entropy', fontsize=16)
def q4():
    # Retorne aqui o resultado da questão 4.
    #Configurando padronização
    not_pulsar = stars.loc[stars['target'] == False]
    mean_not_pulsar = not_pulsar['mean_profile']
    x = mean_not_pulsar
    false_pulsar_mean_profile_standardized = (x - x.mean()) / x.std()
    #quantis para média = 0 e desvio padrão = 1
    q_80 = sct.norm.ppf(0.8, loc=0, scale=1)
    q_90 = sct.norm.ppf(0.9, loc=0, scale=1)
    q_95 = sct.norm.ppf(0.95, loc=0, scale=1)
    #Probabilidades associadas aos quantis
    ecdf = ECDF(false_pulsar_mean_profile_standardized)
    prob_quantis = (ecdf(q_80).round(3), ecdf(q_90).round(3),
                    ecdf(q_95).round(3))
    prob_quantis

    return prob_quantis
Esempio n. 6
0
def ecdf(x, N=100, inverse=False):
    """
    Thin wrapper around statsmodels ecdf.
    Arguments:
        x: array of points for which to find ecdf
        N: The number of output points you want in your ecdf
        inverse: Return 1 - ecdf
    Returns:
        x_out: an array of values
        y_out: an array of percentiles
    """
    import numpy as np
    from statsmodels.distributions.empirical_distribution import ECDF
    x_out = np.linspace(min(x), max(x), N)
    y_out = ECDF(x)(x_out)
    if inverse:
        y_out = 1 - y_out
    return x_out, y_out
Esempio n. 7
0
def quantile_correction(obs_data, mod_data, sce_data, modified=True):
    cdf = ECDF(mod_data)
    p = cdf(sce_data) * 100
    cor = np.subtract(*[np.nanpercentile(x, p) for x in [obs_data, mod_data]])
    if modified:
        mid = np.subtract(
            *[np.nanpercentile(x, 50) for x in [obs_data, mod_data]])
        g = np.true_divide(
            *[np.nanpercentile(x, 50) for x in [obs_data, mod_data]])

        iqr_obs_data = np.subtract(*np.nanpercentile(obs_data, [75, 25]))
        iqr_mod_data = np.subtract(*np.nanpercentile(mod_data, [75, 25]))

        f = np.true_divide(iqr_obs_data, iqr_mod_data)
        cor = g * mid + f * (cor - mid)
        return sce_data + cor
    else:
        return sce_data + cor
Esempio n. 8
0
def q4():
    false_pulsar_mean_profile = stars.loc[stars['target'] == False,
                                          'mean_profile']

    false_pulsar_mean_profile_standardized = sct.zscore(
        false_pulsar_mean_profile)

    # Percent point function
    ppf_q80 = sct.norm.ppf(0.80, loc=0, scale=1)
    ppf_q90 = sct.norm.ppf(0.90, loc=0, scale=1)
    ppf_q95 = sct.norm.ppf(0.95, loc=0, scale=1)

    # Create the ecdf function with standardized star data
    compute_ecdf_stars = ECDF(false_pulsar_mean_profile_standardized)

    return (round(compute_ecdf_stars(ppf_q80),
                  3), round(compute_ecdf_stars(ppf_q90),
                            3), round(compute_ecdf_stars(ppf_q95), 3))
Esempio n. 9
0
def q2():
    """Answer of question 02
    
    Returns
    -------
    float
        Probability of a given value is between one standard deviation
        from the mean
    """
    
    mean = dataframe['normal'].mean()
    std = dataframe['normal'].std()
    
    ecdf = ECDF(dataframe['normal'])
    prob_inf = ecdf(mean - std)
    prob_sup = ecdf(mean + std)
    
    return float(prob_sup - prob_inf)
Esempio n. 10
0
def CDF_band(n_samples):
    samples = np.random.standard_cauchy(n_samples)
    # Estimated Empirical CDF
    ecdf = ECDF(samples)
    line = np.linspace(-20, 20, 100000)
    ecdf_points = []
    for i in line:
        ecdf_points.append(ecdf(i))
    plt.plot(line, ecdf_points)
    plt.show()
    variance, mean = variance_mean(samples)
    skewness = get_skewness(samples)
    # Plugin Mean
    print('Plugin Estimator for Mean is:', mean)
    # Plugin Variance
    print('Plugin Estimator for Variance is:', variance)
    # Plugin Skewness
    print('Plugin Estimator for Skewness is:', skewness)
Esempio n. 11
0
    def fit_transform(self, X, y=None):
        """Normalize numerical columns.

        Args:
            X (pandas.DataFrame) : numerical columns to normalize

        Returns:
            (pandas.DataFrame): normalized numerical columns
        """

        self.ecdfs = [None] * X.shape[1]

        X = X.copy()
        for col in range(X.shape[1]):
            self.ecdfs[col] = ECDF(X[col].values)
            X[col] = self._transform_col(X[col], col)

        return X
Esempio n. 12
0
def q4():
    """Answer of question 04
    
    Returns
    -------
    tuple
        Probability associated to 0.80, 0.90 and 0.95 quantiles of standardized
        values of mean_profile of false pulsar stars
    """
    
    false_pulsar = stars[stars['target'] == False]['mean_profile']
    false_pulsar_mean_profile_standardized = (false_pulsar - false_pulsar.mean()) / false_pulsar.std()
    
    ecdf = ECDF(false_pulsar_mean_profile_standardized)
    theorical_quantiles = sct.norm.ppf([.80, .90, .95], loc=0, scale=1)
    prob_quantiles = ecdf(theorical_quantiles).round(3)
    
    return tuple(prob_quantiles)
Esempio n. 13
0
    def calculate_baseline_scores(self):
        # TODO: should use the get_reachable_geoms function?
        print('\t Calculating baseline scores')
        #         self.base_scores={'walkable_{}'.format(x): [] for x in [
        #             'employment', 'housing', 'healthcare', 'hospitality', 'shopping']}
        base_scores = {}
        self.base_attributes = {}
        self.score_ecdfs = {}
        stats_to_aggregate = [
            col for col in self.zones.columns
            if (('res_' in col) or ('emp_' in col))
        ]
        # get the baseline reachable attributes and scores for every zone
        for ind, row in self.zones.loc[
                self.zones['reference_area']].iterrows():
            reachable_zones = self.zone_to_reachable[ind]['zones']
            self.base_attributes[ind] = self.zones.loc[reachable_zones][
                stats_to_aggregate].sum().to_dict()
            self.base_attributes[ind]['source_res'] = row['res_total']
            self.base_attributes[ind]['source_emp'] = row['emp_total']
            # get scores for individual zones- weighting cancels out
            base_scores[ind] = self.attributes_to_scores(
                [self.base_attributes[ind]])

        # Create the ECDFs for each score using only the zones (not the grid cells
        self.base_zones_scores = pd.DataFrame.from_dict(base_scores,
                                                        orient='index')
        for score in self.base_zones_scores.columns:
            base_scores_no_nan = [
                x for x in self.base_zones_scores[score] if x == x
            ]
            self.score_ecdfs[score] = ECDF(base_scores_no_nan)

        # get weighted scores across the simulation area zones
        # (ignore the grid which is empty in reference and therefore would be weighted zero)
        ref_scores = self.attributes_to_scores(
            [self.base_attributes[ind] for ind in self.overlapping_geoids])
        self.ref_ind = self.normalise_ind(ref_scores)

        # get the base reachable attributes for every grid cell location
        for i_c in range(len(self.geogrid)):
            reachable_zones = self.grid_to_reachable[i_c]['zones']
            self.base_attributes[i_c] = self.zones.loc[reachable_zones][
                stats_to_aggregate].sum().to_dict()
    def plot_inter_limitorder_time_ecdf(self,show=True):
        """
        Plot the the empirical cumulative distribution function of the time gaps between marketorders in self.marketorders
        :return Return axis
        """
        self.limitorders = self.orderbook.loc[self.orderbook.Type==1]
        self.limitorders["IntertradeTimes"] = (self.limitorders.loc[:,"Timestamp"] - self.limitorders.loc[:,"Timestamp"].shift(1))
        self.limitorders.loc[:,"IntertradeTimes"] = self.limitorders.IntertradeTimes.astype('timedelta64[ns]')
        self.limitorders.loc[:,"IntertradeTimes"] = self.limitorders.IntertradeTimes.apply(lambda x: self.get_intertrade_times(x,'seconds'))

        ecdf = ECDF(self.limitorders.IntertradeTimes.values)

        if show:
            fig = plt.figure(figsize=(10,7))
            ax = fig.add_subplot(111)
            ax.ticklabel_format(axis='y', style='sci', scilimits=(-2,-1))
            ax.semilogx(ecdf.x, ecdf.y)
            plt.show()
        return ecdf
def main():

    prev_child = ""
    ### verbs

    with open("verbs/" + args.reg + "_verbs.pickle", "rb") as f:
        verb_counter = pickle.load(f)

    fileids = get_filenames("merged-" + args.reg)
    results_file = open("%s_results.csv" % args.reg, "w+")
    results_csvwriter = csv.writer(results_file, delimiter="\t")
    results_csvwriter.writerow(
        ["name", "age", "causative", "random", "percentile"])

    for f in fileids:
        print(f)
        name = f.split("/")[-1].split("_")[0]
        age = f.split("_")[1].split(".")[0]
        sents = read_sents(f)

        if (args.build):
            vocab_size = we_on_merged_sessions(sents, name + "-" + age)

        caus_counter, caus_result = get_similarity(name, age, args.reg)
        print(caus_counter)
        #random_result = baseline(name, age, caus_counter)
        random_results = baseline(name, age, caus_counter, 1000, verb_counter,
                                  args.reg)
        '''
		try:
			random_result = sum(random_results)/len(random_results)
		except:
			random_result = 0
			'''
        random_result = statistics.median(random_results)
        # percentile of causative in the distribution

        percentile = ECDF(random_results)(caus_result)

        print(percentile)

        results_csvwriter.writerow(
            [name, age, caus_result, random_result, percentile])
Esempio n. 16
0
def q2():
    # Retorne aqui o resultado da questão 2.
    
    # Média e desvio padrão 
    x_mean = df['normal'].mean()
    x_std = df['normal'].std()
    
    # Intervalo
    data_point = (x_mean-x_std,x_mean+x_std)
    
    # CDF empírica da variável normal
    ecdf = ECDF(df['normal'])
    
    # Probabilidade no intervalo é a probabilidade do fim do intervalo 
    # menos a probabilidade do inicio do intervalo
    p1 = ecdf(data_point[0])
    p2 = ecdf(data_point[1])
    
    return round(float(p2-p1),3)
def ECDFFitting(r, W, plot=False):
    Vlist = list(
        GenMultiRN(r, W, Type="P", Size=10000, Warn=False,
                   HisGen=True).ravel())
    Vlist = sorted(Vlist)
    ecdf = ECDF(Vlist)
    Fx = ecdf(Vlist)
    x = np.arange(min(Vlist) - 0.05, max(Vlist) + 0.05, 0.01)
    Fxx = ecdf(x)
    if plot:
        name = "ECDF"
        plt.figure()
        plt.plot(Vlist, Fx, 'o', label=name + ': Origin', markersize=3)
        #plt.plot(Vlist, FittedFunc(Vlist), 'r', label=name+': Fitted')
        plt.plot(x, Fxx, 'r', label=name + ': Fitted ecdf')
        plt.title('ECDF ' + " fitting: r = " + str(r))
        plt.legend()
        plt.show()
    return ecdf
Esempio n. 18
0
 def _single_cdf(self, trial_name):
     data = []
     for drop_rate in self.result[trial_name].keys():
         for stat in self.result[trial_name][drop_rate]:
             data += self._prepare_histogram_data(stat)
     data = [
         v  # / max(data)
         for v in data
     ]
     ecdf = ECDF(data)
     return [
         go.Scatter(
             name="eCDF",
             x=np.unique(data),
             y=ecdf(np.unique(data)) * 100,
             line_shape='hv',
             line_color='darkgreen',
         )
     ]
Esempio n. 19
0
def quantile_mapping(obs_cube, mod_cube, sce_cubes, *args, **kwargs):
    """
    Quantile Mapping

    apply quantile mapping to all scenario cubes using the distributions
    of obs_cube and mod_cube

    Args:

    * obs_cube (:class:`iris.cube.Cube`):
        the observational data

    * mod_cube (:class:`iris.cube.Cube`):
        the model data at the reference period

    * sce_cubes (:class:`iris.cube.CubeList`):
        the scenario data that shall be corrected
    """
    from statsmodels.distributions.empirical_distribution import ECDF

    obs_cube_mask = np.ma.getmask(obs_cube.data)
    cell_iterator = np.nditer(obs_cube.data[0], flags=['multi_index'])
    while not cell_iterator.finished:
        index_list = list(cell_iterator.multi_index)
        cell_iterator.iternext()

        index_list.insert(0, 0)
        index = tuple(index_list)
        if obs_cube_mask and obs_cube_mask[index]:
            continue

        index_list[0] = slice(0, None, 1)
        index = tuple(index_list)
        obs_data = obs_cube.data[index]
        mod_data = mod_cube.data[index]
        mod_ecdf = ECDF(mod_data)

        for sce_cube in sce_cubes:
            sce_data = sce_cube[index].data
            p = mod_ecdf(sce_data) * 100
            corr = np.percentile(obs_data, p) - \
                np.percentile(mod_data, p)
            sce_cube.data[index] += corr
Esempio n. 20
0
def q4():
    # Retorne aqui o resultado da questão 4.
    false_pulsar_mean_profile_standardized = (
        stars[stars["target"] == 0]["mean_profile"] -
        stars[stars["target"] == 0]["mean_profile"].mean()
    ) / stars[stars["target"] == 0]["mean_profile"].std()

    false_pulsar_mean_profile_standardized

    vals = sct.norm.ppf([0.8, 0.9, 0.95])  #os quantis
    vals

    ecdf = ECDF(false_pulsar_mean_profile_standardized)

    resposta = ecdf(vals)
    resposta = tuple(
        map(lambda x: isinstance(x, float) and round(x, 3) or x, resposta))

    return resposta
def permutationFWE(diff_arr, nullmean=0, permutations=1000, nproc=1):
    """
    Performs family-wise error correction using permutation testing (Nichols & Holmes 2002)
    
    Parameters:
        diff_arr = MxN matrix of set of M independent tests for condition 1 minus condition 2 across N subjects
        permutations = Number of permutations to perform
        nproc = number of processes to run in parallel

    Returns:
        t: Array of T-values of correct contrast map (Mx1 vector, for M tests)
        p: Array of FWE-corrected p-values (Mx1 vector, for M tests); 
           Note, p-values correspond to values on the CDF. One-sided or or two-sided p-values can be computed accordingly.

    N.B.: Only works for paired one-sample t-tests
    """
    # Focus on difference matrix -- more computationally feasible (and less data to feed into parallel processing)

    # Prepare inputs for multiprocessing
    inputs = []
    for i in range(permutations):
        seed = np.random.randint(0, 100000, 1)[0]
        inputs.append((diff_arr, nullmean, seed))

    pool = mp.Pool(processes=nproc)
    result = pool.map_async(_permutation, inputs).get()
    pool.close()
    pool.join()

    # Returns an array of T-values distributions (our null distribution of "max-T" values)
    maxT_dist = np.asarray(result)

    # Obtain real t-values
    t = stats.ttest_1samp(diff_arr, nullmean, axis=1)[0]
    #t = np.mean(diff_arr,axis=1)

    # Construct ECDF from maxT_dist
    ecdf = ECDF(maxT_dist)

    # Return p-values from maxT_dist using our empirical CDF (FWE-corrected p-values)
    p_fwe = ecdf(t)

    return t, p_fwe
Esempio n. 22
0
def plot_ccdf(sample):
	ecdf = ECDF(sample)
	x = np.linspace(min(sample), max(sample))
	y = ecdf(x)

	new_y = []
	for i in y:
		new_y.append(1-i)
	y = new_y

	plt.plot(x, y, 'bx')
	plt.gca().set_xscale('log')
	plt.gca().set_yscale('log')
	plt.gca().set_aspect('equal')
	axes = plt.gca()
	axes.set_xlim([min(x),max(x)])
	axes.set_ylim([min(y),max(y)])

	plt.show()
Esempio n. 23
0
def counterfactual_ranks(points_to_predict,
                         points_for_distribution,
                         method="smoothed"):
    """
    counterfactual ranks:
        compute \widehat U the value of the CDF at each element of points_to_predict,
        using the empirical CDF defined by 'points_for_distribution'.
    
    :param points_to_predict: points for wich to get the rank in the distribution
    :param points_for_distribution: points for which to compute the CDF
    :param method: can be "smoothed" or "standard" dependant on the type of method for computation of the CDF
    """
    if method == "smoothed":
        y = smoothed_ecdf(new_points=points_to_predict,
                          x=points_for_distribution)
    if method == "standard":
        ecdf = ECDF(points_for_distribution)
        y = ecdf(points_to_predict)
    return y
Esempio n. 24
0
    def vol_control_window_selection(self):
        vol_rolling_window = self.z_score_rolling_window
        vol_data = self.y
        vol_data = vol_data.diff().ewm(span=vol_rolling_window).std()
        vol_ecdf = ECDF(vol_data.values)

        def window_sample(x, ecdf):
            return np.floor((1 - ecdf(x)) * 12 + 2)

        self.df["aug_window"] = window_sample(vol_data)

        # Create a z-score dataframe containing all columns of possible windows
        all_possible_windows = map(int, list(set(self.df.aug_window.values)))

        data = self.df.kalman_hedged_spread.copy()
        dynamic_rolling_mean = pd.DataFrame(index=data.index)
        dynamic_rolling_std = dynamic_rolling_mean.copy()

        for window in all_possible_windows:
            dynamic_rolling_mean[f"window_{window}"] = data.rolling(
                window=window).mean()
            dynamic_rolling_std[f"window_{window}"] = data.rolling(
                window=window).std()

        for s in dynamic_rolling_mean.iterrows():
            roll_window = "window_" + str(int(self.df.loc[s[0], "aug_window"]))
            self.df.loc[s[0], "dy_roll_mean"] = s[1][roll_window]

        for s in dynamic_rolling_std.iterrows():
            roll_window = "window_" + str(int(self.df.loc[s[0], "aug_window"]))
            self.df.loc[s[0], "dy_roll_std"] = s[1][roll_window]

        self.df["z_score"] = (
            self.df.hedged_spread.values -
            self.df.dy_roll_mean.values) / self.df.dy_roll_std.values
        self.mean_spread = self.df.dy_roll_mean
        self.std_spread = self.df.dy_roll_std

        # Show window sampling profile
        plot_space = np.arange(0, vol_data.max(), 0.01)
        plt.plot(plot_space, vol_ecdf(plot_space))
        plt.title("Window sampling profile")
        plt.show()
Esempio n. 25
0
def Empirical_ICDF(x, p):
    '''
    Returns inverse empirical cumulative probability function at p points
    '''

    # TODO: revisar que el fill_value funcione correctamente

    # fit ECDF
    ecdf = ECDF(x)
    cdf = ecdf(x)

    # interpolate KDE CDF to get support values 
    fint = interp1d(
        cdf, x,
        fill_value=(np.nanmin(x), np.nanmax(x)),
        #fill_value=(np.min(x), np.max(x)),
        bounds_error=False
    )
    return fint(p)
Esempio n. 26
0
def plot_examine_diff_diff(start_time, fname="", loops=2, gap=0,
        thresh=10, **kwas):
    '''
    lines:  1) domain independent cdf of ALL matches
            2-n) cdf of matches for domain with answer space > thresh
            n-m) cdf of matches for ALL domains with answer space < thresh
    '''
    kwas['start_time'] = start_time
    kwas['return_ccache'] = False
    svld, allsvl, allfmt, anssets = mv.arrange_self_data(start_time, gap, loops, **kwas)

    sm = mv.examine_diff_diff(svld)
    vals = [[], []]
    labels = ['all', 'small']
    for dom in sm:
        vals[0] = vals[0] + sm[dom]
        if len(anssets[dom]) < thresh:
            vals[1] += sm[dom]
        else:
            vals.append(sm[dom])
            labels.append(dom)

    fig, ax = plt.subplots(1, 1)
    for i in xrange(0, len(vals)):
        print "*****************"+labels[i]+"*********************"
        print vals[i]
        ecdf = ECDF(vals[i])
        x = list(ecdf.x)
        y = list(ecdf.y)
        ax.plot(x, y, label=labels[i])
    ps.set_dim(fig, ax, xdim=13, ydim=7.5)
    plt.xlabel("diff mask match by domain")
    plt.ylabel("CDF of clients")
    lgd = ps.legend_setup(ax, 4, "top center", True)
    filename = plotsdir+"diff_mask"+fname
    fig.savefig(filename+'.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(filename+'.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.close(fig)

    print "saving data..."
    for i in xrange(0, len(vals)):
        outstr = df.overwrite(plotsdir+labels[i]+'_diff_jaccard.csv',
                df.list2col(vals[i]))
Esempio n. 27
0
def graficar_sueldo_neto(esi):
    sueldo_col_name = 'sueldo_neto'

    cdf_function = ECDF(esi[sueldo_col_name].dropna().values)
    layout = dict(
    title = "<span style='font-size:26px'>Study of age groups</span><br><span style='color:#999; font-size: 16px; font-weight:200'>students and professionals</span>",
    plot_bgcolor='#f5f5f5',
    margin = dict(t=50, l=0, r=0),
    legend=dict(yanchor='top',xanchor='right', x=0.992, y=0.98, font=dict(size= 12),traceorder='normal'),
    xaxis = dict(domain=[0,1]),
    barmode="overlay",
    bargap = 0.1,
    width = 765
    )
    sueldo_range = np.linspace(0, esi[sueldo_col_name].max(), 10000)
    fig = px.line(x=sueldo_range, y=100*cdf_function(sueldo_range),
                 title=f'curva de densidad acumulada de {sueldo_col_name}', layout=layout)
    fig.update_layout(xaxis=dict(title='Sueldo neto'), yaxis=dict(title='Percentil'))
    st.plotly_chart(fig)
Esempio n. 28
0
def calculate_epsilon_per_pair_parallel(values, delta, precision):
    # values = [0.0, 0.2, .4, .6, .7, 10, 20, 100, 400, 500, 1000, 2000]
    values = list(map(abs, values))
    values = sorted(values)
    R_ij = max(values)
    epsilons = []
    r_ij = R_ij * precision
    cdf = ECDF(values)

    epsilon = inf
    flag = 1
    prev = values[0]
    for i in values:
        if i != prev:
            flag = 0
        prev = i

    if not flag:
        for t_k in values:
            p_k = calculate_cdf(cdf, t_k + r_ij) - calculate_cdf(
                cdf, t_k - r_ij)

            # covering the case with risk less than or equal 1-p_k
            if not (
                    round(1 - p_k, 2) <= delta
            ):  # the round is for very small differences, like 0.050000000000000044
                eps = -log(p_k / (1.0 - p_k) * (1.0 /
                                                (delta + p_k) - 1.0)) / log(
                                                    exp(1.0)) * (1.0 / R_ij)

                epsilons.append(eps)
            else:
                epsilons.append(inf)

        if len(epsilons) > 0:
            epsilon = min(epsilons)
    else:
        #  fix the ECDF when all the values are equal.
        # after the discussion, we decided to let the user know about that issue and maybe has can handle it on his own.
        # epsilon=-inf
        epsilon = inf
    return epsilon
Esempio n. 29
0
def plotGwCorrmat(label, gldas, gwcorrmat, masking=True):
    '''
    plot correlation matrix between gw and CNN-learned model mismatch 
    '''
    print('max gw-grace correlation', np.nanmax(gwcorrmat))
    print('min gw-grace correlation', np.nanmin(gwcorrmat))

    cmap = ListedColormap((sns.diverging_palette(240, 10, s=80, l=55,
                                                 n=9)).as_hex())

    fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
    pp = np.zeros(gldas.mask.shape) + np.NaN
    pp[gldas.mask == 1] = 1.0
    if masking:
        temp = np.multiply(gwcorrmat, pp)
        im = ax.imshow(temp, cmap, origin='lower', vmax=0.8, vmin=-0.8)
        #plot CDF
        corrvec = gwcorrmat[gldas.gwvalidcells]
        #remove nan cells
        corrvec = corrvec[np.where(np.isnan(corrvec) == False)[0]]
        cdf = ECDF(corrvec)
        #[left, bottom, width, height]
        figpos = [0.52, 0.7, 0.18, 0.16]
        #sns.set_style("whitegrid")
        sns.set_style("ticks", {"xtick.major.size": 6, "ytick.major.size": 6})
        ax2 = fig.add_axes(figpos)
        ax2.set_ylim(0, 1)
        ax2.grid(True)
        ax2.plot(cdf.x, cdf.y, linewidth=1.5)
        ax2.set_xlabel('Correlation')
        ax2.set_ylabel('CDF')
    else:
        im = ax.imshow(gwcorrmat, cmap, origin='lower')

    cx, cy = gldas.getBasinBoundForPlotting()
    ax.plot(cx, cy, '-', color='#7B7D7D')
    plt.colorbar(im, orientation="horizontal", fraction=0.046, pad=0.1, ax=ax)

    plt.savefig('gwcorr{0}.png'.format(label),
                dpi=fig.dpi,
                transparent=True,
                frameon=False)
Esempio n. 30
0
def filter_variants(boost_obj, args):

    logger.info("Filtering Variants")
    path = args.out

    if args.predictor.lower() == 'classifier':
        df1 = pd.DataFrame()
        df1['xgb_score'] = boost_obj.y_pred
        df1['chr:pos'] = boost_obj.X_data.index
        real_snps = df1[df1['xgb_score'] == 1]['chr:pos']

    elif args.predictor.lower() == 'regressor':
        fitted_values = boost_obj.y_pred

        # RS1 = list(boost_obj.y_data.nonzero()[0])

        RS1 = list(boost_obj.y_data.to_numpy().nonzero()[0])
        ecdf_func = ECDF([fitted_values[idx] for idx in RS1])
        fitted_value_scores = ecdf_func(fitted_values)

        min_qscore = 0.05
        real_snps_idx = [i > min_qscore for i in fitted_value_scores]
        real_snps = boost_obj.X_data.loc[real_snps_idx].index

    ## Plot ECDF
    if args.predictor.lower() == 'regressor':
        logger.info("Plotting ECDF")
        plt.figure()
        plt.plot(ecdf_func.x, ecdf_func.y, '.')
        plt.xlabel('Boosting_Score')
        plt.ylabel('ECDF_Score')
        plt.title('ECDF')
        if args.snps:
            name = 'ecdf_' + args.model + '_' + args.predictor + '_snps.png'
        elif args.indels:
            name = 'ecdf_' + args.model + '_' + args.predictor + '_indels.png'
        elif args.all:
            name = 'ecdf_' + args.model + '_' + args.predictor + '.png'

        plt.savefig(os.path.join(path, name))

    return real_snps