Exemple #1
0
def frecuencia_relativa_con_ancho(conjuntoDeDatos, ancho=0.1, titulo='Histograma de frecuencia relativa'):
    inicio = int(min(conjuntoDeDatos))
    fin = int(max(conjuntoDeDatos))
    n = int((fin - inicio) / (ancho))
    res = stats.relfreq(conjuntoDeDatos, numbins=n)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size, res.frequency.size)
    graficar_frecuencia_relativa(x, res, titulo)
Exemple #2
0
def findCounts(arr: np.ndarray) -> np.ndarray:
    """
    Replaces the elements of the array by their relative frequencies (only 2D-arrays).
    :param arr: array of the dataset as numpy ndarray.
    :return: numpy ndarray.
    """
    array_shape = arr.shape
    # check if array is empty, if it is, return 0
    # this is the trivial case
    if len(arr) == 0:
        return arr

    # check if the array is more then two dimensional
    # if it is, then raise an error
    if len(array_shape) > 2:
        print("Dimension of ndarray must be exactly two.")
        return False
    else:
        # init the output list
        frequencies = np.empty(shape=array_shape, dtype=float)
        # calculate the statistics for each row
        for i in range(0, array_shape[1]):
            res = stats.relfreq(arr[:, i], len(arr[:, i]))
            print(res.frequency)
            np.append(frequencies, res.frequency)
    #returns numpy array with frequencies
    return frequencies
Exemple #3
0
def cdf_vals_from_data(data, numbins=None, maxbins=None):

    # make sure data is a numpy array
    data = numpy.array(data)
    
    # by default, use numbins equal to number of distinct values
    # TODO: shouldn't this be one per possible x val?
    if numbins == None:
        numbins = numpy.unique(data).size

    if maxbins != None and numbins > maxbins:
        numbins = maxbins
    
    # bin the data and count fraction of points in each bin (for PDF)
    rel_bin_counts, min_bin_x, bin_size, _ =\
        stats.relfreq(data, numbins, (data.min(), data.max()))
    
    # bin the data and count each bin (cumulatively) (for CDF)
    cum_bin_counts, min_bin_x, bin_size, _ =\
        stats.cumfreq(data, numbins, (data.min(), data.max()))

    # normalize bin counts so rightmost count is 1
    cum_bin_counts /= cum_bin_counts.max()

    # make array of x-vals (lower end of each bin)
    x_vals = numpy.linspace(min_bin_x, min_bin_x+bin_size*numbins, numbins)

    # CDF always starts at y=0
    cum_bin_counts = numpy.insert(cum_bin_counts, 0, 0)  # y = 0
    cdf_x_vals = numpy.insert(x_vals, 0, x_vals[0])  # x = min x


    return cum_bin_counts, cdf_x_vals, rel_bin_counts, x_vals
    def fit_stage3_(self, kappa, gamma):
        kwargs = {'confidence': gamma}
        self.set_params(**kwargs)

        x = self.encode_train_np
        y = self.y_train_np
        k = int(self.num_classes * kappa)
        sample_ratio = self._params['sample_ratio']
        logger.debug('k for Stage 3: %d', k)

        self._s3_model = knn.KNeighborsClassifier(
            n_neighbors=k,
            n_jobs=-1,
        )
        self._s3_model.fit(x, y)

        # compute the likelihood
        sample_size = int(np.floor(len(x) * sample_ratio))
        logger.debug('[AD Stage 3]: Size of train set: %d', sample_size)
        x_sub = np.random.permutation(x)[:sample_size]
        neigh_indices = self._s3_model.kneighbors(x_sub,
                                                  n_neighbors=k,
                                                  return_distance=False)
        neigh_labels = np.array(
            [self.y_train_np[n_i] for n_i in neigh_indices], dtype=np.int16)
        bins = np.zeros((sample_size, self.num_classes), dtype=np.float32)
        # Is there any vectorization way to compute the histogram?
        for i in range(sample_size):
            bins[i] = stats.relfreq(neigh_labels[i],
                                    numbins=self.num_classes,
                                    defaultreallimits=(0, self.num_classes -
                                                       1))[0]
        self._s3_likelihood = np.mean(np.amax(bins, axis=1))
        logger.debug('Train set likelihood: %f', self._s3_likelihood)
Exemple #5
0
def meq_relfreq(df_dict, colname, l_limit, h_limit, step, numbins=10):
    range_df = create_range(df_dict, colname, l_limit, h_limit, step)
    rangeX = np.arange(l_limit, h_limit, step)
    X = np.zeros((len(rangeX),numbins))
    for x in range(len(rangeX)):
        X[x] = rangeX[x]
    Y = []
    Z = []
    for x in rangeX: 
        relfreq, startpoint, binsize, extrap = stats.relfreq(range_df.loc[x].values, numbins=numbins, \
                defaultreallimits=(min(range_df.loc[x]),max(range_df.loc[x])))
        Yline = [startpoint]
        Z.append(list(relfreq))
        for _ in range(1, len(relfreq)):
            next_y = Yline[-1] + binsize
            Yline.append(next_y)
        Y.append(Yline)
    Y = np.array(Y)
    Z = np.array(Z)
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    cset = ax.contourf(X, Y, Z, alpha=0.5)
    #ax.clabel(cset, fontsize=9, inline=1)
    ax.set_zlim3d(0, 1)
    plt.show()
    return (X, Y, Z)
Exemple #6
0
 def entropy_bin(self, points):
     fq = stats.relfreq(points,
                        numbins=100,
                        defaultreallimits=(np.amin(points),
                                           np.amax(points)))
     # print('entropy frequency', fq.frequency)
     return stats.entropy(fq.frequency, base=2)
Exemple #7
0
def entropy(data, lower, upper):
    data_size = len(data)
    num_bins = int(np.sqrt(data_size))
    s_dist = stats.relfreq(data, num_bins, (lower, upper))
    H = 0
    for i in range(num_bins):
        H += -s_dist.frequency[i] * np.log(
            max(s_dist.frequency[i], 1 / data_size))
    return H
def get_rel_freq(lst_len, title):
    res = stats.relfreq(lst_len, numbins=10)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size, res.frequency.size)
    fig = plt.figure(figsize=(5, 4))
    ax = fig.add_subplot(1, 1, 1)
    ax.bar(x, res.frequency, width=res.binsize)
    ax.set_title(title)
    ax.set_xlim([x.min(), x.max()])
    plt.savefig(target_dir_path + '/' + title + '.jpg')
def get_hist(tuple_list):
    array = []
    for item in tuple_list:
        array.append(item[1])
    res = stats.relfreq(array, numbins=75)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size,
                                     res.frequency.size)
    fig = plt.figure(figsize=(6, 4))
    ax = fig.add_subplot(1, 1, 1)
    ax.bar(x, res.frequency, width=res.binsize)
    plt.show()
def main(argv):

    with open(argv[0], 'r') as f:
        hyperparameters = json.load(f)

    dataset_meta_json_path = hyperparameters['dataset_meta_json_path']
    writer = SummaryWriter(log_dir='../runs/' +
                           hyperparameters['experiment_name'],
                           flush_secs=10)

    standardization = RapidECalibrationStandardize(dataset_meta_json_path)
    dataset = RapidECalibrationDataset(dataset_meta_json_path,
                                       transform=standardization)

    model = RapidEClassifier(
        number_of_classes=hyperparameters['num_of_classes'],
        dropout_rate=hyperparameters['dropout_rate'],
        name=hyperparameters['experiment_name'])

    # SET GPU
    if hyperparameters['gpu']:
        if torch.cuda.is_available():
            device = torch.device("cuda")
            model = torch.nn.DataParallel(model)
        else:
            device = torch.device("cpu")
            model = model
            logging.warning('CUDA is not availible on this machine')
    else:
        device = torch.device("cpu")
        model = model
    model.to(device)

    # OBJECTIVE LOSS
    labels = dataset.gettargets()
    set_l = set(labels)
    freq = stats.relfreq(np.array(labels), numbins=len(set_l)).frequency
    weights = torch.tensor((1 / freq) / np.sum(1 / freq), dtype=torch.float32)
    weights = weights.to(device)
    loss = nn.CrossEntropyLoss(weight=weights, reduction='sum')

    splitter = StratifiedSplitterForRapidECalibDataset(
        hyperparameters['num_of_folds'], dataset)

    # NESTED CROSS-VALIDATION
    nestedcrossvalidator = NestedCrossValidator(model=model,
                                                device=device,
                                                objectiveloss=loss,
                                                splitter=splitter,
                                                hyperparams=hyperparameters,
                                                tbwriter=writer)
    nestedcrossvalidator()
    writer.close()
Exemple #11
0
def EstDesc(datos, vals):
    vals = stats.describe(datos)
    #valores.append(vals)

    freqs = stats.relfreq(datos, 100)
    #valores.append(freqs)

    print("Estadistica descriptiva: \n")
    print(vals)
    print("\nFrecuencias relativas: \n")
    print(freqs)
    freqrel = freqs[0]
    return freqrel
Exemple #12
0
def probwin(data, window, nn, overlap, l_low, l_high):
    """
       Creates an array displaying the histogram of a variable over time
       Input: data - a 1D array
       window -  int corresponding to size of window
       nn - number of bins in histogram, determines histogram's resolution
       overlap - number of overlapping frames between each window
    """

    tt = time.time()

    N = data.shape[0]
    pdf_array = np.zeros((int(N / window), nn))
    vec_var = np.zeros((nn, ))
    time_var = np.array([i * window for i in range(int(N / window))])
    max_vec = np.zeros(int(N / window))
    skew_vec = np.zeros(int(N / window))
    min_lim = l_low
    max_lim = l_high
    s = 1 / 2 * (max_lim - min_lim) / int(N)
    for i in range(0, int(N / window) - 1):

        pdf1, a, b, c = stats.relfreq(data[i * window:(i + 1) * window], nn,
                                      (min_lim - s, max_lim + s))
        pdf2, low_lim, bin_s, c = stats.relfreq(
            data[(i + 1) * window - overlap:(i + 2) * window - overlap], nn,
            (min_lim - s, max_lim + s))

        pdf_array[i, :] = pdf1

        skew_vec[i] = np.nansum(
            (data[i * window:(i + 1) * window] -
             np.nanmean(data[i * window:(i + 1) * window]))**3) / window
        max_vec[i] = bin_s * np.argmax(pdf1) + low_lim

    vec_var = np.array([i * bin_s + low_lim for i in range(nn)])
    print(time.time() - tt)

    return skew_vec, max_vec, vec_var, time_var, pdf_array
Exemple #13
0
def init(F):
    A = []
    with open(F, encoding='utf-8') as f:
        data = np.loadtxt(f, str, delimiter=",")
        for i in data:
            A.append(float(i[2]))
    A = np.array(A)
    print(max(A))
    res = stats.relfreq(A, numbins=1000, defaultreallimits=(0, 2.5))
    x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size,
                                     res.frequency.size)
    y = np.cumsum(res.frequency)
    return x, y
Exemple #14
0
def rel_pdf(self, df_dict, numbins=10):
    to_return = np.array([])
    for k,v in df_dict.iteritems():
        to_return = np.append(to_return, [k].append(stats.relfreq(v, numbins=numbins)))
    #plt.ion()
    #fig = plt.figure()
    #ax = fig.add_subplot(111, projection='3d')
    #X, Y, Z = axes3d.get_test_data(0.1)
    #ax.plot_wireframe(X, Y, Z, rstride=5, cstride=5)
#
#        #for angle in range(0, 360):
#        #    ax.view_init(30, angle)
    #plt.draw()
    return to_return
def draw_hist_rel_frec(data, title):
    res = stats.relfreq(data,
                        numbins=150,
                        defaultreallimits=None,
                        weights=None)
    x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size,
                                     res.frequency.size)
    fig = plt.figure(figsize=(30, 20))

    ax = fig.add_subplot(1, 1, 1)
    ax.bar(x, res.frequency, width=res.binsize)
    ax.set_title(title)
    ax.set_xlim([x.min(), x.max() + 5])
    plt.show()
Exemple #16
0
 def histMaker(self, data, nmax, limits, debug=0, caller=None):
     '''
     return a histogram of input data with nmax bins between limits.
     Overflows are put in the uppermost bin
     '''
     upperlimit = limits[1]
     trunc = numpy.minimum(data, numpy.ones(len(data)) * upperlimit)
     hist, lowerlimit, binsize, extrapoints = relfreq(trunc, nmax, limits)
     hist *= len(data)
     words = 'pmtcal.histMaker'
     if caller is not None: words = caller
     if debug > 0:
         print words, 'rebinned result #bins,limits,lowerlimit, binsize, extrapoints', nmax, limits, lowerlimit, binsize, extrapoints
     if extrapoints > 0:
         print words, 'ERROR rebinning, extrapoints', extrapoints, '. It should be zero!'
     return hist
Exemple #17
0
def relativeFreq(data):
    '''
    returns a relative frequency graph
    '''
    a = np.array(data)  # convert to np array
    recounted = Counter(data)  # count how many colunns to have by making dict
    res = stats.relfreq(a, numbins=len(recounted))
    res.frequency  #freq array

    x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size,
                                     res.frequency.size)
    fig = plt.pyplot.figure(figsize=(5, 4))
    ax = fig.add_subplot(1, 1, 1)
    ax.bar(x, res.frequency, width=res.binsize)
    ax.set_title('Relative frequency histogram')
    ax.set_xlim([x.min(), x.max()])
    plt.pyplot.show()
    def __get_property_stats(self, property, bin_count=10):
        """
        Used to print statistical properties of a requested property, using a provided number of bins

        :param property: Property of data to plot
        :param bin_count: Number of bins to use (set automatically for str properties)
        """

        # --> Gather requested property
        property_lst = self.list_property(property)

        if type(property_lst[0]) is str:
            bin_count = len(set(property_lst))

        else:
            bin_count = bin_count

        # --> Get binned statistical properties
        if type(property_lst[0]) is str:
            property_lst.sort()

            binned_item_frequency = [
                len(list(group)) / len(property_lst)
                for key, group in groupby(property_lst)
            ]

            bin_labels = list(set(property_lst))
            bin_labels.sort()

        else:
            binned_item_frequency = relfreq(property_lst,
                                            numbins=bin_count).frequency
            bin_size = (max(property_lst) - min(property_lst)) / bin_count
            bin_labels = []

            tracker = min(property_lst)

            for _ in range(bin_count):
                bin_labels.append(
                    str(int(tracker)) + " <-> " + str(int(tracker + bin_size)))
                tracker += bin_size

        return property_lst, bin_count, bin_labels, binned_item_frequency
    def def_stage3_(self, adv, pred_adv, passed):
        """
        Checking the class distribution of k nearest neighbours without predicting
        the inputs. Compute the likelihood using one-against-all approach.

        pred_adv : numpy.ndarray
            A dummy variable
        """
        passed_indices = np.where(passed == 1)[0]
        if len(passed_indices) == 0:
            return passed

        x = adv[passed_indices]
        kappa = self._params['kappa']
        k = self.num_classes * kappa
        gamma = self._params['confidence']

        model = self._s3_model  # KNeighborsClassifier for entire train set
        neigh_indices = model.kneighbors(x,
                                         n_neighbors=k,
                                         return_distance=False)
        neigh_labels = np.array(
            [self.y_train_np[n_i] for n_i in neigh_indices], dtype=np.int16)
        bins = np.zeros((len(x), self.num_classes), dtype=np.float32)

        for i in range(len(x)):
            bins[i] = stats.relfreq(neigh_labels[i],
                                    numbins=self.num_classes,
                                    defaultreallimits=(0, self.num_classes -
                                                       1))[0]

        likelihood = np.amax(bins, axis=1)
        logger.debug('Mean likelihood on adv: %f', likelihood.mean())
        threshold = self._s3_likelihood * gamma
        blocked_indices = np.where(likelihood < threshold)[0]
        passed[blocked_indices] = 0

        return passed
Exemple #20
0
# Simulate 1,000 bservations from a Binomial
# distribution with n = 5, prob = 0.6
random.seed(1234)

n, p, N = 5, 0.5, 1000
x = np.random.binomial(n=n, p=p, size=N)

# print the sample mean and variances
print(x.mean())
print(x.var())

# Print the head of simulated binomial variable
print(x[1:10])

# Plot a histogram of simulated binomial observations
plt.hist(x)
plt.ylabel('Count')
plt.xlabel('$k$')
plt.title('Binomial with $n =$ {}, $\pi = ${}, and {} observations'.format(
    n, p, N))
plt.show()

# Plot a histogram of simulated binomial observations with relative frequency
y = stats.relfreq(x, numbins=6)

plt.bar(k, y.frequency)
plt.ylabel('Relative frequency')
plt.xlabel('$k$')
plt.title('Binomial with $n =$ {}, $\pi = ${}, and {} observations'.format(
    n, p, N))
plt.show()
Exemple #21
0
    def plot_energy_efficiency(
            self, UAV_trajectory_ris, UAV_trajectory_ris_no_shift,
            UAV_trajectory_no_ris, GT_schedule_ris, GT_schedule_ris_no_shift,
            GT_schedule_no_ris, UAV_flight_time_ris,
            UAV_flight_time_ris_no_shift, UAV_flight_time_no_ris, eps,
            slot_ris, slot_ris_no_shift, slot_no_ris):
        [Th_ris,
         rate_ris] = self.env.throughput(UAV_trajectory_ris,
                                         UAV_flight_time_ris, GT_schedule_ris,
                                         eps, 1, 1, slot_ris)
        [Th_ris_no_shift, rate_ris_no_shift] = self.env.throughput(
            UAV_trajectory_ris_no_shift, UAV_flight_time_ris_no_shift,
            GT_schedule_ris_no_shift, eps, 1, 0, slot_ris_no_shift)
        [Th_no_ris, rate_no_ris
         ] = self.env.throughput(UAV_trajectory_no_ris, UAV_flight_time_no_ris,
                                 GT_schedule_no_ris, eps, 0, 0, slot_no_ris)
        PEnergy_ris = self.env.flight_energy(UAV_trajectory_ris,
                                             UAV_flight_time_ris, eps,
                                             slot_ris)
        PEnergy_ris_shift = self.env.flight_energy(
            UAV_trajectory_ris_no_shift, UAV_flight_time_ris_no_shift, eps,
            slot_ris_no_shift)
        PEnergy_no_ris = self.env.flight_energy(UAV_trajectory_no_ris,
                                                UAV_flight_time_no_ris, eps,
                                                slot_no_ris)

        plot_ee = np.zeros((3, eps), dtype=np.float)
        for i in range(eps):
            plot_ee[0, i] = 1000 * np.sum(Th_ris[i, :]) / np.sum(
                PEnergy_ris[i, :])
            plot_ee[1, i] = 1000 * np.sum(Th_ris_no_shift[i, :]) / np.sum(
                PEnergy_ris_shift[i, :])
            plot_ee[2, i] = 1000 * np.sum(Th_no_ris[i, :]) / np.sum(
                PEnergy_no_ris[i, :])

        myfont = matplotlib.font_manager.FontProperties(
            fname=
            r"/usr/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/SimHei.ttf"
        )
        res_1 = stats.relfreq(plot_ee[0, :], numbins=25)
        x_1 = res_1.lowerlimit + np.linspace(
            0, res_1.binsize * res_1.frequency.size, res_1.frequency.size)
        y_1 = np.cumsum(res_1.frequency)
        res_2 = stats.relfreq(plot_ee[1, :], numbins=25)
        x_2 = res_2.lowerlimit + np.linspace(
            0, res_2.binsize * res_2.frequency.size, res_2.frequency.size)
        y_2 = np.cumsum(res_2.frequency)
        res_3 = stats.relfreq(plot_ee[2, :], numbins=25)
        x_3 = res_3.lowerlimit + np.linspace(
            0, res_3.binsize * res_3.frequency.size, res_3.frequency.size)
        y_3 = np.cumsum(res_3.frequency)

        plt.plot(x_1,
                 y_1,
                 c='g',
                 linestyle='-',
                 marker='<',
                 label=u"RIS-Assisted UAV")
        plt.plot(x_2, y_2, c='b', linestyle='-', marker='>', label=u"UAV-R/P")
        plt.plot(x_3, y_3, c='r', linestyle='-', marker='o', label=u"UAV/R")
        #plt.plot(range(eps), plot_ee[0,:].T, c='r',linestyle='-', marker='<', label=u"RIS-Assisted UAV")
        #plt.plot(range(eps), plot_ee[1,:].T, c='b', linestyle='-', marker='>',label=u"RIS-Assisted UAV without passive shift")
        #plt.plot(range(eps), plot_ee[2,:].T, c='g',linestyle='-', marker='o', label=u"UAV system without RIS")
        font = {
            'family': 'Times New Roman',
            'weight': 'normal',
            'size': 12,
        }

        plt.xlabel(u'Energy-Efficiency(bits/J)', font)
        plt.ylabel(u'CDF', font)
        plt.legend(prop=font)
        plt.grid()
        plt.savefig('EE.eps')
        plt.show()

        ave_ris = np.sum(plot_ee[0, :]) / eps
        ave_ris_no_shift = np.sum(plot_ee[1, :]) / eps
        ave_no_ris = np.sum(plot_ee[2, :]) / eps
        print("Energy efficieny: : RIS:%f;RIS_NO_SHIFT:%f;NO_RIS:%f" %
              (ave_ris, ave_ris_no_shift, ave_no_ris))
        return
def entropy(list_ecg, numbins=100, base=None):
    """ numbins : number of bins to use for the histogram
    base : base of the log for entropy calculation """
    return stats.entropy(
        stats.relfreq(list_ecg, numbins).frequency, None, base)
Exemple #23
0
hist_perc, bin_edges_perc = np.histogram(iris_targets, bins=3, density=True)
print('np.histogram', hist)
print('np.histogram', hist_perc, hist_perc * np.diff(bin_edges_perc))

# https://numpy.org/doc/stable/reference/generated/numpy.unique.html
# similar to bincount, but you may also use a string array
# good to parse a string array into an int array
# may also return frequency
targets, index_1st_occurrence, target_array, target_frequency = np.unique(
    data['target'], return_index=True, return_inverse=True, return_counts=True)
print('np.unique', targets, index_1st_occurrence, target_array,
      target_frequency)

# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.relfreq.html
# similar to bincount, but return the frequency in percentage instead
frequency, lower_limit, binsize, extra_points = stats.relfreq(iris_targets,
                                                              numbins=3)
print('sp.stats.relfreq', frequency, lower_limit, binsize, extra_points)

print('\nMean, Median, Mode, Quantile, Percentile')
mean = np.mean(data['sepal length'])
print('np.mean', mean)

trimmed_mean = stats.trim_mean(data['sepal length'], .25)
print('sp.stats.trim_mean (25%)', trimmed_mean)

median = np.median(data['sepal length'])
print('np.median', median)

quantile = np.quantile(data['sepal length'], .25)
print('np.quantile 1st (25%)', quantile)
Exemple #24
0
    def plot_propulsion_energy(self, UAV_trajectory_ris,
                               UAV_trajectory_ris_no_shift,
                               UAV_trajectory_no_ris, UAV_flight_time_ris,
                               UAV_flight_time_ris_no_shift,
                               UAV_flight_time_no_ris, eps, slot_ris,
                               slot_ris_no_shift, slot_no_ris):
        PEnergy_ris = self.env.flight_energy(UAV_trajectory_ris,
                                             UAV_flight_time_ris, eps,
                                             slot_ris)
        PEnergy_ris_no_shift = self.env.flight_energy(
            UAV_trajectory_ris_no_shift, UAV_flight_time_ris_no_shift, eps,
            slot_ris_no_shift)
        PEnergy_no_ris = self.env.flight_energy(UAV_trajectory_no_ris,
                                                UAV_flight_time_no_ris, eps,
                                                slot_no_ris)

        plot_energy = np.zeros((3, eps), dtype=np.float)
        for i in range(eps):
            plot_energy[0,
                        i] = plot_energy[0,
                                         i] + np.sum(PEnergy_ris[i, :]) / 1000
            plot_energy[1, i] = plot_energy[1, i] + np.sum(
                PEnergy_ris_no_shift[i, :]) / 1000
            plot_energy[
                2, i] = plot_energy[2, i] + np.sum(PEnergy_no_ris[i, :]) / 1000

        myfont = matplotlib.font_manager.FontProperties(
            fname=
            r"/usr/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/SimHei.ttf"
        )

        res_1 = stats.relfreq(plot_energy[0, :], numbins=25)
        x_1 = res_1.lowerlimit + np.linspace(
            0, res_1.binsize * res_1.frequency.size, res_1.frequency.size)
        y_1 = np.cumsum(res_1.frequency)
        res_2 = stats.relfreq(plot_energy[1, :], numbins=25)
        x_2 = res_2.lowerlimit + np.linspace(
            0, res_2.binsize * res_2.frequency.size, res_2.frequency.size)
        y_2 = np.cumsum(res_2.frequency)
        res_3 = stats.relfreq(plot_energy[2, :], numbins=25)
        x_3 = res_3.lowerlimit + np.linspace(
            0, res_3.binsize * res_3.frequency.size, res_3.frequency.size)
        y_3 = np.cumsum(res_3.frequency)

        plt.plot(x_1,
                 y_1,
                 c='g',
                 linestyle='-',
                 marker='<',
                 label=u"RIS-Assisted UAV")
        plt.plot(x_2, y_2, c='b', linestyle='-', marker='>', label=u"UAV-R/P")
        plt.plot(x_3, y_3, c='r', linestyle='-', marker='o', label=u"UAV/R")
        #plt.plot(np.arange(eps), plot_energy[0,:].T, c='r', linestyle='-', marker='<',label=u"RIS-Assisted UAV")
        #plt.plot(np.arange(eps), plot_energy[1,:].T, c='b', linestyle='-', marker='>',label=u"RIS-Assisted UAV without passive shift")
        #plt.plot(np.arange(eps), plot_energy[2,:].T, c='g', linestyle='-', marker='o',label=u"UAV system without RIS")

        font = {
            'family': 'Times New Roman',
            'weight': 'normal',
            'size': 12,
        }

        plt.xlabel(u'Propulsion Energy(KJ)', font)
        plt.ylabel(u'CDF', font)
        plt.legend(prop=font)
        plt.grid()
        plt.savefig('PE.eps')
        plt.show()

        sum_ris = np.sum(plot_energy[0, :]) / eps
        sum_ris_no_shift = np.sum(plot_energy[1, :]) / eps
        sum_no_ris = np.sum(plot_energy[2, :]) / eps
        print("Propulsion Energy: RIS:%f;RIS_NO_SHIFT:%f;NO_RIS:%f" %
              (sum_ris, sum_ris_no_shift, sum_no_ris))
        return
pdf['day_of_week'] = pdf['Date'].apply(
    lambda x: x.weekday())  # get the weekday index, between 0 and 6
pdf['day_of_week'] = pdf['day_of_week'].apply(lambda x: calendar.day_name[x])

fig, ax = plt.subplots()
pdf.groupby(['Date']).count()['Item'].plot(ax=ax)
ax.set_title('Sales by day')

# Part B Describe the customer
# How many times do they sell every item?
item_sold = pdf.groupby(['Item']).count()
item_sold = item_sold.drop(columns=['Time', 'DateTime', 'Date', 'day_of_week'])
print item_sold

# What is the relative frequency of sales? (graph)
res = stats.relfreq(pdf.Transaction, numbins=9684)
res.frequency
x = res.lowerlimit + np.linspace(0, res.binsize * res.frequency.size,
                                 res.frequency.size)

fig = plt.figure(figsize=(9, 7))
ax = fig.add_subplot(1, 1, 1)
ax.bar(x, res.frequency, width=res.binsize)
ax.set_title('Relative frequency histogram')
plt.show()

# How often do people buy tea with coffee? How about Coffee and croissant?  Coffee and something from the bakery?
cof = pdf.loc[pdf['Item'] == 'Coffee']
tea = pdf.loc[pdf['Item'] == 'Tea']
cro = pdf.loc[pdf['Item'] == 'Croissant']
Exemple #26
0
from scipy import stats
import matplotlib.pyplot as plot

# Loads information from a Json file
input_file = open('Ch1Data.json')
data = json.load(input_file)
assign_name = 'Ch1.4'

# Parse the Ch1.4 Data into a local array
values = data[assign_name]

# This simply creates a histogram variable that we assign to
#       scipy's relfreq which is a relative frequency histogram
# We then pass the values from above, and a numbers of bins
#       we want to associate with the data set.
histogram = stats.relfreq(values, numbins=10)
numpy.sum(histogram.frequency)

x = histogram.lowerlimit + numpy.linspace(
    0, histogram.binsize * histogram.frequency.size, histogram.frequency.size)

# Configure the histogram that will be shown.
fig = plot.figure(figsize=(5, 4))
ax = fig.add_subplot(1, 1, 1)
# Creates the bars used in the Histogram.
ax.bar(x, histogram.frequency, width=histogram.binsize)
ax.set_title('Relative frequency histogram for {}'.format(assign_name))
ax.set_xlim([x.min(), x.max()])

mean = "mean     : {}".format(numpy.mean(values))
variance = "variance : {}".format(numpy.var(values, ddof=1))
# compute histogram
n, low_range, binsize, extrapoints = st.histogram(x)
upper_range  = low_range+binsize*len(n)
bins = np.linspace(low_range, upper_range, len(n)+1)
#bins = 0.5*(bins[:-1] + bins[1:])

# plot the histogram
plt.clf()
plt.bar(bins[:-1], n, width=0.4, color='red')
plt.xlabel('X', fontsize=20)
plt.ylabel('number of data points in the bin', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/hist.png')

# compute and plot the relfreq
relfreqs, lowlim, binsize, extrapoints = st.relfreq(x)
plt.clf()
plt.bar(bins[:-1], relfreqs, width=0.4, color='magenta')
plt.xlabel('X', fontsize=20)
plt.ylabel('Relative frequencies', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/relfreq.png')

# compute and plot pdf
plt.clf()
n, bins, patches = plt.hist(x, 10, normed=1, facecolor='yellow', alpha=0.5)
plt.xlabel('X', fontsize=15)
plt.ylabel('PDF', fontsize=15)
plt.savefig('/home/tomer/my_books/python_in_hydrology/images/pdf.png')

# compute and plot cdf
cumfreqs, lowlim, binsize, extrapoints = st.cumfreq(x)
Exemple #28
0
              bins=bins[i],
              color='gray',
              hist_kws={'edgecolor': 'black'},
              kde_kws={
                  'linewidth': 1,
                  'color': 'blue'
              },
              ax=ax)
 ax.set_xlim((-1.1 * span, 1.1 * span))
 ax.set_ylim((0, 0.05))
 ax.set_ylabel('Density')
 ax.set_xlabel(f'Change in PIP ({bins[i]} bins)')
 x = np.linspace(-span, span, 100)
 ax.plot(x, stats.norm.pdf(x, mu, sigma), color='red')
 qqplot(np.array(values), line='s', ax=axs[1, i], marker='.', color='gray')
 h = stats.relfreq(values, numbins=bins[i])
 xp = h.lowerlimit + np.linspace(0, h.binsize * h.frequency.size,
                                 h.frequency.size)
 ax = axs[2, i]
 y = []
 x = []
 j = 0
 for j in range(len(xp)):
     v = h.frequency[j]
     if v >= threshold:
         y.append(v)
         x.append(xp[j])
     j += i
 ax.set_xlim((-1.1 * span, 1.1 * span))
 ax.set_ylim((0, 0.7))
 ax.set_ylabel('Relative frequency')
Exemple #29
0
            "cases", "infection", 'maskenpflicht', 'stayhomestaysafe',
            'stayhome', 'distancing', 'covidiots'
        ]
        abstimmung = [
            "kampfjets", 'armee', 'begrenzungsinitiative', 'svp', 'srf',
            'initiative', 'abstimmung', 'streik'
        ]

        cleaned_tweets = keywordTweets(abstimmung, tweets)

        #print(cleaned_tweets)

        for i, data in enumerate(scores):
            overall = sum(data)

            d = stats.relfreq(data, numbins=20)
            #print(d)
            #plt.bar(np.arange(len(data)), d)
            #df = pd.DataFrame(data)
            #df.plot.hist(bins=20)

            #plt.plot(d.frequency)
            #plt.savefig(f'{i}.png')

        with open('TweetAnalysis/cleaned_tweets_en.json', 'w') as f:
            f.write(json.dumps(cleaned_tweets))

    if False:
        tweets = tweet_query()
        with open(f'TweetAnalysis/tweets.json', 'w', encoding='utf-8') as f:
            json.dump(tweets, f, ensure_ascii=False)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Exemple #31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Exemple #32
0
b = np.linalg.lstsq(X, Y, rcond=None)[0]
resid = Y - X.dot(b)
print("Коэффициенты линейной регрессии:")
print(b, '\n')
print("Ожидаемое значение вектора ошибок:")
print(np.mean(resid), '\n')
print("Изменение вектора ошибок:")
print(np.var(resid, ddof=1), '\n\n\n')

print("Хи-квадрат критерий Пирсона:")
print("Хи-квадрат критерий Пирсона для k от 3 до 100")

for k in range(3, 100):
    print('k = ', k)
    res = ss.relfreq(residuals,
                     numbins=k,
                     defaultreallimits=(np.amin(residuals),
                                        np.amax(residuals)))
    observed = res.frequency * len(residuals)

    mu = np.mean(residuals)
    sigma = np.std(residuals, ddof=1)
    inter = np.linspace(np.amin(residuals), np.amax(residuals), k + 1)
    expected = np.array([])
    for i in range(k):
        n = (ss.norm.cdf(inter[i + 1], mu, sigma) -
             ss.norm.cdf(inter[i], mu, sigma)) * len(residuals)
        expected = np.append(expected, n)
    print('Встроенная функция хи-квадрат:')
    print(ss.chisquare(observed, expected, ddof=k - 2), '\n')
    print("Наш хи-квадрат:")
    stat, p_val = my_chi_square(observed, expected, ddof=k - 2)
Exemple #33
0
def syncFLIR_diagnostics():

    try:
        logfilepath, logfile = read_csv_logfile()
    except:
        return

    pdf, fileName = initialize_pdf(logfilepath)

    # Split logfile by Serial number
    grouped = logfile.groupby(logfile.SerialNumber)

    # positioning for output text and figures in pdf file
    htext = 610
    wtext = 50
    whist = 30
    himage = 270

    print("Writing diagnostics report ...")

    # suppress output
    with contextlib.redirect_stderr(None):
        # analyze all cameras from csv
        for serial in grouped.grouper.levels[0]:
            # Diagnose recording
            group = grouped.get_group(serial)
            group.sort_values(by=['FrameID'], inplace=True)
            lastFrame = group['FrameID'].max()
            timespan = (group['Timestamp'].max() -
                        group['Timestamp'].min()) / 1e9
            group['IntFramesInt'] = group['Timestamp'].diff() / 1e9
            group['FrameSkip'] = group['FrameID'].diff() - 1
            avgfps = lastFrame / timespan
            meanfps = 1 / group.IntFramesInt.mean()
            critFPS = group.IntFramesInt[group.IntFramesInt > .04].count()
            skipFrames = group.FrameSkip.sum()
            missingFrames = logfile['FrameID'].max() - lastFrame

            # save output
            serialnum = 'Camera:         #' + str(serial)
            numframes = 'Total frames:    ' + str(lastFrame)
            duration = 'Recording time:  ' + time.strftime(
                "%M:%S", time.gmtime(timespan))
            avgfps = 'Frames/Time:     ' + str("{:.2f}".format(avgfps))
            meanfps = 'Mean FPS:        ' + str("{:.2f}".format(meanfps))
            critical = 'Critical frames: ' + str(critFPS)
            skipped = 'Skipped frames:  ' + str(int(skipFrames))
            missing = 'Missing frames:  ' + str(int(missingFrames))

            textLinesReport = [
                serialnum, numframes, duration, avgfps, meanfps, critical,
                skipped, missing
            ]

            # Plot FPS time series
            plt.rcParams['font.size'] = '12'
            timeseries = 'timeseries-' + str(serial) + '.png'
            fig, ax = plt.subplots(figsize=(10, 2))
            ax.plot(group.FrameID,
                    group.IntFramesInt,
                    marker='.',
                    alpha=0.3,
                    color='black',
                    linestyle='solid')
            ax.axhline(y=.04, color='r', linestyle='-', lw=2)
            plt.text(0, 0.045, 'FPS = 25', color='r', rotation=0)
            ax.axhline(y=.02, color='y', linestyle='-', lw=2)
            plt.text(0, 0.025, 'FPS = 50', color='y', rotation=0)
            ax.axhline(y=.005, color='g', linestyle='-', lw=2)
            plt.text(0, .01, 'FPS = 200', color='g', rotation=0)
            plt.ylabel('Inter Frame Interval')
            plt.xlabel('Frame ID')
            plt.title(serial)
            plt.savefig(timeseries)

            # Plot FPS Histogram
            plt.rcParams['font.size'] = '34'
            histogram = 'histogram-' + str(serial) + '.png'
            res = stats.relfreq(group.IntFramesInt.dropna(), numbins=30)
            x = res.lowerlimit + np.linspace(
                0, res.binsize * res.frequency.size, res.frequency.size)
            fig, ax = plt.subplots(figsize=(18, 12))
            ax.bar(x, res.frequency, width=res.binsize)
            ax.axvline(x=.04, color='r', linestyle='-', lw=1)
            plt.text(.05, .4, 'FPS = 25', color='r', rotation=0)
            ax.axvline(x=.02, color='y', linestyle='-', lw=1)
            plt.text(.05, .45, 'FPS = 50', color='y', rotation=0)
            ax.axvline(x=.005, color='g', linestyle='-', lw=1)
            plt.text(.05, .5, 'FPS = 200', color='g', rotation=0)
            plt.xlabel('Inter Frame Interval')
            plt.xlim(0, 0.075)
            plt.ylabel('Relative Frequency')
            plt.title(serial)
            plt.savefig(histogram)

            # write diagnostics to pdf
            text = pdf.beginText(wtext, htext)
            text.setFont("Times-Roman", 11)
            for line in textLinesReport:
                text.textLine(line)
            pdf.drawText(text)

            # write timeseries figures to pdf
            pdf.drawInlineImage(timeseries, 0, himage, width=600, height=120)
            os.remove(timeseries)

            # wirte histograms to pdf
            pdf.drawInlineImage(histogram, whist, 390, width=180, height=120)
            os.remove(histogram)

            # move next text to the right
            wtext = wtext + 180
            # move histogram to the right
            whist = whist + 180
            # move image down
            himage = himage - 125
        pdf.save()

    return fileName
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats as st

testscores = np.random.normal(9.5, 2.5, 500)
print('describe = ', st.describe(testscores))
print('mean = ', np.mean(testscores))
print('mode = ', st.mode(testscores))
print('tmean = ', st.tmean(testscores,[5,15]))
print('variation = ', st.variation(testscores))
print('skewness = ', st.skew(testscores))
print('kurtosis = ', st.kurtosis(testscores))
print('zscore = ', st.zscore(testscores)[:20])

relfr = st.relfreq(testscores, 20)
print('relfreq = ', relfr)
x1 = relfr.lowerlimit + np.linspace(0, relfr.binsize*relfr.frequency.size, relfr.frequency.size)

cumfr = st.cumfreq(testscores, 20)
print('cumfreq = ', cumfr)
x2 = cumfr.lowerlimit + np.linspace(0, cumfr.binsize*cumfr.cumcount.size, cumfr.cumcount.size)

plt.subplot(2,2,1)
plt.hist(testscores, 20)
plt.title('Histogram')
plt.xticks(())

plt.subplot(2,2,2)
plt.bar(x1, relfr.frequency, width = relfr.binsize)
plt.title('Relative histogram')
Exemple #35
0
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

plt.figure(num=1, figsize=[10, 7])

ns = 1000.
dist = np.random.randn(ns)

dmin = np.amin([dist])
dmax = np.amax([dist])

nbins = 10

relfreqs, lowlim, binsize, extrapoints = stats.relfreq(dist, nbins)

print relfreqs.shape
print lowlim
print binsize
print np.linspace(lowlim, binsize * (nbins + 1) + lowlim, nbins).shape

plt.plot(np.linspace(lowlim, binsize * (nbins + 1) + lowlim, nbins), relfreqs)

# histogram 1
nbins = 9
n1, bins, patches = plt.hist(dist,
                             bins=nbins,
                             histtype='step',
                             hold=True,
                             range=(dmin, dmax),
                             color='white')