Beispiel #1
0
def accuracy_pdf():
    df = pd.read_csv(
        "/Users/maksim/dev_projects/merf/figures/accuracy_results/precision_recall.csv"
    )
    prec = df['cpu_precision'] - df['gpu_precision']
    rec = df['cpu_recall'] - df['gpu_recall']

    print(stats.describe(prec))
    print(stats.describe(rec))

    plt.hist(prec, bins=20, label="$CPU_p - GPU_p$", alpha=.5)
    plt.hist(rec, bins=100, label="$CPU_r - GPU_r$", alpha=.5)
    plt.legend()
    tikzplotlib.save("accuracy.tex")
    def get_freq_distn_stats(self):
        pred_list = self.pred_info[2]
        # obtains data from predicted contact dictionary
        pred_list.sort(key=lambda x: float(x[3]), reverse=True)

        # get native information on contact       
        nat_list, nat_dict = get_epitopes_above_ths(self.nat_info[2])

        tp_reslist, tp_resind = [], []
        # get true predictions and their indices
        for ind, res in enumerate(pred_list):
            if float(res[3]) <= 0.0: continue
            res_name = '%s_%s_%s'%(res[0], res[1], res[2])
            if res_name in nat_dict: 
                # if pred res with >0 score is in native epitope
                tp_reslist.append(res[3])
                tp_resind.append(ind)
        
        #print (len(pred_list))
        #print (len(nat_list))
        #print (len(tp_reslist))
        #print (tp_resind)
        #yfreq = [i[3] for i in pred_list[:len(y)] if i[3]>0.0]
        yfreq = [float(i[3]) for i in pred_list if float(i[3])>0.0]
        xfreq = [i for i in range(1,len(yfreq)+1)]
        if len(yfreq) == 0:
            return xfreq, yfreq, tp_reslist, tp_resind, 0, 0, 0
        curr_stats = stats.describe(yfreq)
        kur, skew, var = curr_stats.kurtosis, curr_stats.skewness, curr_stats.variance

        return xfreq, yfreq, tp_reslist, tp_resind, kur, skew, var
Beispiel #3
0
def generate_csv():
    cases_list = unpickle_data()
    csv_name = 'complete_data.csv'
    FULL_CSV = pd.DataFrame(columns=CSV_COLS)
    for c in cases_list:
        print(f"    > Case {c._case_name}")
        for r in c:
            print(f"\t\t + RECORD {r.name}", end="")
            values = list()
            for k, v in r.N_LINEAR.items():
                s = stats.describe(v)
                values.extend([
                    s[2],  # Mean
                    s[3],  # Variance
                    s[4],  # Skewness
                    spectral_entropy(v, sf=r.fs,
                                     method='fft')  # Spectral Entropy
                ])
            row_data = [
                c._case_name,  # Case
                r.name,  # Record
                c.pathology,  # Condition
                COND_ID[c.pathology],  # Condition ID
                len(r.rr),  # RR Length
            ] + values
            FULL_CSV = FULL_CSV.append(pd.Series(data=row_data,
                                                 index=CSV_COLS),
                                       ignore_index=True)
            print("[v]")
    FULL_CSV.to_csv(csv_name, index=False)
Beispiel #4
0
 def edit_row(row):
     rr = row['rr']
     s = stats.describe(rr)
     new_row = row[['record', 'condition']]
     new_row['mean'] = s[2]
     new_row['variance'] = s[3]
     new_row['skewness'] = s[4]
     new_row['kurtosis'] = s[5]
     return new_row
Beispiel #5
0
 def compute(pred_scores):
     describe = stats.describe(pred_scores)
     metrics = {
         "min": describe.minmax[0],
         "max": describe.minmax[1],
         "mean": describe.mean,
         "variance": describe.variance,
         "skewness": describe.skewness,
         "kurtosis": describe.kurtosis
     }
     return metrics
Beispiel #6
0
 def process_row(row: pd.Series) -> pd.Series:
     data = dict(row[[m["tag"] for m in NL_METHODS]])
     for tag, vec in data.items():
         s = stats.describe(vec)
         values = [
             s[2], s[3], s[4],
             spectral_entropy(vec, sf=row['fs'], method='fft')
         ]
         for n, v in zip(punctual_names, values):
             row[tag + n] = v
     return row
Beispiel #7
0
def save_test():
    TEST_DIRS = list(Path('.').glob('Test_*ws/'))
    for td in TEST_DIRS:
        t_cases = test_unpickle(td)

        pdir = "Test/"

        csv_name = pdir + td.stem + '.csv'
        pkl_name = pdir + td.stem + '.pkl'

        csv_data = pd.DataFrame(columns=CSV_COLS)
        pkl_data = pd.DataFrame(columns=CSV_COLS[:5])

        for c in t_cases:
            for r in c:
                # Process for CSV
                values = list()
                row_data = [
                    c._case_name,
                    r.name,
                    c.pathology,
                    COND_ID[c.pathology],
                    len(r.rr_int),
                ]
                for k, v in r.N_LINEAR.items():
                    s = stats.describe(v)
                    row_data.extend([
                        s[2], s[3], s[4],
                        spectral_entropy(v, sf=r.fs, method='fft')
                    ])
                csv_data = csv_data.append(pd.Series(
                    data=row_data,
                    index=CSV_COLS,
                ),
                                           ignore_index=True)
                # Process for pickle
                pkl_row = {
                    'case': c._case_name,
                    'record': r.name,
                    'condition': c.pathology,
                    'cond_id': COND_ID[c.pathology],
                    'length': len(r.rr_int)
                }
                pkl_row.update(r.N_LINEAR)
                pkl_data = pkl_data.append(pd.DataFrame(pkl_row))

        # DATA IS SAVED IN BOTH FORMATS
        csv_data.to_csv(csv_name, index=False)
        with open(pkl_name, 'wb') as pf:
            pickle.dump(pkl_data, pf)
Beispiel #8
0
def linearWindowing(rr_signal: np.ndarray):
    """
    Evaluates rr with linear functions based on a rolling window.

    rr_signal   :: RR vector of time in seconds
    """
    means, var, skew, kurt = list(), list(), list(), list()

    for idx in range(0, len(rr_signal) - RR_WLEN, RR_STEP):
        window_slice = slice(idx, idx + RR_WLEN)
        rr_window = rr_signal[window_slice]
        ds = stats.describe(rr_window)
        means.append(ds[2])
        var.append(ds[3])
        skew.append(ds[4])
        kurt.append(ds[5])

    return means, var, skew, kurt
def compute_pairwise_embedding_distance_features(a_mat, b_mat) -> List:
    """
    Computes pairwise embedding distance features.
    :param a_mat:
    :param b_mat:
    :return: Note that the order of values is the same as in `create_feature_names` (mean, variance, min, max)
    """
    if a_mat is None or b_mat is None or a_mat.size == 0 or b_mat.size == 0:
        return [None] * 4
    else:
        dists = cdist(a_mat, b_mat, "cosine")
        if dists.size == 1:
            # scipy would raise "FloatingPointError: invalid value encountered in double_scalars" when calling describe on a 1x1 matrix, so we use this workaround
            return [dists.item(),
                    0,
                    dists.item(),
                    dists.item()]
        else:
            dists_stats = stats.describe(dists, axis=None)  # type: DescribeResult
            return [dists_stats.mean,
                    0 if dists_stats.variance is None else dists_stats.variance,
                    dists_stats.minmax[0],
                    dists_stats.minmax[1]]
Beispiel #10
0
    def extract_from_instance(instance_file, features_file):
        aset = AuctionSet.load(instance_file)

        # shorthand variables:
        b = aset.bid_set.values
        r = aset.bid_set.quantities
        a = aset.ask_set.values
        s = aset.ask_set.quantities

        ### stats for average bid prices
        nobs, b_minmax, b_mean, b_var, b_skew, b_kurt = st.describe(
            b / np.sum(r, axis=1), ddof=0)
        ### stats for average ask prices
        nobs, a_minmax, a_mean, a_var, a_skew, a_kurt = st.describe(
            a / np.sum(s, axis=1), ddof=0)
        ### stats for bid bundle size
        nobs, r_minmax, r_mean, r_var, r_skew, r_kurt = st.describe(np.sum(
            r, axis=1),
                                                                    ddof=0)
        ### stats for ask bundle size
        nobs, s_minmax, s_mean, s_var, s_skew, s_kurt = st.describe(np.sum(
            s, axis=1),
                                                                    ddof=0)
        ####### heterogeneity -> resource type axis (stats inside a bundle)
        # stats for resource quantities demanded for each resource type: sum, mean, min, max per res type, then describe
        nobs, rt_sum_minmax, rt_sum_mean, rt_sum_var, rt_sum_skew, rt_sum_kurt = st.describe(
            np.sum(r, axis=0), ddof=0)
        nobs, rt_mean_minmax, rt_mean_mean, rt_mean_var, rt_mean_skew, rt_mean_kurt = st.describe(
            np.mean(r, axis=0), ddof=0)
        nobs, rt_min_minmax, rt_min_mean, rt_min_var, rt_min_skew, rt_min_kurt = st.describe(
            np.min(r, axis=0), ddof=0)
        nobs, rt_max_minmax, rt_max_mean, rt_max_var, rt_max_skew, rt_max_kurt = st.describe(
            np.max(r, axis=0), ddof=0)
        # stats for resource quantities offered for each resource type
        nobs, st_sum_minmax, st_sum_mean, st_sum_var, st_sum_skew, st_sum_kurt = st.describe(
            np.sum(s, axis=0), ddof=0)
        nobs, st_mean_minmax, st_mean_mean, st_mean_var, st_mean_skew, st_mean_kurt = st.describe(
            np.mean(s, axis=0), ddof=0)
        nobs, st_min_minmax, st_min_mean, st_min_var, st_min_skew, st_min_kurt = st.describe(
            np.min(s, axis=0), ddof=0)
        nobs, st_max_minmax, st_max_mean, st_max_var, st_max_skew, st_max_kurt = st.describe(
            np.max(s, axis=0), ddof=0)
        # stats for demand/supply ratio by resource types: total, mean
        nobs, qratio_sum_minmax, qratio_sum_mean, qratio_sum_var, qratio_sum_skew, qratio_sum_kurt = st.describe(
            np.sum(r, axis=0) / np.sum(s, axis=0), ddof=0)
        nobs, qratio_mean_minmax, qratio_mean_mean, qratio_mean_var, qratio_mean_skew, qratio_mean_kurt = st.describe(
            np.mean(r, axis=0) / np.mean(s, axis=0), ddof=0)
        # stats for surplus quantity by resource types
        nobs, qsurplus_sum_minmax, qsurplus_sum_mean, qsurplus_sum_var, qsurplus_sum_skew, qsurplus_sum_kurt = st.describe(
            np.sum(s, axis=0) - np.sum(r, axis=0), ddof=0)
        # quantity spread by resource type (max requested quantity of resource k - min offered quantity of resource k)
        nobs, qspread_minmax, qspread_mean, qspread_var, qspread_skew, qspread_kurt = st.describe(
            np.max(r, axis=0) - np.min(s, axis=0), ddof=0)
        # mid price
        bid_max = (b / r.sum(axis=1)).max()
        ask_min = (a / s.sum(axis=1)).min()
        mid_price = (bid_max + ask_min) / 2
        # bid-ask spread
        ba_spread = bid_max - ask_min
        # total demand quantity
        r_total = r.sum()
        # total supply quantity
        s_total = s.sum()
        # total demand value
        b_total = b.sum()
        # total supply value
        a_total = a.sum()
        # surplus value per surplus unit
        surplus_value_per_surplus_unit = 0 if r_total == s_total else (
            b_total - a_total) / (r_total - s_total)
        ### append features
        features = np.array([
            ## instance name to be used as index
            instance_file
            ### group 1: instance - price related
            ,
            b_mean  # average_bid_price_mean
            ,
            math.sqrt(b_var)  # average_bid_price_stddev
            ,
            b_skew  # average_bid_price_skewness
            ,
            b_kurt  # average_bid_price_kurtosis
            ,
            a_mean  # average_ask_price_mean
            ,
            math.sqrt(a_var)  # average_ask_price_stddev
            ,
            a_skew  # average_ask_price_skewness
            ,
            a_kurt  # average_ask_price_kurtosis
            ,
            bid_max  # average_bid_price_max
            ,
            ask_min  # average_ask_price_min
            ,
            mid_price  # mid_price
            ,
            ba_spread  # bid_ask_spread
            ,
            ba_spread / mid_price  # bid_ask_spread_over_mid_price
            ### group 2: instance - quantity related
            ,
            r_mean  # bid_bundle_size_mean
            ,
            math.sqrt(r_var)  # bid_bundle_size_stddev
            ,
            r_skew  # bid_bundle_size_skewness
            ,
            r_kurt  # bid_bundle_size_kurtosis
            ,
            s_mean  # ask_bundle_size_mean
            ,
            math.sqrt(s_var)  # ask_bundle_size_stddev
            ,
            s_skew  # ask_bundle_size_skewness
            ,
            s_kurt  # ask_bundle_size_kurtosis
            ### group 3: instance - quantity per resource related (measure of heterogeneity)
            # --> demand side
            ,
            rt_sum_mean  # total_demand_per_resource_mean
            ,
            math.sqrt(rt_sum_var)  # total_demand_per_resource_stddev
            ,
            rt_sum_skew  # total_demand_per_resource_skewness
            ,
            rt_sum_kurt  # total_demand_per_resource_kurtosis
            ,
            rt_mean_mean  # average_demand_per_resource_mean
            ,
            math.sqrt(rt_mean_var)  # average_demand_per_resource_stddev
            ,
            rt_mean_skew  # average_demand_per_resource_skewness
            ,
            rt_mean_kurt  # average_demand_per_resource_kurtosis
            ,
            rt_min_mean  # minimum_demand_per_resource_mean
            ,
            math.sqrt(rt_min_var)  # minimum_demand_per_resource_stddev
            ,
            rt_min_skew  # minimum_demand_per_resource_skewness
            ,
            rt_min_kurt  # minimum_demand_per_resource_kurtosis
            ,
            rt_max_mean  # maximum_demand_per_resource_mean
            ,
            math.sqrt(rt_max_var)  # maximum_demand_per_resource_stddev
            ,
            rt_max_skew  # maximum_demand_per_resource_skewness
            ,
            rt_max_kurt  # maximum_demand_per_resource_kurtosis
            # --> supply side
            ,
            st_sum_mean  # total_supply_per_resource_mean
            ,
            math.sqrt(st_sum_var)  # total_supply_per_resource_stddev
            ,
            st_sum_skew  # total_supply_per_resource_skewness
            ,
            st_sum_kurt  # total_supply_per_resource_kurtosis
            ,
            st_mean_mean  # average_supply_per_resource_mean
            ,
            math.sqrt(st_mean_var)  # average_supply_per_resource_stddev
            ,
            st_mean_skew  # average_supply_per_resource_skewness
            ,
            st_mean_kurt  # average_supply_per_resource_kurtosis
            ,
            st_min_mean  # minimum_supply_per_resource_mean
            ,
            math.sqrt(st_min_var)  # minimum_supply_per_resource_stddev
            ,
            st_min_skew  # minimum_supply_per_resource_skewness
            ,
            st_min_kurt  # minimum_supply_per_resource_kurtosis
            ,
            st_max_mean  # maximum_supply_per_resource_mean
            ,
            math.sqrt(st_max_var)  # maximum_supply_per_resource_stddev
            ,
            st_max_skew  # maximum_supply_per_resource_skewness
            ,
            st_max_kurt  # maximum_supply_per_resource_kurtosis
            ### group 4: instance - demand-supply balance related
            ,
            surplus_value_per_surplus_unit  # surplus_value_per_surplus_unit
            ,
            b_total / a_total  # demand_supply_ratio_value
            ,
            r_total / s_total  # demand_supply_ratio_quantity
            ,
            qratio_sum_mean  # demand_supply_ratio_total_quantity_per_resource_mean
            ,
            math.sqrt(
                qratio_sum_var
            )  # demand_supply_ratio_total_quantity_per_resource_stddev
            ,
            qratio_sum_skew  # demand_supply_ratio_total_quantity_per_resource_skewness
            ,
            qratio_sum_kurt  # demand_supply_ratio_total_quantity_per_resource_kurtosis
            ,
            qratio_mean_mean  # demand_supply_ratio_mean_quantity_per_resource_mean
            ,
            math.sqrt(
                qratio_mean_var
            )  # demand_supply_ratio_mean_quantity_per_resource_stddev
            ,
            qratio_mean_skew  # demand_supply_ratio_mean_quantity_per_resource_skewness
            ,
            qratio_mean_kurt  # demand_supply_ratio_mean_quantity_per_resource_kurtosis
            ,
            s_total - r_total  # surplus_quantity
            ,
            qsurplus_sum_mean  # surplus_total_quantity_per_resource_mean
            ,
            math.sqrt(
                qsurplus_sum_var)  # surplus_total_quantity_per_resource_stddev
            ,
            qsurplus_sum_skew  # surplus_total_quantity_per_resource_skewness
            ,
            qsurplus_sum_kurt  # surplus_total_quantity_per_resource_kurtosis
            ,
            qspread_mean  # quantity_spread_per_resource_mean
            ,
            math.sqrt(qspread_var)  # quantity_spread_per_resource_stddev
            ,
            qspread_skew  # quantity_spread_per_resource_skewness
            ,
            qspread_kurt  # quantity_spread_per_resource_kurtosis
            ,
            b_mean / a_mean  # ratio_average_price_bid_to_ask
            ,
            r_mean / s_mean  # ratio_bundle_size_bid_to_ask
        ])

        fpi = pd.DataFrame(
            features.reshape((1, features.shape[0])),
            columns=["instance",
                     *[x.name for x in Feature_Names]]).set_index('instance')

        with open(features_file, "a") as f:
            fpi.to_csv(f, header=False, float_format='%g')
            f.close()
data_n = data[[0, 4, 8]]
print('Data info    :\n', data_n.info())
print('Data describe:\n', data_n.describe())
print('Data         :\n', data_n.head(), '\n')

# Gán tiêu đề cột
data_n.columns = ['Tempt', 'Gender', "Beats"]
print('Data         :\n', data_n.head(), '\n')

# 2. Vẽ histogram cho cột Tempt.
plt.figure(figsize = (6, 6))
sns.distplot(data_n.Tempt)
plt.show()

# 3. Tìm thống kê chung của Tempt.
print('\nSố liệu thống kê:\n', stats.describe(data_n.Tempt))

# $. Tìm mean, median, mode => cho nhận xét
mean_T   = data_n.Tempt.mean()
print('Mean:  ', mean_T)
median_T = data_n.Tempt.median()
print('Median:', median_T)
mode_T   = data_n.Tempt.mode()
print('Mode:  ', mode_T)

# Nhận xét: ~ phân phối khá chuẩn

# 5. Giá trị Tempt ở phân vị thứ [0, 1, 2, 2.5, 97.5, 98, 99, 100]
percentiles = np.array([0, 1, 2, 2.5, 97.5, 98, 99, 100])
x = np.percentile(data_n.Tempt, percentiles)
print('Percentiles[]:', x)
Beispiel #12
0
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.metrics import confusion_matrix, accuracy_score
from scipy.stats import stats
import matplotlib.pyplot as plt
from collections import Counter

iris = datasets.load_iris()
df_data = iris.data
df_class = iris.target

clusters, values = np.unique(df_class, return_counts=True)
# ou pd.DataFrame({ 'col': df_class })['col'].value_counts()
# ou Counter(df_class)

stats.describe(df_data)

model = KMeans(len(clusters))
model.fit(df_data)

model.cluster_centers_

previsoes = model.labels_

confusion_matrix(df_class, previsoes)

accuracy_score(df_class, previsoes)

plt.scatter(df_data[previsoes == 0, 0],
            df_data[previsoes == 0, 1],
            c='green',
Beispiel #13
0
 def summary(self):
     self.to1d()
     self.summ = stats.describe(self._oneDdata)
     # print self.summ
     return
def process_annotations(args):
    li = []

    for filename in args["source_csvs"]:
        df = pandas.read_csv(filename, index_col=None, header=0)
        li.append(df)

    source_csv = pandas.concat(li, ignore_index=True, sort=False, axis=0)

    # print(tabulate(source_csv, headers='keys', tablefmt='psql'))

    print(source_csv.columns)

    accept_time_col = source_csv["AcceptTime"]
    submit_time_col = source_csv["SubmitTime"]

    # Highlight those that are too short.
    suspiciously_quick = []
    for accept_time, submit_time in zip(accept_time_col, submit_time_col):
        accept_time = accept_time.replace("PDT", "").strip()
        submit_time = submit_time.replace("PDT", "").strip()

        mturk_date_format = "%a %b %d %H:%M:%S %Y"
        accept_time = datetime.datetime.strptime(accept_time,
                                                 mturk_date_format)
        submit_time = datetime.datetime.strptime(submit_time,
                                                 mturk_date_format)

        time_taken = submit_time - accept_time

        if time_taken.seconds / 60.0 < args["min_time"]:
            suspiciously_quick.append(True)
        else:
            suspiciously_quick.append(False)
    source_csv = source_csv.assign(too_quick=pandas.Series(suspiciously_quick))

    # Story summary
    token_length = []
    too_short = []
    for summary in source_csv["Answer.storySummary"]:
        num_tokens = len(summary.split(" "))
        token_length.append(num_tokens)
        if num_tokens < args["min_tokens"]:
            too_short.append(True)
        else:
            too_short.append(False)

    source_csv = source_csv.assign(
        num_summary_tokens=pandas.Series(token_length))
    source_csv = source_csv.assign(too_short=pandas.Series(too_short))

    genres = []
    for index, row in source_csv.iterrows():
        added = False
        for g in genre_categories:
            if row[g] == True and not added:
                added = True
                genre_name = g.split(".")[1]
                genres.append(genre_name)

        if not added:
            genres.append("other")
    source_csv = source_csv.assign(genre=pandas.Series(genres))

    source_csv.to_csv(f"{args['target']}_processed.csv")

    # print(f"Prefiltered: {len(source_csv)}")
    # source_csv = source_csv.loc[(source_csv['too_quick'] == True) & (source_csv['too_short'] == True)]
    # print(f"Postfiltered: {len(source_csv)}")

    stats_dict = defaultdict(dict)
    for col in stats_columns:
        figures = source_csv[col]

        nobs, minmax, mean, variance, skewness, kurtosis = stats.describe(
            figures)

        stats_dict[col]["nobs"] = nobs
        stats_dict[col]["min"] = minmax[0]
        stats_dict[col]["max"] = minmax[1]
        stats_dict[col]["mean"] = mean
        stats_dict[col]["variance"] = variance
        stats_dict[col]["skewness"] = skewness
        stats_dict[col]["kurtosis"] = kurtosis

        stats_dict[col]["25_perc"] = numpy.percentile(figures, 25)
        stats_dict[col]["median"] = numpy.percentile(figures, 50)
        stats_dict[col]["75_perc"] = numpy.percentile(figures, 75)

        triples = []
        for index, row in source_csv.iterrows():
            worker = row[worker_id_col]
            story = row[story_id_col]
            metrics_col = row[col]
            triples.append((str(worker), str(story), int(metrics_col)))
            print(worker, story, metrics_col)
        t = AnnotationTask(data=triples, distance=interval_distance)
        stats_dict[col]["krippendorff_alpha"] = t.alpha()
        stats_dict[col]["average_agreement"] = t.alpha()

    pandas.DataFrame.from_dict(
        stats_dict, orient="index").to_csv(f"{args['target']}_stats.csv")

    genre_dict = defaultdict(dict)
    genre_desc_count = source_csv[genre_column].value_counts(normalize=False)
    genre_desc = source_csv[genre_column].value_counts(normalize=True)
    for (n, v), (nc, vc) in zip(genre_desc.iteritems(),
                                genre_desc_count.iteritems()):
        genre_dict[n]["count"] = vc
        genre_dict[n]["proportion"] = v

    pandas.DataFrame.from_dict(
        genre_dict, orient="index").to_csv(f"{args['target']}_genres.csv")

    corr_cov_df = source_csv[stats_columns]

    for method in ('pearson', 'kendall', 'spearman'):
        correlation_df = corr_cov_df.corr(method=method)
        correlation_df.to_csv(f"{args['target']}_{method}_corr.csv")
    covariance_df = corr_cov_df.cov()

    covariance_df.to_csv(f"{args['target']}_cov.csv")

    print(source_csv.columns)
Beispiel #15
0
    def __init__(self,data):

        self.N, (self.min, self.max),self.mean,self.variance,self.skewness,self.kurtosis = describe(data)
        self.median = median(data)
        self.std  = std(data)

        # quartiles
        self.q1   = percentile(data,25)
        self.q3   = self.median
        self.q2   = percentile(data,75)

        # percentiles
        self.p01  = percentile(data,1)
        self.p025 = percentile(data,2.5)
        self.p05  = percentile(data,5)
        self.p10  = percentile(data,10)
        self.p90  = percentile(data,90)
        self.p95  = percentile(data,95)
        self.p975 = percentile(data,97.5)
        self.p99  = percentile(data,99)
from scipy.stats import binom
from scipy.stats import stats

# n đủ lớn, p = 0.5 ==> xấp xỉ phân phối chuẩn
n    = 12
p    = 0.5
size = 1000
probs = [0.3, 0.5, 0.8] 
# data_binom = [binom.rvs(n = n, p = p, size = size) for p in probs]
data_binom = binom.rvs(n = n, p = p, size = size)

ax = sns.distplot(data_binom, kde = False, color = 'blue',
                  hist_kws = {'linewidth': 15, 'alpha':1})
ax.set(xlabel = 'Binomial Distribution', ylabel = 'Frequency')

print('\nSố liệu thống kê:\n', stats.describe(data_binom))

# Thí nghiệm tung đồng xu: mặt sấp hoặc mặt ngửa
#    - Giả sử tung một đồng xu 'công bằng' 12 lần. Tính xác suất để có 7 lần ngửa.
#
#    P_x_k = n!/(k!)(n - k)! x p^k x (1 - p)^(n - k)

k = 7
C_n_k = math.factorial(n)/(math.factorial(k) * math.factorial(n - k))
P_X_k =  C_n_k * math.pow(p, k) * math.pow(1 - p, n - k)

print('P(X = 7) = %.4f' %P_X_k)

# Dùng hàm của python
print('P(X = 7) = %.4f (PYTHON)' %binom.pmf(k, n, p, loc = 0))
Beispiel #17
0
#try and find correlations between columns in test and train data
#now we have a_test 4041 x 859
#aX_household_train is 8203 x 859

temp = np.shape(bX_household_train)
temp = temp[1]
train_stat = np.zeros((temp, 6))
test_stat = np.zeros((temp, 6))
feature_corr = np.zeros(temp)
for j in range(0, temp):
    A = bX_household_train[bX_household_train.columns[j]]
    B = b_test[b_test.columns[j]]

    train_stat[j, :] = [
        describe(A).minmax[0],
        describe(A).minmax[1],
        describe(A).mean,
        describe(A).variance,
        describe(A).skewness,
        describe(A).kurtosis
    ]
    test_stat[j, :] = [
        describe(B).minmax[0],
        describe(B).minmax[1],
        describe(B).mean,
        describe(B).variance,
        describe(B).skewness,
        describe(B).kurtosis
    ]
    times = {system: [] for system in systems}
    
    with open('results/times') as f:
        for line in f.readlines():
            values = line.split()
            times[values[0]].append(float(values[2]))
    
    for system in systems:
        folder = 'results/{}/'.format(system)
            
        totals = []
        maxs = []
            
        for file in os.listdir(folder):
            values = []
            with open(os.path.join(folder, file)) as f:
                values = [ float(v) for i, v in enumerate(f.readlines()[1:]) if i > 0 and float(v) != 0]
            totals.append(describe(values).mean)
            maxs.append(max(values))    
        ddata = describe(totals)
        dmaxs = describe(maxs)
        print('Simulation time for {}'.format(system))
        print('\tAvg: {:.0f}'.format(describe(times[system]).mean))
        print('\tMax (stdev): {:.1f}'.format(sqrt(describe(times[system]).variance)))
        print('Memory Consumption for {}'.format(system))
        print('\tAvg: {:.0f}'.format(ddata.mean))
        print('\tstdev: {:.1f}'.format(sqrt(ddata.variance)))
        print('\tMax (avg): {:.0f}'.format(dmaxs.mean))
        print('\tMax (stdev): {:.1f}'.format(sqrt(dmaxs.variance)))
        print()
Beispiel #19
0
def describe_stats(x):
    test_data=x.columns[0:len(x.columns)]
    for i in test_data:
        my_stats=stats.describe(x[i])
        print(i,'\n',my_stats,'\n')