def calculateMeanEstimate(stats, alpha):
    results = {}
    for key, values in stats.items():
        results[key] = meanEstimate(values, alpha)

    # print(results)
    return results
 def plot_aq_stats():
     with open(base_filename + ".txt") as file_obj:
         stats = json.load(file_obj)
     for metric, y in stats.items():
         figures.append(plt.figure())
         plt.scatter(np.arange(len(y)), y)
         plt.xlim([0, len(y)])
         add_texts(texts[metric])
         plt_save(base_filename + "-" + metric)
Beispiel #3
0
 def __get_curr_logl_stats(self):
     # compute (R, p)-pairs (x3) using powerlaw.Fit.distribution_compare
     logl_stats = {key:
                   {stat: val for stat, val in
                    zip(('R', 'p'),
                        self.curr_fit.distribution_compare(
                            'power_law', distro,
                            normalized_ratio=True))}
                   for key, distro in self._distros_to_compare.items()}
     return {('log-likelihoods', f"{dist}_{st}"): val for dist, stats
             in logl_stats.items() for st, val in stats.items()}
def plot_execution_times(results: List[List[Dict[str, Any]]], cockroach_profiling: List[Dict[str, Any]], query_types: int, output_file: Optional[str]):
    with plt.style.context('ggplot'):
        
        fix, ax = plt.subplots()
        colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

        stats = defaultdict(list)
        for query_type, profile_results in zip(query_types, cockroach_profiling):

            stats['Cockroach'].append(dict(avg=np.average(profile_results['latency']), std=np.std(profile_results['latency'])))

            optimizer_times = defaultdict(list)
            for results_list in results:
                for opt_result in results_list:
                    opt_name = opt_result['optimizer_name']
                    opt_times = [r['elapsed_time'] for r in opt_result['stats'] if r['query_type'] == query_type]
                    
                    optimizer_times[opt_name].extend(opt_times)

            for opt_name, times in optimizer_times.items():
                stats[opt_name].append(dict(avg=np.average(times), std=np.std(times)))

        num_series = len(stats)
        offset = -float(num_series) / 2.0 + 0.5
        for i, (label, results) in enumerate(sorted(stats.items(), key=lambda t: t[0])):
            xs = [i + offset * BAR_WIDTH + 1 for i in range(len(results))]
            
            averages = [r['avg'] for r in results]
            errors = [r['std'] for r in results]

            ax.bar(x=xs, height=averages, yerr=errors, width=BAR_WIDTH, capsize=2, color=colors[i], label=label)

            for x, y in zip(xs, averages):
                ax.annotate(f'{y:.2f}', xy=(x, y), xytext=(x, y), xycoords='data', textcoords='offset points')

            offset += 1

        ax.set_xticks(list(range(1, len(cockroach_profiling) + 1)))

        ax.set_xlabel('Query Type')
        ax.set_ylabel('Execution Time (ms)')
        ax.set_title('Average Query Execution Times')

        ax.legend(fontsize='x-small')

        if output_file is not None:
            plt.savefig(output_file)
        else:
            plt.show()
Beispiel #5
0
    def _df_stats(self):
        """
        Compute the stats on aggregated values
        """
        df = self._orig_df
        stats = self._stats.copy()
        tag_cols = self._restrict_cols(self._stat_tag_cols, df)

        # Specific handling for the mean, as it has to be handled per group
        special_stats = {
            stat
            for stat in ('mean', 'sem', 'std')
            if stat in stats and stats[stat] is None
        }
        if special_stats:
            df_mean = self._df_mean(df, special_stats)
            for stat in special_stats:
                stats.pop(stat)
        else:
            df_mean = df_make_empty_clone(df)
            df_mean.drop(columns=self._agg_cols, inplace=True)

        # Create a DataFrame with stats for the groups
        funcs = {name: func or name for name, func in stats.items()}
        if funcs:
            grouped = df.groupby(tag_cols, observed=True, sort=False)
            df = grouped[self._val_col].agg(**funcs).reset_index()
            # Transform the newly created stats columns into rows
            df = self._melt(df)
        else:
            df = pd.DataFrame()

        df = pd.concat([df, df_mean])
        df = self._df_remove_tweak_cols(df)

        unit_col = self._unit_col
        default_unit = ''
        if unit_col in df:
            df[unit_col].fillna(default_unit, inplace=True)
        else:
            df[unit_col] = default_unit

        for stat, unit in self._STATS_UNIT.items():
            df.loc[df[self._stat_col] == stat, unit_col] = unit.name

        return df
Beispiel #6
0
def generate_results_string(target, exp_name, results, latexify, drop=None):
    stats = results[exp_name]["results"][target]
    print(f"Filling template values for {exp_name}")
    tokens = []
    prepad = False
    for metric, values in stats.items():
        mean, std = values
        if drop and metric in drop:
            continue
        print(f"{metric}: {mean} ({std})")
        if latexify:
            str_tokens = ["&$", f"{mean}_{{\\pm{std}}}$"]
            if prepad:
                str_tokens.insert(1, r"\prepad")
            tokens.append(" ".join(str_tokens))
        else:
            tokens += [f"{mean}<sub>({std})</sub>"]
    return small_font_str(tokens)
Beispiel #7
0
def main():
    global USE_PERCENTAGE, USE_LABELS, SELECTION
    parser = get_parser()
    args = parser.parse_args()

    treatments = args.treatments.split(",")

    USE_PERCENTAGE = args.use_percentage

    USE_LABELS = args.use_labels

    SELECTION = args.selection

    results = dict()
    lst_stats = []
    for treatment in treatments:
        if "." in treatment:
            treatment, SELECTION = treatment.split(".")
        else:
            SELECTION = args.selection
        #con, dfs = get_con_and_dfs(treatment)
        con = ALL_CONS[treatment]
        dfs = ALL_DFS
        stats = generate_stats(treatment, con, dfs)
        for k, v in stats.items():
            lst = results.get(k, [])
            lst.append(v)
            results[k] = lst

    print("\n\n")
    for k, items in results.items():
        columns = None
        if items and isinstance(items[0], (tuple, list, set, dict)):
            columns = list(items[0])
        df = pd.DataFrame(items, index=treatments, columns=columns)

        if (args.transpose):
            df = df.T
        print(k)
        if args.use_latex:
            print(df.T.to_latex())
        else:
            print(df.T)
    print("DONE")
def hypothesis_1_3_analysis(stats, boundary=0.2):
    new_stats = {}
    for k_main, v_main in stats.items():
        these_stats = []
        for k, v in v_main.items():
            val = 0
            for H in stats[k_main][k]["y"]:
                print(H)
                if H >= boundary:
                    val += 1
            these_stats.append(val)
            print("------------")
        new_stats[k_main] = these_stats

    print("H(I_3) < H(I_c) - p = {}".format(scipy.stats.ttest_ind(new_stats["c"],new_stats["3"],equal_var=False)))
    print("H(I_3) < H(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["1"],new_stats["3"],equal_var=False)))
    print("H(I_3) < H(I_2) - p = {}".format(scipy.stats.ttest_ind(new_stats["2"],new_stats["3"],equal_var=False)))
    print("H(I_2) < H(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["2"],new_stats["1"],equal_var=False)))
    return new_stats
def hypothesis_3_1_plot(stats):
    x = []
    y = []
    for k, v in stats.items():
        for stat in v:
            for s in stat:
                x.append(int(s[1]))
                y.append(s[0])

    m1, c1, r1, p1, stderr1 = scipy.stats.linregress(x,y)
    plt.scatter(x,y)
    plt.plot([0, 1024], [c1, c1 + m1 * 1023],color="r")
    plt.title("Experiment 3.1")
    plt.ylabel("Average Dove Score Per Transaction (D)")
    plt.xlabel("Altruistic Punishment value (A)")
    plt.xlim([0,1024])
    plt.ylim([0,2])
    plt.show()

    print("m = {}, c = {}, r = {}, p = {}, stderr = {}".format(m1,c1,r1,p1,stderr1))
Beispiel #10
0
def pairwise_regress_stats(data, features):
    """Return R², intercept, slope, Pval."""
    N = len(features)
    # Attributes and indices to get the statistics:
    stats = {
        'R2': ('rsquared', None),
        'const': ('params', 0),
        'slope': ('params', 1),
        'Pval': ('f_pvalue', None)
    }
    mats = {stat: np.full((N, N), np.NaN) for stat in stats}
    for j, fty in enumerate(features[:-1]):
        for i, ftx in enumerate(features[j + 1:], start=j + 1):
            fit = sm.OLS(data[fty], sm.add_constant(data[[ftx]])).fit()
            for stat, (attr, idx) in stats.items():
                value = getattr(fit, attr)
                if idx is not None:
                    value = value[idx]
                mats[stat][j, i] = value  # Upper triangular.
    # TODO: add the 1 against 2 combinations
    return mats
Beispiel #11
0
def build_dataset(stats, template, destroot, workdir, fnameformat, year=0, month=0):
    """Load in raster files in chunks to reduce memory demands,
    calculate statistics, and save to file.

    Keyword arguments:
    stats -- dictionary with numpy arrayrs to store as datasets
    template -- a gdal dataset to be used as template to create new ones
    destroot -- the folder to store the final zip file in
    workdir -- some scratch location with enough free disk space
    fnameformat --  flag that determines formatting
    year -- optional arugment used to supply year for formatting
    month -- optional argument used to supply month for formatting

    Returns: None.
    """
    # generate new file name
    if fnameformat == 'global':
        descriptor = "2000-2014"
    elif fnameformat == 'growyearly':
        descriptor = "{:04d}-{:04d}".format(year, year + 1)
    elif fnameformat == 'calyearly':
        descriptor = "{:04d}".format(year)
    elif fnameformat == 'monthly':
        descriptor = month
    # create zip root
    ziproot = os.path.join(workdir, "fpar.{0}.stats.aust".format(descriptor))
    check_or_create_target_dir(ziproot)

    # Write the results to raster format with appropriate filenames
    for stattype, statarr in stats.items():
        outfile = os.path.join(ziproot, 'data', "fpar.{0}.{1}.aust.tif".format(descriptor, stattype))
        write_array_to_raster(outfile, statarr, template)

    # Write the metadata.json file
    write_metadatadotjson(ziproot, fnameformat, year, month)
    # Zip up the dataset
    zip_dataset(ziproot, destroot)

    # Clean up the directories
    shutil.rmtree(ziproot)
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(
        description='LOL HI THERE',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--phi-hat-threshold',
                        type=float,
                        default=1 - 1e-2,
                        help='Blah')
    parser.add_argument('--quantile', type=float, default=0.5, help='Blah')
    parser.add_argument('--print-bad-data', action='store_true')
    parser.add_argument('in_ssm_fn')
    parser.add_argument('in_params_fn')
    parser.add_argument('out_params_fn')
    args = parser.parse_args()

    np.set_printoptions(linewidth=400,
                        precision=3,
                        threshold=sys.maxsize,
                        suppress=True)
    np.seterr(divide='raise', invalid='raise', over='raise')

    ssms = inputparser.load_ssms(args.in_ssm_fn)
    params = inputparser.load_params(args.in_params_fn)
    ssms = inputparser.remove_garbage(ssms, params['garbage'])

    bad_vids, bad_samp_prop = _remove_bad(ssms, args.phi_hat_threshold,
                                          args.quantile, args.print_bad_data)
    bad_ssm_prop = len(bad_vids) / len(ssms)
    if len(bad_vids) > 0:
        params['garbage'] = common.sort_vids(params['garbage'] + bad_vids)
        with open(args.out_params_fn, 'w') as F:
            json.dump(params, F)

    stats = {
        'bad_ssms': common.sort_vids(bad_vids),
        'bad_samp_prop': '%.3f' % bad_samp_prop,
        'bad_ssm_prop': '%.3f' % bad_ssm_prop,
    }
    for K, V in stats.items():
        print('%s=%s' % (K, V))
Beispiel #13
0
def get_class(stats, class_prob, item):
    """
    :param stats: statystyki w słowniku z klasami.
                    Każde pole posiada k atrybutów zawierających mean i std
    :param item: przedmiot do klasyfikacji. Ostatni element to klasa przewidywana
    :return: klasa wyliczona na podstawie naivnego bayesa
    """
    prob = dict()
    result = []
    for key, atr in stats.items():
        for x in range(len(item) - 1):
            prob.setdefault(key, []).append(
                scipy.stats.norm(stats[key][x][0],
                                 stats[key][x][1]).pdf(item[x]))
        buff = 1
        for x in prob[key]:
            buff *= x
        result.append([key, buff * class_prob[key]])
    max = result[0]
    for x in result:
        if x[1] > max[1]:
            max = x
    return max[0]
def hypothesis_3_2_analysis(stats):
    stat_arrays = [
            "avg_dove_score",
            "avg_hawk_score",
            "avg_score" ,
            "avg_dove_sd",
            "avg_hawk_sd",
            "avg_score_sd",
    ]
    new_stats = {}
    stat_compilation = {}
    for k, v in stats.items():
        new_stats[k] = {}
        for stat in stat_arrays:
            new_stats[k][stat] = []
        for k_n, v_n in stats[k].items():
            for stat in stat_arrays:
                new_stats[k][stat].append(stats[k][k_n][stat])

    for k, v in new_stats.items():
        stat_compilation[k] = {}
        for stat in stat_arrays:
            stat_compilation[k][stat] = np.mean(new_stats[k][stat])

    print("k = c, mean = {}, sd = {}".format(np.mean(new_stats["c"]["avg_dove_score"]),np.std(new_stats["c"]["avg_dove_score"])))
    print("k = 1, mean = {}, sd = {}".format(np.mean(new_stats["1"]["avg_dove_score"]),np.std(new_stats["1"]["avg_dove_score"])))
    print("k = 2, mean = {}, sd = {}".format(np.mean(new_stats["2"]["avg_dove_score"]),np.std(new_stats["2"]["avg_dove_score"])))
    print("k = 3, mean = {}, sd = {}".format(np.mean(new_stats["3"]["avg_dove_score"]),np.std(new_stats["3"]["avg_dove_score"])))

    print("ANOVA: p = {}".format(scipy.stats.f_oneway(new_stats["3"]["avg_dove_score"],new_stats["2"]["avg_dove_score"],new_stats["1"]["avg_dove_score"],new_stats["c"]["avg_dove_score"])[1]))

    print("T-test: D(I_3) > D(I_c) - p = {}".format(scipy.stats.ttest_ind(new_stats["c"]["avg_dove_score"],new_stats["3"]["avg_dove_score"],equal_var=False)[1]))
    print("T-test: D(I_3) > D(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["1"]["avg_dove_score"],new_stats["3"]["avg_dove_score"],equal_var=False)[1]))
    print("T-test: D(I_3) > D(I_2) - p = {}".format(scipy.stats.ttest_ind(new_stats["2"]["avg_dove_score"],new_stats["3"]["avg_dove_score"],equal_var=False)[1]))
    print("T-test: D(I_2) > D(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["1"]["avg_dove_score"],new_stats["2"]["avg_dove_score"],equal_var=False)[1]))

    return new_stats, stat_compilation
def hypothesis_3_3_analysis(stats):
    new_stats = {}
    for k, v in stats.items():
        new_stats[k] = {}
        new_stats[k]["strategy_breakdown"] = []
        for k_n, v_n in stats[k].items():
           new_stats[k]["strategy_breakdown"].append([])
           for n in range(0,len(stats[k][k_n]["strategy_breakdown"]),5):
               new_stats[k]["strategy_breakdown"][k_n].append(stats[k][k_n]["strategy_breakdown"][n])

        new_stats[k]["H"] = []
        new_stats[k]["Hawks"] = []
        for k_n, v_n in stats[k].items():
            new_stats[k]["Hawks"].append([])
            h_stat = 0
            for breakdown in new_stats[k]["strategy_breakdown"][k_n]:
                new_stats[k]["Hawks"][k_n].append(breakdown[1])
                if breakdown[1] > 20:
                    h_stat += 1
            new_stats[k]["H"].append(h_stat)



    return new_stats
    dataset_cut = {"X": X[cens == 0], "w": w[cens == 0], "y": y_cut[cens == 0]}

    # metrics for the dataset, evaluated as dichotomous outcome
    for _ in tqdm(range(args.bootstrap_samples)):

        idxs = bootstrap_dataset(dataset_cut)
        arr_ben, arr_noben = bucket_arr(pred_rr[cens == 0][idxs],
                                        y_cut[cens == 0][idxs],
                                        w[cens == 0][idxs])
        stats["arr_ben"].append(arr_ben)
        stats["arr_noben"].append(arr_noben)
        stats["c_stat"].append(
            c_statistic(pred_rr[cens == 0][idxs], y_cut[cens == 0][idxs],
                        w[cens == 0][idxs]))

    for k, v in stats.items():
        print(f"{k}: {[np.round(u, 2) for u in get_range(v)]}")

    # metrics for the dataset, evaluated on the entire sample
    for _ in tqdm(range(args.bootstrap_samples)):

        idxs = bootstrap_dataset(dataset_all)
        stats["rmst"].append(
            decision_value_rmst(pred_rr[idxs], y[idxs], w[idxs], t[idxs],
                                args.cens_time))
        slope, intercept, _, _, = calibration(pred_rr[idxs],
                                              y[idxs],
                                              w[idxs],
                                              t[idxs],
                                              args.cens_time,
                                              n_bins=5)
Beispiel #17
0
                                denom,
                                'upper90':
                                predictions[int(len(predictions) * 0.95)] /
                                denom,
                                'upper95':
                                predictions[int(len(predictions) * 0.975)] /
                                denom
                            }
                    else:
                        stats[state] = {
                            # 'sortorder': 0,
                            'positive': 0,
                            'negative': 0,
                        }
                if R0 is None and CFR is None:
                    stats_sorted = sorted(stats.items(),
                                          key=lambda x: -x[1].get('median', 0))
                else:
                    state_order = [
                        x[0] for x in allstats["None,None,{}".format(
                            norm_by_population)]
                    ]
                    stats_sorted = sorted(
                        stats.items(), key=lambda x: state_order.index(x[0]))

                allstats["{},{},{}".format(R0, CFR,
                                           norm_by_population)] = stats_sorted

    # Update webpage
    dateint = max(d['date'] for d in data)
    datestr = "{}-{}-{}".format(
Beispiel #18
0
    # Take 59 is for dataset 61, the iris dataset, which is good for numerical tests,
    # Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information.
    task = tasks.get_task(60)
    data = task.get_dataset()
    X, y, categorical = data.get_data(target=data.default_target_attribute,
                                      return_categorical_indicator=True)

    # We want to do cross-validation for some landmarkers, so we take a cv-10 fold.
    # We need to unroll the generator into a list because it is iterated over multiple times.
    folds = list(next(task.iterate_repeats()))

    simple = simple_metafeatures(X, y, categorical)
    stats = statistical_metafeatures(X, y, categorical)
    info = information_theoretic_metafeatures(X, y, categorical)
    landmarkers = landmarker_metafeatures(X, y, categorical, folds)

    for key, val in simple.items():
        print("{}: {}".format(key, val))

    for key, val in stats.items():
        print("{}: {}".format(key, val))

    for key, val in info.items():
        print("{}: {}".format(key, val))

    for key, val in landmarkers.items():
        print("{}: {}".format(key, val))

    print("Total of {} metafeatres".format(
        len(simple) + len(stats) + len(info) + len(landmarkers)))
Beispiel #19
0
                    stats[state] = {
                        "positive": get_positive(allrecords[-1]),
                        "deaths": deaths,
                        "lower95": predictions[int(len(predictions) * 0.025)],
                        "lower50": predictions[int(len(predictions) * 0.25)],
                        "median": predictions[int(len(predictions) * 0.50)],
                        "upper50": predictions[int(len(predictions) * 0.75)],
                        "upper95": predictions[int(len(predictions) * 0.975)],
                    }
                else:
                    stats[state] = {
                        "positive": 0,
                    }
            stats_sorted = sorted(
                stats.items(),
                key=lambda x:
                (x[1].get("deaths", 0), x[1].get("positive", 0), x[0]),
                reverse=True,
            )
            allstats["{},{}".format(R0, CFR)] = stats_sorted

    # Update webpage
    dateint = max(x["date"] for x in allrecords)
    datestr = "{}-{}-{}".format(
        str(dateint)[:4],
        str(dateint)[4:6],
        str(dateint)[6:8])

    with open("index_template.md", "r") as f:
        template = f.read()
	utils.log("Running tests - Importing...")
	from openml import datasets, tasks

	# Take 59 is for dataset 61, the iris dataset, which is good for numerical tests,
	# Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information.
	task = tasks.get_task(60)
	data = task.get_dataset()
	X, y, categorical = data.get_data(target = data.default_target_attribute, return_categorical_indicator = True)

	# We want to do cross-validation for some landmarkers, so we take a cv-10 fold.
	# We need to unroll the generator into a list because it is iterated over multiple times.
	folds = list(next(task.iterate_repeats()))

	simple = simple_metafeatures(X, y, categorical)
	stats = statistical_metafeatures(X, y, categorical)
	info = information_theoretic_metafeatures(X, y, categorical)
	landmarkers = landmarker_metafeatures(X, y, categorical, folds)

	for key, val in simple.items():
		print("{}: {}".format(key, val))

	for key, val in stats.items():
		print("{}: {}".format(key, val))

	for key, val in info.items():
		print("{}: {}".format(key, val))

	for key, val in landmarkers.items():
		print("{}: {}".format(key, val))

	print("Total of {} metafeatres".format(len(simple)+len(stats)+len(info)+len(landmarkers)))
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Find variants with likely incorrect var_read_prob by comparing model with provided var_read_prob to haploid (LOH) model using Bayes factors',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--logbf-threshold',
        type=float,
        default=10.,
        help=
        'Logarithm of Bayes factor threshold at which the haploid model is accepted as more likely model than the model using the provided var_read_prob'
    )
    parser.add_argument('--verbose',
                        action='store_true',
                        help='Print debugging messages')
    parser.add_argument(
        '--ignore-existing-garbage',
        action='store_true',
        help=
        'Ignore any existing garbage variants listed in in_params_fn and test all variants. If not specified, any existing garbage variants will be kept as garbage and not tested again.'
    )
    parser.add_argument('--action',
                        choices=('add_to_garbage', 'modify_var_read_prob'),
                        default='add_to_garbage')
    parser.add_argument('--var-read-prob-alt', type=float, default=1.)
    parser.add_argument('in_ssm_fn', help='Input SSM file with mutations')
    parser.add_argument(
        'in_params_fn',
        help=
        'Input params file listing sample names and any existing garbage mutations'
    )
    parser.add_argument(
        'out_ssm_fn',
        help='Output SSM file with modified list of garbage mutations')
    parser.add_argument(
        'out_params_fn',
        help='Output params file with modified list of garbage mutations')
    args = parser.parse_args()

    np.set_printoptions(linewidth=400,
                        precision=3,
                        threshold=sys.maxsize,
                        suppress=True)
    np.seterr(divide='raise', invalid='raise', over='raise')

    if args.ignore_existing_garbage:
        variants, params = inputparser.load_ssms_and_params(args.in_ssm_fn,
                                                            args.in_params_fn,
                                                            remove_garb=False)
        params['garbage'] = []
    else:
        variants, params = inputparser.load_ssms_and_params(
            args.in_ssm_fn, args.in_params_fn)

    bad_vids, bad_samp_prop = _remove_bad(variants, args.logbf_threshold,
                                          args.var_read_prob_alt, args.verbose)
    bad_ssm_prop = len(bad_vids) / len(variants)

    if args.action == 'add_to_garbage':
        params['garbage'] = common.sort_vids(
            set(bad_vids) | set(params['garbage']))
    elif args.action == 'modify_var_read_prob':
        for vid in bad_vids:
            variants[vid]['omega_v'][:] = args.var_read_prob_alt
    else:
        raise Exception('Unknown action: %s' % args.action)

    inputparser.write_ssms(variants, args.out_ssm_fn)
    with open(args.out_params_fn, 'w') as F:
        json.dump(params, F)

    stats = {
        'num_bad_ssms': len(bad_vids),
        'bad_ssms': common.sort_vids(bad_vids),
        'bad_samp_prop': '%.3f' % bad_samp_prop,
        'bad_ssm_prop': '%.3f' % bad_ssm_prop,
    }
    for K, V in stats.items():
        print('%s=%s' % (K, V))
def collectStats(collectedStats, stats):
    for key, value in stats.items():
        if not key in collectedStats:
            collectedStats[key] = np.array([])
        collectedStats[key] = np.append(collectedStats[key], value)
Beispiel #23
0
def create_statistics_dictionary(df, aggregate_df):
    """
    Function is used to create lists that can be used in creating a box-and-whisker
    plot using the bxp function.
    
    Parameters
    ----------
    df (dataframe): dataframe that shows each site's statistical values (of a 
        particular measurement set, for the most popular unit_concept)
    
    aggregate_df (dataframe): dataframe that contains information (of a particular measurement
        set, for the most popular unit_concept) across all of the applicable sites
        
    Returns
    -------
    lst (list): contains a series of dictionaries. each dictionary is used to represent one
        site (or the information across all of the sites). the key:value pair of this dictionary
        has a particular statistic:value (e.g. ninetieth percentile:150)
    
    names (list): list of the HPO names that make up the rows of the dataframe
    """
    stats = {}

    tot_min, tot_max = 9999999999, -999999999

    for idx, row in df.iterrows():
        hpo = row['src_hpo_id']

        stats[hpo] = {}
        stats[hpo]['mean'] = row['mean']
        stats[hpo]['whislo'] = row['tenth_perc']
        stats[hpo]['q1'] = row['first_quartile']
        stats[hpo]['med'] = row['median']
        stats[hpo]['q3'] = row['third_quartile']
        stats[hpo]['whishi'] = row['ninetieth_perc']

        minimum, maximum = row['min'], row['max']

        if minimum < tot_min:
            tot_min = minimum
        if maximum > tot_max:
            tot_max = maximum

        stats[hpo]['fliers'] = np.array([row['min'], row['max']])

    stats['aggregate_info'] = {}
    stats['aggregate_info']['mean'] = aggregate_df['total_mean'].iloc[0]
    stats['aggregate_info']['whislo'] = aggregate_df['total_tenth_perc'].iloc[
        0]
    stats['aggregate_info']['q1'] = aggregate_df['total_first_quartile'].iloc[
        0]
    stats['aggregate_info']['med'] = aggregate_df['total_median'].iloc[0]
    stats['aggregate_info']['q3'] = aggregate_df['total_third_quartile'].iloc[
        0]
    stats['aggregate_info']['whishi'] = aggregate_df[
        'total_ninetieth_perc'].iloc[0]
    stats['aggregate_info']['fliers'] = np.array([minimum, maximum])

    lst = []
    names = []

    for key, value in stats.items():
        names.append(key)
        lst.append(value)

    return lst, names
Beispiel #24
0
def run(config, _id, logger):
    ''' Top-level function for running experiment.

    Args:
        config (dict): Parameters for modeling, execution levels, and error calculations loaded from config.yaml
        _id (str): Unique id for each processing run generated from current time
        logger (obj): Logger obj from logging module

    Returns:
        None

    '''

    stats = {"true_positives": 0, "false_positives": 0, "false_negatives": 0}

    with open("labeled_anomalies.csv", "rU") as f:
        reader = csv.DictReader(f)

        with open("results/%s.csv" % _id, "a") as out:

            writer = csv.DictWriter(
                out, config.header)  # line by line results written to csv
            writer.writeheader()

            for i, anom in enumerate(reader):
                if reader.line_num >= 1:

                    anom['run_id'] = _id
                    logger.info("Stream # %s: %s" %
                                (reader.line_num - 1, anom['chan_id']))
                    model = None

                    X_train, y_train, X_test, y_test = helpers.load_data(anom)

                    # Generate or load predictions
                    # ===============================
                    y_hat = []
                    if config.predict:
                        model = models.get_model(anom,
                                                 X_train,
                                                 y_train,
                                                 logger,
                                                 train=config.train)
                        y_hat = models.predict_in_batches(
                            y_test, X_test, model, anom)

                    else:
                        y_hat = [
                            float(x) for x in list(
                                np.load(
                                    os.path.join("data", config.use_id,
                                                 "y_hat", anom["chan_id"] +
                                                 ".npy")))
                        ]

                    # Error calculations
                    # ====================================================================================================
                    e = err.get_errors(y_test, y_hat, anom, smoothed=False)
                    e_s = err.get_errors(y_test, y_hat, anom, smoothed=True)

                    anom["normalized_error"] = np.mean(e) / np.ptp(y_test)
                    logger.info("normalized prediction error: %s" %
                                anom["normalized_error"])

                    # Error processing (batch)
                    # =========================

                    E_seq, E_seq_scores = err.process_errors(
                        y_test, y_hat, e_s, anom, logger)
                    anom['scores'] = E_seq_scores

                    anom = err.evaluate_sequences(E_seq, anom)
                    anom["num_values"] = y_test.shape[
                        0] + config.l_s + config.n_predictions

                    for key, value in stats.items():
                        stats[key] += anom[key]

                    helpers.anom_stats(stats, anom, logger)
                    writer.writerow(anom)

    helpers.final_stats(stats, logger)
Beispiel #25
0
def run(config, _id, logger):

    stats = {"true_positives": 0, "false_positives": 0, "false_negatives": 0}

    with open("labeled_anomalies.csv", "rU") as f:
        reader = csv.DictReader(f)

        with open("results/%s.csv" % _id, "a") as out:

            writer = csv.DictWriter(out, config.header)
            writer.writeheader()

            for i, anom in enumerate(reader):
                if reader.line_num >= 1:

                    anom['run_id'] = _id
                    logger.info("Поток # %s: %s" %
                                (reader.line_num - 1, anom['chan_id']))
                    model = None

                    X_train, y_train, X_test, y_test = helpers.load_data(anom)

                    #
                    # ===============================
                    y_hat = []
                    if config.predict:
                        model = models.get_model(anom,
                                                 X_train,
                                                 y_train,
                                                 logger,
                                                 train=config.train)
                        y_hat = models.predict_in_batches(
                            y_test, X_test, model, anom)

                    else:
                        y_hat = [
                            float(x) for x in list(
                                np.load(
                                    os.path.join("data", config.use_id,
                                                 "y_hat", anom["chan_id"] +
                                                 ".npy")))
                        ]

                    #
                    # ====================================================================================================
                    e = err.get_errors(y_test, y_hat, anom, smoothed=False)
                    e_s = err.get_errors(y_test, y_hat, anom, smoothed=True)

                    anom["normalized_error"] = np.mean(e) / np.ptp(y_test)
                    logger.info("нормализованная ошибка предсказания: %s" %
                                anom["normalized_error"])

                    #
                    # =========================

                    E_seq, E_seq_scores = err.process_errors(
                        y_test, y_hat, e_s, anom, logger)
                    anom['scores'] = E_seq_scores

                    anom = err.evaluate_sequences(E_seq, anom)
                    anom["num_values"] = y_test.shape[0]

                    for key, value in stats.items():
                        stats[key] += anom[key]

                    helpers.anom_stats(stats, anom, logger)
                    writer.writerow(anom)

    helpers.final_stats(stats, logger)