def calculateMeanEstimate(stats, alpha): results = {} for key, values in stats.items(): results[key] = meanEstimate(values, alpha) # print(results) return results
def plot_aq_stats(): with open(base_filename + ".txt") as file_obj: stats = json.load(file_obj) for metric, y in stats.items(): figures.append(plt.figure()) plt.scatter(np.arange(len(y)), y) plt.xlim([0, len(y)]) add_texts(texts[metric]) plt_save(base_filename + "-" + metric)
def __get_curr_logl_stats(self): # compute (R, p)-pairs (x3) using powerlaw.Fit.distribution_compare logl_stats = {key: {stat: val for stat, val in zip(('R', 'p'), self.curr_fit.distribution_compare( 'power_law', distro, normalized_ratio=True))} for key, distro in self._distros_to_compare.items()} return {('log-likelihoods', f"{dist}_{st}"): val for dist, stats in logl_stats.items() for st, val in stats.items()}
def plot_execution_times(results: List[List[Dict[str, Any]]], cockroach_profiling: List[Dict[str, Any]], query_types: int, output_file: Optional[str]): with plt.style.context('ggplot'): fix, ax = plt.subplots() colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] stats = defaultdict(list) for query_type, profile_results in zip(query_types, cockroach_profiling): stats['Cockroach'].append(dict(avg=np.average(profile_results['latency']), std=np.std(profile_results['latency']))) optimizer_times = defaultdict(list) for results_list in results: for opt_result in results_list: opt_name = opt_result['optimizer_name'] opt_times = [r['elapsed_time'] for r in opt_result['stats'] if r['query_type'] == query_type] optimizer_times[opt_name].extend(opt_times) for opt_name, times in optimizer_times.items(): stats[opt_name].append(dict(avg=np.average(times), std=np.std(times))) num_series = len(stats) offset = -float(num_series) / 2.0 + 0.5 for i, (label, results) in enumerate(sorted(stats.items(), key=lambda t: t[0])): xs = [i + offset * BAR_WIDTH + 1 for i in range(len(results))] averages = [r['avg'] for r in results] errors = [r['std'] for r in results] ax.bar(x=xs, height=averages, yerr=errors, width=BAR_WIDTH, capsize=2, color=colors[i], label=label) for x, y in zip(xs, averages): ax.annotate(f'{y:.2f}', xy=(x, y), xytext=(x, y), xycoords='data', textcoords='offset points') offset += 1 ax.set_xticks(list(range(1, len(cockroach_profiling) + 1))) ax.set_xlabel('Query Type') ax.set_ylabel('Execution Time (ms)') ax.set_title('Average Query Execution Times') ax.legend(fontsize='x-small') if output_file is not None: plt.savefig(output_file) else: plt.show()
def _df_stats(self): """ Compute the stats on aggregated values """ df = self._orig_df stats = self._stats.copy() tag_cols = self._restrict_cols(self._stat_tag_cols, df) # Specific handling for the mean, as it has to be handled per group special_stats = { stat for stat in ('mean', 'sem', 'std') if stat in stats and stats[stat] is None } if special_stats: df_mean = self._df_mean(df, special_stats) for stat in special_stats: stats.pop(stat) else: df_mean = df_make_empty_clone(df) df_mean.drop(columns=self._agg_cols, inplace=True) # Create a DataFrame with stats for the groups funcs = {name: func or name for name, func in stats.items()} if funcs: grouped = df.groupby(tag_cols, observed=True, sort=False) df = grouped[self._val_col].agg(**funcs).reset_index() # Transform the newly created stats columns into rows df = self._melt(df) else: df = pd.DataFrame() df = pd.concat([df, df_mean]) df = self._df_remove_tweak_cols(df) unit_col = self._unit_col default_unit = '' if unit_col in df: df[unit_col].fillna(default_unit, inplace=True) else: df[unit_col] = default_unit for stat, unit in self._STATS_UNIT.items(): df.loc[df[self._stat_col] == stat, unit_col] = unit.name return df
def generate_results_string(target, exp_name, results, latexify, drop=None): stats = results[exp_name]["results"][target] print(f"Filling template values for {exp_name}") tokens = [] prepad = False for metric, values in stats.items(): mean, std = values if drop and metric in drop: continue print(f"{metric}: {mean} ({std})") if latexify: str_tokens = ["&$", f"{mean}_{{\\pm{std}}}$"] if prepad: str_tokens.insert(1, r"\prepad") tokens.append(" ".join(str_tokens)) else: tokens += [f"{mean}<sub>({std})</sub>"] return small_font_str(tokens)
def main(): global USE_PERCENTAGE, USE_LABELS, SELECTION parser = get_parser() args = parser.parse_args() treatments = args.treatments.split(",") USE_PERCENTAGE = args.use_percentage USE_LABELS = args.use_labels SELECTION = args.selection results = dict() lst_stats = [] for treatment in treatments: if "." in treatment: treatment, SELECTION = treatment.split(".") else: SELECTION = args.selection #con, dfs = get_con_and_dfs(treatment) con = ALL_CONS[treatment] dfs = ALL_DFS stats = generate_stats(treatment, con, dfs) for k, v in stats.items(): lst = results.get(k, []) lst.append(v) results[k] = lst print("\n\n") for k, items in results.items(): columns = None if items and isinstance(items[0], (tuple, list, set, dict)): columns = list(items[0]) df = pd.DataFrame(items, index=treatments, columns=columns) if (args.transpose): df = df.T print(k) if args.use_latex: print(df.T.to_latex()) else: print(df.T) print("DONE")
def hypothesis_1_3_analysis(stats, boundary=0.2): new_stats = {} for k_main, v_main in stats.items(): these_stats = [] for k, v in v_main.items(): val = 0 for H in stats[k_main][k]["y"]: print(H) if H >= boundary: val += 1 these_stats.append(val) print("------------") new_stats[k_main] = these_stats print("H(I_3) < H(I_c) - p = {}".format(scipy.stats.ttest_ind(new_stats["c"],new_stats["3"],equal_var=False))) print("H(I_3) < H(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["1"],new_stats["3"],equal_var=False))) print("H(I_3) < H(I_2) - p = {}".format(scipy.stats.ttest_ind(new_stats["2"],new_stats["3"],equal_var=False))) print("H(I_2) < H(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["2"],new_stats["1"],equal_var=False))) return new_stats
def hypothesis_3_1_plot(stats): x = [] y = [] for k, v in stats.items(): for stat in v: for s in stat: x.append(int(s[1])) y.append(s[0]) m1, c1, r1, p1, stderr1 = scipy.stats.linregress(x,y) plt.scatter(x,y) plt.plot([0, 1024], [c1, c1 + m1 * 1023],color="r") plt.title("Experiment 3.1") plt.ylabel("Average Dove Score Per Transaction (D)") plt.xlabel("Altruistic Punishment value (A)") plt.xlim([0,1024]) plt.ylim([0,2]) plt.show() print("m = {}, c = {}, r = {}, p = {}, stderr = {}".format(m1,c1,r1,p1,stderr1))
def pairwise_regress_stats(data, features): """Return R², intercept, slope, Pval.""" N = len(features) # Attributes and indices to get the statistics: stats = { 'R2': ('rsquared', None), 'const': ('params', 0), 'slope': ('params', 1), 'Pval': ('f_pvalue', None) } mats = {stat: np.full((N, N), np.NaN) for stat in stats} for j, fty in enumerate(features[:-1]): for i, ftx in enumerate(features[j + 1:], start=j + 1): fit = sm.OLS(data[fty], sm.add_constant(data[[ftx]])).fit() for stat, (attr, idx) in stats.items(): value = getattr(fit, attr) if idx is not None: value = value[idx] mats[stat][j, i] = value # Upper triangular. # TODO: add the 1 against 2 combinations return mats
def build_dataset(stats, template, destroot, workdir, fnameformat, year=0, month=0): """Load in raster files in chunks to reduce memory demands, calculate statistics, and save to file. Keyword arguments: stats -- dictionary with numpy arrayrs to store as datasets template -- a gdal dataset to be used as template to create new ones destroot -- the folder to store the final zip file in workdir -- some scratch location with enough free disk space fnameformat -- flag that determines formatting year -- optional arugment used to supply year for formatting month -- optional argument used to supply month for formatting Returns: None. """ # generate new file name if fnameformat == 'global': descriptor = "2000-2014" elif fnameformat == 'growyearly': descriptor = "{:04d}-{:04d}".format(year, year + 1) elif fnameformat == 'calyearly': descriptor = "{:04d}".format(year) elif fnameformat == 'monthly': descriptor = month # create zip root ziproot = os.path.join(workdir, "fpar.{0}.stats.aust".format(descriptor)) check_or_create_target_dir(ziproot) # Write the results to raster format with appropriate filenames for stattype, statarr in stats.items(): outfile = os.path.join(ziproot, 'data', "fpar.{0}.{1}.aust.tif".format(descriptor, stattype)) write_array_to_raster(outfile, statarr, template) # Write the metadata.json file write_metadatadotjson(ziproot, fnameformat, year, month) # Zip up the dataset zip_dataset(ziproot, destroot) # Clean up the directories shutil.rmtree(ziproot)
def main(): parser = argparse.ArgumentParser( description='LOL HI THERE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--phi-hat-threshold', type=float, default=1 - 1e-2, help='Blah') parser.add_argument('--quantile', type=float, default=0.5, help='Blah') parser.add_argument('--print-bad-data', action='store_true') parser.add_argument('in_ssm_fn') parser.add_argument('in_params_fn') parser.add_argument('out_params_fn') args = parser.parse_args() np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise', over='raise') ssms = inputparser.load_ssms(args.in_ssm_fn) params = inputparser.load_params(args.in_params_fn) ssms = inputparser.remove_garbage(ssms, params['garbage']) bad_vids, bad_samp_prop = _remove_bad(ssms, args.phi_hat_threshold, args.quantile, args.print_bad_data) bad_ssm_prop = len(bad_vids) / len(ssms) if len(bad_vids) > 0: params['garbage'] = common.sort_vids(params['garbage'] + bad_vids) with open(args.out_params_fn, 'w') as F: json.dump(params, F) stats = { 'bad_ssms': common.sort_vids(bad_vids), 'bad_samp_prop': '%.3f' % bad_samp_prop, 'bad_ssm_prop': '%.3f' % bad_ssm_prop, } for K, V in stats.items(): print('%s=%s' % (K, V))
def get_class(stats, class_prob, item): """ :param stats: statystyki w słowniku z klasami. Każde pole posiada k atrybutów zawierających mean i std :param item: przedmiot do klasyfikacji. Ostatni element to klasa przewidywana :return: klasa wyliczona na podstawie naivnego bayesa """ prob = dict() result = [] for key, atr in stats.items(): for x in range(len(item) - 1): prob.setdefault(key, []).append( scipy.stats.norm(stats[key][x][0], stats[key][x][1]).pdf(item[x])) buff = 1 for x in prob[key]: buff *= x result.append([key, buff * class_prob[key]]) max = result[0] for x in result: if x[1] > max[1]: max = x return max[0]
def hypothesis_3_2_analysis(stats): stat_arrays = [ "avg_dove_score", "avg_hawk_score", "avg_score" , "avg_dove_sd", "avg_hawk_sd", "avg_score_sd", ] new_stats = {} stat_compilation = {} for k, v in stats.items(): new_stats[k] = {} for stat in stat_arrays: new_stats[k][stat] = [] for k_n, v_n in stats[k].items(): for stat in stat_arrays: new_stats[k][stat].append(stats[k][k_n][stat]) for k, v in new_stats.items(): stat_compilation[k] = {} for stat in stat_arrays: stat_compilation[k][stat] = np.mean(new_stats[k][stat]) print("k = c, mean = {}, sd = {}".format(np.mean(new_stats["c"]["avg_dove_score"]),np.std(new_stats["c"]["avg_dove_score"]))) print("k = 1, mean = {}, sd = {}".format(np.mean(new_stats["1"]["avg_dove_score"]),np.std(new_stats["1"]["avg_dove_score"]))) print("k = 2, mean = {}, sd = {}".format(np.mean(new_stats["2"]["avg_dove_score"]),np.std(new_stats["2"]["avg_dove_score"]))) print("k = 3, mean = {}, sd = {}".format(np.mean(new_stats["3"]["avg_dove_score"]),np.std(new_stats["3"]["avg_dove_score"]))) print("ANOVA: p = {}".format(scipy.stats.f_oneway(new_stats["3"]["avg_dove_score"],new_stats["2"]["avg_dove_score"],new_stats["1"]["avg_dove_score"],new_stats["c"]["avg_dove_score"])[1])) print("T-test: D(I_3) > D(I_c) - p = {}".format(scipy.stats.ttest_ind(new_stats["c"]["avg_dove_score"],new_stats["3"]["avg_dove_score"],equal_var=False)[1])) print("T-test: D(I_3) > D(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["1"]["avg_dove_score"],new_stats["3"]["avg_dove_score"],equal_var=False)[1])) print("T-test: D(I_3) > D(I_2) - p = {}".format(scipy.stats.ttest_ind(new_stats["2"]["avg_dove_score"],new_stats["3"]["avg_dove_score"],equal_var=False)[1])) print("T-test: D(I_2) > D(I_1) - p = {}".format(scipy.stats.ttest_ind(new_stats["1"]["avg_dove_score"],new_stats["2"]["avg_dove_score"],equal_var=False)[1])) return new_stats, stat_compilation
def hypothesis_3_3_analysis(stats): new_stats = {} for k, v in stats.items(): new_stats[k] = {} new_stats[k]["strategy_breakdown"] = [] for k_n, v_n in stats[k].items(): new_stats[k]["strategy_breakdown"].append([]) for n in range(0,len(stats[k][k_n]["strategy_breakdown"]),5): new_stats[k]["strategy_breakdown"][k_n].append(stats[k][k_n]["strategy_breakdown"][n]) new_stats[k]["H"] = [] new_stats[k]["Hawks"] = [] for k_n, v_n in stats[k].items(): new_stats[k]["Hawks"].append([]) h_stat = 0 for breakdown in new_stats[k]["strategy_breakdown"][k_n]: new_stats[k]["Hawks"][k_n].append(breakdown[1]) if breakdown[1] > 20: h_stat += 1 new_stats[k]["H"].append(h_stat) return new_stats
dataset_cut = {"X": X[cens == 0], "w": w[cens == 0], "y": y_cut[cens == 0]} # metrics for the dataset, evaluated as dichotomous outcome for _ in tqdm(range(args.bootstrap_samples)): idxs = bootstrap_dataset(dataset_cut) arr_ben, arr_noben = bucket_arr(pred_rr[cens == 0][idxs], y_cut[cens == 0][idxs], w[cens == 0][idxs]) stats["arr_ben"].append(arr_ben) stats["arr_noben"].append(arr_noben) stats["c_stat"].append( c_statistic(pred_rr[cens == 0][idxs], y_cut[cens == 0][idxs], w[cens == 0][idxs])) for k, v in stats.items(): print(f"{k}: {[np.round(u, 2) for u in get_range(v)]}") # metrics for the dataset, evaluated on the entire sample for _ in tqdm(range(args.bootstrap_samples)): idxs = bootstrap_dataset(dataset_all) stats["rmst"].append( decision_value_rmst(pred_rr[idxs], y[idxs], w[idxs], t[idxs], args.cens_time)) slope, intercept, _, _, = calibration(pred_rr[idxs], y[idxs], w[idxs], t[idxs], args.cens_time, n_bins=5)
denom, 'upper90': predictions[int(len(predictions) * 0.95)] / denom, 'upper95': predictions[int(len(predictions) * 0.975)] / denom } else: stats[state] = { # 'sortorder': 0, 'positive': 0, 'negative': 0, } if R0 is None and CFR is None: stats_sorted = sorted(stats.items(), key=lambda x: -x[1].get('median', 0)) else: state_order = [ x[0] for x in allstats["None,None,{}".format( norm_by_population)] ] stats_sorted = sorted( stats.items(), key=lambda x: state_order.index(x[0])) allstats["{},{},{}".format(R0, CFR, norm_by_population)] = stats_sorted # Update webpage dateint = max(d['date'] for d in data) datestr = "{}-{}-{}".format(
# Take 59 is for dataset 61, the iris dataset, which is good for numerical tests, # Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information. task = tasks.get_task(60) data = task.get_dataset() X, y, categorical = data.get_data(target=data.default_target_attribute, return_categorical_indicator=True) # We want to do cross-validation for some landmarkers, so we take a cv-10 fold. # We need to unroll the generator into a list because it is iterated over multiple times. folds = list(next(task.iterate_repeats())) simple = simple_metafeatures(X, y, categorical) stats = statistical_metafeatures(X, y, categorical) info = information_theoretic_metafeatures(X, y, categorical) landmarkers = landmarker_metafeatures(X, y, categorical, folds) for key, val in simple.items(): print("{}: {}".format(key, val)) for key, val in stats.items(): print("{}: {}".format(key, val)) for key, val in info.items(): print("{}: {}".format(key, val)) for key, val in landmarkers.items(): print("{}: {}".format(key, val)) print("Total of {} metafeatres".format( len(simple) + len(stats) + len(info) + len(landmarkers)))
stats[state] = { "positive": get_positive(allrecords[-1]), "deaths": deaths, "lower95": predictions[int(len(predictions) * 0.025)], "lower50": predictions[int(len(predictions) * 0.25)], "median": predictions[int(len(predictions) * 0.50)], "upper50": predictions[int(len(predictions) * 0.75)], "upper95": predictions[int(len(predictions) * 0.975)], } else: stats[state] = { "positive": 0, } stats_sorted = sorted( stats.items(), key=lambda x: (x[1].get("deaths", 0), x[1].get("positive", 0), x[0]), reverse=True, ) allstats["{},{}".format(R0, CFR)] = stats_sorted # Update webpage dateint = max(x["date"] for x in allrecords) datestr = "{}-{}-{}".format( str(dateint)[:4], str(dateint)[4:6], str(dateint)[6:8]) with open("index_template.md", "r") as f: template = f.read()
utils.log("Running tests - Importing...") from openml import datasets, tasks # Take 59 is for dataset 61, the iris dataset, which is good for numerical tests, # Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information. task = tasks.get_task(60) data = task.get_dataset() X, y, categorical = data.get_data(target = data.default_target_attribute, return_categorical_indicator = True) # We want to do cross-validation for some landmarkers, so we take a cv-10 fold. # We need to unroll the generator into a list because it is iterated over multiple times. folds = list(next(task.iterate_repeats())) simple = simple_metafeatures(X, y, categorical) stats = statistical_metafeatures(X, y, categorical) info = information_theoretic_metafeatures(X, y, categorical) landmarkers = landmarker_metafeatures(X, y, categorical, folds) for key, val in simple.items(): print("{}: {}".format(key, val)) for key, val in stats.items(): print("{}: {}".format(key, val)) for key, val in info.items(): print("{}: {}".format(key, val)) for key, val in landmarkers.items(): print("{}: {}".format(key, val)) print("Total of {} metafeatres".format(len(simple)+len(stats)+len(info)+len(landmarkers)))
def main(): parser = argparse.ArgumentParser( description= 'Find variants with likely incorrect var_read_prob by comparing model with provided var_read_prob to haploid (LOH) model using Bayes factors', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--logbf-threshold', type=float, default=10., help= 'Logarithm of Bayes factor threshold at which the haploid model is accepted as more likely model than the model using the provided var_read_prob' ) parser.add_argument('--verbose', action='store_true', help='Print debugging messages') parser.add_argument( '--ignore-existing-garbage', action='store_true', help= 'Ignore any existing garbage variants listed in in_params_fn and test all variants. If not specified, any existing garbage variants will be kept as garbage and not tested again.' ) parser.add_argument('--action', choices=('add_to_garbage', 'modify_var_read_prob'), default='add_to_garbage') parser.add_argument('--var-read-prob-alt', type=float, default=1.) parser.add_argument('in_ssm_fn', help='Input SSM file with mutations') parser.add_argument( 'in_params_fn', help= 'Input params file listing sample names and any existing garbage mutations' ) parser.add_argument( 'out_ssm_fn', help='Output SSM file with modified list of garbage mutations') parser.add_argument( 'out_params_fn', help='Output params file with modified list of garbage mutations') args = parser.parse_args() np.set_printoptions(linewidth=400, precision=3, threshold=sys.maxsize, suppress=True) np.seterr(divide='raise', invalid='raise', over='raise') if args.ignore_existing_garbage: variants, params = inputparser.load_ssms_and_params(args.in_ssm_fn, args.in_params_fn, remove_garb=False) params['garbage'] = [] else: variants, params = inputparser.load_ssms_and_params( args.in_ssm_fn, args.in_params_fn) bad_vids, bad_samp_prop = _remove_bad(variants, args.logbf_threshold, args.var_read_prob_alt, args.verbose) bad_ssm_prop = len(bad_vids) / len(variants) if args.action == 'add_to_garbage': params['garbage'] = common.sort_vids( set(bad_vids) | set(params['garbage'])) elif args.action == 'modify_var_read_prob': for vid in bad_vids: variants[vid]['omega_v'][:] = args.var_read_prob_alt else: raise Exception('Unknown action: %s' % args.action) inputparser.write_ssms(variants, args.out_ssm_fn) with open(args.out_params_fn, 'w') as F: json.dump(params, F) stats = { 'num_bad_ssms': len(bad_vids), 'bad_ssms': common.sort_vids(bad_vids), 'bad_samp_prop': '%.3f' % bad_samp_prop, 'bad_ssm_prop': '%.3f' % bad_ssm_prop, } for K, V in stats.items(): print('%s=%s' % (K, V))
def collectStats(collectedStats, stats): for key, value in stats.items(): if not key in collectedStats: collectedStats[key] = np.array([]) collectedStats[key] = np.append(collectedStats[key], value)
def create_statistics_dictionary(df, aggregate_df): """ Function is used to create lists that can be used in creating a box-and-whisker plot using the bxp function. Parameters ---------- df (dataframe): dataframe that shows each site's statistical values (of a particular measurement set, for the most popular unit_concept) aggregate_df (dataframe): dataframe that contains information (of a particular measurement set, for the most popular unit_concept) across all of the applicable sites Returns ------- lst (list): contains a series of dictionaries. each dictionary is used to represent one site (or the information across all of the sites). the key:value pair of this dictionary has a particular statistic:value (e.g. ninetieth percentile:150) names (list): list of the HPO names that make up the rows of the dataframe """ stats = {} tot_min, tot_max = 9999999999, -999999999 for idx, row in df.iterrows(): hpo = row['src_hpo_id'] stats[hpo] = {} stats[hpo]['mean'] = row['mean'] stats[hpo]['whislo'] = row['tenth_perc'] stats[hpo]['q1'] = row['first_quartile'] stats[hpo]['med'] = row['median'] stats[hpo]['q3'] = row['third_quartile'] stats[hpo]['whishi'] = row['ninetieth_perc'] minimum, maximum = row['min'], row['max'] if minimum < tot_min: tot_min = minimum if maximum > tot_max: tot_max = maximum stats[hpo]['fliers'] = np.array([row['min'], row['max']]) stats['aggregate_info'] = {} stats['aggregate_info']['mean'] = aggregate_df['total_mean'].iloc[0] stats['aggregate_info']['whislo'] = aggregate_df['total_tenth_perc'].iloc[ 0] stats['aggregate_info']['q1'] = aggregate_df['total_first_quartile'].iloc[ 0] stats['aggregate_info']['med'] = aggregate_df['total_median'].iloc[0] stats['aggregate_info']['q3'] = aggregate_df['total_third_quartile'].iloc[ 0] stats['aggregate_info']['whishi'] = aggregate_df[ 'total_ninetieth_perc'].iloc[0] stats['aggregate_info']['fliers'] = np.array([minimum, maximum]) lst = [] names = [] for key, value in stats.items(): names.append(key) lst.append(value) return lst, names
def run(config, _id, logger): ''' Top-level function for running experiment. Args: config (dict): Parameters for modeling, execution levels, and error calculations loaded from config.yaml _id (str): Unique id for each processing run generated from current time logger (obj): Logger obj from logging module Returns: None ''' stats = {"true_positives": 0, "false_positives": 0, "false_negatives": 0} with open("labeled_anomalies.csv", "rU") as f: reader = csv.DictReader(f) with open("results/%s.csv" % _id, "a") as out: writer = csv.DictWriter( out, config.header) # line by line results written to csv writer.writeheader() for i, anom in enumerate(reader): if reader.line_num >= 1: anom['run_id'] = _id logger.info("Stream # %s: %s" % (reader.line_num - 1, anom['chan_id'])) model = None X_train, y_train, X_test, y_test = helpers.load_data(anom) # Generate or load predictions # =============================== y_hat = [] if config.predict: model = models.get_model(anom, X_train, y_train, logger, train=config.train) y_hat = models.predict_in_batches( y_test, X_test, model, anom) else: y_hat = [ float(x) for x in list( np.load( os.path.join("data", config.use_id, "y_hat", anom["chan_id"] + ".npy"))) ] # Error calculations # ==================================================================================================== e = err.get_errors(y_test, y_hat, anom, smoothed=False) e_s = err.get_errors(y_test, y_hat, anom, smoothed=True) anom["normalized_error"] = np.mean(e) / np.ptp(y_test) logger.info("normalized prediction error: %s" % anom["normalized_error"]) # Error processing (batch) # ========================= E_seq, E_seq_scores = err.process_errors( y_test, y_hat, e_s, anom, logger) anom['scores'] = E_seq_scores anom = err.evaluate_sequences(E_seq, anom) anom["num_values"] = y_test.shape[ 0] + config.l_s + config.n_predictions for key, value in stats.items(): stats[key] += anom[key] helpers.anom_stats(stats, anom, logger) writer.writerow(anom) helpers.final_stats(stats, logger)
def run(config, _id, logger): stats = {"true_positives": 0, "false_positives": 0, "false_negatives": 0} with open("labeled_anomalies.csv", "rU") as f: reader = csv.DictReader(f) with open("results/%s.csv" % _id, "a") as out: writer = csv.DictWriter(out, config.header) writer.writeheader() for i, anom in enumerate(reader): if reader.line_num >= 1: anom['run_id'] = _id logger.info("Поток # %s: %s" % (reader.line_num - 1, anom['chan_id'])) model = None X_train, y_train, X_test, y_test = helpers.load_data(anom) # # =============================== y_hat = [] if config.predict: model = models.get_model(anom, X_train, y_train, logger, train=config.train) y_hat = models.predict_in_batches( y_test, X_test, model, anom) else: y_hat = [ float(x) for x in list( np.load( os.path.join("data", config.use_id, "y_hat", anom["chan_id"] + ".npy"))) ] # # ==================================================================================================== e = err.get_errors(y_test, y_hat, anom, smoothed=False) e_s = err.get_errors(y_test, y_hat, anom, smoothed=True) anom["normalized_error"] = np.mean(e) / np.ptp(y_test) logger.info("нормализованная ошибка предсказания: %s" % anom["normalized_error"]) # # ========================= E_seq, E_seq_scores = err.process_errors( y_test, y_hat, e_s, anom, logger) anom['scores'] = E_seq_scores anom = err.evaluate_sequences(E_seq, anom) anom["num_values"] = y_test.shape[0] for key, value in stats.items(): stats[key] += anom[key] helpers.anom_stats(stats, anom, logger) writer.writerow(anom) helpers.final_stats(stats, logger)