def do_json(args): versions = {} for path in args.logdirs: if os.path.isdir(path): for root, dirs, files in os.walk(path): version = os.path.basename(root) if version not in versions: versions[version] = {} for filename in files: if filename.endswith(".txt"): m = re.match(r'^([^#]+)(#.*)?\.txt$', filename) domain = m.group(1) if domain not in versions[version]: versions[version][domain] = {} read_stats(os.path.join(root, filename), versions[version][domain], args) for version, domains in versions.items(): if args.aggregate: create_total_page_stats(domains, args) for domain, entries in domains.items(): stats = [] for name, value in entries.items(): # We don't want the calculated sum in the JSON file. if name == "Sum": continue entry = [name] for x in ['time_list', 'count_list']: s = statistics(entries[name][x]) entry.append(round(s['average'], 1)) entry.append(round(s['ci']['abs'], 1)) entry.append(round(s['ci']['perc'], 2)) stats.append(entry) domains[domain] = stats print json.dumps(versions, separators=(',', ':'))
def bootstrap(m, query, fn): L,R= get_data(query) stats = [] for _ in range(50): Lb, Rb = make_boot_sample(L, R) stats.append(fn(m, Lb, Rb)) return avg(stats),stddev(stats)
def get_peruser_stats(df, metrics, mean_suffix='mean', std_suffix='std', test_group_column='test_group', control='CONTROL', test='TEST', alpha=0.05): stats = list() for metric in metrics: test_mean, control_mean, diff, confint, zstat, pvalue = get_peruser_diff_zstat(df, 'units', '{0} {1}'.format(metric, mean_suffix), '{0} {1}'.format(metric, std_suffix), test_group_column=test_group_column, test=test, control=control, alpha=alpha, ) stats.append(OrderedDict( metric=metric, test_mean=test_mean, control_mean=control_mean, diff=diff, lcl=confint[0], ucl=confint[1], zstat=zstat, pvalue=pvalue, )) return pd.DataFrame(stats)
def parse_model_file(filepath): f = open(filepath, 'r') lines = [l.strip() for l in f.readlines() if l.strip() != ''] f.close() started_stats = False active_joints = None vid_length = None num_rests = None warp_indices = None stats = [] for line in lines: if line == '--stats--': started_stats = True continue if started_stats: stat_dict = eval(line) stats.append(stat_dict) continue if line.startswith('ActiveJoints='): active_joints = eval(line.split('=', 1)[1]) continue if line.startswith('VidLength='): vid_length = eval(line.split('=', 1)[1]) continue if line.startswith('NumRests='): num_rests = eval(line.split('=', 1)[1]) continue if line.startswith('WarpIndices='): warp_indices = eval(line.split('=', 1)[1]) continue if line.startswith('ConnectionLengths='): connection_lengths = eval(line.split('=', 1)[1]) continue return vid_length, active_joints, num_rests, warp_indices, connection_lengths, stats
def evaluate_by_lc(ar_p, ar_t, ar_lc, mask, nodata_lc, out_dir): ''' Return a dataframe of the agreement between ar_p and ar_t by class in ar_lc. ''' print 'Getting unique values...' classes = np.unique(ar_lc[(ar_lc != nodata_lc) & (ar_lc != 0)]) # Take 0's out too #import pdb; pdb.set_trace() stats = [] for lc in classes: print 'Calculating statistics for class ', lc this_mask = (ar_lc == lc) & mask this_t = ar_t[this_mask] this_p = ar_p[this_mask] ac, ac_s, ac_u, ssd, spod = calc_agree_coef(this_t, this_p, this_t.mean(), this_p.mean()) rmspe = calc_rmspe(this_t, this_p) class_stats = {'lc_class': lc, 'aggree_coef': ac, 'AC_sys': ac_s, 'AC_unsys': ac_u, 'rmspe': rmspe} stats.append(class_stats) df = pd.DataFrame(stats).reindex(columns=['lc_class', 'aggree_coef', 'AC_sys', 'AC_unsys', 'rmspe']) out_txt = os.path.join(out_dir, 'lc_stats.txt') df.to_csv(out_txt, sep='\t', index=False) out_png = os.path.join(out_dir, 'agreement_per_lc_no0.png') class_labels = ['Water', 'Ice/Snow', 'Developed', 'Bare Ground', 'Deciduous Forest', 'Coniferous Forest', 'Shrubland'] plot_agreement(df, out_png, class_labels=class_labels) return df
def main(): _dir = os.path.dirname(__file__) _path = glob.glob(_dir + '//..//data_and_results//HYP_CELL_NWB//Naive//*.nwb') full_qc = [0, 0, 0, 0] for fp in _path: realX, realY, realC = loadNWB(fp) temp_qc = run_qc(realY, realC) full_qc = np.vstack((full_qc, temp_qc)) df = pd.DataFrame( data=full_qc[1:, :], columns=['Mean RMS', 'Max RMS', 'Mean Drift', 'Max Drift'], index=_path) df.to_csv('qc.csv') stats = [] for col in df.columns.values: stats.append(df[col].quantile(0.1)) qc_stats = pd.DataFrame(data=stats, index=[ '10 percentile Mean RMS', '10 percentile Max RMS', ' 10 percentile Mean Drift', ' 10 percentile Max Drift' ]) qc_stats.to_csv('qc_stats.csv')
def bootstrap_CI(sample, conf=0.95): '''TODO: finish this - make it work, but also decide if this is the right algorithm to use default alpha 0.95''' values = sample.values # configure bootstrap n_iterations = 1000 n_size = int(len(sample) * 0.50) # ? print(n_size) # run bootstrap stats = list() for i in range(n_iterations): # prepare train and test sets train = resample(values, n_samples=n_size) test = np.array( [x for x in values if x.tolist() not in train.tolist()]) # fit model model = DecisionTreeClassifier() model.fit(train[:, :-1], train[:, -1]) # evaluate model predictions = model.predict(test[:, :-1]) score = accuracy_score(test[:, -1], predictions) print(score) stats.append(score) # confidence intervals alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(stats, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(stats, p)) print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha * 100, lower * 100, upper * 100))
def compute_stats(output, labels): stats = [] for c in range(labels.shape[1]): # Average precision avg_precision = metrics.average_precision_score( labels[:, c], output[:, c], average=None) # AUC auc = metrics.roc_auc_score(labels[:, c], output[:, c], average=None) # Precisions, recalls (precisions, recalls, thresholds) = metrics.precision_recall_curve( labels[:, c], output[:, c]) # FPR, TPR (fpr, tpr, thresholds) = metrics.roc_curve(labels[:, c], output[:, c]) save_every_steps = 1000 # Sample statistics to reduce size dict = {'precisions': precisions[0::save_every_steps], 'recalls': recalls[0::save_every_steps], 'AP': avg_precision, 'fpr': fpr[0::save_every_steps], 'fnr': 1. - tpr[0::save_every_steps], 'auc': auc} stats.append(dict) return stats
def compute(self, inp=None): stats = [] layer_stats = [None, None] if self.interface.do_center == False: oldpos = np.copy(self.interface.universe.atoms.positions) self.interface.center() surface = self.interface._surfaces[0] surface.triangulation() if self.return_statistics is True: for side in [0, 1]: layer_stats[side] = utilities.triangulated_surface_stats( surface.trimmed_surf_triangs[side], surface.triangulation_points[side]) # this average depends on what is in the stats, it can't be done # automatically stats.append(layer_stats[0][0] + layer_stats[1][0]) # add here new stats other than total area if self.interface.do_center == False: self.interface.universe.positions = np.copy(oldpos) if self.return_triangulation is False: return stats else: return [ stats, surface.surf_triang, surface.triangulation_points, surface.trimmed_surf_triangs ]
def load_patches_db(path): masks = [] stats = [] nuclei_list = [] files = os.listdir(path) patches1 = len(np.load(os.path.join(path, files[0]),encoding="latin1",allow_pickle=True)) patches2 = len(np.load(os.path.join(path, files[1]),encoding="latin1",allow_pickle=True)) patches3 = len(np.load(os.path.join(path, files[2]),encoding="latin1",allow_pickle=True)) patches4 = len(np.load(os.path.join(path, files[3]),encoding="latin1",allow_pickle=True)) patches5 =len(np.load(os.path.join(path, files[4]),encoding="latin1",allow_pickle=True)) patches6 = len(np.load(os.path.join(path, files[5]),encoding="latin1",allow_pickle=True)) #(print(patches1,'',patches2,'',patches3,'',patches4,'',patches5,'',patches6)) images = np.zeros((patches6+patches1+patches2+patches3+patches4+patches5,32,32,3)) c = 0 i=0 for f in files: patches = np.load(os.path.join(path, f),encoding="latin1",allow_pickle=True) for patch in patches: image, mask, nuclei, stat = patch i+=1 images[c] = image[:,:,:3] masks.append(mask) stats.append(stat) nuclei_list.append(nuclei) c += 1 return images, masks, nuclei_list, stats
def get_stats(): # make a list to hold tuples of features from each file. We'll use this to build the dataframe stats = [] # loop over each path and extract our features for path in PATHS: with open(path, 'r') as f: # the child class is indicated by which folder inside data it's in child_class = path.split('/')[2] # read the whole file data = f.read() # get the individual lines (could have done readlines, but we need the whole text too) lines = data.split('\n') # get the participants for this file participants = get_target_participants(data) # loop over the participants and collect features for each of them for participant in participants: # initialize collector variables tokens = [] tags = [] num_tokens = 0 num_utterances = 0 stage = get_stage(path) # loop over the lines for i, line in enumerate(lines): # when we fine a line with our participant, get the tokens and tags if f'*{participant[0]}:' in line: # get the lines containing tag data and extract tags tag_lines = get_tag_lines(lines, i) tags += get_tags(tag_lines) # get any corresponding errors and corrections errors = get_errors(lines, i) # get the lines containing tokens and clean / tokenize token_lines = get_token_lines(lines, i) tokens += get_tokens(token_lines, errors) # increment for each utterance we see num_utterances += 1 # append the tuple with our data for this participant stats.append((path.split('/')[-1], child_class, participant, stage, tokens, Counter(tokens), Counter(tags), len(tokens), len(Counter(tokens)), len(Counter(tags)), len(tokens)/num_utterances)) # make the dataframe from stats and return it df = pd.DataFrame(stats, columns=['file_name','child_class', 'child_name', 'stage', 'tokens','types', 'tags', 'num_tokens', 'num_types', 'num_tags', 'mlu']) return df
def get_coverage_stats(self): stats = [] stats.append(["Average Cvg", self.get_average_coverage()]) stats.append(["Cvg Std Dev", self.get_coverage_stddev()]) stats.append(["Median Cvg", self.get_median_coverage()]) stats.append(["Cvg Mode", self.get_coverage_mode()]) stats.append(["Cvg Coeff Var", self.get_coefficient_of_variation()]) return stats
def explore_statistical_measures_across_repetitions(iterations=5, ndatapoints=1000): stats = [] for i in range(iterations): data1, data2 = get_two_datasets(ndatapoints) s = get_statistical_measures(data1, data2) stats.append(s) return stats
def stat_df(df): stats = [] for col in df.columns: stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype)) stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type']) stats_df.sort_values('Percentage of missing values', ascending=False,inplace=True) return stats_df
def convert_stat(self, x): x = np.atleast_2d(x) n = len(x) stats = [] for i in range(n): stat = self.problem.sufficient_stat(x[i]) stats.append(stat) s = np.vstack(stats) return s
def test(): """ test """ indata = [0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.1, 0.4] stats = RollingStats() for i in indata: stats.append(i) print stats.get() print calculate_stats(indata)
def calculate_stats(output, target, threshold): """Calculate statistics including mAP, AUC, etc. Args: output: 3d array, (samples_num, time, classes_num) target: 3d array, (samples_num, time, classes_num) Returns: stats: list of statistic of each class. """ classes_num = target.shape[0] timestep_num = target.shape[1] print(classes_num) print(timestep_num) stats = [] # Class-wise statistics for j, k in [(j, k) for j in range(timestep_num) for k in range(classes_num)]: # Piecewise comparison output_rounded = output > threshold # Average precision avg_precision = metrics.average_precision_score(target[:, j, k], output_rounded[:, j, k], average=None) # AUC #auc = metrics.roc_auc_score(target[:, j, k], output_rounded[:, j, k], average=None) # Precisions, recalls (precisions, recalls, thresholds) = metrics.precision_recall_curve(target[:, j, k], output_rounded[:, j, k]) # FPR, TPR (fpr, tpr, thresholds) = metrics.roc_curve(target[:, j, k], output_rounded[:, j, k]) save_every_steps = 1000 # Sample statistics to reduce size dict = { 'precisions': precisions[0::save_every_steps], 'recalls': recalls[0::save_every_steps], 'AP': avg_precision, 'fpr': fpr[0::save_every_steps], 'fnr': 1. - tpr[0::save_every_steps] } #'auc': auc} stats.append(dict) return stats
def get_stats(self): stats = [] for current_id in self.ids: current_rating = self.ratings[current_id] mu = current_rating.mu if self.float_precision is not None and isinstance( self.float_precision, int): mu = truncate_float(mu, self.float_precision) c = (self.names[current_id], mu, current_rating.sigma) stats.append(c) return stats
def get_stats(y): stats = [] for i in range(len(y)): if y[i] > -10000: stats.append(y[i]) mean = np.mean(stats) medi = np.median(stats) std = np.std(stats) return mean, medi, std
def getSRs(): from smodels.experiment.databaseObj import Database db = Database ( "official" ) ers = db.getExpResults( dataTypes=[ "efficiencyMap" ] ) stats = [] for er in ers: for ds in er.datasets: D = { "obsN": ds.dataInfo.observedN, "expectedBG": ds.dataInfo.expectedBG, "bgError": ds.dataInfo.bgError, "upperLimit": ds.dataInfo.upperLimit, "expectedUpperLimit": ds.dataInfo.expectedUpperLimit } stats.append ( D ) return stats
def bootstrap_func(df, group_col, y): n_iterations = 1000 n_size = int(len(df) * 0.50) stats = list() for i in range(n_iterations): sample = resample(df, n_samples=n_size) mean_dict = {grp[0]:grp[1][y].mean() for grp in sample.groupby(group_col)} stats.append(mean_dict['treatment']-mean_dict['control']) est_diff = sum(stats)/len(stats) lower = np.percentile(stats, 0.025) upper = np.percentile(stats, 97.5) return({'est':est_diff, 'lower':lower, 'upper':upper})
def validation_end(self, outputs): truth = torch.cat([o['batch_truth'] for o in outputs], dim=0).reshape(-1) logits = torch.cat([o['batch_logits'] for o in outputs], dim=0).reshape(truth.shape[0], outputs[0]['batch_logits'].shape[1]) loss_sum = torch.cat([o['batch_loss'].reshape(-1) for o in outputs], dim=0).reshape(-1) loss_sum = torch.sum(loss_sum, dim=0).reshape(-1) assert truth.shape[0] == sum([o['batch_logits'].shape[0] for o in outputs]), "Mismatch size" loss = self.loss(truth, logits) assert math.isclose(loss.item(), loss_sum.item(), abs_tol=0.01), f"Loss not equal: {loss.item()} VS. {loss_sum.item()}" loss /= truth.shape[0] loss_sum /= truth.shape[0] proba = F.softmax(logits, dim=-1) pred = torch.argmax(proba, dim=-1).reshape(-1) with open(os.path.join(self.hparams.output_dir, "dev-labels.lst"), "w") as output_file: output_file.write("\n".join(map(str, (truth + self.task_config[self.hparams.task_name][ 'label_offset']).cpu().numpy().tolist()))) with open(os.path.join(self.hparams.output_dir, "dev-predictions.lst"), "w") as output_file: output_file.write("\n".join( map(str, (pred + self.task_config[self.hparams.task_name]['label_offset']).cpu().numpy().tolist()))) with open(os.path.join(self.hparams.output_dir, "dev-probabilities.lst"), "w") as output_file: output_file.write("\n".join(map(lambda l: '\t'.join(map(str, l)), proba.cpu().detach().numpy().tolist()))) stats = [] predl = pred.cpu().detach().numpy().tolist() truthl = truth.cpu().detach().numpy().tolist() for _ in range(100): predl = pred.cpu().detach().numpy().tolist() indicies = np.random.randint(len(predl), size=len(predl)) sampled_pred = [predl[i] for i in indicies] sampled_truth = [truthl[i] for i in indicies] stats.append(accuracy_score(sampled_truth, sampled_pred)) _, lower, upper = mean_confidence_interval(stats, self.hparams.ci_alpha) return { 'val_loss': loss.item(), 'val_acc': accuracy_score(truth.cpu().detach().numpy().tolist(), pred.cpu().detach().numpy().tolist()), 'val_cil': lower, 'val_ciu': upper, }
def getStats(self): '''Gets statistics of the recovered parameters''' for idx in arange(len(self.recparams)): data = self.chain[idx] stats.append([ min(data), max(data), mean(data), stdev(data), getConfidenceIntervals( hist(data, 1000)[0], linspace(min(data), max(data), 1000), [68.0, 95.0, 99.0]) ])
def test(self, sample1: np.ndarray, sample2: np.ndarray, alpha=0.05): iters = self.options.get('iters', 1000) gt = compute_pr_x_ge_y(sample1, sample2) sample = np.concatenate((sample1, sample2)) n = len(sample1) stats = [] for _ in range(iters): np.random.shuffle(sample) sample1 = sample[:n] sample2 = sample[n:] stats.append(compute_pr_x_ge_y(sample1, sample2)) p = np.mean(np.array(stats) <= gt) return p < alpha, p, p
def getStatsForDataframe(df): # https://www.kaggle.com/artgor/is-this-malware-eda-fe-and-lgb-updated stats = [] for col in df.columns: stats.append( (col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype)) stats_df = pd.DataFrame( stats, columns=['Feature', 'Unique_values', '%Missing', '%Biggest', 'type']) return stats_df.sort_values('%Missing', ascending=False)
def record(n_iters: int, outfile: str, command: List[str]) -> None: stats: List[Stat] = [] for i in range(n_iters): print(f'\r[{i+1} / {n_iters}] Running {command}', end='', flush=True) stats.append(perf_stat(command)) with open(outfile, 'w', encoding='utf-8') as f: f.write( 'cycles\tinstructions\tseconds_elapsed\tseconds_user\tseconds_sys\n' ) for s in stats: f.write('\t'.join(s) + '\n') print(f'\nResults written to {outfile}.')
def calculate_stats(output, target): """Calculate statistics including mAP, AUC, etc. Args: output: 2d array, (samples_num, classes_num) target: 2d array, (samples_num, classes_num) Returns: stats: list of statistic of each class. """ classes_num = target.shape[-1] stats = [] # Class-wise statistics for k in range(classes_num): # Average precision avg_precision = metrics.average_precision_score(target[:, k], output[:, k], average=None) # AUC try: auc = metrics.roc_auc_score(target[:, k], output[:, k], average=None) except: pass # Precisions, recalls (precisions, recalls, thresholds) = metrics.precision_recall_curve(target[:, k], output[:, k]) # FPR, TPR (fpr, tpr, thresholds) = metrics.roc_curve(target[:, k], output[:, k]) save_every_steps = 1000 # Sample statistics to reduce size dict = { 'precisions': precisions[0::save_every_steps], 'recalls': recalls[0::save_every_steps], 'AP': avg_precision, 'fpr': fpr[0::save_every_steps], 'fnr': 1. - tpr[0::save_every_steps] #'auc': auc } stats.append(dict) return stats
def calc_missing_stat(df, missing_only=True): stats = [] for col in df.columns: stats.append( (col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].dtype)) stats_df = pd.DataFrame( stats, columns=['feature', 'num_of_unique', 'pct_of_missing', 'type']) stats_df = stats_df.sort_values('pct_of_missing', ascending=False).reset_index(drop=True) if missing_only: stats_df = stats_df[stats_df['pct_of_missing'] > 0] return stats_df
def matrix_stats(X, n=6): ''' Order of stats: mean, median, skewness, kurtosis, standard dev,sumofmax,sumofmin,RMS,interquartile_range, l-scale, l-skewness,lkurtosis ^that is if n=4, if not 4 the order will be the same but additonal L-moments will added to the end, 1 per each value of n that increases Input: Matrix of rows of data Output: 'summary' of each in row vectors joined into a matrix, basically a matrix of stats summarizing the data matrix ''' X = np.array(X) stats = [] try: for x in X: stats.append(onerowstats(x, n)) except TypeError: stats.append(onerowstats(X, n)) return np.array(stats)
def bootstrappedConfidenceInterval( trials, computeStatisticFunction, numResamples, pValue): stats = [] t = time.time() for i in range(0, numResamples): # This is just to know when it'll be finished when it runs on a laptop. update(i,numResamples,t) resample = trials.sample(True, 1.0) stats.append(computeStatisticFunction(resample)) sorted(stats) lowerIndex = int(numResamples * pValue / 2 - 1) upperIndex = int(np.ceil(numResamples * (1 - pValue / 2))) return (stats[lowerIndex], stats[upperIndex])
def answerCDF(api_site_name): try: key = bucket.get_key("QA/%s.json.zlib"%api_site_name) items = json.loads(zlib.decompress(key.get_contents_as_string())) except: "print data not found on S3. Please crawl data from stackexchange website first" return A = [] S = [] Sa = [] rank = [] score = [] time = [] for item in items: a = 0 s = 0 t0 = item['creation_date'] if item.has_key('answers'): for i,answer in enumerate(item['answers']): a += 1 dt = answer['creation_date'] - t0 time.append(dt) score.append(answer['score']) s += answer['score'] rank.append(i+1) #t0 = answer['creation_date'] A.append(a) Sa.append(s) S.append(item['score']) return {'rank' : np.array(rank), 'score' : np.array(score), 'time' : np.array(time), 'A': np.array(A), 'S': np.array(S), 'Sa': np.array(Sa), }
def run_expts(self, config: SchedulerConfig, num_srvs: int, num_expts: int, seed_num: int): """Runs a number of experiments with the specified configuration. Args: config: The configuration the Scheduler within the experiments. num_srvs: The total number of servers. num_expts: The number of experiements to be run. seed_num: A seed used to update the job generator. Returns: list: A list of scheduling statistics. """ stats = [] for i in range(num_expts): expt_stats = self._run_expt(config, num_srvs, seed_num + i) stats.append(expt_stats) return stats
def loop_show_asymmetry(prefix, grouping_keys=['Gender', 'FDH_23_Handedness_Prtcpnt'], xaxis_key='Age_At_IMGExam', plots='regressions'): """ Loop over all properties to show asymmetry.""" data = get_all_data() data.filter(lambda k, v: 'fuzzy' not in k) # Remove 'fuzzy' data.filter([partial(lambda k, v, p: (k.startswith(p) or k in grouping_keys or k == xaxis_key), p=p) for p in prefix]) # Process & plot the data. stats = [] regressions = [] group_samples = [] measure_keys = data.get_twohemi_keys() for pi, key in enumerate(sorted(measure_keys)): print("Comparing %d (%s)..." % (pi, key)) gn, ss, rv, gs = compare_group_asymmetry(data.data_dict, xaxis_key=xaxis_key, yaxis_key=key, plots=plots, grouping_keys=grouping_keys, measure_key=key) stats.append(ss) regressions.append(rv) group_samples.append(gs) if 'regression_stats' in plots: dump_regressions_csv(regressions, group_names=gn, measure_names=measure_keys) plot_regressions_scatter(regressions, group_names=gn, measure_names=measure_keys) if 'stat_distributions' in plots: plot_stat_distributions(stats, group_names=gn) plt.show()
def test_search2(): # Test that the search region works. search = rft.IntrinsicVolumes([3,4,5]) x = np.linspace(0.1,10,100) stats = [rft.Gaussian(search=search)] ostats = [rft.Gaussian()] for dfn in range(5,10): for dfd in [40,50,np.inf]: stats.append(rft.FStat(dfn=dfn, dfd=dfd, search=search)) ostats.append(rft.FStat(dfn=dfn, dfd=dfd)) stats.append(rft.TStat(dfd=dfd, search=search)) ostats.append(rft.TStat(dfd=dfd)) stats.append(rft.ChiSquared(dfn=dfn, search=search)) ostats.append(rft.ChiSquared(dfn=dfn)) for i in range(len(stats)): stat = stats[i] ostat = ostats[i] v1 = stat(x) v2 = 0 for j in range(search.mu.shape[0]): v2 += ostat.density(x, j) * search.mu[j] assert_almost_equal(v1, v2)
def _test_binary_predictor(self, features, responses): final_stats = [] final_probs = [] final_responses = [] for predictors_array, ct_prefix, celltype_idx in [(self.ct1_predictor, 'ct1', 0), (self.ct2_predictor, 'ct2', 1)]: echo('Constructing examples for testing for ' + ct_prefix) stats = [] probs = [] resp = [] formatted_features, formatted_responses = \ self.format_features_and_responses(features={'primary_mark_signal': features[ct_prefix + '_mark_signal']}, responses=responses[ct_prefix + '_target_regions']) tp, fp, tn, fn, p = self._eval_predictor(predictors_array, formatted_features, formatted_responses, return_probs=True, celltype_idx=celltype_idx, bagged_idx='all') stats.append((tp, fp, tn, fn)) probs.extend(p) resp.extend(formatted_responses) echo('Global Evaluation') print_eval(*map(sum, zip(*stats))) if len(resp) != len(probs): print 'ERROR', len(resp), len(probs) exit(1) final_stats.append(stats) final_probs.append(probs) final_responses.append(resp) return final_stats, final_probs, final_responses
def calc_multigof(data, model): #choice_limit = 1e-2 / len(data) #choices = numpy.array([[scipy.stats.poisson(m).pmf(i) #for i in range( #int(scipy.stats.poisson(m).ppf(choice_limit)), #int(scipy.stats.poisson(m).ppf(1 - choice_limit) + 1))] #for m in model]) stats = [] for i in range(20): n = 4**i if n > len(data): break for j in range(int(numpy.ceil(len(data) * 1. / n))): dpart = data [j*n:(j + 1)*n] mpart = model[j*n:(j + 1)*n] #cpart = choices[j*n: (j + 1)*n] #print data.shape, dpart.shape, j*n, (j + 1)*n, dpart, data k = int(dpart.sum()) if len(dpart) > 0 else 0 m = mpart.sum() # compare this probability to all the other k #probs = gen_choices(cpart, k) if m > 0: probs = scipy.stats.poisson(m).pmf(k) else: if k == 0: probs = 1 else: probs = 1e-10 stats.append([n, j, probs, m, k]) # * len(data) / n]) assert not numpy.isnan(probs), [mpart.sum(), k] #print ' multigof', n, j, probs, len(stats) # lam = sum(mpart) WRONG! # go through all possibilities to get k return numpy.array(stats)
def test_search1(): # Test that the search region works. # XXX - we are not testing anything search = rft.IntrinsicVolumes([3,4,5]) x = np.linspace(0.1,10,100) stats = [rft.Gaussian()] for dfn in range(5,10): for dfd in [40,50,np.inf]: stats.append(rft.FStat(dfn=dfn, dfd=dfd)) stats.append(rft.TStat(dfd=dfd)) stats.append(rft.ChiSquared(dfn=dfn)) for dim in range(7): for stat in stats: # XXX - v1 appears to be unused v1 = stat(x, search=search) v2 = 0 for i in range(search.mu.shape[0]): v2 += stat.density(x, i) * search.mu[i]
for i in range(nQual): dataT89.append(results[i]['Lgm_B_T89']['time']) dataT89 = array(dataT89) dataOP77 = [] for i in range(nQual): dataOP77.append(results[i]['Lgm_B_OP77']['time']) dataOP77 = array(dataOP77) data = hstack([dataT89.transpose(), dataOP77.transpose()]) data = hstack([data[0:,0::2], data[0:,1::2]]) # T89 and OP77 alternate now # compute statistics on if the medians are different in the two runs stats = [] for val in range(nQual): # for each pair of data do the test stats.append(scipy.stats.mannwhitneyu(data[:,val*2], data[:,val*2+1])[1]*2) # get p value *2 for 2 sided for i, val in enumerate(stats): if val < 0.05: stats[i] = 'Different' else: stats[i] = 'Same' figure() boxplot(data, notch=True, positions=range(nQual*2)) ax = gca() ax.set_xlabel('Quality number') ax.set_ylabel('Run time') ax.set_title(socket.gethostname() + ' LstarVersusPA Calcs ' + str(datetime.datetime.now().month) + '-' + str(datetime.datetime.now().day) + '-' + str(datetime.datetime.now().year))
def getStats(self): '''Gets statistics of the recovered parameters''' for idx in arange(len(self.recparams)): data = self.chain[idx] stats.append([min(data), max(data), mean(data), stdev(data), getConfidenceIntervals(hist(data, 1000)[0], linspace(min(data),max(data),1000), [68.0,95.0,99.0])])
c_name = "%s_%s" % (clust, cl) clust_col_names.append(c_name) coords = clust_coords[clust][cl] for ss in subj_list: lg.info("get peak for Clust %s, subj %s" % (clust, ss)) L.append(mask_dump_peak(clust, coords, ss)) clust_dat = clust_dat.append(pd.Series(L)) out_dat = pd.DataFrame(clust_dat.reshape(7, 17).T, columns=clust_col_names) outname = os.path.join(stdoutdir, "peak_voxel_data.csv") out_dat.to_csv(outname, index=False) lg.info(print(out_dat.corr())) # Doing the tests import itertools import scipy.stats conds = [] stats = [] for combo in itertools.combinations(range(out_dat.shape[1]), 2): conds.append(list([out_dat.columns[combo[0]], out_dat.columns[combo[1]]])) stats.append(list(scipy.stats.pearsonr(out_dat.iloc[:, combo[0]], out_dat.iloc[:, combo[1]]))) corr_res = pd.concat([pd.DataFrame(conds), pd.DataFrame(stats)], axis=1) col_heads = ["condition1", "condition2", "rvalue", "pvalue"] corr_res.columns = col_heads outname_corr = os.path.join(stdoutdir, "corr_tests_on_peak_voxel_data.csv") corr_res.to_csv(outname_corr, index=False)
def plotPhaseTransition_d05(descDic,percentile=25,plot=False): resultDir = "/Users/maithoma/work/compute/pgames_d05_transition/results/" #print listRootFilenames() rt_fnames = selectRootFilenames(descDic)['list_rt'] rt_variables = selectRootFilenames(descDic)['var_dic'] S = [] C = [] s = [] cMedian = [] cDown = [] cUp = [] cCountUp = [] cCountDown = [] cCountMiddle = [] for r,rt in enumerate(rt_fnames): i=0 c = [] s.append(rt_variables['s'][r]) while True: try: filename = rt + "_%s.csv"%i #print filename coop = parseSummary(filename)['coop_level'][-1] #print filename,coop c = np.append(c,coop) S.append(rt_variables['s'][r]) C.append(coop) i+=1 except IOError: break print rt,c cMedian.append(np.mean(c)) cDown.append(np.percentile(c,percentile)) cUp.append(np.percentile(c,100 - percentile)) cCountUp.append(len(c[c>0.8])/float(len(c))) cCountDown.append(len(c[c<0.2])/float(len(c))) cCountMiddle.append(len(c[(c>=0.2)*(c<=0.8)])/float(len(c))) dic = {'s': s, 'cMedian': cMedian,'cUp' : cUp,'cDown' : cDown,'C' :C, 'S':S, 'cCountDown' :cCountDown, 'cCountUp' :cCountUp,'cCountMiddle' :cCountMiddle} if plot: pl.close("all") pl.figure(1) #pl.plot(dic['s'],dic['cCountDown'],'r-+',lw=1) #pl.plot(dic['s'],dic['cCountUp'],'g-x',lw=1) #pl.plot(dic['s'],dic['cCountMiddle'],'b-') pl.xlabel("Property Violation s") #pl.ylabel("Probability that cooperation wins (green) or disappears (red), \n or intermediary state (blue)") pl.ylim(-0.05,1.05) #pl.xlim(xmax=0.05) pl.plot(dic['s'],dic['cMedian'],'k-.',lw=2) pl.fill_between(dic['s'],dic['cDown'],dic['cUp'],color='k',alpha=0.2) #pl.plot(dic['s'],dic['cUP'],'k-.',lw=2) #pl.plot(dic['s'],dic['cDown'],'k-.',lw=2) pl.plot(S,C,'ko') pl.xlabel("Property Violation s") pl.ylabel("Median Cooperation Level") return dic
def ttest(dsets, sa_labels=None, return_values='mt', set_NaN_to=0., compare_to=0.): '''Runs a one-sample t-test across datasets Parameters ---------- dsets: str or list of dicts (filenames of) NIML dsets, each referring to PxQ data for P nodes (features) and Q values per node (samples) sa_labels: list of (int or str) indices or labels of columns to compare return_values: str (default: 'mt') 'm' or 't' or 'mt' to return sample mean, t-value, or both set_NaN_to: float or None (default: 0.) the value that NaNs in dsets replaced by. If None then NaNs are kept. compare_to: float (default: 0.) t-tests are compared against the null hypothesis of a mean of compare_to. Returns ------- dset: dict NIML dset-compatible dict with fields 'data', 'labels', 'stats' and 'node_indices' set. ''' do_m = 'm' in return_values do_t = 't' in return_values if not (do_m or do_t): raise ValueError("Have to return at least m or t") ns = len(dsets) for i, dset in enumerate(dsets): dset = from_any(dset) dset_data = dset['data'] if i == 0: sh = dset_data.shape if sa_labels is None: if 'labels' in dset: sa_labels = dset['labels'] dset_labels = sa_labels else: sa_labels = range(sh[1]) dset_labels = ['%d' % j for j in sa_labels] else: dset_labels = sa_labels nc = len(dset_labels) if dset_labels else sh[1] nn = sh[0] data = np.zeros((nn, nc, ns), dset_data.dtype) # number of nodes, columns, subjects if 'node_indices' in dset: node_idxs = np.reshape(dset['node_indices'], (-1,)) else: node_idxs = np.arange(nn) if i == 0: node_idxs0 = node_idxs else: if set(node_idxs0) != set(node_idxs): raise ValueError("non-matching node indices for %d and %d" % (0, i)) col_idxs = np.asarray(label2index(dset, sa_labels)) data[node_idxs, :, i] = dset_data[:, col_idxs] # subtract the value it is compared to # so that it now tests against a mean of zero if do_m: m = np.mean(data, axis=2) if do_t: from scipy import stats t = stats.ttest_1samp(data - compare_to, 0., axis=2)[0] if do_m and do_t: r = np.zeros((nn, 2 * nc), dtype=m.dtype) r[:, np.arange(0, 2 * nc, 2)] = m r[:, np.arange(1, 2 * nc, 2)] = t elif do_t: r = t elif do_m: r = m pf = [] stats = [] if do_m: pf.append('m') stats.append('None') if do_t: pf.append('t') stats.append('Ttest(%d)' % (ns - 1)) labs = sum([['%s_%s' % (p, lab) for p in pf] for lab in dset_labels], []) stats = stats * nc if not set_NaN_to is None: r[np.logical_not(np.isfinite(r))] = set_NaN_to return dict(data=r, labels=labs, stats=stats, node_indices=node_idxs0)
def build_ffnet(sorted_data,training_set_size): logging.info('starting new run! -----------------------------') print 'defining network' from ffnet import ffnet, imlgraph, mlgraph, loadnet, savenet from time import time from multiprocessing import cpu_count import networkx import pylab #data_in_training2 = sorted_data[:training_set_size,10:-2].astype(float).tolist() data_target_training2 = [[i] for i in sorted_data[:training_set_size,0].astype(float)] new_data_in = sorted_data[:training_set_size,col_training_set[0]] for i in col_training_set[1:]: new_data_in = numpy.column_stack((new_data_in, sorted_data[:training_set_size,i])) data_in_training2 = new_data_in.astype(float).tolist() # Define net (large one) conec = mlgraph(network_config, biases=False) #skipping first 11 cols net = ffnet(conec) print 'saving initialized net' savenet(net, 'starting_net.n') #net = loadnet('starting_net.n') # this way we can init a complex net just once #print 'draw network' #networkx.draw_graphviz(net.graph, prog='dot') #pylab.show() graph_weekly(net, sorted_data,training_set_size) # just saving a pic logging.info('network built as: ' + str(network_config) ) print "TRAINING NETWORK..." # that are many different training algos #net.train_rprop(data_in_training2, data_target_training2, a=1.9, b=0.1, mimin=1e-06, mimax=15.0, xmi=0.5, maxiter=max_functions, disp=1) ###net.train_momentum(data_in_training2, data_target_training2, eta=0.2, momentum=0.1, maxiter=max_functions, disp=1) stats = [] smallest_error = 1000 total = 0 try: for i in xrange(min_loops,max_loops): total += max_functions+i if total>max_total: break print 'training for:',max_functions+i, "total is:", total net.train_tnc(data_in_training2, data_target_training2, maxfun = max_functions+i, messages=1) #net.train_rprop(data_in_training2, data_target_training2, a=1.2, b=0.5, mimin=1e-06, mimax=50.0, xmi=0.1, maxiter=max_functions*20, disp=1) graph_weekly(net, sorted_data,training_set_size) # just saving a pic in0, out0, s1, s2, mape_weekly_all = calc_stats(net,sorted_data,training_set_size) stats.append((in0, out0,total, s1, s2, mape_weekly_all)) #if out0<=(biggest/1.4) and in0>.7: #if out0<=(smallest_error/4) and in0>overfitting_threshold: # print 'we hit overfitting threshold - breaking out early' # break if mape_weekly_all < smallest_error: # found a new best smallest_error = mape_weekly_all savenet(net, 'best_net.n') except KeyboardInterrupt: # this way command-c just breaks out of this loop pass #net.train_cg(data_in_training2, data_target_training2, maxiter=max_functions, disp=1) #net.train_genetic(data_in_training2, data_target_training2, individuals=max_population, generations=max_functions) #net.train_bfgs(data_in_training2, data_target_training2, maxfun = max_functions, disp=1) stats = sorted(stats, reverse=True, key=lambda x: x[1]) for i in stats: temp_string = '' for x in i: temp_string += str(x) + ',' print temp_string net = loadnet('best_net.n') return net