def test_1D_array(self): a = array((1,2,3,4), float64) actual= stats.hmean(a) desired = 4. / (1./1 + 1./2 + 1./3 + 1./4) assert_almost_equal(actual, desired, decimal=14) desired1 = stats.hmean(a,axis=-1) assert_almost_equal(actual, desired1, decimal=14)
def test_2D_array_default(self): a = array(((1,2,3,4), (1,2,3,4), (1,2,3,4))) actual = stats.hmean(a) desired = array((1.,2.,3.,4.)) assert_array_almost_equal(actual, desired, decimal=14) actual1 = stats.hmean(a,axis=0) assert_array_almost_equal(actual1, desired, decimal=14)
def test_hmean(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.hmean(abs(x)) rm = stats.mstats.hmean(abs(xm)) assert_almost_equal(r, rm, 10) r = stats.hmean(abs(y)) rm = stats.mstats.hmean(abs(ym)) assert_almost_equal(r, rm, 10)
def sset_sim(ssset_1, ssset_2): sim = [] if len(ssset_1) == 0 or len(ssset_2) == 0: return 0.0 for synset_1 in ssset_1: for synset_2 in ssset_2: s1, s2, s3 = synset_1.path_similarity(synset_2), \ synset_1.wup_similarity(synset_2), \ 1.0 if s1 > 0 and s2 > 0 and s3 > 0: new_sim = stats.hmean((s1, s2, s3)) if new_sim > 0: sim.append(new_sim) return stats.hmean(sim) if len(sim) > 0 else 0.0
def reprocess_classifier(x_vals, x_sub, prior_train, prior_test, y_vals, j, skf, clf, scores_only=False): # train classifiers and return out of fold train # predictions and blended submission predictions blend_train = prior_train blend_test = prior_test blend_test_j = np.zeros((x_sub.shape[0], len(skf))) for i, (train, test) in enumerate(skf): X_train = x_vals[train] y_train = y_vals[train] X_test = x_vals[test] clf.fit(X_train, y_train) blend_train[test, j] = clf.predict_proba(X_test)[:, 1] blend_test_j[:, i] = clf.predict_proba(x_sub)[:, 1] blend_test[:, j] = hmean(clip_probabilities(blend_test_j), axis=1) print "clf:", j, "logloss:", logloss(y_vals, blend_train[:, j]) dataset_blend_train = clip_probabilities(blend_train) dataset_blend_test = clip_probabilities(blend_test) if scores_only: return dataset_blend_train, dataset_blend_test else: X_stacked = np.hstack([x_vals, dataset_blend_train]) X_submission_stacked = np.hstack([x_sub, dataset_blend_test]) return X_stacked, X_submission_stacked
def _school_similarity(school1, school2, threshold=-sys.float_info.max, proximity_only=False, explain=False): loc1 = school1['address']['coordinates'] if 'coordinates' in school1['address'] else None loc2 = school2['address']['coordinates'] if 'coordinates' in school2['address'] else None sim_loc = location_similarity(loc1, loc2, radius=_radius) if proximity_only == True: sim_name = None if sim_loc: sims = [sim_loc] else: sims = [-sys.float_info.max] else: sim_name = token_similarity(school1['name'], school2['name']) sims = [sim_name, sim_loc] score = stats.hmean(sims) if all([s and s > 0 for s in sims]) else -sys.float_info.max explanation = None if explain == True: explanation = {} explanation['model'] = school1 explanation['entity'] = school2 explanation['Score explanation'] = {} explanation['Score explanation']['1 - distance (km)'] = distance(loc1, loc2) explanation['Score explanation']['2 - _radius for similarity (km)'] = _radius if sim_name: explanation['Score explanation']['3 - similarity of names'] = sim_name explanation['Score explanation']['4 - similarity of locations'] = sim_loc explanation['Score explanation']['5 - final score'] = score explanation['Score explanation']['6 - _score_threshold'] = threshold return score, explanation
def find_error(predicted, actual, fold_stats) : recall = [0.0,0.0,0.0] precision = [0.0,0.0,0.0] fstat = [0.0,0.0,0.0] for i in range (0,3) : tp = 0 fp = 0 fn = 0 tn = 0 for j in range (0,len(actual)) : if (predicted[j] == i) : if (actual[j] == i) : tp = tp + 1 else : fp = fp + 1 else : if (actual[j] == i) : fn = fn + 1 else : tn = tn + 1 recall[i] = tp/(tp+fn) precision[i] = tp/(tp+fp) fstat[i] = stats.hmean([recall[i], precision[i]]) recall_avg = (recall[0]+recall[1]+recall[2])/3 precision_avg = (precision[0]+precision[1]+precision[2])/3 fstat_avg = (fstat[0]+fstat[1]+fstat[2])/3 fold_stats.append([recall_avg, precision_avg, fstat_avg])
def do_cohort(case, model, N0, nindiv, corr_name): last = 0.5 fig, ax = plt.subplots(figsize=(16, 9)) nb = Nbs[(model, N0)] #fig.suptitle("Nb: %d (N1: %d) - different cohorts - 100 SNPs -%s" % # (nb, N0, corr_name), fontsize=18) fig.suptitle("Nb: %d - different cohorts - 100 SNPs - %s" % (nb, corr_name), fontsize=24) box_vals = [] labels = [] tops = [] bottoms = [] hmeans = [] bname = get_bname(model) for cohort in cohorts: vals, ci, r2, sr2, j, ssize = \ case[cohort][(model, N0)][(None, nindiv, 100, "SNP")] for cname, corrections in get_corrs(N0, bname, nindiv, vals, ci, r2, sr2, j): if cname != corr_name: continue cvals, cci = corrections vals = cvals ci = cci break box_vals.append(vals) hmeans.append(hmean(vals)) bottom, top = list(zip(*ci)) top = [100000 if x is None else x for x in top] bottom = [100000 if x is None else x for x in bottom] tops.append(np.percentile(top, 90)) bottoms.append(np.percentile(bottom, 10)) if cohort == 'c2c': labels.append("2 cohorts") elif cohort == 'c3c': labels.append("3 cohorts") else: labels.append("%s" % cohort) if cohort == cohorts[-1]: pos = len(labels) + 0.5 ax.axvline(pos, color="k", lw=0.2) ax.text(last + (pos - last) / 2, 0, "%d Individuals sampled" % nindiv, ha="center", va="bottom", size=24, rotation="horizontal") last = pos ax.set_ylim(0, nb * 3) ax.set_ylabel('$\hat{N}_{e}$', fontsize=32) ax.axhline(nb, color="k", lw=0.3) sns.boxplot(box_vals, notch=0, sym="") ax.set_xticks(1 + np.arange(len(labels))) ax.set_xticklabels(labels, fontsize=24) ax.plot([1 + x for x in range(len(tops))], tops, "rx") ax.plot([1 + x for x in range(len(bottoms))], bottoms, "rx") ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k+") yticks = [0, nb // 2, nb, 2 * nb, 3 * nb] ax.set_yticks(yticks) ax.set_yticklabels([str(y) for y in yticks], fontsize=14) #fig.savefig("output/cohort-%s-%s-%d.png" % (model, corr_name, N0)) return fig
def calculate_avg_condnum(grad_tensor, qoi_set): r""" Given gradient vectors at some points (centers) in the parameter space and given a specific set of QoIs, caculate the average condition number of the matrices formed by the gradient vectors of each QoI map at each center. :param grad_tensor: Gradient vectors at each center in the parameter space :math:`\Lambda` for each QoI map. :type grad_tensor: :class:`np.ndarray` of shape (num_centers, num_qois, Lambda_dim) where num_centers is the number of points in :math:`\Lambda` we have approximated the gradient vectors and num_qois is the number of QoIs we are given. :param list qoi_set: List of QoI indices :rtype: tuple :returns: (condnum, singvals) where condnum is a float and singvals has shape (num_centers, Data_dim) """ # Calculate the singular values of the matrix formed by the gradient # vectors of each QoI map. This gives a set of singular values for each # center. singvals = np.linalg.svd(grad_tensor[:, qoi_set, :], compute_uv=False) indz = singvals[:, -1] == 0 if np.sum(indz) == singvals.shape[0]: hmean_condnum = np.inf else: singvals[indz, 0] = np.inf singvals[indz, -1] = 1 condnums = singvals[:, 0] / singvals[:, -1] hmean_condnum = stats.hmean(condnums) return hmean_condnum, singvals
def find_error(predicted, actual, fold_stats) : recall = [0.0,0.0] precision = [0.0,0.0] fstat = [0.0,0.0] for i in range (0,2) : tp = 0 fp = 0 fn = 0 tn = 0 for j in range (0,len(actual)) : if (predicted[j] == i) : if (actual[j] == i) : tp = tp + 1 else : fp = fp + 1 else : if (actual[j] == i) : fn = fn + 1 else : tn = tn + 1 #print(i,j,tp,fp,fn,tn) if (tp > 0) : recall[i] = tp/(tp+fn) precision[i] = tp/(tp+fp) else : recall[i] = 0.0 precision[i] = 0.0 fstat[i] = stats.hmean([recall[i], precision[i]]) if recall[i] > 0 and precision[i] > 0 else 0.0 recall_avg = numpy.mean(recall) precision_avg = numpy.mean(precision) fstat_avg = numpy.mean(fstat) fold_stats.append([recall_avg, precision_avg, fstat_avg])
def create_simple_record(sequence): features = np.zeros(1, SimpleRecordType) features["mean"] = sequence.mean() features["var"] = sequence.var() features["skewness"] = stats.skew(sequence) features["kurtosis"] = stats.kurtosis(sequence) features["first"] = sequence[0] features["sign"] = np.sign(sequence).mean() features["zeros"] = (sequence == 0).mean() if (features["zeros"] == 0.0): features["harmonic_mean"] = stats.hmean(abs(sequence)) features["geometric_mean"] = stats.gmean(abs(sequence)) else: features["harmonic_mean"] = np.nan features["geometric_mean"] = np.nan for m in [2, 3, 5]: seqm = sequence % m for v in range(m): features["val_%dmod%d" % (v, m)] = (seqm == v).mean() return features
def hyperloglog(multiset, b = 5): """Compute the estimate of the number of unique elements in multiset """ m = 2 ** b registers = [0] * (m + 1) for item in multiset: x = zlib.adler32(str(item)) bin_x = bin(x)[2:] # Truncating 0b leftbits = bin_x[:b] j = int("0b" + leftbits, 2) + 1 rightbits = bin_x[b:] w = rightbits p = w.find("1") + 1 print("w = ", w) print("p = ", p) registers[j] = max(registers[j], p) print(registers) print(harmonic_mean(registers)) print(stats.hmean(registers)) return alpha(m) * (m ** 2) * harmonic_mean(registers)
def get_f(lex, gs_corpus): gs = squeeze(asarray(gs_corpus.sents)) gs_num_mappings = shape(gs)[0] links = nonzero(lex) obj_is = links[0] word_is = links[1] lex_num_mappings = size(obj_is) # compute precision, what portion of the target lex is composed of gold pairings p_count = 0 for pair in range(lex_num_mappings): this_obj = obj_is[pair] this_word = word_is[pair] #loop over gold standard if size(where((gs[:,0] == this_obj) & (gs[:,1] == this_word))) > 0: p_count = p_count + 1 if (lex_num_mappings == 0): #special case precision = 0 else: precision = float(p_count) / float(lex_num_mappings) # compute recall, how many of the total gold pairings are in the target lex recall = float(p_count) / float(gs_num_mappings) # now F is just the harmonic mean f = stats.hmean([recall, precision]) return (precision, recall, f)
def base_harmo_ave(arry): """ This is a function of harmonic mean. The formal parameter is a one dimensional list. The return value is the harmonic mean of the list. """ result = stats.hmean(arry) return round(result,2)
def test_2D_array_dim1(self): a = array(((1,2,3,4), (1,2,3,4), (1,2,3,4))) v = 4. / (1./1 + 1./2 + 1./3 + 1./4) desired1 = array((v,v,v)) actual1 = stats.hmean(a, axis=1) assert_array_almost_equal(actual1, desired1, decimal=14)
def do_pcrit(case, model, N0, isSNP): if isSNP: markers = [100, 200, 400] marker_name = 'SNP' else: markers = [15, 50, 100] marker_name = 'MSAT' cohort = "Newb" sampCase = {} for nmarkers in markers: sampCase[nmarkers] = {} for pcrit in pcrits: #print case[cohort][N0].keys() try: vals, ci, r2, sr2, j, ssize = case[cohort][ (model, N0)][(pcrit, 50, nmarkers, marker_name)] except KeyError: vals, ci, r2, sr2, j, ssize = [], [], [], [], [], [] sampCase[nmarkers][pcrit] = vals, ci, r2, sr2, j, ssize fig, ax = plt.subplots() fig.suptitle("pcrit : %s - %d - Newb - %ss - 50 individuals" % ( model, N0, marker_name)) box = [] cnt = 0 labels = [] tops = [] bottoms = [] hmeans = [] for nmarkers in markers: ax.text(1 + cnt, 0, str(nmarkers) + " %ss" % marker_name, rotation="vertical", ha="left", va="bottom") critCases = sampCase[nmarkers] for pcrit in pcrits: vals, ci, r2, sr2, j, ssize = critCases[pcrit] if len(vals) > 0: hmeans.append(hmean(vals)) bottom, top = list(zip(*ci)) tops.append(np.percentile([x if x is not None else 100000 for x in top], 90)) bottoms.append(np.percentile([x if x is not None else 0.1 for x in bottom], 10)) else: hmeans.append(None) tops.append(None) bottoms.append(None) labels.append(str(pcrit) if pcrit is not None else "std") box.append(vals) cnt += 1 sns.boxplot(box, notch=0, sym="",) ax.plot([1 + x for x in range(len(tops))], tops, "r.") ax.plot([1 + x for x in range(len(bottoms))], bottoms, "r.") ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k.") ax.set_ylabel("$\hat{N}_{e}$") ax.set_ylim(0, Nbs[(model, N0)] * 2) ax.axhline(Nbs[(model, N0)], color="k", lw=2) ax.set_xticks(1 + np.arange(len(labels))) ax.set_xticklabels(labels, rotation="vertical")
def summary_stats(movie_ratings): r_ids = [r.ID for r in movie_ratings] count = len(movie_ratings) if count > 0: amean = np.mean([r.rating for r in movie_ratings]) hmean = spstats.hmean([r.rating for r in movie_ratings]) var = np.var([r.rating for r in movie_ratings]) else: amean = np.NaN hmean = np.NaN var = np.NaN return r_ids, count, amean, hmean, var
def dependency2(i, N_j, node2Index, index2Node, table): values = [] for neighbor in N_j: f = 0.0 if i < node2Index[neighbor]: f = table[i][node2Index[neighbor]] else: f = table[node2Index[neighbor]][i] if f == 0.0: return 0.0 else: values.append(f) return stats.hmean(values)
def plot_pingpong_lineplot(xpos, dir, lang, lab, ls, marker, tp='lat'): stddev = {} statistic = {} for fname in os.listdir(dir): basename, ext = os.path.splitext(fname) if ext != ".dat": continue junk1, junk2, language, msglen = basename.split('_') if lang != language: continue data = np.loadtxt(f"{dir}/{fname}") # outlier removal (Tukey, 1.5 param) d_25 = np.quantile(data, 0.25) iqr = stats.iqr(data) d_75 = np.quantile(data, 0.75) lower = d_25 - 1.5 * iqr upper = d_75 + 1.5 * iqr cdata = np.array([x for x in data if x > lower and x < upper]) ml = int(msglen) if tp == 'lat': statistic[ml] = np.mean(cdata) / 1e6 # convert to ms stddev[ml] = np.std(cdata) / 1e6 # stddev is in same units as data elif tp == 'tput': statistic[ml] = stats.hmean(2 * ml / cdata) * 1e9 # B/s stddev[ml] = np.std(2 * ml / cdata) * 1e9 else: print(f"Unknown measure: {tp}") exit() ys = [float(statistic[k]) for k in sorted(statistic.keys())] stds = [float(stddev[k]) for k in sorted(stddev.keys())] plt.errorbar(xpos, ys, yerr=stds, xerr=None, label=lab, linestyle=ls, marker=marker) plt.legend(loc='best')
def plot_model(fig, model): bname = get_bname(model) vals = [] ldnes = {} errs = {} labels = [] cnb = case['Newb'] nbks = sorted(list(Nbs.keys()), key=lambda x: x[1]) for cname, cdata in get_corrs(bname, [], []): ldnes[cname] = [] errs[cname] = [] nobs = 0 for name, N0 in nbks: if name != model: continue nobs += 1 labels.append(str(N0)) val = Nbs[(model, N0)] ldne, ci = cnb[(model, N0)][None, 50, 100, 'SNP'] for cname, cdata in get_corrs(bname, ldne, ci): cldne, ccis = cdata hmean = stats.hmean([x if x > 0 else 10000 for x in cldne]) ldnes[cname].append(hmean) err = hmean / val errs[cname].append(err) vals.append(val) ax = fig.add_subplot(2, 1, 1) ax.set_title("Nb and estimators %s" % bname) ax.plot(vals, '+', label="Nb") for name, lvals in list(ldnes.items()): ax.plot(lvals, '-', label=name) print(name) print(vals) print(lvals) ax.set_xticklabels(labels) ax.legend() ax = fig.add_subplot(2, 1, 2) ax.set_title("Fraction of error %s" % bname) ax.plot([1.0] * nobs, '+', label="Nb") for name, cvals in list(errs.items()): ax.plot(cvals, '-', label=name) ax.set_xticklabels(labels) ax.legend() fig.savefig("output/correct.png")
def ranking_features(df_score, list_tracks, method = 'mean'): if(len(list_tracks)==1): df_score = df_score.sort_values(by = list_tracks, ascending = False) else: if(method == 'gmean'): df_combine = pd.DataFrame({'score_global': stats.mstats.gmean(df_score.iloc[:,1:], axis=1)}) elif(method == 'hmean'): df_combine = pd.DataFrame({'score_global': stats.hmean(df_score.iloc[:,1:], axis=1)}) else: df_combine = pd.DataFrame({'score_global': df_score.mean(axis=1)}) df_score = pd.concat([df_score, df_combine], axis=1) df_score = df_score.sort_values(by = ['score_global'], ascending = False) return df_score
def extract_keywords_harmonic(self): max_edge_weight = max(self.node_weight.values( )) # Maximum sum of edge weights among all edges in the graph max_degree = max(dict(self.G.degree).values()) ranks = { cand: hmean([ self.G.degree(cand) / max_degree, self.node_weight[cand] / max_edge_weight, self.candidate_doc_occur[cand] / self.num_docs ]) for cand in list(self.G.nodes()) if self.G.degree(cand) > 0 } sortRank = self.sortKeywords(ranks) self.ranks_hmean = sortRank return (self.ranks_hmean)
def get_weight_attr(cov_u, cov_v, reads_weight, db_weight): cov_diff = 1.0 / (abs(cov_u - cov_v) + sys.float_info.epsilon) weight_attr = { 'cov_diff': cov_diff, 'reads_and_db': reads_weight + db_weight, 'geometric_mean': gmean([cov_diff, reads_weight, db_weight]), 'harmonic_mean': hmean([ cov_diff, reads_weight + sys.float_info.epsilon, db_weight + sys.float_info.epsilon ]) } return weight_attr
def num_hm(seed, fname_contains_seed=False): lo = sg.self_affine_psd_based_ext(L, hrms, 0.8, N, seed=seed) up = sg.self_affine_psd_based_ext(L, hrms, 0.8, N, seed=seed, lambda_L_over_lambda_mm=lambda_L_over_lambda_mm) # compute numerical results and save to binary O = [0., 1000., 3000.] P = [10.0E6,20.0E6,30.0E6,40.0E6,50.0E6] #O = [3000.] #P = [10.0E6] AHs, AH_perps, AH_paras, RCs = [], [], [], [] for offset in O: sco = sm.composite_surface_in_x(up.h, lo.h, lo.a, offset) #sm.plot2D(sco, ticks=False) AH, AH_perp, AH_para, RC = [], [], [], [] for p in P: contact = sc.contact_FFT(sco, p, E, nu, store_aperture_field=True, verbose=True) if offset == 3000. and p == 10.0E6: cfname = 'cont_num_3mm_10MPa' if fname_contains_seed: cfname += '_seed'+str(seed) sc.save(contact, cfname) #sc.plot_aperture_field(contact.aperture_field, contact.dxy, save_as='am_3mm_10MPa') #comp_contacts() #sys.exit() #sc.plot_clusters(contact) #sc.plot_contact_cluster_areas(contact) flow_x, flow_y = sf.hydraulic_aperture(contact.aperture_field, contact.dxy, verbose=True) ah_iso = scst.hmean([flow_x.a, flow_y.a]) AH += [su.to_meter(ah_iso)] AH_perp += [su.to_meter(flow_y.a)] AH_para += [su.to_meter(flow_x.a)] RC += [contact.contact_ratio()] AHs += [AH] AH_perps += [AH_perp] AH_paras += [AH_para] RCs += [RC] bfname = 'results' if fname_contains_seed: bfname += '_seed'+str(seed) f = open(bfname, 'wb') pc.dump(P, f) pc.dump(AHs, f) pc.dump(AH_perps, f) pc.dump(AH_paras, f) pc.dump(RCs, f) f.close()
def skewness(): columns = ['column name', 'value'] aircrafts_data = pd.read_csv( "F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/aircrafts_data.csv" ) airports_data = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/airports_data.csv' ) boarding_passes = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/boarding_passes.csv' ) bookings = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/bookings.csv' ) flights = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/flights.csv' ) seats = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/seats.csv' ) ticket_flights = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/ticket_flights.csv' ) tickets = pd.read_csv( 'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/tickets.csv' ) aircrafts_data_skew = aircrafts_data.skew().abs().sum() #aircrafts_data_s = aircrafts_data_skew.sum() boarding_passes_skew = boarding_passes.skew().abs().sum() bookings_skew = bookings.skew().abs().sum() flights_skew = flights.skew().abs().sum() seats_skew = seats.skew().abs().sum() ticket_flights_skew = ticket_flights.skew().abs().sum() tickets_skew = tickets.skew().abs().sum() airports_data_skew = airports_data.skew().abs().sum() data = np.array([ aircrafts_data_skew, airports_data_skew, boarding_passes_skew, bookings_skew, flights_skew, seats_skew, ticket_flights_skew, tickets_skew ]) #print(data) mean = data.mean() h_mean = hmean(data) print('The average skew in the dataset is: ', np.round(mean, 2)) return mean
def swc(labels, distances): if len(np.unique(labels)) > 1: silhouette = round( silhouette_score(squareform(distances), labels, metric='precomputed'), 3) unique, counts = np.unique(labels, return_counts=True) stds = np.std(counts) armonicas = hmean(counts) armonicas_1 = 1 / np.sum(1.0 / counts, axis=0) else: silhouette = 0.0 stds = 'no' armonicas = 'no' armonicas_1 = 'no' return silhouette, stds, armonicas, armonicas_1
def inspect_confusion_matrix(cmatrix): cmatrix = np.array(cmatrix) total_count = np.sum(cmatrix) tp = np.diag(cmatrix) fp = np.sum(cmatrix, axis=0) - tp fn = np.sum(cmatrix, axis=1) - tp tn = total_count - fp - fn - tp averaging = { 'microavg': lambda score: score(*[np.mean(x) for x in (tp, fp, tn, fn)]), 'macroavg': lambda score: np.mean(score(tp, fp, tn, fn)) } results = {} results['noavg'] = { 'total_count': total_count, 'accuracy': tp.sum() / total_count } base_accuracy = (np.sum(cmatrix, axis=0) * np.sum( cmatrix, axis=1)).sum() / (total_count * total_count) results['noavg']['kappa'] = (results['noavg']['accuracy'] - base_accuracy) / (1 - base_accuracy) for averaging_name, averaging_f in averaging.items(): scores = { 'precision': averaging_f(lambda tp, fp, tn, fn: tp / (tp + fp)), 'recall': averaging_f(lambda tp, fp, tn, fn: tp / (tp + fn)), 'specificity': averaging_f(lambda tp, fp, tn, fn: tn / (tn + fp)) } scores['sensitivity'] = scores['recall'] try: scores['f1'] = hmean([scores['precision'], scores['recall']]) except ValueError: scores['f1'] = None scores['ppv'] = scores['precision'] scores['npv'] = averaging_f(lambda tp, fp, tn, fn: tn / (tn + fn)) results[averaging_name] = scores return results
def massAvg(massList, method='weighted', weights=None): """ Compute the average mass of massList according to method. If method=weighted but weights were not properly defined, switch method to harmonic. If massList contains a zero mass, switch method to mean. :parameter method: possible values: harmonic, mean, weighted :parameter weights: weights of elements (only for weighted average) """ if not massList: return massList if len(massList) == 1: return massList[0] if method == 'weighted' and (not weights or len(weights) != len(massList)): method = 'harmonic' flatList = [ mass / GeV for mass in _flattenList(massList)] if method == 'harmonic' and 0. in flatList: method = 'mean' for mass in massList: if len(mass) != len(massList[0]) \ or len(mass[0]) != len(massList[0][0]) \ or len(mass[1]) != len(massList[0][1]): logger.error('Mass shape mismatch in mass list:\n' + str(mass) + ' and ' + str(massList[0])) import sys sys.exit() avgmass = copy.deepcopy(massList[0]) for ib, branch in enumerate(massList[0]): for ival in enumerate(branch): vals = [ float(mass[ib][ival[0]] / GeV) for mass in massList] if method == 'mean': avg = np.mean(vals) elif method == 'harmonic': avg = stats.hmean(vals) elif method == 'weighted': weights = [ float(weight) for weight in weights ] avg = np.average(vals,weights=weights) avgmass[ib][ival[0]] = float(avg)*GeV return avgmass
def quantize_fast(toas, toaerrs, flags=None, dt=0.1): r""" Function to quantize and average TOAs by observation epoch. Used especially for NANOGrav multiband data. Pulled from `[3]`_. .. _[3]: https://github.com/vallis/libstempo/blob/master/libstempo/toasim.py Parameters ---------- times : array TOAs for a pulsar. flags : array, optional Flags for TOAs. dt : float Coarse graining time [days]. """ isort = np.argsort(toas) bucket_ref = [toas[isort[0]]] bucket_ind = [[isort[0]]] dt *= (24 * 3600) for i in isort[1:]: if toas[i] - bucket_ref[-1] < dt: bucket_ind[-1].append(i) else: bucket_ref.append(toas[i]) bucket_ind.append([i]) avetoas = np.array([np.mean(toas[l]) for l in bucket_ind], 'd') avetoaerrs = np.array([sps.hmean(toaerrs[l]) for l in bucket_ind], 'd') if flags is not None: aveflags = np.array([flags[l[0]] for l in bucket_ind]) U = np.zeros((len(toas), len(bucket_ind)), 'd') for i, l in enumerate(bucket_ind): U[l, i] = 1 if flags is not None: return avetoas, avetoaerrs, aveflags, U, bucket_ind else: return avetoas, avetoaerrs, U, bucket_ind
def calallstats(self, nparr1, nparr2=None): self.amean = np.mean(nparr1) self.hmean = stats.hmean(nparr1) self.gmean = stats.gmean(nparr1) self.median = np.median(nparr1) self.mode = stats.mode(nparr1) self.min = np.min(nparr1) self.max = np.max(nparr1) self.q1 = np.percentile(nparr1, 25) self.q2 = np.percentile(nparr1, 50) self.q3 = np.percentile(nparr1, 75) self.var = np.var(nparr1) self.std = np.std(nparr1) self.cov = np.cov(nparr1, nparr2) self.corr = np.correlate(nparr1, nparr2) pass
def massAvg(massList, method='weighted', weights=None): """ Compute the average mass of massList according to method. If method=weighted but weights were not properly defined, switch method to harmonic. If massList contains a zero mass, switch method to mean. :parameter method: possible values: harmonic, mean, weighted :parameter weights: weights of elements (only for weighted average) """ if not massList: return massList if len(massList) == 1: return massList[0] if method == 'weighted' and (not weights or len(weights) != len(massList)): method = 'harmonic' flatList = [mass / GeV for mass in _flattenList(massList)] if method == 'harmonic' and 0. in flatList: method = 'mean' for mass in massList: if len(mass) != len(massList[0]) \ or len(mass[0]) != len(massList[0][0]) \ or len(mass[1]) != len(massList[0][1]): logger.error('Mass shape mismatch in mass list:\n' + str(mass) + ' and ' + str(massList[0])) import sys sys.exit() avgmass = copy.deepcopy(massList[0]) for ib, branch in enumerate(massList[0]): for ival in enumerate(branch): vals = [float(mass[ib][ival[0]] / GeV) for mass in massList] if method == 'mean': avg = np.mean(vals) elif method == 'harmonic': avg = stats.hmean(vals) elif method == 'weighted': weights = [float(weight) for weight in weights] avg = np.average(vals, weights=weights) avgmass[ib][ival[0]] = float(avg) * GeV return avgmass
def recommend_for_user(user, **kwargs): global uu_enh, userids docs, scores = [], [] item_sim_min_thr = kwargs['item_sim_min_thr'] if kwargs.has_key( 'item_sim_min_thr') else 0.2 item_maxn = kwargs['max_items'] if kwargs.has_key('max_items') else 3 # first, locate top N similar users... res = similar_users(user, **kwargs) if res.size > 0: user_row = __get_user_row(user) ri = np.where(user_row.data == 1.0) user_read = user_row.indices[ri] sim_hists = [ __get_user_row(e).todense().tolist()[0] for e in res[:, 0] ] sim_hists = sps.csr_matrix(sim_hists) print "For ", user print user_row print "similar users: " print res print sim_hists for i, d in enumerate(docids): if sim_hists.getcol(i).size > 0 and i not in user_read: y_hat = hmean(sim_hists.getcol(i).data) y = user_row[0, i] s = 1.0 - np.abs(y_hat - y) #print d, ": hmean=", y_hat, ", score=", s if s >= item_sim_min_thr: docs.append(d) scores.append(s) docs = np.array(docs) scores = np.array(scores) ids = np.argsort(-scores)[0:item_maxn + 1] docs = docs[ids] scores = scores[ids] ret = np.array(zip(docs, scores)) return ret
def get_fft_stats(z): avg = np.average(z) std = np.std(z) median = np.median(z) var = np.var(z) kurt = stats.kurtosis(z) hmean = stats.hmean(z) gmean = stats.gmean(z) skew = stats.skew(z) median_dev_abs = np.sum(np.abs(z - median)) std_dev_abs = np.sum(np.abs(z - std)) stats_array = [avg, std, median, var, kurt, hmean, gmean, skew, median_dev_abs, std_dev_abs] return stats_array
def do_hz_comp(pref, mydir, model, N0): snps = [100] # , 200, 400] cohort = "Newb" cutCase = {} for cut in cuts: cutCase[cut] = {} case = load_file(pref, cut * 100, mydir) for nsnp in snps: vals, ci, r2, sr2, j, ssize = case[cohort][ (model, N0)][(None, 50, nsnp, "SNP")] cutCase[cut][nsnp] = vals, ci, r2, sr2, j, ssize fig, ax = plt.subplots(figsize=(16, 9)) fig.suptitle("Hz comparison: %s - %d - Newb - SNPs - 50 indivs " % (model, N0)) box = [] cnt = 0 labels = [] tops = [] bottoms = [] hmeans = [] for cut in cuts: # ax.text(cnt, 0, str(cut), rotation="vertical") cases = cutCase[cut] for nsnp in snps: vals, ci, r2, sr2, j, ssize = cases[nsnp] hmeans.append(hmean(vals)) bottom, top = list(zip(*ci)) top = [100000 if x is None else x for x in top] bottom = [100000 if x is None else x for x in bottom] tops.append(np.percentile(top, 90)) bottoms.append(np.percentile(bottom, 10)) # labels.append(str(nsnp)) labels.append(str(cut)) box.append(vals) cnt += 1 sns.boxplot(box, notch=0, sym="",) ax.set_ylabel("$\hat{N}_{e}$", fontsize=24) ax.set_xlabel('Hz', fontsize=24) ax.plot([1 + x for x in range(len(tops))], tops, "rx") ax.plot([1 + x for x in range(len(bottoms))], bottoms, "rx") ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k+") ax.axhline(Nbs[(model, N0)], color="k", lw=0.3) ax.set_ylim(0, Nbs[(model, N0)] * 3) ax.set_xticks(1 + np.arange(len(labels))) ax.set_xticklabels(labels)
def f1_micro_average(y_test, y_pred): ''' Calculates s.c. micro average f1 Input: y_test, y_pred - arrays with test and prediction values Output: micro average f1 ''' TN = [] FP = [] FN = [] for i in range(y_pred.shape[1]): TN.append(confusion_matrix(np.array(y_test)[:, i], y_pred[:, i])[1, 1]) FP.append(confusion_matrix(np.array(y_test)[:, i], y_pred[:, i])[1, 0]) FN.append(confusion_matrix(np.array(y_test)[:, i], y_pred[:, i])[0, 1]) precision = np.sum(TN) / (np.sum(TN) + np.sum(FN)) recall = np.sum(TN) / (np.sum(TN) + np.sum(FP)) return hmean([precision, recall])
def combine_predictions(all_interp): y_true = to_np(all_interp[0][1].y_true) all_preds = np.stack([to_np(interp.preds) for _, interp in all_interp]) preds = np.mean(all_preds, axis=0) acc_m = compute_acc(preds, y_true) preds = np.median(all_preds, axis=0) acc_med = compute_acc(preds, y_true) preds = gmean(all_preds, axis=0) acc_g = compute_acc(preds, y_true) preds = hmean(all_preds, axis=0) acc_h = compute_acc(preds, y_true) print(f'accuracy -- mean: {acc_m:0.3f} median: {acc_med:0.3f} gmean: {acc_g:0.3f} hmean: {acc_h:0.3f}') return acc_m, acc_med, acc_g, acc_h
def calculate_means(df): """ 自定义一个返回DataFrame函数,使用Numpy的函数average计算加权平均值,使用Scipy的gmean和hmean计算几何和调和平均值 :param df: :return: """ df_means = pd.DataFrame( index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic']) cols = ['SATMTMID', 'SATVRMID'] for col in cols: arithmetic = df[col].mean() weighted = np.average(df[col], weights=df['UGDS']) geometric = gmean(df[col]) harmonic = hmean(df[col]) df_means[col] = [arithmetic, weighted, geometric, harmonic] df_means['count'] = len(df) return df_means.astype(int)
def perf_metrics_2X2(y_true, y_pred): cm = confusion_matrix(y_true, y_pred) TN = cm[0][0] FN = cm[1][0] TP = cm[1][1] FP = cm[0][1] precision = TP / (TP + FP) # pos_pred_value recall = TP / (TP + FN) # TP_rate/sensitivity false_discovery_rate = FP / (TP + FP) true_negative_rate = TN / (FP + TN) f1_score = hmean((precision, recall)) # return {'Precision': precision, 'Recall': recall, 'True Negative Rate': true_negative_rate, # 'False Discovery Rate': false_discovery_rate, 'F1 Score': f1_score} return 'True Positive Rate: {:.2f}\nTrue Negative Rate: {:.2f}\nPrecision: {:.2f}\nFalse Discovery Rate: {:.2f}\nF1 ' \ 'Score: {:.2f}\n'.format(recall, true_negative_rate, precision, false_discovery_rate, f1_score)
def getNumberSummary(series): # Handle the case where input is empty, then just return all 0s. if len(series) == 0: series = pd.Series([0, 0]) return pd.DataFrame({ "min": [np.min(series)], "lower_quartile": [np.percentile(series, 25)], "median": [np.median(series)], "upper_quartile": [np.percentile(series, 75)], "max": [np.max(series)], "mean": [np.mean(series)], "std": [np.std(series, ddof=1 if len(series) > 1 else 0)], "skewness": [st.skew(series)], "kurtosis": [st.kurtosis(series, fisher=False)], "gmean": [st.gmean(series + 1e-6)], "hmean": [st.hmean(series + 1e-6)], "sum": [np.sum(series)] })
def plot_age_imputed_dist(data): fig, ax = plt.subplots(3, 2, figsize=(15, 15)) plot_age_and_sex_distribution(data, "Original", ax[0][0]) plot_age_and_sex_distribution(data.fillna(data["Age"].mean()), "Impute With Mean", ax[0][1]) plot_age_and_sex_distribution( data.fillna(stats.gmean(data["Age"].dropna())), "Impute With GMean", ax[1][0]) plot_age_and_sex_distribution( data.fillna(stats.hmean(data["Age"].dropna())), "Impute With HMean", ax[1][1]) plot_age_and_sex_distribution(data.fillna(data["Age"].mode()[0]), "Impute With Mode", ax[2][0]) plot_age_and_sex_distribution(data.fillna(data["Age"].median()), "Impute With Median", ax[2][1]) sns.despine(trim=True) plt.tight_layout() plt.show()
def evaluate(): model_path, data_dir = _input() model_home = model_path.split('.')[0] + "_out" pos_val = '{}_val_pos.txt'.format(data_dir) neg_val = '{}_val_bg.txt'.format(data_dir) loc_score = np.array([]) rec_score = np.array([]) # Create dirs to store results none_path = path.join(model_home, 'none') good_path = path.join(model_home, 'good') mid_path = path.join(model_home, 'mid') bad_path = path.join(model_home, 'bad') os.popen("rm -rf {0}; mkdir {0}".format(model_home)) os.popen("mkdir {0}".format(none_path)) os.popen("mkdir {0}".format(good_path)) os.popen("mkdir {0}".format(mid_path)) os.popen("mkdir {0}".format(bad_path)) loc_score_p, rec_score_p, goods, mids, bads, nones = evaluate_positives(data_dir, \ pos_val, model_path) _copy_data(goods, good_path) _copy_data(mids, mid_path) _copy_data(bads, bad_path) _copy_data(nones, none_path) loc_score_n, rec_score_n = evaluate_negatives(data_dir, neg_val, model_path) loc_score = np.append((loc_score_p, loc_score_n)) rec_score = np.append((rec_score_p, rec_score_n)) print "Total positives evaluated {}".format(len(loc_score_p)) print "loc_score {}, rec_score {}".format(np.mean(loc_score_p), np.mean(rec_score_p)) print "Negatives evaluated {}".format(len(loc_score_n)) print "loc_score {}, rec_score {}".format(np.mean(loc_score_n), np.mean(rec_score_n)) print "Total evaluated {}".format(len(loc_score)) print "loc_score {}, rec_score {}".format(np.mean(loc_score), np.mean(rec_score)) print "Score: {}".format( stats.hmean([np.mean(loc_score), np.mean(rec_score)]))
def calculateVideoVMAF(self, modelPath): print("Calculating quality ...") # VMAF CSV File Name vmafOut = os.path.splitext(self.outputFile)[0] + "-vmaf.csv" # Assemble select, scale and vmaf filter strings scaleStringMain = "[0:v]scale=1920x1080:flags=bicubic,settb=AVTB,setpts=PTS-STARTPTS[main]; " scaleStringRef = "[1:v]scale=1920x1080:flags=bicubic,settb=AVTB,setpts=PTS-STARTPTS[ref]; " vmafFilterString = "[main][ref]libvmaf=model_path=" + modelPath vmafFilterString += ":log_path=" + vmafOut vmafFilterString += ":log_fmt=csv" # Assemble final filter string filterString = scaleStringMain + scaleStringRef + vmafFilterString # Assemble FFmpeg command vmafCommand = [ "ffmpeg", "-hide_banner", "-v", "error", "-stats", "-r", "24", "-i", self.outputFile, "-r", "24", "-i", self.videoFile, "-filter_complex", filterString, "-f", "null", "-" ] # Run FFmpeg command a = run(vmafCommand) # Read in VMAF CSV file vmaf_df = pd.read_csv(vmafOut, usecols=['Frame', 'vmaf']) # Averages & Percentiles vmafMean = vmaf_df['vmaf'].mean() vmafHMean = stats.hmean(vmaf_df['vmaf'], axis=0) vmafMax = vmaf_df['vmaf'].max() vmafP75 = np.percentile(vmaf_df['vmaf'], q=75) vmafP25 = np.percentile(vmaf_df['vmaf'], q=25) vmafMin = vmaf_df['vmaf'].min() # Print results print("VMAF mean = " + str(vmafMean)) print("VMAF harmonic mean = " + str(vmafHMean)) print("VMAF maximum = " + str(vmafMax)) print("VMAF 75th percentile = " + str(vmafP75)) print("VMAF 25th percentile = " + str(vmafP25)) print("VMAF minimum = " + str(vmafMin))
def transliterate_one_word(word, language1, language2): # on récupère la prononciation du mot response = json.loads(get_all_words(language1, word))['result'] if response == []: return False word = response[0] syllables_ipa1 = word['syllables_ipa'] syllables2 = [] syllables_ipa2 = [] translitteration_score = [] for syll in syllables_ipa1: try: # on cherche la correspondance de chaque syllabe dans la 2eme langue response = json.loads(get_all_syllables(language1, syll))['result'] # response = requests.get(f'{API_URL}/{language1}_syllables/{syll}').json()['result'] syll1 = response[language2] score = response[f'{language2}_score'] if score < 0.8: # on traite la syllabe autrement si elle n'a pas d'équivalent, score arbitraire raise KeyError syllables_ipa2 += [syll1] translitteration_score += [score] # syll2 = requests.get(f'{API_URL}/{language2}_syllables/{syll1}').json()['result']['orthographical_syllable'] syll2 = json.loads(get_all_syllables( language2, syll1))['result']['orthographical_syllable'] syllables2 += [syll2] except (KeyError, TypeError): syll1 = "" syll2 = "" for phonem in syll: try: item = json.loads(get_phonem(language1, phonem))['result'] equivalent = item[language2] writing = json.loads(get_phonem( language2, equivalent))['result']['written'] syll1 += equivalent syll2 += writing except (KeyError, TypeError): continue syllables_ipa2 += [syll1] syllables2 += [syll2] translitteration_score += [0.001] harmonic_mean = int(100 * round(stats.hmean(translitteration_score), 2)) return word[ "syllables"], syllables2, syllables_ipa1, syllables_ipa2, harmonic_mean
def recommendation_mf(userArray, numUsers, movieIds): ratings_dict = { 'itemID': list(ratings.movie_id_ml) + list(numUsers * movieIds), 'userID': list(ratings.user_id) + [ max(ratings.user_id) + 1 + x for x in range(numUsers) for y in range(15) ], 'rating': list(ratings.rating) + [item for sublist in userArray for item in sublist] } df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset = data.build_full_trainset() nmf = NMF() nmf.fit(trainset) userIds = [ trainset.to_inner_uid(max(ratings.user_id) + 1 + x) for x in range(numUsers) ] mat = np.dot(nmf.pu, nmf.qi.T) scores = hmean(mat[userIds, :], axis=0) best_movies = scores.argsort() best_movies = best_movies[-9:][::-1] scores = scores[best_movies] movie_ind = [trainset.to_raw_iid(x) for x in best_movies] recommendation = list( zip( list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].title), list(df_ML_movies[df_ML_movies.movie_id_ml.isin( movie_ind)].poster_url), list(scores))) print(recommendation) print(len(recommendation[0])) return recommendation
def print_iris_statistics(data): # --------------------------------------------------- Create data frame < # df = pandas.DataFrame( columns = ["Dł. d. k.", "Sz. d. k.", "Dł. pł.", "Sz. pł."]) # ------------------------- Calculate different statistical information < # df.loc["Minimum [cm]", :] = [i for i in data.iloc[:, 0:4].min()] df.loc["Maksimum [cm]", :] = [i for i in data.iloc[:, 0:4].max()] df.loc["Rozstęp [cm]", :] = [i for i in df.loc["Maksimum [cm]"] - df.loc["Minimum [cm]"]] df.loc["Pierwszy kwartyl [cm]", :] = [ quantile(data.iloc[:, i], 0.25) for i in range(4)] df.loc["Mediana [cm]", :] = [ median(data.iloc[:, i]) for i in range(4)] df.loc["Trzeci kwartyl [cm]", :] = [ quantile(data.iloc[:, i], 0.75) for i in range(4)] df.loc["Średnia harmoniczna [cm]", :] = stats.hmean(data.iloc[:, 0:4]) df.loc["Średnia geometryczna [cm]", :] = stats.gmean(data.iloc[:, 0:4]) df.loc["Średnia arytmetyczna [cm]", :] = [i for i in data.mean()] # Operator ** means power() method # The shape attribute for numpy arrays returns the dimensions of the array # If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[0] is n df.loc["Średnia potęgowa 2 rzędu [cm]", :] = [i for i in ( ((data.iloc[:, 0:4] ** 2).sum() / data.shape[0]) ** (1 / 2))] df.loc["Średnia potęgowa 3 rzędu [cm]", :] = [i for i in ( ((data.iloc[:, 0:4] ** 3).sum() / data.shape[0]) ** (1 / 3))] df.loc["Wariancja [cm^2]", :] = [i for i in data.var()] df.loc["Odchylenie standardowe [cm]", :] = [i for i in data.std()] # If True, Fisher’s definition is used (normal ==> 0.0) # If False, Pearson’s definition is used (normal ==> 3.0) df.loc["Kurtoza", :] = stats.kurtosis(data.iloc[:, 0:4], fisher = False) pandas.set_option('display.max_rows', 1000) pandas.set_option('display.max_columns', 1000) pandas.set_option('display.width', 1000) print(df.astype(float).round(1))
def compute_semeval_score(pearson_score, spearman_score): """ Return NaN if a dataset can't be evaluated on a given frame. Return 0 if at least one similarity measure was 0 or negative. Otherwise, take a harmonic mean of a Pearson correlation coefficient and a Spearman correlation coefficient. """ intervals = ['acc', 'low', 'high'] scores = [] for interval in intervals: if any( np.isnan(x) for x in [spearman_score[interval], pearson_score[interval]] ): scores.append(float('NaN')) elif any(x <= 0 for x in [spearman_score[interval], pearson_score[interval]]): scores.append(0) else: scores.append(hmean([spearman_score[interval], pearson_score[interval]])) return pd.Series(scores, index=intervals)
def calc_scores(final_probes, all_strings, l_c_ratio=1.): coverage_score, success_match_list = zip(*final_probes) length_score = [] # harmonic mean of the lengths final_score = [] for cr, sm in final_probes: l = hmean(list(map(len,sm))) length_score.append(l) fs = final_score_function(cr,0,l,len(sm), l_c_ratio) final_score.append(fs) ret = { 'final': final_score, 'coverage': coverage_score, 'overlap': [0]*len(coverage_score), 'length': length_score, 'match_list': success_match_list } return ret
def evaluations2df(data_two_stage): if not isinstance(data_two_stage, np.ndarray): data_two_stage = np.array(data_two_stage) data = { 'thr': data_two_stage[:, 0], 'thr_exh': data_two_stage[:, 1], 'TP': data_two_stage[:, 4].astype(np.int32), 'FP': data_two_stage[:, 5].astype(np.int32), 'TN': data_two_stage[:, 6].astype(np.int32), 'FN': data_two_stage[:, 7].astype(np.int32), 'precision': data_two_stage[:, 8], 'recall': data_two_stage[:, 9], 'F1': stats.hmean(np.ma.masked_equal(data_two_stage[:, 8:10], 0), axis=1), 'acc': data_two_stage[:, 10], } frame_two_stage = pd.DataFrame(data) return frame_two_stage
def evaluate(stats: EvaluationStatistics) -> EvaluationResult: EPS = 1e-6 precision = stats.tp / (stats.tp + stats.fp) recall = stats.tp / (stats.tp + stats.fn) precision[np.where(np.isnan(precision))] = 1 recall[np.where(np.isnan(recall))] = 1 accuracy = (stats.tp.sum() + stats.tn.sum()) / ( stats.tp.sum() + stats.tn.sum() + stats.fp.sum() + stats.fn.sum()) f1 = hmean(np.concatenate([precision, recall]) + EPS) predicted_dist = stats.predicted_count / stats.predicted_count.sum() actual_dist = stats.actual_count / stats.actual_count.sum() expected_accuracy = np.sum(predicted_dist * actual_dist) kappa = (accuracy - expected_accuracy) / (1 - expected_accuracy) return EvaluationResult(precision, recall, accuracy, f1, kappa)
def main(): # documents = load() sc = SentComparator() path = "../ClaimComparator/testLogicParseCorpus/" files = glob(os.path.join(path, "*logic.csv")) # w2v = sc.loadW2V() for filename in files: with open(filename, "r", encoding='utf-8') as logic, open('results.txt', 'a', encoding='utf-8') as res: print('Processing:', filename) if 'claim 1' in filename: target = 'Bolsanaro won the Brazilian election' elif 'claim 2' in filename: target = 'Climate change is predominantly caused by human activity' else: target = 'Michael Jordan is the greatest basketball player of all time' lines = logic.readlines() match = sc.oneToManyCompare(lines, target) res.write(filename + ': ' + str(match) + '\n') # print(sc.compare('The sandwich ate the rat', 'The rat ate the sandwich')) # print(documents['./testCorpus\claim 3-7.txt'][4]) # posCCM = ClaimComparatorModel(documents['./testCorpus\claim 3-3.txt'][-1]) # print(posCCM.go()) # ['[', 'russell', 'may', 'have', 'won', 'more', 'championships', ']', '[', ',', 'Chamberlain', 'may', 'have', # 'averaged', '50', 'points', 'per', 'game', 'in', 'a', 'single', 'season', ']', '[', 'and', 'Kareem', 'may', 'be', # 'the', 'all-time', 'scoring', 'leader', ']', ',', '[', 'but', 'if', '[', 'you', 'take', 'into', 'consideration', # 'the', 'entire', 'package', ']', ',', 'Michael', 'Jordan', 'is', 'the', 'greatest', 'of', 'all', 'time', '.', ']'] # negCCM = ClaimComparatorModel(documents['./testCorpus\claim 3-7.txt'][4]) # print(negCCM.go()) # ['[', 'tonight', 'is', 'the', 'commencement', 'of', 'the', 'fourth', 'installment', 'of', 'the', 'Warriors-Cavs', # 'Finals', ']', ',', '[', 'which', 'resurrects', 'the', 'biggest', 'debate', 'in', 'basketball', ':', 'Is', # 'LeBron', 'James', 'or', 'the', 'Bulls', "'", 'Michael', 'Jordan', 'the', 'Greatest', 'Player', 'of', 'All-Time', # '?', ']'] TP, FP, FN = LogicComparator.run_on_sts() precision = (1.0 * TP) / (TP + FP) recall = (1.0 * TP) / (TP + FN) print("Precision: {}\nRecall: {}\nF1 Score: {}".format( precision, recall, hmean([precision, recall])))
def cv_validation(self): from scipy import stats # self.network.load_state_dict(self.best_weights.state_dict()) folds_accuracy = [] self.network.eval() with torch.no_grad(): for loader in self.folds_loaders: test_loss = 0. full_preds = [] full_labels = [] full_logits = [] for data, target in loader: data, target = data.to(self.device), target.to(self.device) # bs, ncrops, c, h, w = data.size() labels = target.data if self.one_hot: target = self._convert_int_onehot(target) logits = self.network(data) if self.prob_est: logits = F.softmax(logits, dim=1) full_logits.append(logits) test_loss += self.loss_func(logits, target).item() # get the index of the max log-probability pred = logits.data.max(1, keepdim=True)[1] # full_preds.extend(pred.view(-1).cpu()) full_preds.extend(pred.view(-1).cpu()) full_labels.extend(labels.cpu()) test_accuracy = 100 * metrics.accuracy_score( np.array(full_labels), np.array(full_preds)) folds_accuracy.append(test_accuracy) hmean = stats.hmean(folds_accuracy) self.logger.info('# Task {} # Harmonic Accuracy: {:.2f}%'.format( self.task_id, hmean)) result = {'id': self.task_id, 'acc': hmean} all_hmean = self.exploit_comm.allgather(result) return all_hmean
def calculate_avg_condnum(input_set, qoi_set=None): r""" Given gradient vectors at some points (centers) in the input space and given a specific set of QoIs, caculate the average condition number of the matrices formed by the gradient vectors of each QoI map at each center. :param input_set: The input sample set. Make sure the attribute _jacobians is not None :type input_set: :class:`~bet.sample.sample_set` :param list qoi_set: List of QoI indices :rtype: tuple :returns: (condnum, singvals) where condnum is a float and singvals has shape (num_centers, output_dim) """ if input_set._jacobians is None: raise ValueError("You must have jacobians to use this method.") if qoi_set is None: G = input_set._jacobians else: G = input_set._jacobians[:, qoi_set, :] if G.shape[1] > G.shape[2]: msg = "Condition number is not defined for more outputs than inputs." msg += " Try adding a qoi_set to evaluate the condition number of." raise ValueError(msg) # Calculate the singular values of the matrix formed by the gradient # vectors of each QoI map. This gives a set of singular values for each # center. singvals = np.linalg.svd(G, compute_uv=False) indz = singvals[:, -1] == 0 if np.sum(indz) == singvals.shape[0]: hmean_condnum = np.inf else: singvals[indz, 0] = np.inf singvals[indz, -1] = 1 condnums = singvals[:, 0] / singvals[:, -1] hmean_condnum = stats.hmean(condnums) return hmean_condnum, singvals
def computeF1_macro(confusion_matrix,matching, num_clusters): """ computes the macro F1 score confusion matrix : requres permutation matching according to which matrix must be permuted """ ##Permute the matrix columns permuted_confusion_matrix = np.zeros([num_clusters,num_clusters]) for cluster in xrange(num_clusters): matched_cluster = matching[cluster] permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster] ##Compute the F1 score for every cluster F1_score = 0 for cluster in xrange(num_clusters): TP = permuted_confusion_matrix[cluster,cluster] FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP precision = TP/(TP + FP) recall = TP/(TP + FN) f1 = stats.hmean([precision,recall]) F1_score += f1 F1_score /= num_clusters return F1_score
def token_similarity(string1, string2): set1 = set(normalize(string1).split()) set2 = set(normalize(string2).split()) sims = [] diffs = [] for item1 in set1: for item2 in set2: sim = 1 - Levenshtein.distance(item1, item2) / (len(item1) + len(item2)) if sim > _tksim_threshold: sims.append(item1) sims.append(item2) sims = list(set(sims)) for item1 in set1: if item1 not in sims: diffs.append(item1) for item2 in set2: if item2 not in sims: diffs.append(item2) diffs = list(set(diffs)) sim = len(sims) / (len(sims) + len(diffs)) if len(sims) + len(diffs) > 0 else 0 diff = 1 - (len(diffs) / (len(sims) + len(diffs)) if len(sims) + len(diffs) > 0 else 0) scores = [sim, diff] return stats.hmean(scores) if all([s > 0 for s in scores]) else -sys.float_info.max
def harmonic_mean(value, *args, **kwargs): """ Wrapper for :func:`scipy.stats.hmean` """ return hmean(value, *args, **kwargs)
from pylab import * from scipy import stats # generate a normal distribution sample with 100 elements sample = np.random.randn(100) # harmonic mean out = stats.hmean(sample[sample > 0]) print "Harmonic mean = " + str(out) # the mean, where values below -1 and above 1 are removed out = stats.tmean(sample, limits = (-1, 1)) print "Trimmed mean = " + str(out) # calculate the skewness of the sample out = stats.skew(sample) print "Skewness = " + str(out) out = stats.describe(sample) print out
# print out some information print "Number of introns: %d" % intron_count print "Repeat count: %d" % repeat_count print "Repeat length: %d" % repeat_length print "Repeats per intron: %.2f" % (float(repeat_count) / float(intron_count)) print "Number of introns containing repeats: %d" % (len(unique_intron_sizes) - len(non_rep_intron_sizes)) print "Raw unique intron sizes outputted to %s" % output print "\nNon repeat introns:" print "Count:", len(non_rep_intron_sizes) print "Max:", max(non_rep_intron_sizes) print "Min:", min(non_rep_intron_sizes) print "Mean: %.2f" % numpy.mean(non_rep_intron_sizes) print "Median: %d" % numpy.median(non_rep_intron_sizes) print "Mode: %d" % stats.mode(non_rep_intron_sizes)[0] print "GMean: %.2f" % stats.gmean(non_rep_intron_sizes) print "HMean: %.2f" % stats.hmean(non_rep_intron_sizes) print "\nRepeat introns:" print "Count:", len(rep_intron_sizes) print "Max:", max(rep_intron_sizes) print "Min:", min(rep_intron_sizes) print "Mean: %.2f" % numpy.mean(rep_intron_sizes) print "Median %d" % numpy.median(rep_intron_sizes) print "Mode: %d" % stats.mode(rep_intron_sizes)[0] print "GMean: %.2f" % stats.gmean(rep_intron_sizes) print "HMean: %.2f" % stats.hmean(rep_intron_sizes) intron_count = 0 repeat_count = 0 repeat_length = 0 cum_rep_len = 0 header = 0
def combine_harmonic(lst): if len(lst) == 1: return lst[0] return stats.hmean(np.array(lst))
def stackedTrain(X, y, projectName, scoring): print ("\nTraining stacked...") model_dir = "models" basePath = os.path.join(model_dir, projectName) models = os.listdir(basePath) df = pd.DataFrame() #y = pd.DataFrame(data = y, columns = ["y"]) skipName = "ensemble" for model_name in models: model_name_base = model_name.split(".")[0] suffix = model_name.split(".")[1] if model_name_base != skipName and suffix == "sav": print ("\n" + model_name_base) path = os.path.join(basePath, model_name) model = pickle.load(open(path, "rb")) y_hat = model.predict(X) df[model_name_base] = y_hat tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel() n = tn + fp + fn + tp precision = tp / (tp + fp) recall = tp / (tp + fn) accuracy = (tp + tn) / n f1 = stats.hmean([precision, recall]) print (accuracy) path = os.path.join("models", projectName, model_name_base + ".txt") f = open(path, "w") f.write("N:\t\t" + str(n)) f.write("\n\nTrue positive:\t" + str(tp) + "\t(" + str(tp/n) + ")") f.write("\nTrue negative:\t" + str(tn) + "\t(" + str(tn/n) + ")") f.write("\nFalse positive:\t" + str(fp) + "\t(" + str(fp/n) + ")") f.write("\nFalse negative:\t" + str(fn) + "\t(" + str(fn/n) + ")") f.write("\n\nAccuracy:\t" + str(accuracy)) f.write("\n\nPrecision:\t" + str(precision)) f.write("\nRecall:\t\t" + str(recall)) f.write("\nF1:\t\t" + str(f1)) f.close() kSplits = 2 param_grid = {} model = RandomForestClassifier() #transformPipeline = getTransformPipeline() #pipelineArray = transformPipeline[:] #pipelineArray.append(("clf", model)) #pipeline = Pipeline(pipelineArray) grid_search = GridSearchCV(model, param_grid = param_grid, cv = kSplits, verbose = 2, scoring = scoring) grid_search.fit(df, y) bestParameters = grid_search.best_params_ model.set_params(**bestParameters) model.fit(df, y) path = os.path.join("models", projectName, skipName + ".sav") f = open(path, "wb") pickle.dump(model, f) f.close() return
res = {} ys = [] for t in cfg.futureGens: y = cfg.A * math.cos(2 * math.pi * (t - cfg.seasonGen) / cfg.T) + cfg.B res[t] = {} #print t, y for numIndivs, numLoci in cfg.sampleStrats: fname = myUtils.getStatName(cfg, numIndivs, numLoci) for rec in myUtils.getStat(open(fname)): if rec["type"] != "temp": continue g1l = res[rec["g2"]].setdefault(rec["g1"], []) g1l.append(rec[tempStat][-1]) xs = res.keys() xs.sort() plt = {} for x in xs: for g1 in res[x].keys(): g1l = plt.setdefault(g1, []) g1l.append(stats.hmean([v if v >= 0 else 100000 for v in res[x][g1]])) print len(xs), plt.keys(), len(plt[56]) pylab.title(tempStat) for g1, vals in plt.items(): print g1 pylab.plot(xs, vals, label=str(g1)) pylab.legend() pylab.show()