Example #1
0
    def test_1D_array(self):
        a = array((1,2,3,4), float64)
        actual= stats.hmean(a)
        desired =  4. / (1./1 + 1./2 + 1./3 + 1./4)
        assert_almost_equal(actual, desired, decimal=14)

        desired1 = stats.hmean(a,axis=-1)
        assert_almost_equal(actual, desired1, decimal=14)
Example #2
0
    def test_2D_array_default(self):
        a = array(((1,2,3,4),
                   (1,2,3,4),
                   (1,2,3,4)))
        actual = stats.hmean(a)
        desired = array((1.,2.,3.,4.))
        assert_array_almost_equal(actual, desired, decimal=14)

        actual1 = stats.hmean(a,axis=0)
        assert_array_almost_equal(actual1, desired, decimal=14)
Example #3
0
    def test_hmean(self):
        for n in self.get_n():
            x, y, xm, ym = self.generate_xy_sample(n)

            r = stats.hmean(abs(x))
            rm = stats.mstats.hmean(abs(xm))
            assert_almost_equal(r, rm, 10)

            r = stats.hmean(abs(y))
            rm = stats.mstats.hmean(abs(ym))
            assert_almost_equal(r, rm, 10)
Example #4
0
def sset_sim(ssset_1, ssset_2):
    sim = []
    if len(ssset_1) == 0 or len(ssset_2) == 0:
        return 0.0
    for synset_1 in ssset_1:
        for synset_2 in ssset_2:
            s1, s2, s3 = synset_1.path_similarity(synset_2), \
                         synset_1.wup_similarity(synset_2), \
                         1.0
            if s1 > 0 and s2 > 0 and s3 > 0:
                new_sim = stats.hmean((s1, s2, s3))
                if new_sim > 0:
                    sim.append(new_sim)
        
    return stats.hmean(sim) if len(sim) > 0 else 0.0
def reprocess_classifier(x_vals, x_sub, prior_train, prior_test, y_vals, j, skf, clf, scores_only=False):
    # train classifiers and return out of fold train
    # predictions and blended submission predictions
    blend_train = prior_train
    blend_test = prior_test

    blend_test_j = np.zeros((x_sub.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        X_train = x_vals[train]
        y_train = y_vals[train]
        X_test = x_vals[test]
        clf.fit(X_train, y_train)
        blend_train[test, j] = clf.predict_proba(X_test)[:, 1]
        blend_test_j[:, i] = clf.predict_proba(x_sub)[:, 1]
    blend_test[:, j] = hmean(clip_probabilities(blend_test_j), axis=1)
    print "clf:", j, "logloss:", logloss(y_vals, blend_train[:, j])

    dataset_blend_train = clip_probabilities(blend_train)
    dataset_blend_test = clip_probabilities(blend_test)
    if scores_only:
        return dataset_blend_train, dataset_blend_test
    else:
        X_stacked = np.hstack([x_vals, dataset_blend_train])
        X_submission_stacked = np.hstack([x_sub, dataset_blend_test])
        return X_stacked, X_submission_stacked
def _school_similarity(school1, school2, threshold=-sys.float_info.max, proximity_only=False, explain=False):
    loc1 = school1['address']['coordinates'] if 'coordinates' in school1['address'] else None
    loc2 = school2['address']['coordinates'] if 'coordinates' in school2['address'] else None 
    sim_loc = location_similarity(loc1, loc2, radius=_radius)
    
    if proximity_only == True:
        sim_name = None
        if sim_loc: sims = [sim_loc] 
        else: sims = [-sys.float_info.max]
    else:
        sim_name = token_similarity(school1['name'], school2['name'])
        sims = [sim_name, sim_loc]
    score = stats.hmean(sims) if all([s and s > 0 for s in sims]) else -sys.float_info.max
    
    explanation = None
    if explain == True:
        explanation = {}
        explanation['model'] = school1
        explanation['entity'] = school2
        explanation['Score explanation'] = {}
        explanation['Score explanation']['1 - distance (km)'] = distance(loc1, loc2)
        explanation['Score explanation']['2 - _radius for similarity (km)'] = _radius
        if sim_name: explanation['Score explanation']['3 - similarity of names'] = sim_name
        explanation['Score explanation']['4 - similarity of locations'] = sim_loc
        explanation['Score explanation']['5 - final score'] = score
        explanation['Score explanation']['6 - _score_threshold'] = threshold
    
    return score, explanation
Example #7
0
def find_error(predicted, actual, fold_stats) :
    recall = [0.0,0.0,0.0]
    precision = [0.0,0.0,0.0]
    fstat = [0.0,0.0,0.0]
    for i in range (0,3) :
        tp = 0
        fp = 0
        fn = 0
        tn = 0
        for j in range (0,len(actual)) :
            if (predicted[j] == i) :
                if (actual[j] == i) :
                    tp = tp + 1
                else :
                    fp = fp + 1
            else :
                if (actual[j] == i) :
                    fn = fn + 1
                else :
                    tn = tn + 1
        recall[i] = tp/(tp+fn)
        precision[i] = tp/(tp+fp)
        fstat[i] = stats.hmean([recall[i], precision[i]])
    recall_avg = (recall[0]+recall[1]+recall[2])/3
    precision_avg = (precision[0]+precision[1]+precision[2])/3
    fstat_avg = (fstat[0]+fstat[1]+fstat[2])/3
    fold_stats.append([recall_avg, precision_avg, fstat_avg])
Example #8
0
def do_cohort(case, model, N0, nindiv, corr_name):
    last = 0.5
    fig, ax = plt.subplots(figsize=(16, 9))
    nb = Nbs[(model, N0)]
    #fig.suptitle("Nb: %d (N1: %d) - different cohorts - 100 SNPs -%s" %
    #            (nb, N0, corr_name), fontsize=18)
    fig.suptitle("Nb: %d - different cohorts - 100 SNPs - %s" %
                (nb, corr_name), fontsize=24)
    box_vals = []
    labels = []
    tops = []
    bottoms = []
    hmeans = []
    bname = get_bname(model)

    for cohort in cohorts:
        vals, ci, r2, sr2, j, ssize = \
            case[cohort][(model, N0)][(None, nindiv, 100, "SNP")]
        for cname, corrections in get_corrs(N0, bname, nindiv, vals,
                                            ci, r2, sr2, j):
            if cname != corr_name:
                continue
            cvals, cci = corrections
            vals = cvals
            ci = cci
            break
        box_vals.append(vals)
        hmeans.append(hmean(vals))
        bottom, top = list(zip(*ci))
        top = [100000 if x is None else x for x in top]
        bottom = [100000 if x is None else x for x in bottom]
        tops.append(np.percentile(top, 90))
        bottoms.append(np.percentile(bottom, 10))
        if cohort == 'c2c':
            labels.append("2 cohorts")
        elif cohort == 'c3c':
            labels.append("3 cohorts")
        else:
            labels.append("%s" % cohort)
        if cohort == cohorts[-1]:
            pos = len(labels) + 0.5
            ax.axvline(pos, color="k", lw=0.2)
            ax.text(last + (pos - last) / 2, 0, "%d Individuals sampled" % nindiv,
                    ha="center", va="bottom", size=24,
                    rotation="horizontal")
            last = pos
    ax.set_ylim(0, nb * 3)
    ax.set_ylabel('$\hat{N}_{e}$', fontsize=32)
    ax.axhline(nb, color="k", lw=0.3)
    sns.boxplot(box_vals, notch=0, sym="")
    ax.set_xticks(1 + np.arange(len(labels)))
    ax.set_xticklabels(labels, fontsize=24)
    ax.plot([1 + x for x in range(len(tops))], tops, "rx")
    ax.plot([1 + x for x in range(len(bottoms))], bottoms, "rx")
    ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k+")
    yticks = [0, nb // 2, nb, 2 * nb, 3 * nb]
    ax.set_yticks(yticks)
    ax.set_yticklabels([str(y) for y in yticks], fontsize=14)
    #fig.savefig("output/cohort-%s-%s-%d.png" % (model, corr_name, N0))
    return fig
Example #9
0
def calculate_avg_condnum(grad_tensor, qoi_set):
    r"""
    Given gradient vectors at some points (centers) in the parameter space and
    given a specific set of QoIs, caculate the average condition number of the
    matrices formed by the gradient vectors of each QoI map at each center.
    :param grad_tensor: Gradient vectors at each center in the parameter space
        :math:`\Lambda` for each QoI map.
    :type grad_tensor: :class:`np.ndarray` of shape (num_centers, num_qois,
        Lambda_dim) where num_centers is the number of points in :math:`\Lambda`
        we have approximated the gradient vectors and num_qois is the number of
        QoIs we are given.
    :param list qoi_set: List of QoI indices
    :rtype: tuple
    :returns: (condnum, singvals) where condnum is a float and singvals
        has shape (num_centers, Data_dim)
    """
    # Calculate the singular values of the matrix formed by the gradient
    # vectors of each QoI map.  This gives a set of singular values for each
    # center.
    singvals = np.linalg.svd(grad_tensor[:, qoi_set, :], compute_uv=False)
    indz = singvals[:, -1] == 0
    if np.sum(indz) == singvals.shape[0]:
        hmean_condnum = np.inf
    else:
        singvals[indz, 0] = np.inf
        singvals[indz, -1] = 1
        condnums = singvals[:, 0] / singvals[:, -1]
        hmean_condnum = stats.hmean(condnums)

    return hmean_condnum, singvals
Example #10
0
def find_error(predicted, actual, fold_stats) :
    recall = [0.0,0.0]
    precision = [0.0,0.0]
    fstat = [0.0,0.0]
    for i in range (0,2) :
        tp = 0
        fp = 0
        fn = 0
        tn = 0
        for j in range (0,len(actual)) :
            if (predicted[j] == i) :
                if (actual[j] == i) :
                    tp = tp + 1
                else :
                    fp = fp + 1
            else :
                if (actual[j] == i) :
                    fn = fn + 1
                else :
                    tn = tn + 1
        #print(i,j,tp,fp,fn,tn)
        if (tp > 0) :
            recall[i] = tp/(tp+fn)
            precision[i] = tp/(tp+fp)
        else :
            recall[i] = 0.0
            precision[i] = 0.0
        fstat[i] = stats.hmean([recall[i], precision[i]]) if recall[i] > 0 and precision[i] > 0 else 0.0
    recall_avg = numpy.mean(recall)
    precision_avg = numpy.mean(precision)
    fstat_avg = numpy.mean(fstat)
    fold_stats.append([recall_avg, precision_avg, fstat_avg])
Example #11
0
def create_simple_record(sequence):
    features = np.zeros(1, SimpleRecordType)
    
    features["mean"] = sequence.mean()
    features["var"] = sequence.var()
    features["skewness"] = stats.skew(sequence)
    features["kurtosis"] = stats.kurtosis(sequence)
    
    features["first"] = sequence[0]
    features["sign"] = np.sign(sequence).mean()
    features["zeros"] = (sequence == 0).mean()
    
    if (features["zeros"] == 0.0):
        features["harmonic_mean"] = stats.hmean(abs(sequence))
        features["geometric_mean"] = stats.gmean(abs(sequence))
    else:
        features["harmonic_mean"] = np.nan
        features["geometric_mean"] = np.nan
    
    for m in [2, 3, 5]:
        seqm = sequence % m
        for v in range(m):
            features["val_%dmod%d" % (v, m)] = (seqm == v).mean()
    
    return features
Example #12
0
def hyperloglog(multiset, b = 5):
    """Compute the estimate of the number of unique elements in multiset
    """
    m = 2 ** b
    registers = [0] * (m + 1)

    for item in multiset:
        x = zlib.adler32(str(item))
        bin_x = bin(x)[2:] # Truncating 0b
        leftbits = bin_x[:b]
        j = int("0b" + leftbits, 2) + 1

        rightbits = bin_x[b:]
        w = rightbits

        p = w.find("1") + 1
        print("w = ", w)
        print("p = ", p)

        registers[j] = max(registers[j], p)

    print(registers)
    print(harmonic_mean(registers))
    print(stats.hmean(registers))

    return alpha(m) * (m ** 2) * harmonic_mean(registers)
Example #13
0
def get_f(lex, gs_corpus):
    gs = squeeze(asarray(gs_corpus.sents))
    gs_num_mappings = shape(gs)[0]

    links = nonzero(lex)
    obj_is = links[0]
    word_is = links[1]
    lex_num_mappings = size(obj_is)

    # compute precision, what portion of the target lex is composed of gold pairings
    p_count = 0
    for pair in range(lex_num_mappings):
        this_obj = obj_is[pair]
        this_word = word_is[pair]

        #loop over gold standard
        if size(where((gs[:,0] == this_obj) & (gs[:,1] == this_word))) > 0:
            p_count = p_count + 1

    if (lex_num_mappings == 0): #special case
      precision = 0
    else:
      precision = float(p_count) / float(lex_num_mappings)

    # compute recall, how many of the total gold pairings are in the target lex
    recall = float(p_count) / float(gs_num_mappings)

    # now F is just the harmonic mean
    f =  stats.hmean([recall, precision])
    return (precision, recall, f)
Example #14
0
def base_harmo_ave(arry):
    """
    This is a function of harmonic mean.
    The formal parameter is a one dimensional list.
    The return value is the harmonic mean of the list.
    """
    result = stats.hmean(arry)
    return round(result,2)
Example #15
0
    def test_2D_array_dim1(self):
        a = array(((1,2,3,4),
                   (1,2,3,4),
                   (1,2,3,4)))

        v = 4. / (1./1 + 1./2 + 1./3 + 1./4)
        desired1 = array((v,v,v))
        actual1 = stats.hmean(a, axis=1)
        assert_array_almost_equal(actual1, desired1, decimal=14)
Example #16
0
def do_pcrit(case, model, N0, isSNP):
    if isSNP:
        markers = [100, 200, 400]
        marker_name = 'SNP'
    else:
        markers = [15, 50, 100]
        marker_name = 'MSAT'
    cohort = "Newb"
    sampCase = {}
    for nmarkers in markers:
        sampCase[nmarkers] = {}
        for pcrit in pcrits:
            #print case[cohort][N0].keys()
            try:
                vals, ci, r2, sr2, j, ssize = case[cohort][
                    (model, N0)][(pcrit, 50, nmarkers, marker_name)]
            except KeyError:
                vals, ci, r2, sr2, j, ssize = [], [], [], [], [], []
            sampCase[nmarkers][pcrit] = vals, ci, r2, sr2, j, ssize
    fig, ax = plt.subplots()
    fig.suptitle("pcrit : %s - %d - Newb - %ss - 50 individuals" % (
        model, N0, marker_name))
    box = []
    cnt = 0
    labels = []
    tops = []
    bottoms = []
    hmeans = []

    for nmarkers in markers:
        ax.text(1 + cnt, 0, str(nmarkers) + " %ss" % marker_name, rotation="vertical", ha="left", va="bottom")
        critCases = sampCase[nmarkers]
        for pcrit in pcrits:
            vals, ci, r2, sr2, j, ssize = critCases[pcrit]
            if len(vals) > 0:
                hmeans.append(hmean(vals))
                bottom, top = list(zip(*ci))
                tops.append(np.percentile([x if x is not None else 100000
                                           for x in top], 90))
                bottoms.append(np.percentile([x if x is not None else 0.1
                                              for x in bottom], 10))
            else:
                hmeans.append(None)
                tops.append(None)
                bottoms.append(None)
            labels.append(str(pcrit) if pcrit is not None else "std")
            box.append(vals)
            cnt += 1
    sns.boxplot(box, notch=0, sym="",)
    ax.plot([1 + x for x in range(len(tops))], tops, "r.")
    ax.plot([1 + x for x in range(len(bottoms))], bottoms, "r.")
    ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k.")
    ax.set_ylabel("$\hat{N}_{e}$")
    ax.set_ylim(0, Nbs[(model, N0)] * 2)
    ax.axhline(Nbs[(model, N0)], color="k", lw=2)
    ax.set_xticks(1 + np.arange(len(labels)))
    ax.set_xticklabels(labels, rotation="vertical")
Example #17
0
 def summary_stats(movie_ratings):
     r_ids = [r.ID for r in movie_ratings]
     count = len(movie_ratings)
     if count > 0:
         amean = np.mean([r.rating for r in movie_ratings])
         hmean = spstats.hmean([r.rating for r in movie_ratings])
         var = np.var([r.rating for r in movie_ratings])
     else:
         amean = np.NaN
         hmean = np.NaN
         var = np.NaN
     return r_ids, count, amean, hmean, var
def dependency2(i, N_j, node2Index, index2Node, table):
    values = []
    for neighbor in N_j:
        f = 0.0
        if i < node2Index[neighbor]:
            f = table[i][node2Index[neighbor]]
        else:
            f = table[node2Index[neighbor]][i]
        if f == 0.0:
            return 0.0
        else:
            values.append(f)
    return stats.hmean(values)
Example #19
0
def plot_pingpong_lineplot(xpos, dir, lang, lab, ls, marker, tp='lat'):

    stddev = {}
    statistic = {}

    for fname in os.listdir(dir):
        basename, ext = os.path.splitext(fname)
        if ext != ".dat":
            continue

        junk1, junk2, language, msglen = basename.split('_')

        if lang != language:
            continue

        data = np.loadtxt(f"{dir}/{fname}")

        # outlier removal (Tukey, 1.5 param)
        d_25 = np.quantile(data, 0.25)
        iqr = stats.iqr(data)
        d_75 = np.quantile(data, 0.75)
        lower = d_25 - 1.5 * iqr
        upper = d_75 + 1.5 * iqr
        cdata = np.array([x for x in data if x > lower and x < upper])

        ml = int(msglen)

        if tp == 'lat':
            statistic[ml] = np.mean(cdata) / 1e6  # convert to ms
            stddev[ml] = np.std(cdata) / 1e6  # stddev is in same units as data
        elif tp == 'tput':
            statistic[ml] = stats.hmean(2 * ml / cdata) * 1e9  # B/s
            stddev[ml] = np.std(2 * ml / cdata) * 1e9
        else:
            print(f"Unknown measure: {tp}")
            exit()

    ys = [float(statistic[k]) for k in sorted(statistic.keys())]
    stds = [float(stddev[k]) for k in sorted(stddev.keys())]

    plt.errorbar(xpos,
                 ys,
                 yerr=stds,
                 xerr=None,
                 label=lab,
                 linestyle=ls,
                 marker=marker)
    plt.legend(loc='best')
Example #20
0
def plot_model(fig, model):
    bname = get_bname(model)
    vals = []
    ldnes = {}
    errs = {}
    labels = []
    cnb = case['Newb']
    nbks = sorted(list(Nbs.keys()), key=lambda x: x[1])
    for cname, cdata in get_corrs(bname, [], []):
        ldnes[cname] = []
        errs[cname] = []

    nobs = 0
    for name, N0 in nbks:
        if name != model:
            continue
        nobs += 1
        labels.append(str(N0))
        val = Nbs[(model, N0)]
        ldne, ci = cnb[(model, N0)][None, 50, 100, 'SNP']
        for cname, cdata in get_corrs(bname, ldne, ci):
            cldne, ccis = cdata
            hmean = stats.hmean([x if x > 0 else 10000 for x in cldne])
            ldnes[cname].append(hmean)
            err = hmean / val
            errs[cname].append(err)
        vals.append(val)

    ax = fig.add_subplot(2, 1, 1)
    ax.set_title("Nb and estimators %s" % bname)
    ax.plot(vals, '+', label="Nb")
    for name, lvals in list(ldnes.items()):
        ax.plot(lvals, '-', label=name)
        print(name)
        print(vals)
        print(lvals)
    ax.set_xticklabels(labels)
    ax.legend()

    ax = fig.add_subplot(2, 1, 2)
    ax.set_title("Fraction of error %s" % bname)
    ax.plot([1.0] * nobs, '+', label="Nb")
    for name, cvals in list(errs.items()):
        ax.plot(cvals, '-', label=name)
    ax.set_xticklabels(labels)
    ax.legend()

    fig.savefig("output/correct.png")
def ranking_features(df_score, list_tracks, method = 'mean'):
	if(len(list_tracks)==1):
		df_score = df_score.sort_values(by = list_tracks, ascending = False)

	else:
		if(method == 'gmean'):
			df_combine = pd.DataFrame({'score_global': stats.mstats.gmean(df_score.iloc[:,1:], axis=1)})
		elif(method == 'hmean'):
			df_combine = pd.DataFrame({'score_global': stats.hmean(df_score.iloc[:,1:], axis=1)})
		else:
			df_combine = pd.DataFrame({'score_global': df_score.mean(axis=1)})

		df_score = pd.concat([df_score, df_combine], axis=1)
		df_score = df_score.sort_values(by = ['score_global'], ascending = False)

	return df_score
    def extract_keywords_harmonic(self):
        max_edge_weight = max(self.node_weight.values(
        ))  # Maximum sum of edge weights among all edges in the graph
        max_degree = max(dict(self.G.degree).values())

        ranks = {
            cand: hmean([
                self.G.degree(cand) / max_degree,
                self.node_weight[cand] / max_edge_weight,
                self.candidate_doc_occur[cand] / self.num_docs
            ])
            for cand in list(self.G.nodes()) if self.G.degree(cand) > 0
        }
        sortRank = self.sortKeywords(ranks)
        self.ranks_hmean = sortRank
        return (self.ranks_hmean)
Example #23
0
def get_weight_attr(cov_u, cov_v, reads_weight, db_weight):
    cov_diff = 1.0 / (abs(cov_u - cov_v) + sys.float_info.epsilon)
    weight_attr = {
        'cov_diff':
        cov_diff,
        'reads_and_db':
        reads_weight + db_weight,
        'geometric_mean':
        gmean([cov_diff, reads_weight, db_weight]),
        'harmonic_mean':
        hmean([
            cov_diff, reads_weight + sys.float_info.epsilon,
            db_weight + sys.float_info.epsilon
        ])
    }
    return weight_attr
def num_hm(seed, fname_contains_seed=False):
    lo = sg.self_affine_psd_based_ext(L, hrms, 0.8, N, seed=seed)
    up = sg.self_affine_psd_based_ext(L, hrms, 0.8, N, seed=seed, lambda_L_over_lambda_mm=lambda_L_over_lambda_mm)
    
    # compute numerical results and save to binary
    O = [0., 1000., 3000.] 
    P = [10.0E6,20.0E6,30.0E6,40.0E6,50.0E6]
    #O = [3000.] 
    #P = [10.0E6]
    AHs, AH_perps, AH_paras, RCs = [], [], [], []
    for offset in O:
        sco = sm.composite_surface_in_x(up.h, lo.h, lo.a, offset)
        #sm.plot2D(sco, ticks=False)            
        AH, AH_perp, AH_para, RC = [], [], [], []
        for p in P:
            contact = sc.contact_FFT(sco, p, E, nu, store_aperture_field=True, verbose=True)
            if offset == 3000. and p == 10.0E6:
                cfname = 'cont_num_3mm_10MPa'
                if fname_contains_seed:
                    cfname += '_seed'+str(seed)
                sc.save(contact, cfname)
            #sc.plot_aperture_field(contact.aperture_field, contact.dxy, save_as='am_3mm_10MPa')
            #comp_contacts()
            #sys.exit()
            #sc.plot_clusters(contact)
            #sc.plot_contact_cluster_areas(contact)
            flow_x, flow_y = sf.hydraulic_aperture(contact.aperture_field, contact.dxy, verbose=True)
            ah_iso = scst.hmean([flow_x.a, flow_y.a])
            AH += [su.to_meter(ah_iso)]
            AH_perp += [su.to_meter(flow_y.a)]
            AH_para += [su.to_meter(flow_x.a)]
            RC += [contact.contact_ratio()]
        AHs += [AH]
        AH_perps += [AH_perp]
        AH_paras += [AH_para]
        RCs += [RC]

    bfname = 'results'
    if fname_contains_seed:
        bfname += '_seed'+str(seed)
    f = open(bfname, 'wb')
    pc.dump(P, f)
    pc.dump(AHs, f)
    pc.dump(AH_perps, f)
    pc.dump(AH_paras, f)
    pc.dump(RCs, f)
    f.close()
Example #25
0
def skewness():
    columns = ['column name', 'value']
    aircrafts_data = pd.read_csv(
        "F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/aircrafts_data.csv"
    )
    airports_data = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/airports_data.csv'
    )
    boarding_passes = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/boarding_passes.csv'
    )
    bookings = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/bookings.csv'
    )
    flights = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/flights.csv'
    )
    seats = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/seats.csv'
    )
    ticket_flights = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/ticket_flights.csv'
    )
    tickets = pd.read_csv(
        'F:/Academics/Sem 1/Big Data/Homewok/HM2/cs5614-hw-master/data/tickets.csv'
    )
    aircrafts_data_skew = aircrafts_data.skew().abs().sum()

    #aircrafts_data_s = aircrafts_data_skew.sum()
    boarding_passes_skew = boarding_passes.skew().abs().sum()

    bookings_skew = bookings.skew().abs().sum()
    flights_skew = flights.skew().abs().sum()
    seats_skew = seats.skew().abs().sum()
    ticket_flights_skew = ticket_flights.skew().abs().sum()
    tickets_skew = tickets.skew().abs().sum()
    airports_data_skew = airports_data.skew().abs().sum()
    data = np.array([
        aircrafts_data_skew, airports_data_skew, boarding_passes_skew,
        bookings_skew, flights_skew, seats_skew, ticket_flights_skew,
        tickets_skew
    ])
    #print(data)
    mean = data.mean()
    h_mean = hmean(data)
    print('The average skew in the dataset is: ', np.round(mean, 2))
    return mean
def swc(labels, distances):
    if len(np.unique(labels)) > 1:
        silhouette = round(
            silhouette_score(squareform(distances),
                             labels,
                             metric='precomputed'), 3)
        unique, counts = np.unique(labels, return_counts=True)
        stds = np.std(counts)
        armonicas = hmean(counts)
        armonicas_1 = 1 / np.sum(1.0 / counts, axis=0)
    else:
        silhouette = 0.0
        stds = 'no'
        armonicas = 'no'
        armonicas_1 = 'no'

    return silhouette, stds, armonicas, armonicas_1
Example #27
0
def inspect_confusion_matrix(cmatrix):
    cmatrix = np.array(cmatrix)

    total_count = np.sum(cmatrix)
    tp = np.diag(cmatrix)
    fp = np.sum(cmatrix, axis=0) - tp
    fn = np.sum(cmatrix, axis=1) - tp
    tn = total_count - fp - fn - tp

    averaging = {
        'microavg':
        lambda score: score(*[np.mean(x) for x in (tp, fp, tn, fn)]),
        'macroavg': lambda score: np.mean(score(tp, fp, tn, fn))
    }

    results = {}

    results['noavg'] = {
        'total_count': total_count,
        'accuracy': tp.sum() / total_count
    }

    base_accuracy = (np.sum(cmatrix, axis=0) * np.sum(
        cmatrix, axis=1)).sum() / (total_count * total_count)
    results['noavg']['kappa'] = (results['noavg']['accuracy'] -
                                 base_accuracy) / (1 - base_accuracy)

    for averaging_name, averaging_f in averaging.items():
        scores = {
            'precision': averaging_f(lambda tp, fp, tn, fn: tp / (tp + fp)),
            'recall': averaging_f(lambda tp, fp, tn, fn: tp / (tp + fn)),
            'specificity': averaging_f(lambda tp, fp, tn, fn: tn / (tn + fp))
        }

        scores['sensitivity'] = scores['recall']
        try:
            scores['f1'] = hmean([scores['precision'], scores['recall']])
        except ValueError:
            scores['f1'] = None

        scores['ppv'] = scores['precision']
        scores['npv'] = averaging_f(lambda tp, fp, tn, fn: tn / (tn + fn))

        results[averaging_name] = scores

    return results
def massAvg(massList, method='weighted', weights=None):
    """
    Compute the average mass of massList according to method.

    If method=weighted but weights were not properly defined,
    switch method to harmonic.    
    If massList contains a zero mass, switch method to mean.
    
    :parameter method: possible values: harmonic, mean, weighted
    :parameter weights: weights of elements (only for weighted average)
    
    """
    if not massList:
        return massList
    if len(massList) == 1:
        return massList[0]

    if method == 'weighted' and (not weights or len(weights) != len(massList)):
        method = 'harmonic'
    flatList = [ mass / GeV for mass in _flattenList(massList)]
    if method == 'harmonic' and 0. in flatList:
        method = 'mean'

    for mass in massList:
        if len(mass) != len(massList[0]) \
                or len(mass[0]) != len(massList[0][0]) \
                or len(mass[1]) != len(massList[0][1]):
            logger.error('Mass shape mismatch in mass list:\n' + str(mass) +
                         ' and ' + str(massList[0]))
            import sys
            sys.exit()

    avgmass = copy.deepcopy(massList[0])
    for ib, branch in enumerate(massList[0]):
        for ival in enumerate(branch):
            vals = [ float(mass[ib][ival[0]] / GeV) for mass in massList]
            if method == 'mean':
                avg = np.mean(vals)
            elif method == 'harmonic':
                avg = stats.hmean(vals)
            elif method == 'weighted':
                weights = [ float(weight) for weight in weights ]
                avg = np.average(vals,weights=weights)                
            avgmass[ib][ival[0]] = float(avg)*GeV

    return avgmass
Example #29
0
def quantize_fast(toas, toaerrs, flags=None, dt=0.1):
    r"""
    Function to quantize and average TOAs by observation epoch. Used especially
    for NANOGrav multiband data.

    Pulled from `[3]`_.

    .. _[3]: https://github.com/vallis/libstempo/blob/master/libstempo/toasim.py

    Parameters
    ----------

    times : array
        TOAs for a pulsar.

    flags : array, optional
        Flags for TOAs.

    dt : float
        Coarse graining time [days].
    """
    isort = np.argsort(toas)

    bucket_ref = [toas[isort[0]]]
    bucket_ind = [[isort[0]]]
    dt *= (24 * 3600)
    for i in isort[1:]:
        if toas[i] - bucket_ref[-1] < dt:
            bucket_ind[-1].append(i)
        else:
            bucket_ref.append(toas[i])
            bucket_ind.append([i])

    avetoas = np.array([np.mean(toas[l]) for l in bucket_ind], 'd')
    avetoaerrs = np.array([sps.hmean(toaerrs[l]) for l in bucket_ind], 'd')
    if flags is not None:
        aveflags = np.array([flags[l[0]] for l in bucket_ind])

    U = np.zeros((len(toas), len(bucket_ind)), 'd')
    for i, l in enumerate(bucket_ind):
        U[l, i] = 1

    if flags is not None:
        return avetoas, avetoaerrs, aveflags, U, bucket_ind
    else:
        return avetoas, avetoaerrs, U, bucket_ind
Example #30
0
    def calallstats(self, nparr1, nparr2=None):
        self.amean = np.mean(nparr1)
        self.hmean = stats.hmean(nparr1)
        self.gmean = stats.gmean(nparr1)
        self.median = np.median(nparr1)
        self.mode = stats.mode(nparr1)
        self.min = np.min(nparr1)
        self.max = np.max(nparr1)
        self.q1 = np.percentile(nparr1, 25)
        self.q2 = np.percentile(nparr1, 50)
        self.q3 = np.percentile(nparr1, 75)
        self.var = np.var(nparr1)
        self.std = np.std(nparr1)
        self.cov = np.cov(nparr1, nparr2)
        self.corr = np.correlate(nparr1, nparr2)

        pass
Example #31
0
def massAvg(massList, method='weighted', weights=None):
    """
    Compute the average mass of massList according to method.

    If method=weighted but weights were not properly defined,
    switch method to harmonic.    
    If massList contains a zero mass, switch method to mean.
    
    :parameter method: possible values: harmonic, mean, weighted
    :parameter weights: weights of elements (only for weighted average)
    
    """
    if not massList:
        return massList
    if len(massList) == 1:
        return massList[0]

    if method == 'weighted' and (not weights or len(weights) != len(massList)):
        method = 'harmonic'
    flatList = [mass / GeV for mass in _flattenList(massList)]
    if method == 'harmonic' and 0. in flatList:
        method = 'mean'

    for mass in massList:
        if len(mass) != len(massList[0]) \
                or len(mass[0]) != len(massList[0][0]) \
                or len(mass[1]) != len(massList[0][1]):
            logger.error('Mass shape mismatch in mass list:\n' + str(mass) +
                         ' and ' + str(massList[0]))
            import sys
            sys.exit()

    avgmass = copy.deepcopy(massList[0])
    for ib, branch in enumerate(massList[0]):
        for ival in enumerate(branch):
            vals = [float(mass[ib][ival[0]] / GeV) for mass in massList]
            if method == 'mean':
                avg = np.mean(vals)
            elif method == 'harmonic':
                avg = stats.hmean(vals)
            elif method == 'weighted':
                weights = [float(weight) for weight in weights]
                avg = np.average(vals, weights=weights)
            avgmass[ib][ival[0]] = float(avg) * GeV

    return avgmass
Example #32
0
def recommend_for_user(user, **kwargs):
    global uu_enh, userids
    docs, scores = [], []

    item_sim_min_thr = kwargs['item_sim_min_thr'] if kwargs.has_key(
        'item_sim_min_thr') else 0.2
    item_maxn = kwargs['max_items'] if kwargs.has_key('max_items') else 3

    # first, locate top N similar users...
    res = similar_users(user, **kwargs)

    if res.size > 0:
        user_row = __get_user_row(user)
        ri = np.where(user_row.data == 1.0)
        user_read = user_row.indices[ri]

        sim_hists = [
            __get_user_row(e).todense().tolist()[0] for e in res[:, 0]
        ]
        sim_hists = sps.csr_matrix(sim_hists)

        print "For ", user
        print user_row

        print "similar users: "
        print res
        print sim_hists

        for i, d in enumerate(docids):
            if sim_hists.getcol(i).size > 0 and i not in user_read:
                y_hat = hmean(sim_hists.getcol(i).data)
                y = user_row[0, i]
                s = 1.0 - np.abs(y_hat - y)
                #print d, ": hmean=", y_hat, ", score=", s
                if s >= item_sim_min_thr:
                    docs.append(d)
                    scores.append(s)

        docs = np.array(docs)
        scores = np.array(scores)
        ids = np.argsort(-scores)[0:item_maxn + 1]
        docs = docs[ids]
        scores = scores[ids]

    ret = np.array(zip(docs, scores))
    return ret
Example #33
0
def get_fft_stats(z):

	avg = np.average(z)
	std = np.std(z)
	median = np.median(z)
	var = np.var(z)
	kurt = stats.kurtosis(z)
	hmean = stats.hmean(z)
	gmean = stats.gmean(z)
	skew = stats.skew(z)
	median_dev_abs = np.sum(np.abs(z - median)) 
	std_dev_abs = np.sum(np.abs(z - std)) 

	
	stats_array = [avg, std, median, var, kurt, hmean, gmean, skew, median_dev_abs, std_dev_abs]

	return stats_array
Example #34
0
def do_hz_comp(pref, mydir, model, N0):
    snps = [100]  # , 200, 400]
    cohort = "Newb"
    cutCase = {}
    for cut in cuts:
        cutCase[cut] = {}
        case = load_file(pref, cut * 100, mydir)
        for nsnp in snps:
            vals, ci, r2, sr2, j, ssize = case[cohort][
                (model, N0)][(None, 50, nsnp, "SNP")]
            cutCase[cut][nsnp] = vals, ci, r2, sr2, j, ssize
    fig, ax = plt.subplots(figsize=(16, 9))
    fig.suptitle("Hz comparison: %s - %d - Newb - SNPs - 50 indivs " % (model, N0))
    box = []
    cnt = 0
    labels = []
    tops = []
    bottoms = []
    hmeans = []

    for cut in cuts:
        # ax.text(cnt, 0, str(cut), rotation="vertical")
        cases = cutCase[cut]
        for nsnp in snps:
            vals, ci, r2, sr2, j, ssize = cases[nsnp]
            hmeans.append(hmean(vals))
            bottom, top = list(zip(*ci))
            top = [100000 if x is None else x for x in top]
            bottom = [100000 if x is None else x for x in bottom]
            tops.append(np.percentile(top, 90))
            bottoms.append(np.percentile(bottom, 10))
            # labels.append(str(nsnp))
            labels.append(str(cut))
            box.append(vals)
            cnt += 1
    sns.boxplot(box, notch=0, sym="",)
    ax.set_ylabel("$\hat{N}_{e}$", fontsize=24)
    ax.set_xlabel('Hz', fontsize=24)
    ax.plot([1 + x for x in range(len(tops))], tops, "rx")
    ax.plot([1 + x for x in range(len(bottoms))], bottoms, "rx")
    ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k+")
    ax.axhline(Nbs[(model, N0)], color="k", lw=0.3)
    ax.set_ylim(0, Nbs[(model, N0)] * 3)
    ax.set_xticks(1 + np.arange(len(labels)))
    ax.set_xticklabels(labels)
def f1_micro_average(y_test, y_pred):
    '''
    Calculates s.c. micro average f1
    Input: y_test, y_pred - arrays with test and prediction values
    Output: micro average f1
    '''

    TN = []
    FP = []
    FN = []
    for i in range(y_pred.shape[1]):
        TN.append(confusion_matrix(np.array(y_test)[:, i], y_pred[:, i])[1, 1])
        FP.append(confusion_matrix(np.array(y_test)[:, i], y_pred[:, i])[1, 0])
        FN.append(confusion_matrix(np.array(y_test)[:, i], y_pred[:, i])[0, 1])
    precision = np.sum(TN) / (np.sum(TN) + np.sum(FN))
    recall = np.sum(TN) / (np.sum(TN) + np.sum(FP))

    return hmean([precision, recall])
Example #36
0
def combine_predictions(all_interp):
    y_true = to_np(all_interp[0][1].y_true)
    all_preds = np.stack([to_np(interp.preds) for _, interp in all_interp])
    
    preds = np.mean(all_preds, axis=0)
    acc_m = compute_acc(preds, y_true) 
    
    preds = np.median(all_preds, axis=0)
    acc_med = compute_acc(preds, y_true)
    
    preds = gmean(all_preds, axis=0)
    acc_g = compute_acc(preds, y_true)
    
    preds = hmean(all_preds, axis=0)
    acc_h = compute_acc(preds, y_true)
    
    print(f'accuracy -- mean: {acc_m:0.3f}   median: {acc_med:0.3f}   gmean: {acc_g:0.3f}   hmean: {acc_h:0.3f}')
    return acc_m, acc_med, acc_g, acc_h
def calculate_means(df):
    """
    自定义一个返回DataFrame函数,使用Numpy的函数average计算加权平均值,使用Scipy的gmean和hmean计算几何和调和平均值
    :param df:
    :return:
    """
    df_means = pd.DataFrame(
        index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
        df_means[col] = [arithmetic, weighted, geometric, harmonic]

    df_means['count'] = len(df)
    return df_means.astype(int)
def perf_metrics_2X2(y_true, y_pred):
	cm = confusion_matrix(y_true, y_pred)
	TN = cm[0][0]
	FN = cm[1][0]
	TP = cm[1][1]
	FP = cm[0][1]

	precision = TP / (TP + FP)  # pos_pred_value
	recall = TP / (TP + FN)  # TP_rate/sensitivity
	false_discovery_rate = FP / (TP + FP)
	true_negative_rate = TN / (FP + TN)

	f1_score = hmean((precision, recall))

	# return {'Precision': precision, 'Recall': recall, 'True Negative Rate': true_negative_rate,
	#         'False Discovery Rate': false_discovery_rate, 'F1 Score': f1_score}
	return 'True Positive Rate: {:.2f}\nTrue Negative Rate: {:.2f}\nPrecision: {:.2f}\nFalse Discovery Rate: {:.2f}\nF1 ' \
	       'Score: {:.2f}\n'.format(recall, true_negative_rate, precision, false_discovery_rate, f1_score)
Example #39
0
def getNumberSummary(series):
    # Handle the case where input is empty, then just return all 0s.
    if len(series) == 0:
        series = pd.Series([0, 0])
    return pd.DataFrame({
        "min": [np.min(series)],
        "lower_quartile": [np.percentile(series, 25)],
        "median": [np.median(series)],
        "upper_quartile": [np.percentile(series, 75)],
        "max": [np.max(series)],
        "mean": [np.mean(series)],
        "std": [np.std(series, ddof=1 if len(series) > 1 else 0)],
        "skewness": [st.skew(series)],
        "kurtosis": [st.kurtosis(series, fisher=False)],
        "gmean": [st.gmean(series + 1e-6)],
        "hmean": [st.hmean(series + 1e-6)],
        "sum": [np.sum(series)]
    })
Example #40
0
def plot_age_imputed_dist(data):
    fig, ax = plt.subplots(3, 2, figsize=(15, 15))
    plot_age_and_sex_distribution(data, "Original", ax[0][0])
    plot_age_and_sex_distribution(data.fillna(data["Age"].mean()),
                                  "Impute With Mean", ax[0][1])
    plot_age_and_sex_distribution(
        data.fillna(stats.gmean(data["Age"].dropna())), "Impute With GMean",
        ax[1][0])
    plot_age_and_sex_distribution(
        data.fillna(stats.hmean(data["Age"].dropna())), "Impute With HMean",
        ax[1][1])
    plot_age_and_sex_distribution(data.fillna(data["Age"].mode()[0]),
                                  "Impute With Mode", ax[2][0])
    plot_age_and_sex_distribution(data.fillna(data["Age"].median()),
                                  "Impute With Median", ax[2][1])
    sns.despine(trim=True)
    plt.tight_layout()
    plt.show()
Example #41
0
def evaluate():
    model_path, data_dir = _input()
    model_home = model_path.split('.')[0] + "_out"
    pos_val = '{}_val_pos.txt'.format(data_dir)
    neg_val = '{}_val_bg.txt'.format(data_dir)
    loc_score = np.array([])
    rec_score = np.array([])

    # Create dirs to store results
    none_path = path.join(model_home, 'none')
    good_path = path.join(model_home, 'good')
    mid_path = path.join(model_home, 'mid')
    bad_path = path.join(model_home, 'bad')
    os.popen("rm -rf {0}; mkdir {0}".format(model_home))
    os.popen("mkdir {0}".format(none_path))
    os.popen("mkdir {0}".format(good_path))
    os.popen("mkdir {0}".format(mid_path))
    os.popen("mkdir {0}".format(bad_path))

    loc_score_p, rec_score_p, goods, mids, bads, nones = evaluate_positives(data_dir, \
    pos_val, model_path)
    _copy_data(goods, good_path)
    _copy_data(mids, mid_path)
    _copy_data(bads, bad_path)
    _copy_data(nones, none_path)

    loc_score_n, rec_score_n = evaluate_negatives(data_dir, neg_val,
                                                  model_path)

    loc_score = np.append((loc_score_p, loc_score_n))
    rec_score = np.append((rec_score_p, rec_score_n))

    print "Total positives evaluated {}".format(len(loc_score_p))
    print "loc_score {}, rec_score {}".format(np.mean(loc_score_p),
                                              np.mean(rec_score_p))
    print "Negatives evaluated {}".format(len(loc_score_n))
    print "loc_score {}, rec_score {}".format(np.mean(loc_score_n),
                                              np.mean(rec_score_n))
    print "Total evaluated {}".format(len(loc_score))
    print "loc_score {}, rec_score {}".format(np.mean(loc_score),
                                              np.mean(rec_score))
    print "Score: {}".format(
        stats.hmean([np.mean(loc_score),
                     np.mean(rec_score)]))
Example #42
0
    def calculateVideoVMAF(self, modelPath):
        print("Calculating quality ...")

        # VMAF CSV File Name
        vmafOut = os.path.splitext(self.outputFile)[0] + "-vmaf.csv"

        # Assemble select, scale and vmaf filter strings
        scaleStringMain = "[0:v]scale=1920x1080:flags=bicubic,settb=AVTB,setpts=PTS-STARTPTS[main]; "
        scaleStringRef = "[1:v]scale=1920x1080:flags=bicubic,settb=AVTB,setpts=PTS-STARTPTS[ref]; "
        vmafFilterString = "[main][ref]libvmaf=model_path=" + modelPath
        vmafFilterString += ":log_path=" + vmafOut
        vmafFilterString += ":log_fmt=csv"

        # Assemble final filter string
        filterString = scaleStringMain + scaleStringRef + vmafFilterString

        # Assemble FFmpeg command
        vmafCommand = [
            "ffmpeg", "-hide_banner", "-v", "error", "-stats", "-r", "24",
            "-i", self.outputFile, "-r", "24", "-i", self.videoFile,
            "-filter_complex", filterString, "-f", "null", "-"
        ]

        # Run FFmpeg command
        a = run(vmafCommand)

        # Read in VMAF CSV file
        vmaf_df = pd.read_csv(vmafOut, usecols=['Frame', 'vmaf'])

        # Averages & Percentiles
        vmafMean = vmaf_df['vmaf'].mean()
        vmafHMean = stats.hmean(vmaf_df['vmaf'], axis=0)
        vmafMax = vmaf_df['vmaf'].max()
        vmafP75 = np.percentile(vmaf_df['vmaf'], q=75)
        vmafP25 = np.percentile(vmaf_df['vmaf'], q=25)
        vmafMin = vmaf_df['vmaf'].min()

        # Print results
        print("VMAF mean = " + str(vmafMean))
        print("VMAF harmonic mean = " + str(vmafHMean))
        print("VMAF maximum = " + str(vmafMax))
        print("VMAF 75th percentile = " + str(vmafP75))
        print("VMAF 25th percentile = " + str(vmafP25))
        print("VMAF minimum = " + str(vmafMin))
Example #43
0
def transliterate_one_word(word, language1, language2):
    # on récupère la prononciation du mot
    response = json.loads(get_all_words(language1, word))['result']
    if response == []:
        return False
    word = response[0]
    syllables_ipa1 = word['syllables_ipa']
    syllables2 = []
    syllables_ipa2 = []
    translitteration_score = []
    for syll in syllables_ipa1:
        try:
            # on cherche la correspondance de chaque syllabe dans la 2eme langue
            response = json.loads(get_all_syllables(language1, syll))['result']
            # response = requests.get(f'{API_URL}/{language1}_syllables/{syll}').json()['result']
            syll1 = response[language2]
            score = response[f'{language2}_score']
            if score < 0.8:  # on traite la syllabe autrement si elle n'a pas d'équivalent, score arbitraire
                raise KeyError
            syllables_ipa2 += [syll1]
            translitteration_score += [score]
            # syll2 = requests.get(f'{API_URL}/{language2}_syllables/{syll1}').json()['result']['orthographical_syllable']
            syll2 = json.loads(get_all_syllables(
                language2, syll1))['result']['orthographical_syllable']
            syllables2 += [syll2]
        except (KeyError, TypeError):
            syll1 = ""
            syll2 = ""
            for phonem in syll:
                try:
                    item = json.loads(get_phonem(language1, phonem))['result']
                    equivalent = item[language2]
                    writing = json.loads(get_phonem(
                        language2, equivalent))['result']['written']
                    syll1 += equivalent
                    syll2 += writing
                except (KeyError, TypeError):
                    continue
            syllables_ipa2 += [syll1]
            syllables2 += [syll2]
            translitteration_score += [0.001]
    harmonic_mean = int(100 * round(stats.hmean(translitteration_score), 2))
    return word[
        "syllables"], syllables2, syllables_ipa1, syllables_ipa2, harmonic_mean
Example #44
0
def recommendation_mf(userArray, numUsers, movieIds):

    ratings_dict = {
        'itemID':
        list(ratings.movie_id_ml) + list(numUsers * movieIds),
        'userID':
        list(ratings.user_id) + [
            max(ratings.user_id) + 1 + x for x in range(numUsers)
            for y in range(15)
        ],
        'rating':
        list(ratings.rating) +
        [item for sublist in userArray for item in sublist]
    }

    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    trainset = data.build_full_trainset()

    nmf = NMF()
    nmf.fit(trainset)

    userIds = [
        trainset.to_inner_uid(max(ratings.user_id) + 1 + x)
        for x in range(numUsers)
    ]

    mat = np.dot(nmf.pu, nmf.qi.T)

    scores = hmean(mat[userIds, :], axis=0)
    best_movies = scores.argsort()
    best_movies = best_movies[-9:][::-1]
    scores = scores[best_movies]
    movie_ind = [trainset.to_raw_iid(x) for x in best_movies]

    recommendation = list(
        zip(
            list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].title),
            list(df_ML_movies[df_ML_movies.movie_id_ml.isin(
                movie_ind)].poster_url), list(scores)))
    print(recommendation)
    print(len(recommendation[0]))
    return recommendation
Example #45
0
def print_iris_statistics(data):
    # --------------------------------------------------- Create data frame < #
    df = pandas.DataFrame(
            columns = ["Dł. d. k.",
                       "Sz. d. k.",
                       "Dł. pł.",
                       "Sz. pł."])

    # ------------------------- Calculate different statistical information < #
    df.loc["Minimum [cm]", :] = [i for i in data.iloc[:, 0:4].min()]
    df.loc["Maksimum [cm]", :] = [i for i in data.iloc[:, 0:4].max()]
    df.loc["Rozstęp [cm]", :] = [i for i in df.loc["Maksimum [cm]"]
                                 - df.loc["Minimum [cm]"]]

    df.loc["Pierwszy kwartyl [cm]", :] = [
        quantile(data.iloc[:, i], 0.25) for i in range(4)]
    df.loc["Mediana [cm]", :] = [
        median(data.iloc[:, i]) for i in range(4)]
    df.loc["Trzeci kwartyl [cm]", :] = [
        quantile(data.iloc[:, i], 0.75) for i in range(4)]

    df.loc["Średnia harmoniczna [cm]", :] = stats.hmean(data.iloc[:, 0:4])
    df.loc["Średnia geometryczna [cm]", :] = stats.gmean(data.iloc[:, 0:4])
    df.loc["Średnia arytmetyczna [cm]", :] = [i for i in data.mean()]

    # Operator ** means power() method
    # The shape attribute for numpy arrays returns the dimensions of the array
    # If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[0] is n
    df.loc["Średnia potęgowa 2 rzędu [cm]", :] = [i for i in (
            ((data.iloc[:, 0:4] ** 2).sum() / data.shape[0]) ** (1 / 2))]
    df.loc["Średnia potęgowa 3 rzędu [cm]", :] = [i for i in (
            ((data.iloc[:, 0:4] ** 3).sum() / data.shape[0]) ** (1 / 3))]

    df.loc["Wariancja [cm^2]", :] = [i for i in data.var()]
    df.loc["Odchylenie standardowe [cm]", :] = [i for i in data.std()]

    # If True, Fisher’s definition is used (normal ==> 0.0)
    # If False, Pearson’s definition is used (normal ==> 3.0)
    df.loc["Kurtoza", :] = stats.kurtosis(data.iloc[:, 0:4], fisher = False)

    pandas.set_option('display.max_rows', 1000)
    pandas.set_option('display.max_columns', 1000)
    pandas.set_option('display.width', 1000)
    print(df.astype(float).round(1))
Example #46
0
def compute_semeval_score(pearson_score, spearman_score):
    """
    Return NaN if a dataset can't be evaluated on a given frame. Return 0 if at least one similarity
    measure was 0 or negative. Otherwise, take a harmonic mean of a Pearson correlation coefficient
    and a Spearman correlation coefficient.
    """
    intervals = ['acc', 'low', 'high']
    scores = []
    for interval in intervals:
        if any(
            np.isnan(x) for x in [spearman_score[interval], pearson_score[interval]]
        ):
            scores.append(float('NaN'))
        elif any(x <= 0 for x in [spearman_score[interval], pearson_score[interval]]):
            scores.append(0)
        else:
            scores.append(hmean([spearman_score[interval], pearson_score[interval]]))

    return pd.Series(scores, index=intervals)
Example #47
0
def calc_scores(final_probes, all_strings, l_c_ratio=1.):
    coverage_score, success_match_list = zip(*final_probes)
    length_score = [] # harmonic mean of the lengths
    final_score = []

    for cr, sm in final_probes:
        l = hmean(list(map(len,sm)))
        length_score.append(l)
        fs = final_score_function(cr,0,l,len(sm), l_c_ratio)
        final_score.append(fs)
    
    ret = {
        'final': final_score,
        'coverage': coverage_score,
        'overlap': [0]*len(coverage_score),
        'length': length_score,
        'match_list': success_match_list
    }
    return ret
Example #48
0
def evaluations2df(data_two_stage):
    if not isinstance(data_two_stage, np.ndarray):
        data_two_stage = np.array(data_two_stage)

    data = {
        'thr': data_two_stage[:, 0],
        'thr_exh': data_two_stage[:, 1],
        'TP': data_two_stage[:, 4].astype(np.int32),
        'FP': data_two_stage[:, 5].astype(np.int32),
        'TN': data_two_stage[:, 6].astype(np.int32),
        'FN': data_two_stage[:, 7].astype(np.int32),
        'precision': data_two_stage[:, 8],
        'recall': data_two_stage[:, 9],
        'F1': stats.hmean(np.ma.masked_equal(data_two_stage[:, 8:10], 0),
                          axis=1),
        'acc': data_two_stage[:, 10],
    }
    frame_two_stage = pd.DataFrame(data)
    return frame_two_stage
Example #49
0
def evaluate(stats: EvaluationStatistics) -> EvaluationResult:
    EPS = 1e-6

    precision = stats.tp / (stats.tp + stats.fp)
    recall = stats.tp / (stats.tp + stats.fn)
    precision[np.where(np.isnan(precision))] = 1
    recall[np.where(np.isnan(recall))] = 1

    accuracy = (stats.tp.sum() + stats.tn.sum()) / (
        stats.tp.sum() + stats.tn.sum() + stats.fp.sum() + stats.fn.sum())

    f1 = hmean(np.concatenate([precision, recall]) + EPS)

    predicted_dist = stats.predicted_count / stats.predicted_count.sum()
    actual_dist = stats.actual_count / stats.actual_count.sum()
    expected_accuracy = np.sum(predicted_dist * actual_dist)
    kappa = (accuracy - expected_accuracy) / (1 - expected_accuracy)

    return EvaluationResult(precision, recall, accuracy, f1, kappa)
Example #50
0
def main():
    # documents = load()
    sc = SentComparator()
    path = "../ClaimComparator/testLogicParseCorpus/"
    files = glob(os.path.join(path, "*logic.csv"))
    # w2v = sc.loadW2V()
    for filename in files:
        with open(filename, "r",
                  encoding='utf-8') as logic, open('results.txt',
                                                   'a',
                                                   encoding='utf-8') as res:
            print('Processing:', filename)
            if 'claim 1' in filename:
                target = 'Bolsanaro won the Brazilian election'
            elif 'claim 2' in filename:
                target = 'Climate change is predominantly caused by human activity'
            else:
                target = 'Michael Jordan is the greatest basketball player of all time'
            lines = logic.readlines()
            match = sc.oneToManyCompare(lines, target)
            res.write(filename + ': ' + str(match) + '\n')

    # print(sc.compare('The sandwich ate the rat', 'The rat ate the sandwich'))
    # print(documents['./testCorpus\claim 3-7.txt'][4])
    # posCCM = ClaimComparatorModel(documents['./testCorpus\claim 3-3.txt'][-1])
    # print(posCCM.go())
    # ['[', 'russell', 'may', 'have', 'won', 'more', 'championships', ']', '[', ',', 'Chamberlain', 'may', 'have',
    #  'averaged', '50', 'points', 'per', 'game', 'in', 'a', 'single', 'season', ']', '[', 'and', 'Kareem', 'may', 'be',
    #  'the', 'all-time', 'scoring', 'leader', ']', ',', '[', 'but', 'if', '[', 'you', 'take', 'into', 'consideration',
    #  'the', 'entire', 'package', ']', ',', 'Michael', 'Jordan', 'is', 'the', 'greatest', 'of', 'all', 'time', '.', ']']

    # negCCM = ClaimComparatorModel(documents['./testCorpus\claim 3-7.txt'][4])
    # print(negCCM.go())
    # ['[', 'tonight', 'is', 'the', 'commencement', 'of', 'the', 'fourth', 'installment', 'of', 'the', 'Warriors-Cavs',
    #  'Finals', ']', ',', '[', 'which', 'resurrects', 'the', 'biggest', 'debate', 'in', 'basketball', ':', 'Is',
    #  'LeBron', 'James', 'or', 'the', 'Bulls', "'", 'Michael', 'Jordan', 'the', 'Greatest', 'Player', 'of', 'All-Time',
    #  '?', ']']

    TP, FP, FN = LogicComparator.run_on_sts()
    precision = (1.0 * TP) / (TP + FP)
    recall = (1.0 * TP) / (TP + FN)
    print("Precision: {}\nRecall: {}\nF1 Score: {}".format(
        precision, recall, hmean([precision, recall])))
Example #51
0
    def cv_validation(self):
        from scipy import stats
        # self.network.load_state_dict(self.best_weights.state_dict())
        folds_accuracy = []

        self.network.eval()
        with torch.no_grad():
            for loader in self.folds_loaders:
                test_loss = 0.
                full_preds = []
                full_labels = []
                full_logits = []
                for data, target in loader:
                    data, target = data.to(self.device), target.to(self.device)
                    # bs, ncrops, c, h, w = data.size()
                    labels = target.data
                    if self.one_hot:
                        target = self._convert_int_onehot(target)

                    logits = self.network(data)

                    if self.prob_est:
                        logits = F.softmax(logits, dim=1)
                    full_logits.append(logits)

                    test_loss += self.loss_func(logits, target).item()
                    # get the index of the max log-probability
                    pred = logits.data.max(1, keepdim=True)[1]
                    # full_preds.extend(pred.view(-1).cpu())
                    full_preds.extend(pred.view(-1).cpu())
                    full_labels.extend(labels.cpu())

                test_accuracy = 100 * metrics.accuracy_score(
                    np.array(full_labels), np.array(full_preds))
                folds_accuracy.append(test_accuracy)

        hmean = stats.hmean(folds_accuracy)
        self.logger.info('# Task {} # Harmonic Accuracy: {:.2f}%'.format(
            self.task_id, hmean))
        result = {'id': self.task_id, 'acc': hmean}
        all_hmean = self.exploit_comm.allgather(result)

        return all_hmean
Example #52
0
def calculate_avg_condnum(input_set, qoi_set=None):
    r"""
    Given gradient vectors at some points (centers) in the input space and
    given a specific set of QoIs, caculate the average condition number of the
    matrices formed by the gradient vectors of each QoI map at each center.

    :param input_set: The input sample set.  Make sure the attribute _jacobians
        is not None
    :type input_set: :class:`~bet.sample.sample_set`
    :param list qoi_set: List of QoI indices

    :rtype: tuple
    :returns: (condnum, singvals) where condnum is a float and singvals
        has shape (num_centers, output_dim)

    """

    if input_set._jacobians is None:
        raise ValueError("You must have jacobians to use this method.")
    if qoi_set is None:
        G = input_set._jacobians
    else:
        G = input_set._jacobians[:, qoi_set, :]
    if G.shape[1] > G.shape[2]:
        msg = "Condition number is not defined for more outputs than inputs."
        msg += " Try adding a qoi_set to evaluate the condition number of."
        raise ValueError(msg)

    # Calculate the singular values of the matrix formed by the gradient
    # vectors of each QoI map.  This gives a set of singular values for each
    # center.
    singvals = np.linalg.svd(G, compute_uv=False)
    indz = singvals[:, -1] == 0
    if np.sum(indz) == singvals.shape[0]:
        hmean_condnum = np.inf
    else:
        singvals[indz, 0] = np.inf
        singvals[indz, -1] = 1
        condnums = singvals[:, 0] / singvals[:, -1]
        hmean_condnum = stats.hmean(condnums)

    return hmean_condnum, singvals
Example #53
0
def computeF1_macro(confusion_matrix,matching, num_clusters):
	"""
	computes the macro F1 score
	confusion matrix : requres permutation
	matching according to which matrix must be permuted
	"""
	##Permute the matrix columns
	permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
	for cluster in xrange(num_clusters):
		matched_cluster = matching[cluster]
 		permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
 	##Compute the F1 score for every cluster
 	F1_score = 0
 	for cluster in xrange(num_clusters):
 		TP = permuted_confusion_matrix[cluster,cluster]
 		FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
 		FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
 		precision = TP/(TP + FP)
 		recall = TP/(TP + FN)
 		f1 = stats.hmean([precision,recall])
 		F1_score += f1
 	F1_score /= num_clusters
 	return F1_score
Example #54
0
def token_similarity(string1, string2):
    set1 = set(normalize(string1).split())
    set2 = set(normalize(string2).split())
    sims = []
    diffs = []
    for item1 in set1:
        for item2 in set2:
            sim = 1 - Levenshtein.distance(item1, item2) / (len(item1) + len(item2))
            if sim > _tksim_threshold:
                sims.append(item1)
                sims.append(item2)
    sims = list(set(sims))
    for item1 in set1:
        if item1 not in sims:
            diffs.append(item1)
    for item2 in set2:
        if item2 not in sims:
            diffs.append(item2)
    diffs = list(set(diffs))
    sim = len(sims) / (len(sims) + len(diffs)) if len(sims) + len(diffs) > 0 else 0
    diff = 1 - (len(diffs) / (len(sims) + len(diffs)) if len(sims) + len(diffs) > 0 else 0)
    scores = [sim, diff]
    return stats.hmean(scores) if all([s > 0 for s in scores]) else -sys.float_info.max
Example #55
0
def harmonic_mean(value, *args, **kwargs):
    """
    Wrapper for :func:`scipy.stats.hmean`
    """
    return hmean(value, *args, **kwargs)
Example #56
0
from pylab import *
from scipy import stats

# generate a normal distribution sample with 100 elements
sample = np.random.randn(100)

# harmonic mean
out = stats.hmean(sample[sample > 0])
print "Harmonic mean = " + str(out)

# the mean, where values below -1 and above 1 are removed
out = stats.tmean(sample, limits = (-1, 1))
print "Trimmed mean = " + str(out)

# calculate the skewness of the sample
out = stats.skew(sample)
print "Skewness = " + str(out)

out = stats.describe(sample)
print out
Example #57
0
	# print out some information
	print "Number of introns: %d" % intron_count
	print "Repeat count: %d" % repeat_count
	print "Repeat length: %d" % repeat_length
	print "Repeats per intron: %.2f" % (float(repeat_count) / float(intron_count))
	print "Number of introns containing repeats: %d" % (len(unique_intron_sizes) - len(non_rep_intron_sizes))
	print "Raw unique intron sizes outputted to %s" % output
	print "\nNon repeat introns:"
	print "Count:", len(non_rep_intron_sizes)
	print "Max:", max(non_rep_intron_sizes)
	print "Min:", min(non_rep_intron_sizes)
	print "Mean: %.2f" % numpy.mean(non_rep_intron_sizes)
	print "Median: %d" % numpy.median(non_rep_intron_sizes)
	print "Mode: %d" % stats.mode(non_rep_intron_sizes)[0]
	print "GMean: %.2f" % stats.gmean(non_rep_intron_sizes)
	print "HMean: %.2f" % stats.hmean(non_rep_intron_sizes)
	print "\nRepeat introns:"
	print "Count:", len(rep_intron_sizes)
	print "Max:", max(rep_intron_sizes)
	print "Min:", min(rep_intron_sizes)
	print "Mean: %.2f" % numpy.mean(rep_intron_sizes)
	print "Median %d" % numpy.median(rep_intron_sizes)
	print "Mode: %d" % stats.mode(rep_intron_sizes)[0]
	print "GMean: %.2f" % stats.gmean(rep_intron_sizes)
	print "HMean: %.2f" % stats.hmean(rep_intron_sizes)
	
	intron_count = 0
	repeat_count = 0
	repeat_length = 0
	cum_rep_len = 0
	header = 0
Example #58
0
def combine_harmonic(lst):
	if len(lst) == 1:
		return lst[0]
	return stats.hmean(np.array(lst))
Example #59
0
def stackedTrain(X, y, projectName, scoring):

    print ("\nTraining stacked...")
    model_dir = "models"
    basePath = os.path.join(model_dir, projectName)
    models = os.listdir(basePath)

    df = pd.DataFrame()
    #y = pd.DataFrame(data = y, columns = ["y"])
    skipName = "ensemble"

    for model_name in models:
        model_name_base = model_name.split(".")[0]
        suffix          = model_name.split(".")[1]
        if model_name_base != skipName and suffix == "sav":
            print ("\n" + model_name_base)
            
            path = os.path.join(basePath, model_name)
            model = pickle.load(open(path, "rb"))

            y_hat = model.predict(X)
            df[model_name_base] = y_hat

            tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
            
            n = tn + fp + fn + tp
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            accuracy = (tp + tn) / n
            f1 = stats.hmean([precision, recall])

            print (accuracy)
            
            path = os.path.join("models", projectName, model_name_base + ".txt")
            f = open(path, "w")
            
            f.write("N:\t\t" + str(n))
            f.write("\n\nTrue positive:\t" + str(tp) + "\t(" + str(tp/n) + ")")
            f.write("\nTrue negative:\t" + str(tn) + "\t(" + str(tn/n) + ")")
            f.write("\nFalse positive:\t" + str(fp) + "\t(" + str(fp/n) + ")")
            f.write("\nFalse negative:\t" + str(fn) + "\t(" + str(fn/n) + ")")
            
            f.write("\n\nAccuracy:\t" + str(accuracy))
            
            f.write("\n\nPrecision:\t" + str(precision))
            f.write("\nRecall:\t\t" + str(recall))
            f.write("\nF1:\t\t" + str(f1))
            
            f.close()

    kSplits = 2
    param_grid = {}
    model = RandomForestClassifier()            

    #transformPipeline = getTransformPipeline()
    #pipelineArray = transformPipeline[:]
    #pipelineArray.append(("clf", model))
    #pipeline = Pipeline(pipelineArray)
       
    grid_search = GridSearchCV(model, param_grid = param_grid, cv = kSplits, verbose = 2, scoring = scoring)
    grid_search.fit(df, y)
    bestParameters = grid_search.best_params_

    model.set_params(**bestParameters)

    model.fit(df, y)

    path = os.path.join("models", projectName, skipName + ".sav")
    f = open(path, "wb")
    pickle.dump(model, f)
    f.close()  
    return
res = {}
ys = []
for t in cfg.futureGens:
    y = cfg.A * math.cos(2 * math.pi * (t - cfg.seasonGen) / cfg.T) + cfg.B
    res[t] = {}
    #print t, y

for numIndivs, numLoci in cfg.sampleStrats:
    fname = myUtils.getStatName(cfg, numIndivs, numLoci)
    for rec in myUtils.getStat(open(fname)):
        if rec["type"] != "temp":
            continue
        g1l = res[rec["g2"]].setdefault(rec["g1"], [])
        g1l.append(rec[tempStat][-1])

xs = res.keys()
xs.sort()
plt = {}
for x in xs:
    for g1 in res[x].keys():
        g1l = plt.setdefault(g1, [])
        g1l.append(stats.hmean([v if v >= 0 else 100000 for v in res[x][g1]]))
print len(xs), plt.keys(), len(plt[56])
pylab.title(tempStat)
for g1, vals in plt.items():
    print g1
    pylab.plot(xs, vals, label=str(g1))
pylab.legend()
pylab.show()