コード例 #1
0
    def test_weightstats_ddof_tests(self):
        # explicit test that ttest and confint are independent of ddof
        # one sample case
        x1_2d = self.x1_2d
        w1 = self.w1

        d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0)
        d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1)
        d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2)

        #check confint independent of user ddof
        res0 = d1w_d0.ttest_mean()
        res1 = d1w_d1.ttest_mean()
        res2 = d1w_d2.ttest_mean()
        # concatenate into one array with np.r_
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        res0 = d1w_d0.ttest_mean(0.5)
        res1 = d1w_d1.ttest_mean(0.5)
        res2 = d1w_d2.ttest_mean(0.5)
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        #check confint independent of user ddof
        res0 = d1w_d0.tconfint_mean()
        res1 = d1w_d1.tconfint_mean()
        res2 = d1w_d2.tconfint_mean()
        assert_almost_equal(res1, res0, 14)
        assert_almost_equal(res2, res0, 14)
コード例 #2
0
def weighted_stat(stock_trading_df):
    if (stock_trading_df.shape[0] == 1):
        return pd.Series([
            0, 0, stock_trading_df['price'][0], stock_trading_df['price'][0],
            stock_trading_df['price'][0], 1, stock_trading_df['turnover'][0]
        ],
                         index=[
                             'price_var', 'price_std', 'price_mean',
                             'price_min', 'price_max', 'no_of_txn', 'turnover'
                         ])
    else:
        return pd.Series([
            DescrStatsW(stock_trading_df['price'],
                        stock_trading_df['volume']).var,
            DescrStatsW(stock_trading_df['price'],
                        stock_trading_df['volume']).std,
            DescrStatsW(stock_trading_df['price'],
                        stock_trading_df['volume']).mean,
            min(stock_trading_df['price']),
            max(stock_trading_df['price']),
            DescrStatsW(stock_trading_df['price']).nobs,
            sum(stock_trading_df['turnover'])
        ],
                         index=[
                             'price_var', 'price_std', 'price_mean',
                             'price_min', 'price_max', 'no_of_txn', 'turnover'
                         ])
コード例 #3
0
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1) * 21. / 20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    #d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    #TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
コード例 #4
0
 def setup_class(cls):
     cls.x1 = np.array(
         [7.8, 6.6, 6.5, 7.4, 7.3, 7., 6.4, 7.1, 6.7, 7.6, 6.8])
     cls.x2 = np.array([4.5, 5.4, 6.1, 6.1, 5.4, 5., 4.1, 5.5])
     cls.d1 = DescrStatsW(cls.x1)
     cls.d2 = DescrStatsW(cls.x2)
     cls.cm = CompareMeans(cls.d1, cls.d2)
コード例 #5
0
    def test_weightstats_2(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2

        d1 = DescrStatsW(x1)
        d1w = DescrStatsW(x1, weights=w1)
        d2w = DescrStatsW(x2, weights=w2)
        x1r = d1w.asrepeats()
        x2r = d2w.asrepeats()
        #        print 'random weights'
        #        print ttest_ind(x1, x2, weights=(w1, w2))
        #        print stats.ttest_ind(x1r, x2r)
        assert_almost_equal(
            ttest_ind(x1, x2, weights=(w1, w2))[:2], stats.ttest_ind(x1r, x2r),
            14)
        #not the same as new version with random weights/replication
        #        assert x1r.shape[0] == d1w.sum_weights
        #        assert x2r.shape[0] == d2w.sum_weights

        assert_almost_equal(x2r.mean(0), d2w.mean, 14)
        assert_almost_equal(x2r.var(), d2w.var, 14)
        assert_almost_equal(x2r.std(), d2w.std, 14)
        #note: the following is for 1d
        assert_almost_equal(np.cov(x2r, bias=1), d2w.cov, 14)
        #assert_almost_equal(np.corrcoef(np.x2r), d2w.corrcoef, 19)
        #TODO: exception in corrcoef (scalar case)

        #one-sample tests
        #        print d1.ttest_mean(3)
        #        print stats.ttest_1samp(x1, 3)
        #        print d1w.ttest_mean(3)
        #        print stats.ttest_1samp(x1r, 3)
        assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11)
        assert_almost_equal(
            d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
コード例 #6
0
    def test_weightstats_3(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        w1, w2 = self.w1, self.w2

        d1w_2d = DescrStatsW(x1_2d, weights=w1)
        d2w_2d = DescrStatsW(x2_2d, weights=w2)
        x1r_2d = d1w_2d.asrepeats()
        x2r_2d = d2w_2d.asrepeats()

        assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14)
        assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14)
        assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14)
        assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14)
        assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14)

        #        print d1w_2d.ttest_mean(3)
        #        #scipy.stats.ttest is also vectorized
        #        print stats.ttest_1samp(x1r_2d, 3)
        t, p, d = d1w_2d.ttest_mean(3)
        assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11)
        #print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T]
        cm = CompareMeans(d1w_2d, d2w_2d)
        ressm = cm.ttest_ind()
        resss = stats.ttest_ind(x1r_2d, x2r_2d)
        assert_almost_equal(ressm[:2], resss, 14)
コード例 #7
0
    def fit(self):
        """Once the exposure and outcome models are specified, we can estimate the risk ratio and risk difference.

        Returns
        -------
        Gains `risk_difference`, `risk_difference_ci`, and `risk_ratio` values
        """
        if (self._fit_exposure_ is False) or (self._fit_outcome_ is False):
            raise ValueError(
                'The exposure and outcome models must be specified before the doubly robust estimate can '
                'be generated')

        # Doubly robust estimator under all treated
        a_obs = self.df[self.exposure]
        y_obs = self.df[self.outcome]
        ps = self.df['_ps_']
        py_a1 = self.df['_pY1_']
        py_a0 = self.df['_pY0_']
        dr_a1 = np.where(a_obs == 1, (y_obs / ps) - ((py_a1 * (1 - ps)) / ps),
                         py_a1)

        # Doubly robust estimator under all untreated
        dr_a0 = np.where(a_obs == 1, py_a0,
                         (y_obs / (1 - ps) - ((py_a0 * ps) / (1 - ps))))

        # Generating estimates for the risk difference and risk ratio
        zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)

        if self._weight_ is None:
            if self._continuous_outcome:
                self.average_treatment_effect = np.mean(dr_a1) - np.mean(dr_a0)
                var_ic = np.var(
                    (dr_a1 - dr_a0) - self.average_treatment_effect,
                    ddof=1) / self.df.shape[0]
                self.average_treatment_effect_se = np.sqrt(var_ic)
                self.average_treatment_effect_ci = [
                    self.average_treatment_effect - zalpha * np.sqrt(var_ic),
                    self.average_treatment_effect + zalpha * np.sqrt(var_ic)
                ]

            else:
                self.risk_difference = np.mean(dr_a1) - np.mean(dr_a0)
                self.risk_ratio = np.mean(dr_a1) / np.mean(dr_a0)
                var_ic = np.var((dr_a1 - dr_a0) - self.risk_difference,
                                ddof=1) / self.df.shape[0]
                self.risk_difference_se = np.sqrt(var_ic)
                self.risk_difference_ci = [
                    self.risk_difference - zalpha * np.sqrt(var_ic),
                    self.risk_difference + zalpha * np.sqrt(var_ic)
                ]
        else:
            dr_m1 = DescrStatsW(dr_a1, weights=self.df[self._weight_]).mean
            dr_m0 = DescrStatsW(dr_a0, weights=self.df[self._weight_]).mean

            if self._continuous_outcome:
                self.average_treatment_effect = dr_m1 - dr_m0
            else:
                self.risk_difference = dr_m1 - dr_m0
                self.risk_ratio = dr_m1 / dr_m0
コード例 #8
0
ファイル: Ball.py プロジェクト: svejlgaard/AppStat2020Project
    def get_angle(self, flip = True):
        angles = get_angles()

        angle_dict = dict()
        for a in ['r','l']:
            phi = angles[[f'inc_{a}']]
            flip = angles[[f'inc_flip_{a}']]
            err = 1

            phi_des = DescrStatsW(phi, weights = 1/(np.array([err, err, err]))**2)
            phi_mean = phi_des.mean
            phi_mean_err = phi_des.std / np.sqrt(3)
            
            flip_des = DescrStatsW(flip, weights = 1/(np.array([err, err, err]))**2)
            flip_mean = flip_des.mean
            flip_mean_err = flip_des.std / np.sqrt(3)

            true_phi = (phi_mean + flip_mean) / 2

            true_phi_err = np.sqrt(phi_mean_err**2 + flip_mean_err**2) / 2
            
            angle_dict.update({f'phi_{a}': [true_phi, true_phi_err]})
        

        self.phi_r_mu = angle_dict['phi_r'][0]
        self.phi_r_std = angle_dict['phi_r'][1]


        self.phi_l_mu = angle_dict['phi_l'][0]
        self.phi_l_std = angle_dict['phi_l'][1]

        self.dphi_mu_g = (self.phi_r_mu - self.phi_l_mu) / 2
        self.dphi_std_g = np.sqrt(self.phi_r_std**2 + self.phi_l_std**2)
        self.dphi_mu_err_g = self.dphi_std_g / np.sqrt(2) + 0.05


        upper = (self.a_dict['a_L'][0] - self.a_dict['a_R'][0]) * np.sin(np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0]))
        lower = (self.a_dict['a_L'][0] + self.a_dict['a_R'][0]) * np.cos(np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0]))
        
        self.dphi_mu_a = upper / lower
        
        x = np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0])
        L = self.a_dict['a_L'][0]
        R = self.a_dict['a_R'][0]

        err1 = (np.sin(x)/((L+R)*np.cos(x))-(L-R)*np.sin(x)/((L+R)**2*np.cos(x)))**2 * self.phi_l_std**2
        err2 = (-np.sin(x)/((L+R)*np.cos(x))-(L-R)*np.sin(x)/((L+R)**2*np.cos(x)))**2 * self.phi_r_std**2
        err3 = ((L-R)/(L+R)+(L-R)*np.sin(x)**2/((L+R)*np.cos(x)**2))**2 * angle_dict[f'phi_{self.orientation.lower()}'][1]**2
        
        self.dphi_std_a = np.sqrt(err1 + err2 + err3) + 0.05
        
        combined_dphi = DescrStatsW([self.dphi_mu_g, self.dphi_mu_a], weights=1/(np.array([self.dphi_mu_err_g, self.dphi_std_a]))**2)


        self.dphi_mu = combined_dphi.mean
        self.dphi_std = combined_dphi.std / np.sqrt(2)

        self.get_chi2(f'dphi_{self.orientation}', np.array([self.dphi_mu_a,self.dphi_mu_g]), np.array([self.dphi_std_a, self.dphi_mu_err_g]))
コード例 #9
0
def _similarity_helper_limited(user1_id, user2_id, solr):
    user1_vector, user2_vector, weight_vector = get_vector_limited(
        user1_id, user2_id)

    data = column_stack((user1_vector, user2_vector))
    result = DescrStatsW(data)[1][0]
    result2 = DescrStatsW(data, weights=weight_vector).corrcoef[1][0]

    print('Pearson similarity ' + str(user2_id) + ' ' + str(result))
    print('DescrStatsW ' + str(user2_id) + ' ' + str(result2))
コード例 #10
0
def trades_to_bar(ticks: pd.DataFrame, bar_trigger: str='fixed') -> dict:
    
    if type(ticks) != pd.DataFrame:
        ticks = pd.DataFrame(ticks)
    
    bar = {'bar_trigger': bar_trigger}
    # time
    bar['open_at'] = ticks['utc_dt'].iloc[0]
    bar['close_at'] = ticks['utc_dt'].iloc[-1]
    bar['duration_td'] = bar['close_at'] - bar['open_at']
    # volume
    bar['tick_count'] = ticks.shape[0]
    bar['volume'] = ticks.volume.sum()
    bar['dollars'] = (ticks.volume * ticks.price).sum()
    # price
    bar['price_open'] = ticks.price.values[0]
    bar['price_close'] = ticks.price.values[-1]
    bar['price_low'] = ticks.price.min()
    bar['price_high'] = ticks.price.max()
    bar['price_range'] = bar['price_high'] - bar['price_low']
    bar['price_return'] = bar['price_close'] - bar['price_close']
    # volume weighted price
    dsw = DescrStatsW(data=ticks.price, weights=ticks.volume)
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    bar['price_wq10'] = qtiles[0]
    bar['price_wq50'] = qtiles[1]
    bar['price_wq90'] = qtiles[2]
    bar['price_wq_range'] = bar['price_wq90'] - bar['price_wq10']
    bar['price_wmean'] = dsw.mean
    bar['price_wstd'] = dsw.std
    # jma
    bar['jma_open'] = ticks.jma.values[0]
    bar['jma_close'] = ticks.jma.values[-1]
    bar['jma_low'] = ticks.jma.min()
    bar['jma_high'] = ticks.jma.max()
    bar['jma_range'] = bar['jma_high'] - bar['jma_low']
    bar['jma_return'] = bar['jma_close'] - bar['jma_open']
    # volume weighted jma
    dsw = DescrStatsW(data=ticks.jma, weights=ticks.volume)
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    bar['jma_wq10'] = qtiles[0]
    bar['jma_wq50'] = qtiles[1]
    bar['jma_wq90'] = qtiles[2]
    bar['jma_wq_range'] = bar['jma_wq90'] - bar['jma_wq10']
    bar['jma_wmean'] = dsw.mean
    bar['jma_wstd'] = dsw.std
    # tick/vol/dollar/imbalance
    bar['tick_imbalance'] = ticks.side.sum()
    bar['volume_imbalance'] = (ticks.volume * ticks.side).sum()
    bar['dollar_imbalance'] = (ticks.volume * ticks.price * ticks.side).sum()

    return bar
コード例 #11
0
def title_len_stat(mongo_db):
    len_counter_db = collections.Counter()
    len_counter_cr = collections.Counter()
    for col_name in mongo_db.collection_names():
        if col_name not in PAPER_COLLECTIONS:
            continue
        col = mongo_db[col_name]
        query_w_doi = col.find({'doi': {'$exists': True}})
        for doc in query_w_doi:
            if ('metadata' in doc and 'title' in doc['metadata']
                    and isinstance(doc['metadata']['title'], str)):
                len_counter_db[len(doc['metadata']['title'])] += 1
            if ('crossref_raw_result' in doc
                    and 'title' in doc['crossref_raw_result']
                    and isinstance(doc['crossref_raw_result']['title'], list)
                    and len(doc['crossref_raw_result']['title']) == 1):
                len_counter_cr[len(
                    doc['crossref_raw_result']['title'][0])] += 1

    # stat for db titles
    sorted_len = sorted(len_counter_db.keys())
    weights = [len_counter_db[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter_db')
    pprint(len_counter_db)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    # stat for cr titles
    sorted_len = sorted(len_counter_cr.keys())
    weights = [len_counter_cr[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    #     sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter_cr')
    pprint(len_counter_cr)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    return len_counter_db, len_counter_cr
コード例 #12
0
    def angular_linear_correlation(self,
                                   angles,
                                   data,
                                   weights=None,
                                   double_peak=False):
        """
        This function computes an angular-linear correlation.
        When expecting the data to have two symmetrical, opposite peaks (e.g.
        a non-direction selective effect such as horizontal vs. vertical
        instead of up vs down), double_peak should be set to True.

        :param angles: input angles
        :type angles: 1-D array
        :param data: input data
        :type data: 1-D array, same shape as angles
        :param weights: weights to use for correlation
        :type weights: 1-D array, same shape as angles
        :param double_peak: when True, angles are doubled
        :type double_peak: bool

        :return corr: circular correlation
        :type corr: float

        """

        # set weights to one
        if weights is None:
            weights = np.ones_like(angles)

        # In cases of expected periodicity (e.g. data peaks at two opposite angles),
        # the angular data should be scaled:
        if double_peak:
            angles = self.collapse_angles_symmetrically(angles)

        # use formula from the pycircstat package to calculate circular correlation:
        rxs = DescrStatsW(data=np.vstack([data, np.sin(angles)]).T,
                          weights=weights).corrcoef[0, 1]
        rxc = DescrStatsW(data=np.vstack([data, np.cos(angles)]).T,
                          weights=weights).corrcoef[0, 1]
        rcs = DescrStatsW(data=np.vstack([np.sin(angles),
                                          np.cos(angles)]).T,
                          weights=weights).corrcoef[0, 1]

        # rxs = self.functions.wpearson(these_data,np.sin(doubled_angles),weights)
        # rxc = self.functions.wpearson(these_data,np.cos(doubled_angles),weights)
        # rcs = self.functions.wpearson(np.sin(doubled_angles),np.cos(doubled_angles),weights)
        # compute angular-linear correlation (equ. 27.47)
        corr = np.sqrt((rxc**2 + rxs**2 - 2 * rxc * rxs * rcs) / (1 - rcs**2))

        return corr
コード例 #13
0
    def setup_class(self):
        np.random.seed(9876789)
        n1, n2 = 20, 30
        m1, m2 = 1, 1.2
        x1 = m1 + np.random.randn(n1, 3)
        x2 = m2 + np.random.randn(n2, 3)
        w1 = np.random.randint(1, 4, n1)
        w2 = np.random.randint(1, 4, n2)

        self.x1, self.x2 = x1, x2
        self.w1, self.w2 = w1, w2
        self.d1w = DescrStatsW(x1, weights=w1, ddof=0)
        self.d2w = DescrStatsW(x2, weights=w2, ddof=1)
        self.x1r = self.d1w.asrepeats()
        self.x2r = self.d2w.asrepeats()
コード例 #14
0
    def _get_weighted_stats(self, X, y, weights):
        """Gets the weighted mean and standard deviation for each variable
        in X and y, based on an array of weights."""
        Xw_stat_obj = DescrStatsW(self._X, weights=self._w, ddof=1)

        # Weighted standard deviation for X vars:
        std_Xw = np.sqrt(np.abs(Xw_stat_obj.var_ddof(1)))  # abs for w_sum <1
        mean_Xw = Xw_stat_obj.mean  # Numpy array shape: (regressors, )

        yw_stat_obj = DescrStatsW(self._y, weights=self._w, ddof=1)
        # Weighted standard deviation for y:
        std_yw = np.sqrt(np.abs(yw_stat_obj.var_ddof(1)))  # abs for w_sum <1
        mean_yw = yw_stat_obj.mean  # Numpy array shape: (regressors, )

        return mean_Xw, mean_yw, std_Xw, std_yw
    def setup_class(cls):
        np.random.seed(9876789)
        n1, n2 = 20, 20
        m1, m2 = 1, 1.2
        x1 = m1 + np.random.randn(n1, 3)
        x2 = m2 + np.random.randn(n2, 3)
        w1 = np.random.randint(1, 4, n1)
        w2 = np.random.randint(1, 4, n2)

        cls.x1, cls.x2 = x1, x2
        cls.w1, cls.w2 = w1, w2
        cls.d1w = DescrStatsW(x1, weights=w1, ddof=1)
        cls.d2w = DescrStatsW(x2, weights=w2, ddof=1)
        cls.x1r = cls.d1w.asrepeats()
        cls.x2r = cls.d2w.asrepeats()
def globaldepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()

    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)
    ##que diferencia hay con coverage_hist.DP.mean()??????????

    global_depth = {}
    b, bases_20x, depth_20X = depth_fraction(coverage_hist, thr=20)
    global_depth.update({'bases_totales': int(b)})

    global_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]})
    #global_depth.update({'std_DP':round(weighted_stats.std,signif)})
    #global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]})
    #global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]})
    #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})
    #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})

    #global_depth.update({'dp>=1':round(depth_fraction(coverage_hist,thr=1),signif)})
    #global_depth.update({'dp>=10':round(depth_fraction(coverage_hist,thr=10),signif)})
    global_depth.update({'bases_20X': int(bases_20x)})
    #global_depth.update({'bases_20X(%)':(100*(bases_20x/b)})
    global_depth.update({'dp>=20': round(depth_20X, 3)})

    #global_depth.update({'dp>=20':round(depth_fraction(coverage_hist,thr=20),signif)})
    #global_depth.update({'dp>=30':round(depth_fraction(coverage_hist,thr=30),signif)})
    #global_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)})
    #global_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)})

    return (global_depth)
コード例 #17
0
    def fit_single(self,
                   pos_left,
                   pos_right,
                   weights,
                   tol=1e-4,
                   maxiter=4000,
                   verbose=False):
        left, right = np.asarray(pos_left), np.asarray(pos_right)
        debugs = list() if verbose else None
        centers = (left + right) / 2.0
        statsW = DescrStatsW(centers, weights=np.array(weights))
        init_paras = self._paras_compose_([statsW.mean], [statsW.cov], [1.0])

        method = 'Nelder-Mead'
        res = opt.minimize(self._single_optpara,
                           init_paras,
                           args=(left, right, weights, debugs),
                           method=method,
                           tol=tol,
                           options={
                               'maxiter': maxiter,
                               'disp': verbose
                           })
        if verbose:
            print("Method:{}; Initial parameter: {};".format(
                method, init_paras))
            print("Converged Parameter: {}".format(res.x))

        mus, covs, ws = self._paras_decompose_(res.x, 1)
        if det(covs[0]) == 0.0:
            print("Warning: covariance processed:")
            print("\t pre-optimal mus: {}, cov: {}".format(mus[0], covs[0]))
            covs[0] = self._cov_process_(covs[0])

        return mus, covs, ws, res.fun
コード例 #18
0
ファイル: utils.py プロジェクト: zzzcy-coder/Barra
def Newey_West(ret, q=2, tao=252):
    '''
    Newey_West方差调整
    时序上存在相关性时,使用Newey_West调整协方差估计
    factor_ret: DataFrame, 行为时间,列为因子收益
    q: 假设因子收益为q阶MA过程
    tao: 算协方差时的半衰期
    '''
    from functools import reduce
    from statsmodels.stats.weightstats import DescrStatsW

    T = ret.shape[0]  #时序长度
    K = ret.shape[1]  #因子数
    if T <= q or T <= K:
        raise Exception("T <= q or T <= K")

    names = ret.columns
    weights = 0.5**(np.arange(T - 1, -1, -1) / tao)  #指数衰减权重
    weights = weights / sum(weights)

    w_stats = DescrStatsW(ret, weights)
    ret = ret - w_stats.mean

    ret = np.matrix(ret.values)
    Gamma0 = [weights[t] * ret[t].T @ ret[t] for t in range(T)]
    Gamma0 = reduce(np.add, Gamma0)

    V = Gamma0  #调整后的协方差矩阵
    for i in range(1, q + 1):
        Gammai = [weights[i + t] * ret[t].T @ ret[i + t] for t in range(T - i)]
        Gammai = reduce(np.add, Gammai)
        V = V + (1 - i / (1 + q)) * (Gammai + Gammai.T)

    return (pd.DataFrame(V, columns=names, index=names))
コード例 #19
0
ファイル: stats_helper.py プロジェクト: ejpjapan/TSMOM
def compute_summary_statistics(dbm: database_manager.DatabaseManager, tbl_name: str) -> Optional[Dict[str, Tuple]]:
    """
    Computes summary statistics for given table.
    :param dbm: A DatabaseManager instance.
    :param tbl_name: name of the table to compute monthly return for.
    :return: dictionary containing various statistics.
    """
    df, info, start_date = finance_metrics.compute_monthly_returns(dbm, tbl_name)

    if df is not None and info is not None:
        stat = {}

        dsw = DescrStatsW(df['Monthly_Return'].values)

        stat['table_name'] = tbl_name
        stat['contract_name'] = info[1]
        stat['type'] = info[3] if info[3] is not None else None
        stat['subtype'] = info[4] if info[4] is not None else None
        stat['start-date'] = start_date
        stat['ar'] = df['Monthly_Return'].mean() * 12
        stat['vol'] = df['Monthly_Return'].std() * np.sqrt(12)
        stat['t-stat'] = dsw.ttest_mean(alternative='larger')[0]
        stat['p-value'] = dsw.ttest_mean(alternative='larger')[1]
        stat['kurt'] = df['Monthly_Return'].kurt()
        stat['skew'] = df['Monthly_Return'].skew()

        return stat

    return None
コード例 #20
0
def calc_statistics(_totalVals, _distances, name, mask):
    if len(_totalVals) == 0 or len(mask) == 0:
        print name, "- No data available!\n"
        return (name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
                np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)

    totalVals = _totalVals[mask, :]
    distances = _distances[mask]

    #distances = np.array(distances);
    if (len(distances) != len(totalVals)):
        raise ValueError("distances must be the same length as data values")

    weights = np.transpose(np.array([distances] * totalVals.shape[1]))

    notnan = np.where(np.isnan(totalVals) == False)
    totalStats = DescrStatsW(totalVals[notnan],
                             weights=weights[notnan],
                             ddof=0)
    totalGrandMean = totalStats.mean
    totalGrandSD = totalStats.std

    #print name, "total onto-shelf current (m/s):", totalGrandMean, "+/-", totalGrandSD;

    #Return the mean ontoshelf current for this region (weighted by segment distance)
    #Return the SD of the same.
    return (name, totalGrandMean, totalGrandSD)
コード例 #21
0
def weighted_percentiles(data, weights, percentiles):
    """Return the weighted percentiles.

    Args:
      data (np.ndarray) : Bin variable (e.g. temperature, salinity)
      weights (np.ndarray): Weights (e.g. cell volume, area)
      percentiles (np.ndarray): Array of requested percentiles (e.g. 0-1 by 0.01)

    """

    assert percentiles.max() <= 1.0
    assert percentiles.min() >= 0.0

    wq = DescrStatsW(data=data, weights=weights)
    bin_edges = wq.quantile(probs=percentiles, return_pandas=False)

    # manual method does not give a clean results...
    #ix = np.argsort(data)
    #data = data[ix] # sort data
    #weights = weights[ix] # sort weights
    #cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    #perc = np.arange(0, 1.01, 0.01)
    #test2 = np.interp(perc, cdf, data)

    return bin_edges
コード例 #22
0
ファイル: bc_analysis_fields.py プロジェクト: AJueling/CESM
    def spatial_correlation(self,
                            field_A,
                            field_B,
                            method=None,
                            selection=None):
        """ correlate two 2D fields """
        if np.shape(field_A) != np.shape(field_B):  # have to regrid
            A, B = self.regrid_to_lower_resolution(field_A, field_B)
        else:
            A, B = field_A, field_B
        assert np.shape(A) == np.shape(B)
        domain = self.determine_domain(A)

        AREA = xr_AREA(domain)
        MASK = boolean_mask(domain=domain, mask_nr=0)
        if type(selection) == int:
            MASK = boolean_mask(domain=domain, mask_nr=selection)
        elif type(selection) == dict:
            MASK, AREA = MASK.sel(selection), AREA.sel(selection)
            A, B = A.sel(selection), B.sel(selection)

        D = np.any(np.array(
            [np.isnan(A).values,
             np.isnan(B).values, (MASK == 0).values]),
                   axis=0)
        A = xr.where(D, np.nan,
                     A).stack(z=('latitude', 'longitude')).dropna(dim='z')
        B = xr.where(D, np.nan,
                     B).stack(z=('latitude', 'longitude')).dropna(dim='z')
        C = xr.where(D, np.nan,
                     AREA).stack(z=('latitude', 'longitude')).dropna(dim='z')
        d = DescrStatsW(np.array([A.values, B.values]).T, weights=C)
        spatial_corr_coef = d.corrcoef[0, 1]

        return spatial_corr_coef
コード例 #23
0
def branch_scan_MinVar_general(modifier, ds_dists, us_dists, all_weights):
    """
    This is the function to minimize in order to optimaly situate the root on the putative
    branch. Note that this function is only valid for minimizing the variance of schemes 
    where the weights do not change with regard to changing the root.

    Input/s:
    modifier - This is the parameter to be optimized! Essentially a float of how much to shift the
                root left or right so as to minimize the root-to-tip variance
    ds_dists - array of downstream root-to-tip distances
    us_dists - array of upstream root-to-tip distances
    all_weights - array of downstream and upstream terminal weights
    
    Output/s:
    dsw.var - weighted variance
    
    """
    #Adjust the downstream and upstream root-to-tip distances with the modifier
    temp_ds_dists = ds_dists + modifier
    temp_us_dists = us_dists - modifier
    all_dists = np.concatenate((temp_ds_dists, temp_us_dists))

    #Calculate weighted variance and return
    dsw = DescrStatsW(all_dists, all_weights)
    return dsw.var
コード例 #24
0
def plot_fill_between(data, data_dir, label, n_dis, color='magenta'):
    plt.figure(figsize=(10, 8))
    left_edge = []
    right_edge = []
    mean_res = []
    iters = []
    for key, value in data.items():
        left, right = DescrStatsW(value).tconfint_mean()
        left_edge.append(left)
        right_edge.append(right)
        iters.append(key + 1)
        mean_res.append(np.mean(value))

    plt.fill_between(iters, left_edge, right_edge, color='violet')
    plt.plot(iters, mean_res, color=color, lw=5)

    plt.xlabel('iteration', fontsize=18)
    plt.ylabel(label, fontsize=18)
    plt.xlim([1, len(iters)])
    plt.ylim([min(mean_res) - 0.05, max(mean_res) + 0.05])
    plt.xticks(list(plt.xticks()[0][1:]) + [1])
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.legend(['TestCV'], fontsize=18, loc=2)
    plt.savefig('{}/{}:{}_ARTM_smart.eps'.format(data_dir, label, n_dis))
    plt.show()
    return mean_res
コード例 #25
0
ファイル: Ball.py プロジェクト: svejlgaard/AppStat2020Project
 def get_diameter(self):
     self.get_chi2(f'Diameter', self.diameter.values, np.array([0.05, 0.05, 0.05]))
     all_diameter = DescrStatsW(self.diameter, weights=1/(np.array([0.05, 0.05, 0.05]))**2)
     self.diameter_mean = all_diameter.mean
     self.diameter_mean_err = all_diameter.std/np.sqrt(len(self.diameter))
     self.diameter_std = all_diameter.std
     print(f' The diameter of the {self.ball} is {1000*self.diameter_mean:.2f} +- {1000*self.diameter_mean_err:.2f}')
コード例 #26
0
ファイル: MLutils.py プロジェクト: littlewine/snorkel-ml
def get_lower_upper_CI(scores):
    if scores.shape[1] > 1:  #then 2-D
        lower_bound, upper_bound = DescrStatsW(
            scores.T).tconfint_mean() - scores.mean(axis=1)
    else:
        lower_bound, upper_bound = 0, 0
    return abs(lower_bound)
コード例 #27
0
def globaldepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()
    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)

    global_depth = {}
    global_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]})
    global_depth.update({'std_DP': round(weighted_stats.std, signif)})
    global_depth.update({'q25_DP': weighted_stats.quantile(0.25).values[0]})
    global_depth.update({'q75_DP': weighted_stats.quantile(0.75).values[0]})
    global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]})
    global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]})

    global_depth.update(
        {'dp>=1': round(depth_fraction(coverage_hist, thr=1), signif)})
    global_depth.update(
        {'dp>=10': round(depth_fraction(coverage_hist, thr=10), signif)})
    global_depth.update(
        {'dp>=20': round(depth_fraction(coverage_hist, thr=20), signif)})
    global_depth.update(
        {'dp>=30': round(depth_fraction(coverage_hist, thr=30), signif)})
    global_depth.update(
        {'dp>=50': round(depth_fraction(coverage_hist, thr=50), signif)})
    global_depth.update(
        {'dp>=100': round(depth_fraction(coverage_hist, thr=100), signif)})
    return (global_depth)
コード例 #28
0
    def test_comparemeans_convenient_interface(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        d1 = DescrStatsW(x1_2d)
        d2 = DescrStatsW(x2_2d)
        cm1 = CompareMeans(d1, d2)

        # smoke test for summary
        from statsmodels.iolib.table import SimpleTable
        for use_t in [True, False]:
            for usevar in ['pooled', 'unequal']:
                smry = cm1.summary(use_t=use_t, usevar=usevar)
                assert_(isinstance(smry, SimpleTable))

        # test for from_data method
        cm2 = CompareMeans.from_data(x1_2d, x2_2d)
        assert_(str(cm1.summary()) == str(cm2.summary()))
コード例 #29
0
def localdepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()
    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)

    local_depth = {}
    local_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    #local_depth.update({'median_DP':weighted_stats.quantile(0.5).values[0]})
    local_depth.update({'std_DP': round(weighted_stats.std, signif)})

    local_depth.update(
        {'dp>=1': (round(depth_fraction(coverage_hist, thr=1), signif)) * 100})
    local_depth.update(
        {'dp>=5': (round(depth_fraction(coverage_hist, thr=5), signif)) * 100})
    local_depth.update({
        'dp>=10': (round(depth_fraction(coverage_hist, thr=10), signif)) * 100
    })
    local_depth.update({
        'dp>=20': (round(depth_fraction(coverage_hist, thr=20), signif)) * 100
    })
    local_depth.update({
        'dp>=30': (round(depth_fraction(coverage_hist, thr=30), signif)) * 100
    })
    #local_depth.update({'mean_DP':round(weighted_stats.mean,signif)})

    #local_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)})
    #local_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)})
    return pd.Series(local_depth)
コード例 #30
0
def _similarity_helper_2(user1_id, user2_id, user1_vector, solr):
    query = 'doc_type:score AND users:({} AND {})'.format(user1_id, user2_id)
    solr.delete(q=query)

    user2_vector, depth_vector = get_vector_tf_idf(user2_id)
    data = column_stack((user1_vector, user2_vector))

    # result = DescrStatsW(data).corrcoef[1][0]
    result2 = DescrStatsW(data, weights=depth_vector).corrcoef[1][0]

    mutual_friends = get_mutual_friends(user1_id, user2_id, solr)

    new_score = [{
        'doc_type': 'score',
        'users': [user1_id, user2_id],
        'similarity': result2,
        'mutual_friends': mutual_friends,
        'friends_count': len(mutual_friends)
    }]

    solr.add(new_score)
    solr.commit()

    # print('Pearson similarity ' + str(user2_id) + ' ' + str(result))
    print('DescrStatsW ' + str(user2_id) + ' ' + str(result2))