def test_weightstats_ddof_tests(self):
        # explicit test that ttest and confint are independent of ddof
        # one sample case
        x1_2d = self.x1_2d
        w1 = self.w1

        d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0)
        d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1)
        d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2)

        #check confint independent of user ddof
        res0 = d1w_d0.ttest_mean()
        res1 = d1w_d1.ttest_mean()
        res2 = d1w_d2.ttest_mean()
        # concatenate into one array with np.r_
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        res0 = d1w_d0.ttest_mean(0.5)
        res1 = d1w_d1.ttest_mean(0.5)
        res2 = d1w_d2.ttest_mean(0.5)
        assert_almost_equal(np.r_[res1], np.r_[res0], 14)
        assert_almost_equal(np.r_[res2], np.r_[res0], 14)

        #check confint independent of user ddof
        res0 = d1w_d0.tconfint_mean()
        res1 = d1w_d1.tconfint_mean()
        res2 = d1w_d2.tconfint_mean()
        assert_almost_equal(res1, res0, 14)
        assert_almost_equal(res2, res0, 14)
Example #2
0
def weighted_stat(stock_trading_df):
    if (stock_trading_df.shape[0] == 1):
        return pd.Series([
            0, 0, stock_trading_df['price'][0], stock_trading_df['price'][0],
            stock_trading_df['price'][0], 1, stock_trading_df['turnover'][0]
        ],
                         index=[
                             'price_var', 'price_std', 'price_mean',
                             'price_min', 'price_max', 'no_of_txn', 'turnover'
                         ])
    else:
        return pd.Series([
            DescrStatsW(stock_trading_df['price'],
                        stock_trading_df['volume']).var,
            DescrStatsW(stock_trading_df['price'],
                        stock_trading_df['volume']).std,
            DescrStatsW(stock_trading_df['price'],
                        stock_trading_df['volume']).mean,
            min(stock_trading_df['price']),
            max(stock_trading_df['price']),
            DescrStatsW(stock_trading_df['price']).nobs,
            sum(stock_trading_df['turnover'])
        ],
                         index=[
                             'price_var', 'price_std', 'price_mean',
                             'price_min', 'price_max', 'no_of_txn', 'turnover'
                         ])
def test_ztest_ztost():
    # compare weightstats with separately tested proportion ztest ztost
    import statsmodels.stats.proportion as smprop

    x1 = [0, 1]
    w1 = [5, 15]

    res2 = smprop.proportions_ztest(15, 20., value=0.5)
    d1 = DescrStatsW(x1, w1)
    res1 = d1.ztest_mean(0.5)
    assert_allclose(res1, res2, rtol=0.03, atol=0.003)

    d2 = DescrStatsW(x1, np.array(w1) * 21. / 20)
    res1 = d2.ztest_mean(0.5)
    assert_almost_equal(res1, res2, decimal=12)

    res1 = d2.ztost_mean(0.4, 0.6)
    res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6)
    assert_almost_equal(res1[0], res2[0], decimal=12)

    x2 = [0, 1]
    w2 = [10, 10]
    #d2 = DescrStatsW(x1, np.array(w1)*21./20)
    d2 = DescrStatsW(x2, w2)
    res1 = ztest(d1.asrepeats(), d2.asrepeats())
    res2 = smprop.proportions_chisquare(np.asarray([15, 10]),
                                        np.asarray([20., 20]))
    #TODO: check this is this difference expected?, see test_proportion
    assert_allclose(res1[1], res2[1], rtol=0.03)

    res1a = CompareMeans(d1, d2).ztest_ind()
    assert_allclose(res1a[1], res2[1], rtol=0.03)
    assert_almost_equal(res1a, res1, decimal=12)
 def setup_class(cls):
     cls.x1 = np.array(
         [7.8, 6.6, 6.5, 7.4, 7.3, 7., 6.4, 7.1, 6.7, 7.6, 6.8])
     cls.x2 = np.array([4.5, 5.4, 6.1, 6.1, 5.4, 5., 4.1, 5.5])
     cls.d1 = DescrStatsW(cls.x1)
     cls.d2 = DescrStatsW(cls.x2)
     cls.cm = CompareMeans(cls.d1, cls.d2)
    def test_weightstats_2(self):
        x1, x2 = self.x1, self.x2
        w1, w2 = self.w1, self.w2

        d1 = DescrStatsW(x1)
        d1w = DescrStatsW(x1, weights=w1)
        d2w = DescrStatsW(x2, weights=w2)
        x1r = d1w.asrepeats()
        x2r = d2w.asrepeats()
        #        print 'random weights'
        #        print ttest_ind(x1, x2, weights=(w1, w2))
        #        print stats.ttest_ind(x1r, x2r)
        assert_almost_equal(
            ttest_ind(x1, x2, weights=(w1, w2))[:2], stats.ttest_ind(x1r, x2r),
            14)
        #not the same as new version with random weights/replication
        #        assert x1r.shape[0] == d1w.sum_weights
        #        assert x2r.shape[0] == d2w.sum_weights

        assert_almost_equal(x2r.mean(0), d2w.mean, 14)
        assert_almost_equal(x2r.var(), d2w.var, 14)
        assert_almost_equal(x2r.std(), d2w.std, 14)
        #note: the following is for 1d
        assert_almost_equal(np.cov(x2r, bias=1), d2w.cov, 14)
        #assert_almost_equal(np.corrcoef(np.x2r), d2w.corrcoef, 19)
        #TODO: exception in corrcoef (scalar case)

        #one-sample tests
        #        print d1.ttest_mean(3)
        #        print stats.ttest_1samp(x1, 3)
        #        print d1w.ttest_mean(3)
        #        print stats.ttest_1samp(x1r, 3)
        assert_almost_equal(d1.ttest_mean(3)[:2], stats.ttest_1samp(x1, 3), 11)
        assert_almost_equal(
            d1w.ttest_mean(3)[:2], stats.ttest_1samp(x1r, 3), 11)
    def test_weightstats_3(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        w1, w2 = self.w1, self.w2

        d1w_2d = DescrStatsW(x1_2d, weights=w1)
        d2w_2d = DescrStatsW(x2_2d, weights=w2)
        x1r_2d = d1w_2d.asrepeats()
        x2r_2d = d2w_2d.asrepeats()

        assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14)
        assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14)
        assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14)
        assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14)
        assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14)

        #        print d1w_2d.ttest_mean(3)
        #        #scipy.stats.ttest is also vectorized
        #        print stats.ttest_1samp(x1r_2d, 3)
        t, p, d = d1w_2d.ttest_mean(3)
        assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11)
        #print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T]
        cm = CompareMeans(d1w_2d, d2w_2d)
        ressm = cm.ttest_ind()
        resss = stats.ttest_ind(x1r_2d, x2r_2d)
        assert_almost_equal(ressm[:2], resss, 14)
Example #7
0
    def fit(self):
        """Once the exposure and outcome models are specified, we can estimate the risk ratio and risk difference.

        Returns
        -------
        Gains `risk_difference`, `risk_difference_ci`, and `risk_ratio` values
        """
        if (self._fit_exposure_ is False) or (self._fit_outcome_ is False):
            raise ValueError(
                'The exposure and outcome models must be specified before the doubly robust estimate can '
                'be generated')

        # Doubly robust estimator under all treated
        a_obs = self.df[self.exposure]
        y_obs = self.df[self.outcome]
        ps = self.df['_ps_']
        py_a1 = self.df['_pY1_']
        py_a0 = self.df['_pY0_']
        dr_a1 = np.where(a_obs == 1, (y_obs / ps) - ((py_a1 * (1 - ps)) / ps),
                         py_a1)

        # Doubly robust estimator under all untreated
        dr_a0 = np.where(a_obs == 1, py_a0,
                         (y_obs / (1 - ps) - ((py_a0 * ps) / (1 - ps))))

        # Generating estimates for the risk difference and risk ratio
        zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)

        if self._weight_ is None:
            if self._continuous_outcome:
                self.average_treatment_effect = np.mean(dr_a1) - np.mean(dr_a0)
                var_ic = np.var(
                    (dr_a1 - dr_a0) - self.average_treatment_effect,
                    ddof=1) / self.df.shape[0]
                self.average_treatment_effect_se = np.sqrt(var_ic)
                self.average_treatment_effect_ci = [
                    self.average_treatment_effect - zalpha * np.sqrt(var_ic),
                    self.average_treatment_effect + zalpha * np.sqrt(var_ic)
                ]

            else:
                self.risk_difference = np.mean(dr_a1) - np.mean(dr_a0)
                self.risk_ratio = np.mean(dr_a1) / np.mean(dr_a0)
                var_ic = np.var((dr_a1 - dr_a0) - self.risk_difference,
                                ddof=1) / self.df.shape[0]
                self.risk_difference_se = np.sqrt(var_ic)
                self.risk_difference_ci = [
                    self.risk_difference - zalpha * np.sqrt(var_ic),
                    self.risk_difference + zalpha * np.sqrt(var_ic)
                ]
        else:
            dr_m1 = DescrStatsW(dr_a1, weights=self.df[self._weight_]).mean
            dr_m0 = DescrStatsW(dr_a0, weights=self.df[self._weight_]).mean

            if self._continuous_outcome:
                self.average_treatment_effect = dr_m1 - dr_m0
            else:
                self.risk_difference = dr_m1 - dr_m0
                self.risk_ratio = dr_m1 / dr_m0
Example #8
0
    def get_angle(self, flip = True):
        angles = get_angles()

        angle_dict = dict()
        for a in ['r','l']:
            phi = angles[[f'inc_{a}']]
            flip = angles[[f'inc_flip_{a}']]
            err = 1

            phi_des = DescrStatsW(phi, weights = 1/(np.array([err, err, err]))**2)
            phi_mean = phi_des.mean
            phi_mean_err = phi_des.std / np.sqrt(3)
            
            flip_des = DescrStatsW(flip, weights = 1/(np.array([err, err, err]))**2)
            flip_mean = flip_des.mean
            flip_mean_err = flip_des.std / np.sqrt(3)

            true_phi = (phi_mean + flip_mean) / 2

            true_phi_err = np.sqrt(phi_mean_err**2 + flip_mean_err**2) / 2
            
            angle_dict.update({f'phi_{a}': [true_phi, true_phi_err]})
        

        self.phi_r_mu = angle_dict['phi_r'][0]
        self.phi_r_std = angle_dict['phi_r'][1]


        self.phi_l_mu = angle_dict['phi_l'][0]
        self.phi_l_std = angle_dict['phi_l'][1]

        self.dphi_mu_g = (self.phi_r_mu - self.phi_l_mu) / 2
        self.dphi_std_g = np.sqrt(self.phi_r_std**2 + self.phi_l_std**2)
        self.dphi_mu_err_g = self.dphi_std_g / np.sqrt(2) + 0.05


        upper = (self.a_dict['a_L'][0] - self.a_dict['a_R'][0]) * np.sin(np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0]))
        lower = (self.a_dict['a_L'][0] + self.a_dict['a_R'][0]) * np.cos(np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0]))
        
        self.dphi_mu_a = upper / lower
        
        x = np.radians(angle_dict[f'phi_{self.orientation.lower()}'][0])
        L = self.a_dict['a_L'][0]
        R = self.a_dict['a_R'][0]

        err1 = (np.sin(x)/((L+R)*np.cos(x))-(L-R)*np.sin(x)/((L+R)**2*np.cos(x)))**2 * self.phi_l_std**2
        err2 = (-np.sin(x)/((L+R)*np.cos(x))-(L-R)*np.sin(x)/((L+R)**2*np.cos(x)))**2 * self.phi_r_std**2
        err3 = ((L-R)/(L+R)+(L-R)*np.sin(x)**2/((L+R)*np.cos(x)**2))**2 * angle_dict[f'phi_{self.orientation.lower()}'][1]**2
        
        self.dphi_std_a = np.sqrt(err1 + err2 + err3) + 0.05
        
        combined_dphi = DescrStatsW([self.dphi_mu_g, self.dphi_mu_a], weights=1/(np.array([self.dphi_mu_err_g, self.dphi_std_a]))**2)


        self.dphi_mu = combined_dphi.mean
        self.dphi_std = combined_dphi.std / np.sqrt(2)

        self.get_chi2(f'dphi_{self.orientation}', np.array([self.dphi_mu_a,self.dphi_mu_g]), np.array([self.dphi_std_a, self.dphi_mu_err_g]))
Example #9
0
def _similarity_helper_limited(user1_id, user2_id, solr):
    user1_vector, user2_vector, weight_vector = get_vector_limited(
        user1_id, user2_id)

    data = column_stack((user1_vector, user2_vector))
    result = DescrStatsW(data)[1][0]
    result2 = DescrStatsW(data, weights=weight_vector).corrcoef[1][0]

    print('Pearson similarity ' + str(user2_id) + ' ' + str(result))
    print('DescrStatsW ' + str(user2_id) + ' ' + str(result2))
Example #10
0
def trades_to_bar(ticks: pd.DataFrame, bar_trigger: str='fixed') -> dict:
    
    if type(ticks) != pd.DataFrame:
        ticks = pd.DataFrame(ticks)
    
    bar = {'bar_trigger': bar_trigger}
    # time
    bar['open_at'] = ticks['utc_dt'].iloc[0]
    bar['close_at'] = ticks['utc_dt'].iloc[-1]
    bar['duration_td'] = bar['close_at'] - bar['open_at']
    # volume
    bar['tick_count'] = ticks.shape[0]
    bar['volume'] = ticks.volume.sum()
    bar['dollars'] = (ticks.volume * ticks.price).sum()
    # price
    bar['price_open'] = ticks.price.values[0]
    bar['price_close'] = ticks.price.values[-1]
    bar['price_low'] = ticks.price.min()
    bar['price_high'] = ticks.price.max()
    bar['price_range'] = bar['price_high'] - bar['price_low']
    bar['price_return'] = bar['price_close'] - bar['price_close']
    # volume weighted price
    dsw = DescrStatsW(data=ticks.price, weights=ticks.volume)
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    bar['price_wq10'] = qtiles[0]
    bar['price_wq50'] = qtiles[1]
    bar['price_wq90'] = qtiles[2]
    bar['price_wq_range'] = bar['price_wq90'] - bar['price_wq10']
    bar['price_wmean'] = dsw.mean
    bar['price_wstd'] = dsw.std
    # jma
    bar['jma_open'] = ticks.jma.values[0]
    bar['jma_close'] = ticks.jma.values[-1]
    bar['jma_low'] = ticks.jma.min()
    bar['jma_high'] = ticks.jma.max()
    bar['jma_range'] = bar['jma_high'] - bar['jma_low']
    bar['jma_return'] = bar['jma_close'] - bar['jma_open']
    # volume weighted jma
    dsw = DescrStatsW(data=ticks.jma, weights=ticks.volume)
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    bar['jma_wq10'] = qtiles[0]
    bar['jma_wq50'] = qtiles[1]
    bar['jma_wq90'] = qtiles[2]
    bar['jma_wq_range'] = bar['jma_wq90'] - bar['jma_wq10']
    bar['jma_wmean'] = dsw.mean
    bar['jma_wstd'] = dsw.std
    # tick/vol/dollar/imbalance
    bar['tick_imbalance'] = ticks.side.sum()
    bar['volume_imbalance'] = (ticks.volume * ticks.side).sum()
    bar['dollar_imbalance'] = (ticks.volume * ticks.price * ticks.side).sum()

    return bar
def title_len_stat(mongo_db):
    len_counter_db = collections.Counter()
    len_counter_cr = collections.Counter()
    for col_name in mongo_db.collection_names():
        if col_name not in PAPER_COLLECTIONS:
            continue
        col = mongo_db[col_name]
        query_w_doi = col.find({'doi': {'$exists': True}})
        for doc in query_w_doi:
            if ('metadata' in doc and 'title' in doc['metadata']
                    and isinstance(doc['metadata']['title'], str)):
                len_counter_db[len(doc['metadata']['title'])] += 1
            if ('crossref_raw_result' in doc
                    and 'title' in doc['crossref_raw_result']
                    and isinstance(doc['crossref_raw_result']['title'], list)
                    and len(doc['crossref_raw_result']['title']) == 1):
                len_counter_cr[len(
                    doc['crossref_raw_result']['title'][0])] += 1

    # stat for db titles
    sorted_len = sorted(len_counter_db.keys())
    weights = [len_counter_db[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter_db')
    pprint(len_counter_db)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    # stat for cr titles
    sorted_len = sorted(len_counter_cr.keys())
    weights = [len_counter_cr[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    #     sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter_cr')
    pprint(len_counter_cr)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    return len_counter_db, len_counter_cr
Example #12
0
    def angular_linear_correlation(self,
                                   angles,
                                   data,
                                   weights=None,
                                   double_peak=False):
        """
        This function computes an angular-linear correlation.
        When expecting the data to have two symmetrical, opposite peaks (e.g.
        a non-direction selective effect such as horizontal vs. vertical
        instead of up vs down), double_peak should be set to True.

        :param angles: input angles
        :type angles: 1-D array
        :param data: input data
        :type data: 1-D array, same shape as angles
        :param weights: weights to use for correlation
        :type weights: 1-D array, same shape as angles
        :param double_peak: when True, angles are doubled
        :type double_peak: bool

        :return corr: circular correlation
        :type corr: float

        """

        # set weights to one
        if weights is None:
            weights = np.ones_like(angles)

        # In cases of expected periodicity (e.g. data peaks at two opposite angles),
        # the angular data should be scaled:
        if double_peak:
            angles = self.collapse_angles_symmetrically(angles)

        # use formula from the pycircstat package to calculate circular correlation:
        rxs = DescrStatsW(data=np.vstack([data, np.sin(angles)]).T,
                          weights=weights).corrcoef[0, 1]
        rxc = DescrStatsW(data=np.vstack([data, np.cos(angles)]).T,
                          weights=weights).corrcoef[0, 1]
        rcs = DescrStatsW(data=np.vstack([np.sin(angles),
                                          np.cos(angles)]).T,
                          weights=weights).corrcoef[0, 1]

        # rxs = self.functions.wpearson(these_data,np.sin(doubled_angles),weights)
        # rxc = self.functions.wpearson(these_data,np.cos(doubled_angles),weights)
        # rcs = self.functions.wpearson(np.sin(doubled_angles),np.cos(doubled_angles),weights)
        # compute angular-linear correlation (equ. 27.47)
        corr = np.sqrt((rxc**2 + rxs**2 - 2 * rxc * rxs * rcs) / (1 - rcs**2))

        return corr
    def setup_class(self):
        np.random.seed(9876789)
        n1, n2 = 20, 30
        m1, m2 = 1, 1.2
        x1 = m1 + np.random.randn(n1, 3)
        x2 = m2 + np.random.randn(n2, 3)
        w1 = np.random.randint(1, 4, n1)
        w2 = np.random.randint(1, 4, n2)

        self.x1, self.x2 = x1, x2
        self.w1, self.w2 = w1, w2
        self.d1w = DescrStatsW(x1, weights=w1, ddof=0)
        self.d2w = DescrStatsW(x2, weights=w2, ddof=1)
        self.x1r = self.d1w.asrepeats()
        self.x2r = self.d2w.asrepeats()
Example #14
0
    def _get_weighted_stats(self, X, y, weights):
        """Gets the weighted mean and standard deviation for each variable
        in X and y, based on an array of weights."""
        Xw_stat_obj = DescrStatsW(self._X, weights=self._w, ddof=1)

        # Weighted standard deviation for X vars:
        std_Xw = np.sqrt(np.abs(Xw_stat_obj.var_ddof(1)))  # abs for w_sum <1
        mean_Xw = Xw_stat_obj.mean  # Numpy array shape: (regressors, )

        yw_stat_obj = DescrStatsW(self._y, weights=self._w, ddof=1)
        # Weighted standard deviation for y:
        std_yw = np.sqrt(np.abs(yw_stat_obj.var_ddof(1)))  # abs for w_sum <1
        mean_yw = yw_stat_obj.mean  # Numpy array shape: (regressors, )

        return mean_Xw, mean_yw, std_Xw, std_yw
    def setup_class(cls):
        np.random.seed(9876789)
        n1, n2 = 20, 20
        m1, m2 = 1, 1.2
        x1 = m1 + np.random.randn(n1, 3)
        x2 = m2 + np.random.randn(n2, 3)
        w1 = np.random.randint(1, 4, n1)
        w2 = np.random.randint(1, 4, n2)

        cls.x1, cls.x2 = x1, x2
        cls.w1, cls.w2 = w1, w2
        cls.d1w = DescrStatsW(x1, weights=w1, ddof=1)
        cls.d2w = DescrStatsW(x2, weights=w2, ddof=1)
        cls.x1r = cls.d1w.asrepeats()
        cls.x2r = cls.d2w.asrepeats()
def globaldepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()

    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)
    ##que diferencia hay con coverage_hist.DP.mean()??????????

    global_depth = {}
    b, bases_20x, depth_20X = depth_fraction(coverage_hist, thr=20)
    global_depth.update({'bases_totales': int(b)})

    global_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]})
    #global_depth.update({'std_DP':round(weighted_stats.std,signif)})
    #global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]})
    #global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]})
    #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})
    #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})

    #global_depth.update({'dp>=1':round(depth_fraction(coverage_hist,thr=1),signif)})
    #global_depth.update({'dp>=10':round(depth_fraction(coverage_hist,thr=10),signif)})
    global_depth.update({'bases_20X': int(bases_20x)})
    #global_depth.update({'bases_20X(%)':(100*(bases_20x/b)})
    global_depth.update({'dp>=20': round(depth_20X, 3)})

    #global_depth.update({'dp>=20':round(depth_fraction(coverage_hist,thr=20),signif)})
    #global_depth.update({'dp>=30':round(depth_fraction(coverage_hist,thr=30),signif)})
    #global_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)})
    #global_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)})

    return (global_depth)
Example #17
0
    def fit_single(self,
                   pos_left,
                   pos_right,
                   weights,
                   tol=1e-4,
                   maxiter=4000,
                   verbose=False):
        left, right = np.asarray(pos_left), np.asarray(pos_right)
        debugs = list() if verbose else None
        centers = (left + right) / 2.0
        statsW = DescrStatsW(centers, weights=np.array(weights))
        init_paras = self._paras_compose_([statsW.mean], [statsW.cov], [1.0])

        method = 'Nelder-Mead'
        res = opt.minimize(self._single_optpara,
                           init_paras,
                           args=(left, right, weights, debugs),
                           method=method,
                           tol=tol,
                           options={
                               'maxiter': maxiter,
                               'disp': verbose
                           })
        if verbose:
            print("Method:{}; Initial parameter: {};".format(
                method, init_paras))
            print("Converged Parameter: {}".format(res.x))

        mus, covs, ws = self._paras_decompose_(res.x, 1)
        if det(covs[0]) == 0.0:
            print("Warning: covariance processed:")
            print("\t pre-optimal mus: {}, cov: {}".format(mus[0], covs[0]))
            covs[0] = self._cov_process_(covs[0])

        return mus, covs, ws, res.fun
Example #18
0
def Newey_West(ret, q=2, tao=252):
    '''
    Newey_West方差调整
    时序上存在相关性时,使用Newey_West调整协方差估计
    factor_ret: DataFrame, 行为时间,列为因子收益
    q: 假设因子收益为q阶MA过程
    tao: 算协方差时的半衰期
    '''
    from functools import reduce
    from statsmodels.stats.weightstats import DescrStatsW

    T = ret.shape[0]  #时序长度
    K = ret.shape[1]  #因子数
    if T <= q or T <= K:
        raise Exception("T <= q or T <= K")

    names = ret.columns
    weights = 0.5**(np.arange(T - 1, -1, -1) / tao)  #指数衰减权重
    weights = weights / sum(weights)

    w_stats = DescrStatsW(ret, weights)
    ret = ret - w_stats.mean

    ret = np.matrix(ret.values)
    Gamma0 = [weights[t] * ret[t].T @ ret[t] for t in range(T)]
    Gamma0 = reduce(np.add, Gamma0)

    V = Gamma0  #调整后的协方差矩阵
    for i in range(1, q + 1):
        Gammai = [weights[i + t] * ret[t].T @ ret[i + t] for t in range(T - i)]
        Gammai = reduce(np.add, Gammai)
        V = V + (1 - i / (1 + q)) * (Gammai + Gammai.T)

    return (pd.DataFrame(V, columns=names, index=names))
Example #19
0
def compute_summary_statistics(dbm: database_manager.DatabaseManager, tbl_name: str) -> Optional[Dict[str, Tuple]]:
    """
    Computes summary statistics for given table.
    :param dbm: A DatabaseManager instance.
    :param tbl_name: name of the table to compute monthly return for.
    :return: dictionary containing various statistics.
    """
    df, info, start_date = finance_metrics.compute_monthly_returns(dbm, tbl_name)

    if df is not None and info is not None:
        stat = {}

        dsw = DescrStatsW(df['Monthly_Return'].values)

        stat['table_name'] = tbl_name
        stat['contract_name'] = info[1]
        stat['type'] = info[3] if info[3] is not None else None
        stat['subtype'] = info[4] if info[4] is not None else None
        stat['start-date'] = start_date
        stat['ar'] = df['Monthly_Return'].mean() * 12
        stat['vol'] = df['Monthly_Return'].std() * np.sqrt(12)
        stat['t-stat'] = dsw.ttest_mean(alternative='larger')[0]
        stat['p-value'] = dsw.ttest_mean(alternative='larger')[1]
        stat['kurt'] = df['Monthly_Return'].kurt()
        stat['skew'] = df['Monthly_Return'].skew()

        return stat

    return None
Example #20
0
def calc_statistics(_totalVals, _distances, name, mask):
    if len(_totalVals) == 0 or len(mask) == 0:
        print name, "- No data available!\n"
        return (name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
                np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)

    totalVals = _totalVals[mask, :]
    distances = _distances[mask]

    #distances = np.array(distances);
    if (len(distances) != len(totalVals)):
        raise ValueError("distances must be the same length as data values")

    weights = np.transpose(np.array([distances] * totalVals.shape[1]))

    notnan = np.where(np.isnan(totalVals) == False)
    totalStats = DescrStatsW(totalVals[notnan],
                             weights=weights[notnan],
                             ddof=0)
    totalGrandMean = totalStats.mean
    totalGrandSD = totalStats.std

    #print name, "total onto-shelf current (m/s):", totalGrandMean, "+/-", totalGrandSD;

    #Return the mean ontoshelf current for this region (weighted by segment distance)
    #Return the SD of the same.
    return (name, totalGrandMean, totalGrandSD)
def weighted_percentiles(data, weights, percentiles):
    """Return the weighted percentiles.

    Args:
      data (np.ndarray) : Bin variable (e.g. temperature, salinity)
      weights (np.ndarray): Weights (e.g. cell volume, area)
      percentiles (np.ndarray): Array of requested percentiles (e.g. 0-1 by 0.01)

    """

    assert percentiles.max() <= 1.0
    assert percentiles.min() >= 0.0

    wq = DescrStatsW(data=data, weights=weights)
    bin_edges = wq.quantile(probs=percentiles, return_pandas=False)

    # manual method does not give a clean results...
    #ix = np.argsort(data)
    #data = data[ix] # sort data
    #weights = weights[ix] # sort weights
    #cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    #perc = np.arange(0, 1.01, 0.01)
    #test2 = np.interp(perc, cdf, data)

    return bin_edges
Example #22
0
    def spatial_correlation(self,
                            field_A,
                            field_B,
                            method=None,
                            selection=None):
        """ correlate two 2D fields """
        if np.shape(field_A) != np.shape(field_B):  # have to regrid
            A, B = self.regrid_to_lower_resolution(field_A, field_B)
        else:
            A, B = field_A, field_B
        assert np.shape(A) == np.shape(B)
        domain = self.determine_domain(A)

        AREA = xr_AREA(domain)
        MASK = boolean_mask(domain=domain, mask_nr=0)
        if type(selection) == int:
            MASK = boolean_mask(domain=domain, mask_nr=selection)
        elif type(selection) == dict:
            MASK, AREA = MASK.sel(selection), AREA.sel(selection)
            A, B = A.sel(selection), B.sel(selection)

        D = np.any(np.array(
            [np.isnan(A).values,
             np.isnan(B).values, (MASK == 0).values]),
                   axis=0)
        A = xr.where(D, np.nan,
                     A).stack(z=('latitude', 'longitude')).dropna(dim='z')
        B = xr.where(D, np.nan,
                     B).stack(z=('latitude', 'longitude')).dropna(dim='z')
        C = xr.where(D, np.nan,
                     AREA).stack(z=('latitude', 'longitude')).dropna(dim='z')
        d = DescrStatsW(np.array([A.values, B.values]).T, weights=C)
        spatial_corr_coef = d.corrcoef[0, 1]

        return spatial_corr_coef
def branch_scan_MinVar_general(modifier, ds_dists, us_dists, all_weights):
    """
    This is the function to minimize in order to optimaly situate the root on the putative
    branch. Note that this function is only valid for minimizing the variance of schemes 
    where the weights do not change with regard to changing the root.

    Input/s:
    modifier - This is the parameter to be optimized! Essentially a float of how much to shift the
                root left or right so as to minimize the root-to-tip variance
    ds_dists - array of downstream root-to-tip distances
    us_dists - array of upstream root-to-tip distances
    all_weights - array of downstream and upstream terminal weights
    
    Output/s:
    dsw.var - weighted variance
    
    """
    #Adjust the downstream and upstream root-to-tip distances with the modifier
    temp_ds_dists = ds_dists + modifier
    temp_us_dists = us_dists - modifier
    all_dists = np.concatenate((temp_ds_dists, temp_us_dists))

    #Calculate weighted variance and return
    dsw = DescrStatsW(all_dists, all_weights)
    return dsw.var
Example #24
0
def plot_fill_between(data, data_dir, label, n_dis, color='magenta'):
    plt.figure(figsize=(10, 8))
    left_edge = []
    right_edge = []
    mean_res = []
    iters = []
    for key, value in data.items():
        left, right = DescrStatsW(value).tconfint_mean()
        left_edge.append(left)
        right_edge.append(right)
        iters.append(key + 1)
        mean_res.append(np.mean(value))

    plt.fill_between(iters, left_edge, right_edge, color='violet')
    plt.plot(iters, mean_res, color=color, lw=5)

    plt.xlabel('iteration', fontsize=18)
    plt.ylabel(label, fontsize=18)
    plt.xlim([1, len(iters)])
    plt.ylim([min(mean_res) - 0.05, max(mean_res) + 0.05])
    plt.xticks(list(plt.xticks()[0][1:]) + [1])
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.legend(['TestCV'], fontsize=18, loc=2)
    plt.savefig('{}/{}:{}_ARTM_smart.eps'.format(data_dir, label, n_dis))
    plt.show()
    return mean_res
Example #25
0
 def get_diameter(self):
     self.get_chi2(f'Diameter', self.diameter.values, np.array([0.05, 0.05, 0.05]))
     all_diameter = DescrStatsW(self.diameter, weights=1/(np.array([0.05, 0.05, 0.05]))**2)
     self.diameter_mean = all_diameter.mean
     self.diameter_mean_err = all_diameter.std/np.sqrt(len(self.diameter))
     self.diameter_std = all_diameter.std
     print(f' The diameter of the {self.ball} is {1000*self.diameter_mean:.2f} +- {1000*self.diameter_mean_err:.2f}')
Example #26
0
def get_lower_upper_CI(scores):
    if scores.shape[1] > 1:  #then 2-D
        lower_bound, upper_bound = DescrStatsW(
            scores.T).tconfint_mean() - scores.mean(axis=1)
    else:
        lower_bound, upper_bound = 0, 0
    return abs(lower_bound)
def globaldepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()
    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)

    global_depth = {}
    global_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]})
    global_depth.update({'std_DP': round(weighted_stats.std, signif)})
    global_depth.update({'q25_DP': weighted_stats.quantile(0.25).values[0]})
    global_depth.update({'q75_DP': weighted_stats.quantile(0.75).values[0]})
    global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]})
    global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]})

    global_depth.update(
        {'dp>=1': round(depth_fraction(coverage_hist, thr=1), signif)})
    global_depth.update(
        {'dp>=10': round(depth_fraction(coverage_hist, thr=10), signif)})
    global_depth.update(
        {'dp>=20': round(depth_fraction(coverage_hist, thr=20), signif)})
    global_depth.update(
        {'dp>=30': round(depth_fraction(coverage_hist, thr=30), signif)})
    global_depth.update(
        {'dp>=50': round(depth_fraction(coverage_hist, thr=50), signif)})
    global_depth.update(
        {'dp>=100': round(depth_fraction(coverage_hist, thr=100), signif)})
    return (global_depth)
Example #28
0
    def test_comparemeans_convenient_interface(self):
        x1_2d, x2_2d = self.x1_2d, self.x2_2d
        d1 = DescrStatsW(x1_2d)
        d2 = DescrStatsW(x2_2d)
        cm1 = CompareMeans(d1, d2)

        # smoke test for summary
        from statsmodels.iolib.table import SimpleTable
        for use_t in [True, False]:
            for usevar in ['pooled', 'unequal']:
                smry = cm1.summary(use_t=use_t, usevar=usevar)
                assert_(isinstance(smry, SimpleTable))

        # test for from_data method
        cm2 = CompareMeans.from_data(x1_2d, x2_2d)
        assert_(str(cm1.summary()) == str(cm2.summary()))
Example #29
0
def localdepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()
    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)

    local_depth = {}
    local_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    #local_depth.update({'median_DP':weighted_stats.quantile(0.5).values[0]})
    local_depth.update({'std_DP': round(weighted_stats.std, signif)})

    local_depth.update(
        {'dp>=1': (round(depth_fraction(coverage_hist, thr=1), signif)) * 100})
    local_depth.update(
        {'dp>=5': (round(depth_fraction(coverage_hist, thr=5), signif)) * 100})
    local_depth.update({
        'dp>=10': (round(depth_fraction(coverage_hist, thr=10), signif)) * 100
    })
    local_depth.update({
        'dp>=20': (round(depth_fraction(coverage_hist, thr=20), signif)) * 100
    })
    local_depth.update({
        'dp>=30': (round(depth_fraction(coverage_hist, thr=30), signif)) * 100
    })
    #local_depth.update({'mean_DP':round(weighted_stats.mean,signif)})

    #local_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)})
    #local_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)})
    return pd.Series(local_depth)
Example #30
0
def _similarity_helper_2(user1_id, user2_id, user1_vector, solr):
    query = 'doc_type:score AND users:({} AND {})'.format(user1_id, user2_id)
    solr.delete(q=query)

    user2_vector, depth_vector = get_vector_tf_idf(user2_id)
    data = column_stack((user1_vector, user2_vector))

    # result = DescrStatsW(data).corrcoef[1][0]
    result2 = DescrStatsW(data, weights=depth_vector).corrcoef[1][0]

    mutual_friends = get_mutual_friends(user1_id, user2_id, solr)

    new_score = [{
        'doc_type': 'score',
        'users': [user1_id, user2_id],
        'similarity': result2,
        'mutual_friends': mutual_friends,
        'friends_count': len(mutual_friends)
    }]

    solr.add(new_score)
    solr.commit()

    # print('Pearson similarity ' + str(user2_id) + ' ' + str(result))
    print('DescrStatsW ' + str(user2_id) + ' ' + str(result2))