def globaldepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()
    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)

    global_depth = {}
    global_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]})
    global_depth.update({'std_DP': round(weighted_stats.std, signif)})
    global_depth.update({'q25_DP': weighted_stats.quantile(0.25).values[0]})
    global_depth.update({'q75_DP': weighted_stats.quantile(0.75).values[0]})
    global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]})
    global_depth.update({'q95_DP': weighted_stats.quantile(0.95).values[0]})

    global_depth.update(
        {'dp>=1': round(depth_fraction(coverage_hist, thr=1), signif)})
    global_depth.update(
        {'dp>=10': round(depth_fraction(coverage_hist, thr=10), signif)})
    global_depth.update(
        {'dp>=20': round(depth_fraction(coverage_hist, thr=20), signif)})
    global_depth.update(
        {'dp>=30': round(depth_fraction(coverage_hist, thr=30), signif)})
    global_depth.update(
        {'dp>=50': round(depth_fraction(coverage_hist, thr=50), signif)})
    global_depth.update(
        {'dp>=100': round(depth_fraction(coverage_hist, thr=100), signif)})
    return (global_depth)
Example #2
0
def trades_to_bar(ticks: pd.DataFrame, bar_trigger: str='fixed') -> dict:
    
    if type(ticks) != pd.DataFrame:
        ticks = pd.DataFrame(ticks)
    
    bar = {'bar_trigger': bar_trigger}
    # time
    bar['open_at'] = ticks['utc_dt'].iloc[0]
    bar['close_at'] = ticks['utc_dt'].iloc[-1]
    bar['duration_td'] = bar['close_at'] - bar['open_at']
    # volume
    bar['tick_count'] = ticks.shape[0]
    bar['volume'] = ticks.volume.sum()
    bar['dollars'] = (ticks.volume * ticks.price).sum()
    # price
    bar['price_open'] = ticks.price.values[0]
    bar['price_close'] = ticks.price.values[-1]
    bar['price_low'] = ticks.price.min()
    bar['price_high'] = ticks.price.max()
    bar['price_range'] = bar['price_high'] - bar['price_low']
    bar['price_return'] = bar['price_close'] - bar['price_close']
    # volume weighted price
    dsw = DescrStatsW(data=ticks.price, weights=ticks.volume)
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    bar['price_wq10'] = qtiles[0]
    bar['price_wq50'] = qtiles[1]
    bar['price_wq90'] = qtiles[2]
    bar['price_wq_range'] = bar['price_wq90'] - bar['price_wq10']
    bar['price_wmean'] = dsw.mean
    bar['price_wstd'] = dsw.std
    # jma
    bar['jma_open'] = ticks.jma.values[0]
    bar['jma_close'] = ticks.jma.values[-1]
    bar['jma_low'] = ticks.jma.min()
    bar['jma_high'] = ticks.jma.max()
    bar['jma_range'] = bar['jma_high'] - bar['jma_low']
    bar['jma_return'] = bar['jma_close'] - bar['jma_open']
    # volume weighted jma
    dsw = DescrStatsW(data=ticks.jma, weights=ticks.volume)
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    bar['jma_wq10'] = qtiles[0]
    bar['jma_wq50'] = qtiles[1]
    bar['jma_wq90'] = qtiles[2]
    bar['jma_wq_range'] = bar['jma_wq90'] - bar['jma_wq10']
    bar['jma_wmean'] = dsw.mean
    bar['jma_wstd'] = dsw.std
    # tick/vol/dollar/imbalance
    bar['tick_imbalance'] = ticks.side.sum()
    bar['volume_imbalance'] = (ticks.volume * ticks.side).sum()
    bar['dollar_imbalance'] = (ticks.volume * ticks.price * ticks.side).sum()

    return bar
def title_len_stat(mongo_db):
    len_counter_db = collections.Counter()
    len_counter_cr = collections.Counter()
    for col_name in mongo_db.collection_names():
        if col_name not in PAPER_COLLECTIONS:
            continue
        col = mongo_db[col_name]
        query_w_doi = col.find({'doi': {'$exists': True}})
        for doc in query_w_doi:
            if ('metadata' in doc and 'title' in doc['metadata']
                    and isinstance(doc['metadata']['title'], str)):
                len_counter_db[len(doc['metadata']['title'])] += 1
            if ('crossref_raw_result' in doc
                    and 'title' in doc['crossref_raw_result']
                    and isinstance(doc['crossref_raw_result']['title'], list)
                    and len(doc['crossref_raw_result']['title']) == 1):
                len_counter_cr[len(
                    doc['crossref_raw_result']['title'][0])] += 1

    # stat for db titles
    sorted_len = sorted(len_counter_db.keys())
    weights = [len_counter_db[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter_db')
    pprint(len_counter_db)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    # stat for cr titles
    sorted_len = sorted(len_counter_cr.keys())
    weights = [len_counter_cr[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    #     sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter_cr')
    pprint(len_counter_cr)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    return len_counter_db, len_counter_cr
def weighted_percentiles(data, weights, percentiles):
    """Return the weighted percentiles.

    Args:
      data (np.ndarray) : Bin variable (e.g. temperature, salinity)
      weights (np.ndarray): Weights (e.g. cell volume, area)
      percentiles (np.ndarray): Array of requested percentiles (e.g. 0-1 by 0.01)

    """

    assert percentiles.max() <= 1.0
    assert percentiles.min() >= 0.0

    wq = DescrStatsW(data=data, weights=weights)
    bin_edges = wq.quantile(probs=percentiles, return_pandas=False)

    # manual method does not give a clean results...
    #ix = np.argsort(data)
    #data = data[ix] # sort data
    #weights = weights[ix] # sort weights
    #cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    #perc = np.arange(0, 1.01, 0.01)
    #test2 = np.interp(perc, cdf, data)

    return bin_edges
def globaldepth(coverage_hist):

    coverage_hist['cumsum'] = 1 - coverage_hist.frequency.cumsum()

    weighted_stats = DescrStatsW(coverage_hist.DP - 1,
                                 weights=coverage_hist.BPs,
                                 ddof=0)
    ##que diferencia hay con coverage_hist.DP.mean()??????????

    global_depth = {}
    b, bases_20x, depth_20X = depth_fraction(coverage_hist, thr=20)
    global_depth.update({'bases_totales': int(b)})

    global_depth.update({'mean_DP': round(weighted_stats.mean, signif)})
    global_depth.update({'median_DP': weighted_stats.quantile(0.5).values[0]})
    #global_depth.update({'std_DP':round(weighted_stats.std,signif)})
    #global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]})
    #global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]})
    #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})
    #global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})

    #global_depth.update({'dp>=1':round(depth_fraction(coverage_hist,thr=1),signif)})
    #global_depth.update({'dp>=10':round(depth_fraction(coverage_hist,thr=10),signif)})
    global_depth.update({'bases_20X': int(bases_20x)})
    #global_depth.update({'bases_20X(%)':(100*(bases_20x/b)})
    global_depth.update({'dp>=20': round(depth_20X, 3)})

    #global_depth.update({'dp>=20':round(depth_fraction(coverage_hist,thr=20),signif)})
    #global_depth.update({'dp>=30':round(depth_fraction(coverage_hist,thr=30),signif)})
    #global_depth.update({'dp>=50':round(depth_fraction(coverage_hist,thr=50),signif)})
    #global_depth.update({'dp>=100':round(depth_fraction(coverage_hist,thr=100),signif)})

    return (global_depth)
def describe_cluster(cluster, columns):
    """ Generate descriptive statistics for a cluster

    Parameters:
        cluster (DataFrame): A dataframe, that contains density informations for every bin in the cluster
        columns (list of string): The names of the columns for which to generate statistics

    Returns: 
        Series: All statistics for the selected columns
    """

    values = cluster.values
    dstats = DescrStatsW(
        values, cluster["DENSITY"].values if len(values) > 1 else None)
    mean = dstats.mean
    std = dstats.std
    quantiles = dstats.quantile(0.5, return_pandas=False)

    result_columns = [[
        mean[i],
        std[i],
        std[i] / abs(mean[i]) * 100,
        cluster[columns[i]].min(),
        quantiles[0][i],
        cluster[columns[i]].max(),
    ] for i in range(len(columns))]
    result = list(itertools.chain(*result_columns)) + [
        cluster["DENSITY"].count(),
        cluster["DENSITY"].sum() * 100,
    ]

    value_columns = [[
        (col, "mean"),
        (col, "std"),
        (col, "varC (%)"),
        (col, "min"),
        (col, "median"),
        (col, "max"),
    ] for col in columns]
    index = list(itertools.chain(*value_columns)) + [
        ("DENSITY", "count"),
        ("DENSITY", "total"),
    ]

    return pd.Series(result, index=pd.MultiIndex.from_tuples(index))
Example #7
0
def output_new_bar(state):
    new_bar = {}
    if state['tick_count'] == 0:
        return new_bar
    new_bar['bar_trigger'] = state['trigger_yet?!']
    # time
    new_bar['open_epoch'] = state['trades']['epoch'][0]
    new_bar['close_epoch'] = state['trades']['epoch'][-1]
    new_bar['open_at'] = pd.to_datetime(state['trades']['epoch'][0], unit='ns')
    new_bar['close_at'] = pd.to_datetime(state['trades']['epoch'][-1], unit='ns')
    new_bar['duration_dt'] = new_bar['close_at'] - new_bar['open_at']    
    new_bar['duration_sec'] = state['duration_sec']
    new_bar['duration_min'] = new_bar['duration_sec'] / 60
    # price
    new_bar['price_open'] = state['trades']['price'][0]
    new_bar['price_close'] = state['trades']['price'][-1]
    new_bar['price_low'] = state['price_min']
    new_bar['price_high'] = state['price_max']
    new_bar['price_mean'] = np.array(state['trades']['price']).mean() 
    new_bar['price_std'] = np.array(state['trades']['price']).std()
    new_bar['price_q10'] = np.quantile(state['trades']['price'], q=0.1)
    new_bar['price_q50'] = np.quantile(state['trades']['price'], q=0.5)
    new_bar['price_q90'] = np.quantile(state['trades']['price'], q=0.9)
    new_bar['price_range'] = state['price_range']
    new_bar['bar_return'] = state['bar_return']
    # volume weighted price
    dsw = DescrStatsW(data=state['trades']['price'], weights=state['trades']['volume'])
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    new_bar['price_wq10'] = qtiles[0]
    new_bar['price_wq50'] = qtiles[1]
    new_bar['price_wq90'] = qtiles[2]
    new_bar['price_wmean'] = dsw.mean
    new_bar['price_wstd'] = dsw.std
    # tick/vol/dollar/imbalance
    new_bar['tick_count'] = state['tick_count']
    new_bar['volume_sum'] = state['volume_sum']
    new_bar['dollar_sum'] = state['dollar_sum']
    new_bar['tick_imbalance'] = state['tick_imbalance']
    new_bar['volume_imbalance'] = state['volume_imbalance']
    new_bar['dollar_imbalance'] = state['dollar_imbalance']
    new_bar['tick_imbalance_run'] = state['tick_run']
    new_bar['volume_imbalance_run'] = state['volume_run']
    new_bar['dollar_imbalance_run'] = state['dollar_run']
    return new_bar
def abs_len_stat(mongo_db):
    len_counter = collections.Counter()
    for col_name in mongo_db.collection_names():
        if col_name not in PAPER_COLLECTIONS:
            continue
        col = mongo_db[col_name]
        query_w_doi = col.find({'abstract': {'$exists': True}})
        for doc in query_w_doi:
            # get abstract
            abstract = None
            if 'abstract' in doc and len(doc['abstract']) > 0:
                abstract = ''
                for fragment in doc['abstract']:
                    if ('text' in fragment
                            and isinstance(fragment['text'], str)
                            and len(fragment['text']) > 0):
                        abstract += fragment['text'].strip() + ' '

                abstract = abstract.strip()
                if len(abstract) == 0:
                    abstract = None

            if abstract is not None:
                #                 print(abstract)
                len_counter[len(abstract)] += 1

    # stat for db abs
    sorted_len = sorted(len_counter.keys())
    weights = [len_counter[l] for l in sorted_len]
    weighted_stats = DescrStatsW(sorted_len, weights=weights)
    sns.barplot(sorted_len, weights)
    percentile = weighted_stats.quantile(probs=[
        0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.95, 0.97, 0.99
    ])
    print('len_counter')
    pprint(len_counter)
    print('weighted_stats.mean', weighted_stats.mean)
    print('weighted_stats.std', weighted_stats.std)
    print('percentile')
    print(percentile)

    return len_counter
Example #9
0
    def update_thrs(self):
        thrs = {}
        # firstly compute and collect thresholds
        for vs_type in self.vs_types:
            if self.require_wp_vs_others:  # apply loosest WP from the previous iteration
                taus = self.apply_wp_vs_others(vs_type)
            else:
                taus = self._taus
            weighted_score = DescrStatsW(data=np.array(
                taus[f'score_vs_{vs_type}'], dtype=np.float32),
                                         weights=np.array(taus['weight'],
                                                          dtype=np.float32))
            thrs[vs_type] = weighted_score.quantile(probs=1 - self.tpr,
                                                    return_pandas=False)

        # then update them in the class
        for vs_type, WPs in self.wp_definitions.items():
            for wp_cfg in WPs.values():
                idx = (self.tpr >= wp_cfg["eff"]).argmax()
                wp_cfg['thrs'].append(thrs[vs_type][idx])
Example #10
0
def state_to_bar(state: dict) -> dict:
    
    new_bar = {}
    if state['stat']['tick_count'] < 11:
        return new_bar

    new_bar['bar_trigger'] = state['bar_trigger']
    # time
    new_bar['open_at'] = state['trades']['utc_dt'][0]
    new_bar['close_at'] = state['trades']['utc_dt'][-1]
    new_bar['duration_td'] = new_bar['close_at'] - new_bar['open_at']
    # volume
    new_bar['tick_count'] = state['stat']['tick_count']
    new_bar['volume'] = state['stat']['volume']
    new_bar['dollars'] = state['stat']['dollars']
    # price
    new_bar['price_open'] = state['trades']['price'][0]
    new_bar['price_close'] = state['trades']['price'][-1]
    new_bar['price_low'] = state['stat']['price_min']
    new_bar['price_high'] = state['stat']['price_max']
    new_bar['price_range'] = state['stat']['price_range']
    new_bar['price_return'] = state['stat']['price_return']
    # volume weighted price
    dsw = DescrStatsW(data=state['trades']['price'], weights=state['trades']['volume'])
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    new_bar['price_wq10'] = qtiles[0]
    new_bar['price_wq50'] = qtiles[1]
    new_bar['price_wq90'] = qtiles[2]
    new_bar['price_wq_range'] = new_bar['price_wq90'] - new_bar['price_wq10']
    new_bar['price_wmean'] = dsw.mean
    new_bar['price_wstd'] = dsw.std
    # jma
    new_bar['jma_open'] = state['trades']['jma'][0]
    new_bar['jma_close'] = state['trades']['jma'][-1]
    new_bar['jma_low'] = state['stat']['jma_min']
    new_bar['jma_high'] = state['stat']['jma_max']
    new_bar['jma_range'] = state['stat']['jma_range']
    new_bar['jma_return'] = state['stat']['jma_return']
    # volume weighted jma
    dsw = DescrStatsW(data=state['trades']['jma'], weights=state['trades']['volume'])
    qtiles = dsw.quantile(probs=[0.1, 0.5, 0.9]).values
    new_bar['jma_wq10'] = qtiles[0]
    new_bar['jma_wq50'] = qtiles[1]
    new_bar['jma_wq90'] = qtiles[2]
    new_bar['jma_wq_range'] = new_bar['jma_wq90'] - new_bar['jma_wq10']
    new_bar['jma_wmean'] = dsw.mean
    new_bar['jma_wstd'] = dsw.std
    # tick/vol/dollar/imbalance
    new_bar['tick_imbalance'] = state['stat']['tick_imbalance']
    new_bar['volume_imbalance'] = state['stat']['volume_imbalance']
    new_bar['dollar_imbalance'] = state['stat']['dollar_imbalance']
    if False:
        new_bar['n_tick_count'] = len(state['trades']['price'])
        new_bar['n_volume'] = sum(state['trades']['volume'])
        new_bar['n_dollars'] = new_bar['price_wq50'] * new_bar['volume']
        new_bar['n_tick_imbalance'] = sum(state['trades']['side'])
        new_bar['n_open_at'] = state['trades']['utc_dt'][0]
        new_bar['n_close_at'] = state['trades']['utc_dt'][-1]
        # new_bar['n_volume_imbalance'] = 
        # new_bar['n_dollar_imbalance'] = 

    return new_bar
def test_weightstats_len_1():
    x1 = [1]
    w1 = [1]
    d1 = DescrStatsW(x1, w1)
    assert (d1.quantile([0.0, 0.5, 1.0]) == 1).all()
Example #12
0
def weighted(x):
    stats = DescrStatsW(x["quantity"], x["sold"])
    return {"median": stats.quantile(0.5)[0.5], "std": stats.std}
Example #13
0
def tabulate_march_inequality(year):
    """
    #
    For years 1964-2009 (year is March year, not earnings year), tabulate:

    These inequality metrics:

    - 90/50, 50/10, 90/10, Vln
    - 60/50, 70/50, 80/50, 95/50, 97/50
    - 50/3, 50/5, 50/20, 50/30, 50/40

    For these samples

    - Males
    - Females
    - Both

    For these wage measures

    - All hourly

    For these conditioning variables

    - raw wage inequality
    - residual wage inequality

    Also note:

    - Always dropping allocators where possible

    D. Autor, 2/24/2004
    D. Autor, 6/15/2004 - Updated for consistency of controls for quantime simulation methods
    M. Anderson, 12/13/2005 - Updated for new quantiles and years
    D. Autor, 9/5/2006. Updated for 2005 March
    M. Wasserman, 10/14/2009 Updated for 2007/8 March
    #
    """

    df = tabulate_march_basic(year)
    df = df.eval("""
            lnwinc = log(winc_ws) + log(gdp)
            lnhinc = log(hinc_ws) + log(gdp)
        """)

    # Full-time and hourly samples
    df = df.eval("ftfy = fulltime*fullyear")
    df.ftfy.describe().to_frame().T
    df = df.eval("""
            ftsamp = (lnwinc == lnwinc) * ftfy * abs(bcwkwgkm-1)
            hrsamp = (lnhinc == lnhinc) * abs(bchrwgkm-1)
        """)
    # @ ftsamp: weekly real wage not none + ftfy + above weekly real wage limit
    # @ hrsamp: hourly real wage not none + above hourly real wage limit

    df.loc[df.ftsamp == 0, "lnwinc"] = np.nan
    df.loc[df.hrsamp == 0, "lnhinc"] = np.nan
    df.query("ftsamp == 1")["lnwinc"].describe().to_frame().T
    df.query("hrsamp == 1")["lnhinc"].describe().to_frame().T
    df = df.query("ftsamp == 1 | hrsamp == 1")

    # Generate experience categories
    df = df.assign(expcat=(df.exp/3).astype(int) + 1)
    df.loc[df.expcat == 17, "expcat"] = 16
    assert df.eval("1<= expcat <= 16").all()

    df.groupby("expcat")["exp"].agg(["mean", "min", "max"])

    # interaction terms - 80 of these
    # @ move to residual wage part

    # Drop reference group's interaction term: HSG with 0-2 years of experience
    # @ simiarly skip now

    df = df.filter(["year", "wgt", "wgt_hrs", "female", "lnwinc", "lnhinc", "hrsamp", "ftsamp", "edcat", "expcat"])

    ######################################################################
    # Summarize raw inequality
    ######################################################################

    pctiles = pd.Series([3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 97])
    pctiles_ = pctiles / 100
    tot_pct = pd.DataFrame(index=pctiles)
    tot_stat = pd.DataFrame(index=["mn", "vln"])

    dt = df.query("ftsamp==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_mf"] = [wq.mean, wq.var]

    dt = df.query("ftsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_m"] = [wq.mean, wq.var]

    dt = df.query("ftsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_ft_f"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_mf"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_m"] = [wq.mean, wq.var]

    dt = df.query("hrsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnhinc, weights=dt.wgt_hrs)
    tot_pct["tot_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False)
    tot_stat["tot_hr_f"] = [wq.mean, wq.var]

    df_stat = pd.concat([tot_stat, tot_pct], axis=0, sort=False)

    ######################################################################
    # Summarize residual inequality - Weekly & Hourly
    ######################################################################

    res_pct = pd.DataFrame(index=pctiles)
    res_stat = pd.DataFrame(index=["mn", "vln"])

    dt = df.query("ftsamp==1")
    y, X = dmatrices('lnwinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_mf"] = [wq.mean, wq.var]  # @ mean is not necessary but to be consistent
    res_pct["res_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==0")
    y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_m"] = [wq.mean, wq.var]
    res_pct["res_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==1")
    y, X = dmatrices('lnwinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt)
    res_stat["res_ft_f"] = [wq.mean, wq.var]
    res_pct["res_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1")
    y, X = dmatrices('lnhinc ~ female + C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_mf"] = [wq.mean, wq.var]
    res_pct["res_hr_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1 & female==0")
    y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_m"] = [wq.mean, wq.var]
    res_pct["res_hr_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("hrsamp==1 & female==1")
    y, X = dmatrices('lnhinc ~ C(edcat) : C(expcat) - 1', dt, return_type="dataframe")
    X = sm.add_constant(X.drop("C(edcat)[2]:C(expcat)[1]", axis=1))
    res = sm.WLS(y, X, weights=dt.wgt_hrs).fit()
    resid = res.resid
    wq = DescrStatsW(data=resid, weights=dt.wgt_hrs)
    res_stat["res_hr_f"] = [wq.mean, wq.var]
    res_pct["res_hr_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    df_stat_ = pd.concat([res_stat, res_pct], axis=0)
    df_stat = pd.concat([df_stat, df_stat_], axis=1)

    # march-ineq-data-`1'
    df_stat = df_stat.T.rename_axis('sample').reset_index().assign(year=year)  # @ tidy data

    ######################################################################
    # Percentiles of weekly earnings
    ######################################################################

    # @ simply generate more percentiles under full-time samples
    # @ note here year is march census year thus minus one to be earnings year

    pctiles = pd.Series(range(3, 98))
    pctiles_ = pctiles / 100
    tot_pct = pd.DataFrame(index=pctiles)

    dt = df.query("ftsamp==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_mf"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==0")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_m"] = wq.quantile(probs=pctiles_, return_pandas=False)

    dt = df.query("ftsamp==1 & female==1")
    wq = DescrStatsW(data=dt.lnwinc, weights=dt.wgt)
    tot_pct["tot_ft_f"] = wq.quantile(probs=pctiles_, return_pandas=False)

    # march-pctile-`yr'
    tot_pct = tot_pct.T.rename_axis('sample').reset_index().assign(year=year-1)  # @ tidy data

    # @ the code then combine 1963-2008 generated files
    # @ we remove this as not sure necessary
    # @ actually this part can be combined with #Summarize raw inequality#

    return df_stat, tot_pct
Example #14
0
def getWeightedMeanQuantiles(feature,weights):
    weighted_stats = DescrStatsW(feature, weights=weights, ddof=0)
    return weighted_stats.mean, weighted_stats.quantile([0.25,0.50,0.75],return_pandas=False)
Example #15
0
#     is selected or a blank value, then the algorithm will replace with an
#     array of 1's with length equal to the endog.
#     WARNING: Using weights is not verified yet for all possible options
#     and results, see Notes.

# In[182]:


df1=pd.DataFrame({ 'x':range(1,101), 'wt':range(1,101) })

from statsmodels.stats.weightstats import DescrStatsW
wdf = DescrStatsW(df1.x, weights=df1.wt, ddof=1) 
print('without weight, the mean value is: ', np.mean(df1.x))
print( 'with weight, the mean value is: ', wdf.mean )
print( wdf.std )
print( wdf.quantile([0.25,0.50,0.75]) )


# In[201]:


# 'COVIDFOL_W66. How closely have you been following news about the outbreak of the coronavirus
import statsmodels.api as sm
X = sm.add_constant(df[['F_INCOME','F_AGECAT', 'edu', 'republic','COVIDCOVER1_W66', 'MH_TRACK_a_W66', 
                        'MH_TRACK_b_W66', 'MH_TRACK_d_W66', 'MH_TRACK_d_W66', 'MH_TRACK_e_W66']])
y = df['COVIDFOL_W66']
reg = sm.OLS(y,X, freq_weights=df['WEIGHT_W66'])
results = reg.fit()
reg1 = sm.GLM(y,X)
results1 = reg1.fit()
reg2 = sm.GLM(y,X, freq_weights=df['WEIGHT_W66'])
Example #16
0
# ## Compute global statistics

bases_without_reads = depth_fraction(coverage,ZeroDepth=True)
bases_greater1 = depth_fraction(coverage,thr=1)
bases_greater10 = depth_fraction(coverage,thr=10)
bases_greater20 = depth_fraction(coverage,thr=20)
bases_greater30 = depth_fraction(coverage,thr=30)

# uso estadisticos pesados popr la longitud de cada intervalo de profundidad constante
weighted_stats = DescrStatsW(coverage['count'], weights=coverage.count_length, ddof=0)


global_depth={}
global_depth.update({'mean_DP':round(weighted_stats.mean,signif)})
global_depth.update({'median_DP':weighted_stats.quantile(0.5).values[0]})
global_depth.update({'std_DP':round(weighted_stats.std,signif)})
global_depth.update({'q25_DP':weighted_stats.quantile(0.25).values[0]})
global_depth.update({'q75_DP':weighted_stats.quantile(0.75).values[0]})
global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})

global_depth.update({'dp>=1':bases_greater1})
global_depth.update({'dp>=10':bases_greater10})
global_depth.update({'dp>=20':bases_greater20})
global_depth.update({'dp>=30':bases_greater30})
global_depth.update({'q95_DP':weighted_stats.quantile(0.95).values[0]})
#global_depth.update({'DP=0':round(bases_without_reads,signif)})

res = pd.Series(global_depth).to_frame()
res.columns=[sample]
global_statistics = res.loc[[u'dp>=1',u'dp>=10', u'dp>=20', u'dp>=30', 
def test_weightstats_2d_w1():
    x1 = [[1], [2]]
    w1 = [[1], [2]]
    d1 = DescrStatsW(x1, w1)
    print(len(np.array(w1).shape))
    assert (d1.quantile([0.5, 1.0]) == 2).all().all()
def test_weightstats_2d_w2():
    x1 = [[1]]
    w1 = [[1]]
    d1 = DescrStatsW(x1, w1)
    assert (d1.quantile([0, 0.5, 1.0]) == 1).all().all()
Example #19
0
    weights.append(1)  #last is last minues before * 0.5

    #start analysis
    allPrint(latestFileHandler, "Analysis for " + item)
    allPrint(latestFileHandler, "=============" + "=" * len(item))
    allPrint(latestFileHandler, "Data count: " + str(len(times)))

    stats = DescrStatsW(prices, weights)
    allPrint(latestFileHandler,
             "Weighted Average Price: " + "{:,.2f}".format(stats.mean))
    allPrint(latestFileHandler,
             "Weighted Stdev: " + "{:,.2f}".format(stats.std))
    allPrint(latestFileHandler, )
    allPrint(latestFileHandler, "Percentiles:")
    allPrint(latestFileHandler,
             "5% : " + "{:,.2f}".format(stats.quantile(0.05, False)[0]))
    allPrint(latestFileHandler,
             "15% : " + "{:,.2f}".format(stats.quantile(0.15, False)[0]))
    allPrint(latestFileHandler,
             "50% : " + "{:,.2f}".format(stats.quantile(0.50, False)[0]))
    allPrint(latestFileHandler,
             "85% : " + "{:,.2f}".format(stats.quantile(0.85, False)[0]))
    allPrint(latestFileHandler,
             "95% : " + "{:,.2f}".format(stats.quantile(0.95, False)[0]))
    allPrint(latestFileHandler, )
    profits[item] = round(stats.quantile(tresholdPercentile, False)[0],
                          2) - round(prices[-1], 2)
    profitsPercent[item] = (
        round(stats.quantile(tresholdPercentile, False)[0], 2) -
        round(prices[-1], 2)) / prices[-1]
    allPrint(latestFileHandler,
Example #20
0
def describe_cluster(cluster_df, features, weight_column, oven_refills):
    """ Create the statistics for a cluster. Datapoints that are part of a breakdown
    period are excluded.

    Parameters
    ----------
    cluster_df : DataFrame
        A dataframe that contains all points of the cluster you want to describe.
    features : list of source features
        All source feature for which the statistics should be generated
    weight_column : string
        Name of the column to use for weighting data points, typically
        `datapoint_duration` (``ProcessingFeatures.DATAPOINT_DURATION``)
    oven_refills : list of timestamp
        End of the oven refill periods

    Returns
    -------
    Series
        A Series of the following statistics

        For each parameter in `features`:

            1. mean
            2. std
            3. std% (std in percent of mean)
            4. avg_dev (average deviation of mean)
            5. min
            6. 25% (lower quartile)
            7. median
            8. 75% (upper quartile)
            9. max

        Once for the cluster:

            10. Density/count (number of data points in the cluster)
            11. Duration/in_hours (total duration of cluster)
            12. Duration/longest (duration of longest fragment)
            13. Duration/num_splits (number of fragments)
            14. Refill/index (index of oven refill that came directly before 
                the beginning of the longest fragment)
            15. Refill/delta_in_hours (delta from the end of the closest oven refill)
            16. num_breakdowns/per_hour (number of breakdowns per hour)
    """

    values = ["mean", "std", "std%", "avg_dev", "min", "25%", "median", "75%", "max"]
    index = pd.MultiIndex.from_tuples(
        [(p, v) for p in features for v in values]
        + [
            ("DENSITY", "count"),
            ("DURATION", "in_hours"),
            ("DURATION", "longest_in_hours"),
            ("DURATION", "num_splits"),
            ("REFILL", "index"),
            ("REFILL", "delta_in_hours"),
            ("num_breakdowns", "per_hour"),
        ]
    )

    data = cluster_df.loc[
        (cluster_df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0), features
    ].values  # TODO maybe only include non breakdown here???
    weights = cluster_df.loc[
        (cluster_df[ProcessingFeatures.HT_VOLTAGE_BREAKDOWN] == 0), weight_column
    ].values
    if data.size == 0:
        return None

    stats = DescrStatsW(data, weights, ddof=1)

    mean = np.array(stats.mean)  # np.mean(data, axis=0)
    std = np.array(stats.std)  # np.std(data, axis=0)
    quantiles = stats.quantile([0, 0.25, 0.5, 0.75, 1], return_pandas=False)
    # np.quantile(data, [0, 0.25, 0.5, 0.75, 1], axis=0)
    avg_dev = np.dot(weights, np.absolute(data - mean)) / np.sum(weights)

    count = len(data)

    duration_in_seconds = cluster_df[ProcessingFeatures.DATAPOINT_DURATION].sum()
    duration_in_hours = duration_in_seconds / 3600

    (
        duration_longest_start,
        duration_longest,
        duration_num_splits,
    ) = get_cluster_duration(cluster_df, weight_column)
    duration_longest /= 3600

    closest_refill = None
    for i, refill in reversed(list(enumerate(oven_refills))):
        if duration_longest_start > refill:
            closest_refill = i
            break

    refill_delta = -1
    if not closest_refill is None:
        refill_delta = (
            pd.Timestamp(duration_longest_start) - oven_refills[closest_refill]
        ).total_seconds() / 3600

    description = [
        [
            mean[i],
            std[i],
            np.abs(std[i] / mean[i]) * 100,
            avg_dev[i],
            quantiles[0][i],
            quantiles[1][i],
            quantiles[2][i],
            quantiles[3][i],
            quantiles[4][i],
        ]
        for i in range(len(features))
    ]
    description = [item for sublist in description for item in sublist]
    description.append(count)
    description.append(duration_in_hours)
    description.append(duration_longest)
    description.append(duration_num_splits)

    description.append(closest_refill)
    description.append(refill_delta)

    description.append(
        cluster_df.loc[
            cluster_df[ProcessingFeatures.HT_SPARKS_COUNTER] > 0,
            ProcessingFeatures.HT_SPARKS_COUNTER,
        ].nunique()
        / duration_in_hours
    )

    return pd.Series(description, index=index)