Example #1
0
 def summary_df(self, thresholds=None, lower_quantile=None, upper_quantile=None):
     """
     Calculates the pair of metrics for each threshold for each result.
     """
     if thresholds is None:
         thresholds = self.thresholds
     if lower_quantile is None:
         lower_quantile = self.config['lower_quantile']
     if upper_quantile is None:
         upper_quantile = self.config['upper_quantile']
     
     if self.n_current_results > self.n_cached_curves:
         # If there are new curves, recompute
         colnames = ['_'.join([metric, stat])
                     for metric in [self.metric1.name, self.metric2.name] 
                     for stat in ['Mean', 'Median',
                                  '%d_Percentile' % (100*lower_quantile),
                                  '%d_Percentile' % (upper_quantile*100)]]
         self.ret = pd.DataFrame(columns=colnames, index=thresholds, dtype='float64')
         
         for threshold in thresholds:
             m1s = Series([self.metric1.score(result, threshold) for result in self.results])
             m2s = Series([self.metric2.score(result, threshold) for result in self.results])
             self.ret.loc[threshold] = (m1s.mean(), m1s.quantile(.5), m1s.quantile(.05), m1s.quantile(.95),
                                        m2s.mean(), m2s.quantile(.5), m2s.quantile(.05), m2s.quantile(.95))
     return self.ret
Example #2
0
    def test_nanmean_overflow(self):
        # GH 10155
        # In the previous implementation mean can overflow for int dtypes, it
        # is now consistent with numpy
        from pandas import Series

        # numpy < 1.9.0 is not computing this correctly
        from distutils.version import LooseVersion
        if LooseVersion(np.__version__) >= '1.9.0':
            for a in [2 ** 55, -2 ** 55, 20150515061816532]:
                s = Series(a, index=range(500), dtype=np.int64)
                result = s.mean()
                np_result = s.values.mean()
                self.assertEqual(result, a)
                self.assertEqual(result, np_result)
                self.assertTrue(result.dtype == np.float64)

        # check returned dtype
        for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
            s = Series(range(10), dtype=dtype)
            result = s.mean()
            if is_integer_dtype(dtype):
                self.assertTrue(result.dtype == np.float64)
            else:
                self.assertTrue(result.dtype == dtype)
Example #3
0
def onDataArrived(data,trader):
    '''
    执行数据每次接收到数据
    '''
    global count,bid0,bid1,ask0,ask1,diffs,lastDirection

    count += 1
    #print 'instrumentID =',data['instrumentID'],'count=',count
    #if count == 10:
    #    print 'bid =',data['bid'],'ask =',data['ask']
    #    result = trader.open(instrumentID=data['instrumentID'],directionCode = 'buy',volume=1)
    #    print result
    #    print result[2].openPrice

    #if count in (15,25):
    #    print 'total volume=',trader.getTotalVolume()

    #if count == 20:
    #    trader.closeAll()


    if data['instrumentID'] == instrumentID0:
        bid0 = data['bid']
        ask0 = data['ask']
    if data['instrumentID'] == instrumentID1:
        bid1 = data['bid']
        ask1 = data['ask']

    if bid0 != 0 and bid1 !=0:
        diff = bid1 - bid0
        diffs.append(diff)
        if len(diffs) > 350:
            s = Series(diffs)
            pts = diff - s.mean()
            if abs(pts) > 3 :
                if pts > 0 and lastDirection <= 0:
                    lastDirection = 1
                    print '开仓条件触发,头寸方向:',lastDirection
                    print 'bid0 =',bid0,'bid1 =',bid1,'ask0 =',ask0,'ask1 =',ask1
                    print 'diff =',diff,'偏离值 =',pts
                    trader.closeAll()
                    price0,price1=openPair(trader,instrumentID0,instrumentID1,lastDirection)
                    print '实际价差 =',price1-price0-s.mean()
                    print '买盘滑点 =',price0-bid0
                    print '卖盘滑点 =',ask1-price1
                if pts < 0 and lastDirection >= 0:
                    lastDirection = -1
                    print '开仓条件触发,头寸方向:',lastDirection
                    print 'bid0 =',bid0,'bid1 =',bid1,'ask0 =',ask0,'ask1 =',ask1
                    print 'diff =',diff,'偏离值 =',pts
                    trader.closeAll()
                    openPair(trader,instrumentID0,instrumentID1,lastDirection)
                    print '实际价差 =',price0-price1-s.mean()
                    print '买盘滑点 =',price1-bid1
                    print '卖盘滑点 =',ask0-price0
            else:
                if count % 30 == 0 :
                    print '平均点差 =',s.mean(),'diff =',diff,'偏离值 =',pts
Example #4
0
def query_rent_avgprice(filename):
    """
        query mongo calculate the avg price of each city
        return { city : price }
    """
    # { city : [price1,price2...]}
    rent_avgprice = {}
    res = RentHouse.objects.only('city', 'price')
    for item in res:
        city = item.city
        price = clean_price(item.price)
        if price == -1:
            continue
        if city in rent_avgprice.keys():
            rent_avgprice[city].append(price)
        else:
            rent_avgprice[city] = [price]
    for key, value in rent_avgprice.items():
        s = Series(value)
        rent_avgprice[key] = s.mean()
    for key, value in rent_avgprice.items():
        rent_avgprice[key] = int(value)
    # write the result to cache file
    fd = open('./cache/' + filename, "w", encoding="utf-8")
    fd.write(json.dumps(rent_avgprice))
    return rent_avgprice
Example #5
0
def get_red_yellow_bins(series: pd.Series,
                        method: str,
                        red_bin=None,
                        yellow_bin=None):
    if method == "Percentile":
        red = round(series.quantile(q=red_bin / 100), 6)
        yellow = round(series.quantile(q=(red_bin + yellow_bin) / 100), 6)
        return red, yellow

    elif method == "Count":

        red = round(series.sort_values().values[int(round(red_bin, 0)) - 1], 6)
        yellow = round(
            series.sort_values().values[int(round(red_bin + yellow_bin, 0)) -
                                        1], 6)
        return red, yellow

    elif method == "NAVF":
        mean = series.mean()
        sd = series.std()
        red = round(mean - 3 * sd, 6)
        yellow = round(mean - 2 * sd, 6)

        return red, yellow

    elif method == "fuzzy AVF":
        y = fuzzy_smf(series)
        red, yellow = get_fuzzy_bins(series, y, red_bin, yellow_bin)
        return red, yellow

    else:
        red = None
        yellow = None
        return red, yellow
Example #6
0
def describe_numeric_1d(series: Series,
                        percentiles: Sequence[float]) -> Series:
    """Describe series containing numerical data.

    Parameters
    ----------
    series : Series
        Series to be described.
    percentiles : list-like of numbers
        The percentiles to include in the output.
    """
    from pandas import Series

    # error: Argument 1 to "format_percentiles" has incompatible type "Sequence[float]";
    # expected "Union[ndarray, List[Union[int, float]], List[float], List[Union[str,
    # float]]]"
    formatted_percentiles = format_percentiles(
        percentiles)  # type: ignore[arg-type]

    stat_index = ["count", "mean", "std", "min"
                  ] + formatted_percentiles + ["max"]
    d = ([series.count(),
          series.mean(),
          series.std(),
          series.min()] + series.quantile(percentiles).tolist() +
         [series.max()])
    return Series(d, index=stat_index, name=series.name)
Example #7
0
def z_score(
    s: pd.Series,
    moments_dict: dict = None,
    keys: Tuple[str] = ("mean", "std")) -> pd.Series:
    """
    Transforms the Series into z-scores

    :param s: Input Series
    :type s: pd.Series
    :param moments_dict: If not None, then the mean and standard
        deviation used to compute the z-score transformation is
        saved as entries in moments_dict with keys determined by
        the keys argument, defaults to None
    :type moments_dict: dict, optional
    :param keys: Determines the keys saved in moments_dict
        if moments are saved, defaults to ("mean", "std")
    :type keys: Tuple[str], optional
    :return: Transformed Series
    :rtype: pd.Series
    """
    mean = s.mean()
    std = s.std()
    if std == 0:
        return 0
    if moments_dict is not None:
        moments_dict[keys[0]] = mean
        moments_dict[keys[1]] = std
    return (s - mean) / std
Example #8
0
 def column_info(labels: pd.Series) -> pd.Series:
     count = len(labels)
     gini = gini_ratio(labels)
     prob = labels.astype(bool).sum() / len(labels)
     avg = labels.mean()
     std = labels.std()
     return count, gini, prob, avg, std
Example #9
0
def _check_Xy(X: pd.DataFrame,
              y: pd.Series,
              *,
              norm_y=False) -> Tuple[pd.Series, pd.Series]:
    if np.ndim(X) == 1:
        X = pd.Series(X).to_frame()
    elif np.ndim(X) == 2:
        X = pd.DataFrame(X)

    assert X.ndim == 2
    assert np.ndim(y) == 1
    assert len(X) == len(y)

    valid = ~X.isnull().any(1).values
    X = pd.Series(list(zip(*X.values[valid].T)),
                  name=tuple(X.columns)).astype('category')
    y = pd.Series(y).reset_index(drop=True)[valid]

    if is_object_dtype(y):
        y = pd.Categorical(y)

    if norm_y:
        assert is_numeric_dtype(y)
        y = (y - y.mean()) / y.std()

    return X, y
Example #10
0
    def infer_posterior(data: Series,
                        alpha: Optional[float] = None,
                        beta: Optional[float] = None) -> Gamma:
        """
        Return a new Gamma distribution of the posterior most likely to
        generate the given data.

        :param data: Series of float values representing duration of,
                     or between each observation.
        :param alpha: Value for the α hyper-parameter of the prior Gamma
                      distribution (number of observations). Defaults to Vague.
        :param beta: Value for the β hyper-parameter of the prior Gamma
                     distribution (sum of observations). Defaults to Vague.
        """
        if alpha is None:
            alpha = VaguePrior.Gamma.alpha
        if beta is None:
            beta = VaguePrior.Gamma.beta
        data = data.dropna()
        n = len(data)
        x_mean = data.mean()
        return GammaExponentialConjugate(n=n,
                                         x_mean=x_mean,
                                         alpha=alpha,
                                         beta=beta).posterior()
def get_moments_annotation(
    s: pd.Series,
    xref: str,
    yref: str,
    x: float,
    y: float,
    xanchor: str,
    title: str,
    labels: List,
) -> go.layout.Annotation:
    """Calculates summary statistics for a series and returns and
    Annotation object.
    """
    moments = list(stats.describe(s.to_numpy()))
    moments[3] = np.sqrt(moments[3])

    sharpe = s.mean() / s.std()

    return go.layout.Annotation(
        text=(f"<b>sharpe: {sharpe:>8.4f}</b><br>" + ("<br>").join(
            [f"{k[0]:<9}{k[1](moments[i])}" for i, k in enumerate(labels)])),
        align="left",
        showarrow=False,
        xref=xref,
        yref=yref,
        x=x,
        y=y,
        bordercolor="black",
        borderwidth=0.5,
        borderpad=2,
        bgcolor="white",
        xanchor=xanchor,
        yanchor="top",
    )
def relative_absolute_error(y_true: pd.Series, y_pred: pd.Series):
    y_true_mean = y_true.mean()
    n = len(y_true)
    # Relative Absolute Error
    # err = math.sqrt(sum(np.square(y_true - y_pred)) / math.sqrt(sum(np.square(y_true-y_true_mean))))
    err = sum(abs(y_true - y_pred)) / sum(abs(y_true - y_true_mean))
    return err
Example #13
0
def statistics_imputer(X: Series, method: str = 'mean', null_value: List = None) -> Series:
    """
    统计指标填充,例如:均值、中位数、众数等
    :param X:
    :param method: 目前仅支持均值、中位数、众数、最大值、最小值
    :param null_value: 缺失值列表
    :return:
    """
    X = X.copy()
    if null_value is not None:
        X[X.isin(null_value)] = np.nan

    if method == 'mean':
        fill_value = X.mean()
    elif method == 'median':
        fill_value = X.median()
    elif method == 'mode':
        fill_value = X.mode()[0]
    elif method == 'max':
        fill_value = X.max()
    elif method == 'min':
        fill_value = X.min()
    else:
        raise Exception('未配置的填充方法')

    X.fillna(fill_value, inplace=True)

    return X
Example #14
0
def query_rent_avgzone(filename):
    """
        query mongo calculate the avg price of each zone
        return { zone : num}
    """
    # { zone : [price1,price2...]}
    rent_avgzone = {}
    res = RentHouse.objects.only('city', 'price', 'location')
    for item in res:
        tmp = item.location.split('-')
        zone = item.city + '-' + tmp[0]
        price = clean_price(item.price)
        if price == -1:
            continue
        if zone in rent_avgzone.keys():
            rent_avgzone[zone].append(price)
        else:
            rent_avgzone[zone] = [price]
    for key, value in rent_avgzone.items():
        s = Series(value)
        rent_avgzone[key] = s.mean()
    # write the result to cache file
    fd = open('./cache/' + filename, "w")
    fd.write(json.dumps(rent_avgzone))
    return rent_avgzone
Example #15
0
def update_enhanced_meta(serie: pd.Series, catalog_id: str, distribution_id: str):
    """Crea o actualiza los metadatos enriquecidos de la serie pasada. El título de
    la misma DEBE ser el ID de la serie en la base de datos"""

    field = Field.objects.get(distribution__dataset__catalog__identifier=catalog_id,
                              distribution__identifier=distribution_id,
                              identifier=serie.name)
    periodicity = meta_keys.get(field.distribution, meta_keys.PERIODICITY)
    days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days

    last = serie[-1]
    second_to_last = serie[-2] if serie.index.size > 1 else None
    last_pct_change = last / second_to_last - 1

    # Cálculos
    meta = {
        meta_keys.INDEX_START: serie.first_valid_index().date(),
        meta_keys.INDEX_END: serie.last_valid_index().date(),
        meta_keys.PERIODICITY: periodicity,
        meta_keys.INDEX_SIZE: _get_index_size(serie),
        meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update,
        meta_keys.LAST_VALUE: last,
        meta_keys.SECOND_TO_LAST_VALUE: second_to_last,
        meta_keys.LAST_PCT_CHANGE: last_pct_change,
        meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity),
        meta_keys.MAX: serie.max(),
        meta_keys.MIN: serie.min(),
        meta_keys.AVERAGE: serie.mean(),
    }

    for meta_key, value in meta.items():
        field.enhanced_meta.update_or_create(key=meta_key, defaults={'value': value})
Example #16
0
def basic_stat_map(s: pd.Series) -> dict:
    return {
        "mean": s.mean(),
        "median": s.median(),
        "std": s.std(),
        "count": s.count(),
    }
Example #17
0
 def standardize(self, s: pd.Series):
     mean, std = s.mean(), s.std()
     self.logger.info("{name}'s mean: {m}, std: {s}".format(name=s.name,
                                                            m=mean,
                                                            s=std))
     stdized = s.apply(lambda x: (x - mean) / std).rename("stdized_price")
     return stdized, mean, std
    def _sortino_ratio(self, returns: pd.Series) -> float:
        """
        Return the sortino ratio for a given series of a returns.
        返回索提诺比率
        Sortino ratio的思路和Sharpe ratio的思路是一样的,但是对分子分母分别都做了调整。它将分子换为超额收益率,
        而分母换为Lower partial standard deviation,下偏标准差,主要是为了解决传统的正态分布存在的几个问题:
        分布其实并不对称。尤其是收益率函数分布左偏(偏度为负)的情况下,正态分布会低估风险,
        此时使用偏态分布就要比正态分布要合理;投资组合的下限应该是无风险投资工具。
        因此传统的sharpe ratio中分母使用全体的标准差(全体对平均投资收益的偏离)是不合适的,
        应该使用收益对无风险投资收益的偏离。总体上来看,Sortino ratio更看重对(左)尾部的预期损失分析,
        而Sharpe ratio则是对全体样本进行分析;而当  更换为收益列中大于无风险收益的全部样本时,
        则是对(右)尾部的超额收入分析。金融历史不能完全指导过去,尤其是在极端风险的考量上。
        因此使用Sortino ratio就变成一种更审慎的评估工具了。另外说一句,Sortino ratio也没有解决尖峰(leptokurtic)肥尾(fat tail)的问题。

        链接:https://www.zhihu.com/question/37128695/answer/230508370


        https://en.wikipedia.org/wiki/Sortino_ratio
        """
        # 亏损收益
        downside_returns = (returns[returns < self._target_returns])**2

        # 资产期望收益率
        expected_return = returns.mean()

        # 亏损收益方差
        downside_std = np.sqrt(downside_returns.mean())

        return (expected_return - self._risk_free_rate) / (downside_std + 1E-9)
 def _sharpe_ratio(self, returns: pd.Series) -> float:
     """
     Return the sharpe ratio for a given series of a returns.
     返回夏普比率
     https://en.wikipedia.org/wiki/Sharpe_ratio
     """
     return (returns.mean() - self._risk_free_rate) / (returns.std() + 1E-9)
Example #20
0
def dist_table(diff: pd.Series):
    diff = diff.map(lambda x: trueround_precision(x, 3))
    head = '<tr><td>区分度</td><td>区分度描述</td><td>题目数量</td></tr>'
    rtn = [
        head,
    ]
    groups = [(0, 0.199), (0.2, 0.299), (0.3, 0.399), (0.4, 1)]
    labels = [
        '需要修改',
        '修改之后会更好',
        '合格',
        '较好',
    ]

    i = 0
    for g in groups:
        label = labels[i]
        n = sum((diff >= g[0]) & (diff <= g[1]))
        i += 1
        row = f'<tr><td>{g[0]}~{g[1]}</td><td>{label}</td><td>{n}</td></tr>'
        rtn.append(row)
    discribe = {}
    discribe['最大区分度值'] = diff.max()
    discribe['最小区分度值'] = diff.min()
    discribe['平均区分度值'] = diff.mean()
    for k, v in discribe.items():
        row = f'<tr><td>{k}</td><td>{v}</td></tr>'
        rtn.append(row)
    rows = '\n'.join(rtn)
    return f'<table class="table table-striped">{rows}</table>'
Example #21
0
def calc_avg_time(timeseries, month=0, wake=True):
    seconds_list = []
    if wake is True:
        label = 'wake_time'
    else:
        label = 'bed_time'
    if month == 0:
        abrv_month = 'all time'
        timeseries = timeseries[label]
    else:
        given_month = '2017/' + str(month)
        abrv_month = month_dict[month]
        timeseries = timeseries.ix[given_month][label]
    for t_stamp in timeseries:
        if t_stamp.hour <= 12:
            ts_seconds = t_stamp.hour * 3600 + t_stamp.minute * 60 + t_stamp.second * 1.
            seconds_list.append(ts_seconds)
        else:
            ts_seconds = t_stamp.hour * 3600 + t_stamp.minute * 60 + t_stamp.second * 1. - 24 * 3600
            seconds_list.append(ts_seconds)
    timeseries_temp = Series(seconds_list, index=timeseries.index)
    ts_m = timeseries_temp.mean()
    if ts_m > 0:
        ts_m = int(ts_m)
    else:
        ts_m = int(ts_m) + 24 * 3600
    result = time(ts_m / 3600, (ts_m % 3600) / 60, (ts_m % 3600) % 60)
    result = result.strftime('%H:%M:%S')
    return result
Example #22
0
def calculate_enhanced_meta(serie: pd.Series, periodicity: str) -> dict:
    """Crea o actualiza los metadatos enriquecidos de la serie pasada. El título de
    la misma DEBE ser el ID de la serie en la base de datos"""

    days_since_update = (datetime.now() -
                         _get_last_day_of_period(serie, periodicity)).days

    last = serie[-1]
    second_to_last = serie[-2] if serie.index.size > 1 else None
    last_pct_change = last / second_to_last - 1

    # Cálculos
    meta = {
        meta_keys.INDEX_START: serie.first_valid_index().date(),
        meta_keys.INDEX_END: serie.last_valid_index().date(),
        meta_keys.PERIODICITY: periodicity,
        meta_keys.INDEX_SIZE: _get_index_size(serie),
        meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update,
        meta_keys.LAST_VALUE: last,
        meta_keys.SECOND_TO_LAST_VALUE: second_to_last,
        meta_keys.LAST_PCT_CHANGE: last_pct_change,
        meta_keys.IS_UPDATED: _is_series_updated(days_since_update,
                                                 periodicity),
        meta_keys.MAX: serie.max(),
        meta_keys.MIN: serie.min(),
        meta_keys.AVERAGE: serie.mean(),
        meta_keys.SIGNIFICANT_FIGURES: significant_figures(serie.values)
    }

    return meta
    def test_all_values_single_bin(self):
        # 2070
        index = period_range(start="2012-01-01", end="2012-12-31", freq="M")
        s = Series(np.random.randn(len(index)), index=index)

        result = s.resample("A", how='mean')
        tm.assert_almost_equal(result[0], s.mean())
Example #24
0
def plot_norm(data: pd.Series, bins=10, ax=None, is_show_plot=None):
    """
    显示当前数据的正太分布曲线
    :param data:
    :param bins: bar 数量
    :param ax: 如果为None,则新建一个画布
    :param is_show_plot: 是否展示
    :return: n, bins_v, mean, std
    """
    if ax is None:
        fig, ax = plt.subplots()
        if is_show_plot is None:
            is_show_plot = True

    if is_show_plot is None:
        is_show_plot = False

    n, bins_v = np.histogram(data, bins=bins)

    mu = data.mean()  # mean of distribution
    sigma = data.std()  # standard deviation of distribution
    # def norm_func(x, mu, sigma):
    #     pdf = np.exp(-((x - mu)**2)/(2*sigma**2)) / (sigma * np.sqrt(2*np.pi))
    #     return pdf
    # y = norm_func(bins, mu, sigma)  # 与 mlab.normpdf(bins, mu, sigma) 相同
    # y = mlab.normpdf(bins, mu, sigma)
    y = stats.norm.pdf(bins_v, loc=mu, scale=sigma)
    ax.plot(bins_v, y, '--')
    plt.grid(True)
    if is_show_plot:
        plt.show()

    return n, bins_v, mu, sigma
def _check_Xy(X: pd.DataFrame,
              y: pd.Series, *,
              norm_y=False) -> Tuple[pd.Series, pd.Series]:
    if np.ndim(X) == 1:
        X = pd.Series(X).to_frame()
    elif np.ndim(X) == 2:
        X = pd.DataFrame(X)

    assert X.ndim == 2
    assert np.ndim(y) == 1
    assert len(X) == len(y)

    valid = ~X.isnull().any(1).values
    X = pd.Series(list(zip(*X.values[valid].T)),
                  name=tuple(X.columns)).astype('category')
    y = pd.Series(y).reset_index(drop=True)[valid]

    if is_object_dtype(y):
        y = pd.Categorical(y)

    if norm_y:
        assert is_numeric_dtype(y)
        y = (y - y.mean()) / y.std()

    return X, y
Example #26
0
def get_descriptive_stats(data: pd.Series) -> dict:
    """Calculate descriptive statistics for the supplied `data`.

    Args:
        data (pd.Series): An array of the values to summarise.

    Returns:
        dict: A dictionary of summary statistics.
    """
    stats = [
        "Mean",
        "Standard Deviation",
        "Minimum",
        "Maximum",
        "Median",
        "Mode",
    ]
    values = resolve_integer_or_float([
        data.mean(),
        data.std(),
        data.min(),
        data.max(),
        median(data),
        mode(data),
    ])
    return dict(zip(stats, values))
Example #27
0
    def test_all_values_single_bin(self):
        # 2070
        index = period_range(start="2012-01-01", end="2012-12-31", freq="M")
        s = Series(np.random.randn(len(index)), index=index)

        result = s.resample("A").mean()
        tm.assert_almost_equal(result[0], s.mean())
Example #28
0
def zscore(s: Series) -> Series:
    """
    Returns the z-score for every value in the series.

    Z = (x - mu) / sigma

    Example
    -------

    >>> x = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9])
    >>> x
    0    1
    1    2
    2    3
    3    4
    4    5
    5    6
    6    7
    7    8
    8    9
    dtype: int64
    >>> x.zscore()
    0   -1.460593
    1   -1.095445
    2   -0.730297
    3   -0.365148
    4    0.000000
    5    0.365148
    6    0.730297
    7    1.095445
    8    1.460593
    dtype: float64
    """
    return (s - s.mean()) / s.std()
Example #29
0
def calc_avg_time(timeseries,month=0, wake=True):
	seconds_list = []
	if wake is True:
		label = 'wake_time'
	else:
		label = 'bed_time'
	if month == 0:
		abrv_month = 'all time'
		timeseries = timeseries[label]
	else:
		given_month = '2016/' + str(month)
		abrv_month = month_dict[month]
		timeseries = timeseries.ix[given_month][label]
	for t_stamp in timeseries:
		if t_stamp.hour <= 12:
			ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1.
			seconds_list.append(ts_seconds)
		else:
			ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. - 24*3600
			seconds_list.append(ts_seconds)	
	timeseries_temp = Series(seconds_list,index = timeseries.index)
	ts_m = timeseries_temp.mean()
	if ts_m > 0:
		ts_m = int(ts_m)
	else:
		ts_m = int(ts_m) + 24*3600
	result = time(ts_m/3600,(ts_m%3600)/60,(ts_m%3600)%60)
	result = result.strftime('%H:%M:%S')
	return result
Example #30
0
def calc_long_short_prec(pred: pd.Series,
                         label: pd.Series,
                         date_col="datetime",
                         quantile: float = 0.2,
                         dropna=False,
                         is_alpha=False) -> Tuple[pd.Series, pd.Series]:
    """
    calculate the precision for long and short operation


    :param pred/label: index is **pd.MultiIndex**, index name is **[datetime, instruments]**; columns names is **[score]**.

            .. code-block:: python
                                                  score
                datetime            instrument
                2020-12-01 09:30:00 SH600068    0.553634
                                    SH600195    0.550017
                                    SH600276    0.540321
                                    SH600584    0.517297
                                    SH600715    0.544674
    label :
        label
    date_col :
        date_col

    Returns
    -------
    (pd.Series, pd.Series)
        long precision and short precision in time level
    """
    if is_alpha:
        label = label - label.mean(level=date_col)
    if int(1 / quantile) >= len(label.index.get_level_values(1).unique()):
        raise ValueError("Need more instruments to calculate precision")

    df = pd.DataFrame({"pred": pred, "label": label})
    if dropna:
        df.dropna(inplace=True)

    group = df.groupby(level=date_col)

    N = lambda x: int(len(x) * quantile)
    # find the top/low quantile of prediction and treat them as long and short target
    long = group.apply(
        lambda x: x.nlargest(N(x), columns="pred").label).reset_index(
            level=0, drop=True)
    short = group.apply(
        lambda x: x.nsmallest(N(x), columns="pred").label).reset_index(
            level=0, drop=True)

    groupll = long.groupby(date_col)
    l_dom = groupll.apply(lambda x: x > 0)
    l_c = groupll.count()

    groups = short.groupby(date_col)
    s_dom = groups.apply(lambda x: x < 0)
    s_c = groups.count()
    return (l_dom.groupby(date_col).sum() /
            l_c), (s_dom.groupby(date_col).sum() / s_c)
 def fill_outliers(col: pd.Series):
     """ Remove outliers of each col
     """
     mean, std = col.mean(), col.std()
     upper, lower = mean + 3 * std, mean - 3 * std
     col[col > upper] = np.floor(upper)
     col[col < lower] = np.floor(lower)
     return col.values
Example #32
0
        def _mask_outliers(vec: pd.Series, stdv_times):
            vec_mean = vec.mean()
            vec_stdv = vec.std()
            upper = vec_mean + vec_stdv * stdv_times
            lower = vec_mean - vec_stdv * stdv_times
            vec[((lower > vec) | (vec > upper))] = np.nan

            return vec
Example #33
0
 def from_series(feature_name: str, series: Series):
     """从pandas.Series中构造"""
     assert types.is_numeric_dtype(series), series.dtypes
     return NumericColumn(feature_name=feature_name,
                          min_value=series.min(),
                          max_value=series.max(),
                          mean_value=series.mean(),
                          std_value=series.std())
Example #34
0
def sharpe_ratio(corrs: pd.Series) -> np.float32:
    """
    Calculate the Sharpe ratio for Numerai by using grouped per-era data

    :param corrs: A Pandas Series containing the Spearman correlations for each era
    :return: A float denoting the Sharpe ratio of your predictions.
    """
    return corrs.mean() / corrs.std()
Example #35
0
 def before_after_3sigma(data: pd.Series) -> pd.Series:
     miu = data.mean()
     sigma = data.std()
     threshold_down = miu - 3 * sigma
     threshold_up = miu + 3 * sigma
     data[data.ge(threshold_up)] = threshold_up
     data[data.le(threshold_down)] = threshold_down
     return data
Example #36
0
def saveDictionaryToFile(my_dict, file_name):
    '''THis function will write the values of a dictionary into a csv, BUT it will also
    append the mean value as the last row'''
    data = Series(my_dict, index=my_dict.keys())
    mean_value = data.mean()
    data['AVG'] = mean_value
    data.sort_index(axis=0, inplace=True)
    data.to_csv(file_name)
Example #37
0
def pd_03():
    df=DataFrame(np.random.randn(6,3))
    df.ix[2:,1]=np.nan
    df.ix[4:,2]=np.nan
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill',limit=2)
    data=Series([1.,None,3.5,None,7])
    print data.fillna(data.mean())
    print df.fillna(df.mean())
Example #38
0
def print_avg_time(timeseries,month=0, wake=True):
	seconds_list = []
	if wake == True:
		if month == 0:
			abrv_month = 'all time'
			timeseries = timeseries['wake_time']
		else:
			given_month = '2016/' + str(month)
			abrv_month = month_dict[month]
			timeseries = timeseries.ix[given_month]['wake_time']
		for t_stamp in timeseries:
			ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1.
			seconds_list.append(ts_seconds)
		timeseries_temp = Series(seconds_list,index = timeseries.index)
		ts_m = timeseries_temp.mean()
		ts_m = int(ts_m)
		result = time(ts_m/3600,(ts_m%3600)/60,(ts_m%3600)%60)
		print 'average wake time in ' + abrv_month + ': ' + result.strftime('%H:%M:%S')
	else:
		if month == 0:
			abrv_month = 'all time'
			timeseries = timeseries['bed_time']
		else:
			given_month = '2016/' + str(month)
			abrv_month = month_dict[month]
			timeseries = timeseries.ix[given_month]['bed_time']
		for t_stamp in timeseries:
			if t_stamp.hour <= 12:
				ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1.
				seconds_list.append(ts_seconds)
			else:
				ts_seconds = t_stamp.hour*3600+t_stamp.minute*60+t_stamp.second*1. - 24*3600
				seconds_list.append(ts_seconds)				
		timeseries_temp = Series(seconds_list,index = timeseries.index)
		ts_m = timeseries_temp.mean()
		if ts_m > 0:
			ts_m = int(ts_m)
		else:
			ts_m = int(ts_m) + 24*3600
		result = time(ts_m/3600,(ts_m%3600)/60,(ts_m%3600)%60)
		print 'average bed time in %s is %s' % (abrv_month, result.strftime('%H:%M:%S'))	
	return result
Example #39
0
    def Calls(self):
        rows = []
        for name,callTimes in self.times['call'].iteritems():
            s = Series(callTimes)
            func,loc = formatName(name)
            callCount = s.count()
            meanTime = s.mean()
            totalTime = s.sum()
            rows.append((func,loc,callCount,meanTime,totalTime))

        columns = ('FUNCTION', 'SOURCE', 'COUNT', 'MEAN', 'TOTAL')
        return DataFrame.from_records(rows, columns=columns, index=('FUNCTION', 'SOURCE'))
Example #40
0
    def Phases(self):
        rows = []
        for prefix in ('parse', 'compile', 'run'):
            for name,callTimes in self.times[prefix].iteritems():
                s = Series(callTimes)
                callCount = s.count()
                meanTime = s.mean()
                totalTime = s.sum()
                rows.append(("%s:%s" % (prefix,name),callCount,meanTime,totalTime))

        columns = ('PHASE', 'COUNT', 'MEAN', 'TOTAL')
        return DataFrame.from_records(rows, columns=columns, index=('PHASE'))
Example #41
0
    def test_nanmean_overflow(self):
        # GH 10155
        # In the previous implementation mean can overflow for int dtypes, it
        # is now consistent with numpy

        for a in [2 ** 55, -2 ** 55, 20150515061816532]:
            s = Series(a, index=range(500), dtype=np.int64)
            result = s.mean()
            np_result = s.values.mean()
            assert result == a
            assert result == np_result
            assert result.dtype == np.float64
Example #42
0
def FollowsInfoByCode(code):
    filename = 'follows_history/'+code+'.csv'
    if not os.path.exists(filename):
        # print filename, 'not exit'
        return None
    reader = csv.reader(file(filename,'rb'))
    follows_chg_list = []
    rt_line = []
    for row in reader:
        name, date, follows, follows_chg = row
        follows_chg = int(follows_chg)
        follows_chg_list.append((follows_chg))
        if date > '2014-10-08' and date < '2014-12-01':
            df = Series(follows_chg_list)
            FollowsMultiple = round((follows_chg)/df.mean(), 1)
            if FollowsMultiple > 4 and FollowsMultiple < 50:
                d = dateutil.parser.parse(date)
                if d.weekday() < 5:
                    print code, name, d, ',', follows, follows_chg, round(df.mean(),1), str(FollowsMultiple)+'x'
                    line = AnalyseHistoryPrice(code, name, FollowsMultiple, date)
                    if line != None:
                        rt_line.append(line)
    return rt_line
Example #43
0
    def test_nanmean_overflow(self):
        # GH 10155
        # In the previous implementation mean can overflow for int dtypes, it
        # is now consistent with numpy

        # numpy < 1.9.0 is not computing this correctly
        if not _np_version_under1p9:
            for a in [2 ** 55, -2 ** 55, 20150515061816532]:
                s = Series(a, index=range(500), dtype=np.int64)
                result = s.mean()
                np_result = s.values.mean()
                assert result == a
                assert result == np_result
                assert result.dtype == np.float64
Example #44
0
  def get_summary_indicators_from_hist(sf, hist, int_index=False):
    seriesHist = Series(hist)
    maxs = {
      'freq': dict()
    }
    
    means = {'freq': seriesHist.mean()}
    medians = {'freq': seriesHist.median()}
    stds = {'freq': seriesHist.std()}
    maxs['freq']['freq'] = seriesHist.max()
    maxs['freq']['index'] = seriesHist.idxmax()
    index_total = 'NA'

    if int_index:
      index = seriesHist.index
      index = index.astype(int)
      index_list = index.tolist()
      index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))])
      index_series = Series(index_list)

      means['index'] = index_series.mean()
      medians['index'] = index_series.median()
      stds['index'] = index_series.std()
      
      maxs['freq']['index'] = int(maxs['freq']['index'])

      maxs['index'] = dict()
      maxs['index']['index'] = max(index_list)
      maxs['index']['freq'] = hist[str(maxs['index']['index'])]

    return {
      'means': means,
      'medians': medians,
      'stds': stds,
      'max': maxs,
      'index_total': index_total
    }
Example #45
0
 def summary_df(self):
     lower_quantile = self.config['lower_quantile']
     upper_quantile = self.config['upper_quantile']
     
     vals = Series(self.summary)
     
     lower_bound = vals.quantile(lower_quantile)
     upper_bound = vals.quantile(upper_quantile)
     median = vals.quantile(0.5)
     mean = vals.mean()
     
     column_names = [ "Mean" , "Median" , "%d_Percentile" % (lower_quantile*100), "%d_Percentile" % (upper_quantile*100)]
     df = pd.DataFrame(dict(zip(column_names, [mean, median, lower_bound, upper_bound])), index=[0])
     
     return df
Example #46
0
def calc_avg(data: pd.Series, prec=None) -> float:
    """Calculate average.
    
    Args:
        data: data to analyze (pd.Series)
        prec: precision if rounding (int)
    Returns:
        average (float)
    """
    
    average = data.mean(axis=0)
    if prec is not None:
        average = round(number=average, ndigits=prec)

    return average
Example #47
0
    def test_nanmean_overflow(self):
        # GH 10155
        # In the previous implementation mean can overflow for int dtypes, it
        # is now consistent with numpy

        # numpy < 1.9.0 is not computing this correctly
        from distutils.version import LooseVersion
        if LooseVersion(np.__version__) >= '1.9.0':
            for a in [2 ** 55, -2 ** 55, 20150515061816532]:
                s = Series(a, index=range(500), dtype=np.int64)
                result = s.mean()
                np_result = s.values.mean()
                self.assertEqual(result, a)
                self.assertEqual(result, np_result)
                self.assertTrue(result.dtype == np.float64)
Example #48
0
def main():
    """
    Handling of not applicable values
    """

    string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
    print string_data
    print string_data.isnull()
    string_data[0] = None
    print string_data.isnull()
    print None is np.nan, None == np.nan # not same

    # Exclude N/A
    print '',''
    NA = np.nan
    data = Series([1, NA, 3.5, NA, 7])
    print data.dropna()
    print data[data.notnull()]

    data = DataFrame([
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
    ])
    cleaned = data.dropna() # row that all value is not NA
    print data
    print cleaned
    print data.dropna(how='all')
    data[4] = None
    print data.dropna(axis=1, how='all')
    print data.dropna(thresh=2) # non NA is more 2

    # Fill NA
    print '',''
    print data.fillna(0)
    print data.fillna({1: 0.5, 2: -1})
    _ = data.fillna(0, inplace=True)
    print data
    print '',''
    df = DataFrame(np.arange(18).reshape((6, 3)))
    df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill', limit=2)
    data = Series([1., NA, 3.5, NA, 7])
    print data.fillna(data.mean())
Example #49
0
    def generate_value_length_distribution(self, variable_value):
        """
        variable map to it's distribution(mean, variance).
            self.__value_length_distribution

        :param variable_value: {path: {variable: [value]}}
        :return: {path: {variable: {'mean': m, 'variance': v}}}
        """
        for path, variable_dict in variable_value.items():
            self._m['value_length_distribution'][path] = {}
            for variable, value_list in variable_dict.items():
                length_list = [len(value) for value in value_list]
                length_series = Series(length_list)
                mean = length_series.mean()
                var = length_series.var()

                self._m['value_length_distribution'][path][variable] = {'mean': mean, 'variance': var}
Example #50
0
def GetFollowsMeanByCode(dirfilelist, code = '(SH:600036)'):
    follows_chg_list = []
    last_follows = 0
    for one in dirfilelist:
        # print one
        df_curr = pd.read_csv(one, names = ['name', 'code', 'follows'], skiprows=[0])
        name, follows = GetFollowsByCode(df_curr, code)
        if follows > 0:
            if last_follows == 0:
                last_follows = follows
            else:
                diff = abs(follows - last_follows)
                last_follows = follows
                follows_chg_list.append(diff)
    # print follows_chg_list
    df = Series(follows_chg_list)
    # print df.mean()   
    return (df.mean())
Example #51
0
    def __report_bots_metadata_results_excel(self, writer):
        """
        writer -> None
        writer: ExcelWriter | ExcelWriter object containing buffer for eventual
           output .xlsx file
        """
        npsim = self.get_npsim()

        ## Some bookeeping
             # get percent unique bots
        percentUniqueBots = round(
            float(self._calc_num_unique_bots()) / float(npsim.get_n()), 4)
        percentUniqueBotsString = "{0:.0f}%".format(100 * percentUniqueBots)
            # get percent mulligans used
        numMulUsed = 0
        for bot in npsim.get_bots():
            if bot.has_used_mulligan():
                numMulUsed += 1
        percentMulUsed = round(float(numMulUsed) / float(npsim.get_n()), 4)
        percentMulUsedString = "{0:.0f}%".format(100 * percentMulUsed)
            # for constructing "one item columns"
        enoughEmptyRows = ["" for i in range(npsim.get_n()-1)] 

        ## Create series that correspond to columns in output excel file
        ## takes advantage of the fact that self.get_bots() is in sorted order
        npsim.get_bots().sort(key=lambda bot: bot.get_max_streak_length(), 
            reverse=True)
        botS = Series([bot.get_index() for bot in npsim.get_bots()], name='Bot')
        maxStreakS = Series([bot.get_max_streak_length() for bot in \
            npsim.get_bots()], name='maxStreak')
        aveStreakS = Series([maxStreakS.mean()] + enoughEmptyRows, 
            name="aveMaxStreak")
        uniqueBotS = Series([percentUniqueBotsString] + \
            enoughEmptyRows, name='Unique Bots(%)')
        percentMulUsedS = Series([percentMulUsedString] + \
            enoughEmptyRows, name='Mul Used (%)')

        ## construct dataframe to write to excel file
        df = concat(
            [botS, maxStreakS, aveStreakS, uniqueBotS, percentMulUsedS], axis=1)

        ## put df info on excel buffer
        df.to_excel(writer, index=False, sheet_name='Bots Meta')
# look up by index
s2[["a", "d"]]
# or by numerical index
s2[[0, 2]]
s2.index

# use series to introduce time series data
dates = pd.date_range("2014-08-01", "2014-08-06")
dates

# must be the same length
temps1 = Series([80, 82, 85, 90, 83, 87], index=dates)
temps1

temps1.mean()

temps2 = Series([70, 75, 69, 83, 79, 77], index=dates)
temp_diffs = temps1 - temps2
temp_diffs

# or by date
temp_diffs["2014-08-03"]
# or by integer position
temp_diffs[2]

#####
# DateFrame
# multiple columns of heterogenous data, but each column of same type
#####
# Check for missingness
totaldf.isnull().sum()
	# Lots of missing data for Smoking, Physical Activity, Malaria and HIV, so won't use
	# 14 missing values in Life Expectancy so will delete them
	# Will delete all non-complete rows once I get rid of the above 4 columns

totaldf = totaldf.drop(['Smoking', 'PhysicalActivity', 'Malaria', 'HIV'], axis=1) 
totaldf = totaldf.dropna()
totaldf.isnull().sum()
totaldf.shape

# Explore the data a bit
totaldf.ix[totaldf['AlcConsumption'].idxmax()]
totaldf.ix[totaldf['ImprovedWater'].idxmin()]

Series.mean(totaldf['Suicide'])
print(totaldf.loc[totaldf['Country'].isin(['Panama', 'Guatemala', 'Australia'])])

# Work out if can use linear regression
corr_df = totaldf.corr(method='pearson')
print("--------------- CORRELATIONS ---------------")
print(corr_df.head(corr_df.shape[1]))

s = corr_df.unstack()
so = DataFrame(s.sort_values(kind="quicksort"))
so.loc[(so[0] >= .8) & (so[0] < 1)]

import matplotlib.pyplot as plt

plt.hist(totaldf['LifeExpectancy'])
plt.title("Life Expectancy Histogram")
Example #54
0
frame['e'].map(format)
## Sorting and ranking
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_index(axis=1, ascending=False)
frame.sort_index(axis=1, ascending=True)
frame
obj = Series([4, 7, -3, 2])
obj.order()
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.order()
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_index(by='b')
frame.sort_index(by=['a', 'b']) # sort by multiple columns
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
obj.mean()
obj.rank(method='first')
obj.rank(ascending=False, method='max')
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
'c': [-2, 5, 8, -2.5]})
frame
frame.rank(axis=1)
get_ipython().system(u'ls')
df = DataFrame(np.random.randn(6,3))
df.ix[2:, 1]=NA 
df.ix[4:, 2]=NA

print(df)
print('\n')
print(df.fillna(method='ffill'))
print('\n')
print(df.fillna(method='ffill', limit=2))
print('\n')

###############################################################

data = Series([1.,NA,3.5,NA,7])

print(data.fillna(data.mean()))
print('\n')

###############################################################

data = Series(np.random.randn(10),
              index = [['a','a','a','b','b','b','c','c','d','d'],
                       [1,2,3,1,2,3,1,2,2,3]])


print(data)
print('\n')
print(data.index)
print('\n')
print(data['b'])
print('\n')

        def adfTest(spread):
            ADF_p_value = ts.adfuller(spread, 1)[1]
            return ADF_p_value

        ADF_p_value = adfTest(spread)


        if ADF_p_value <= 0.05:
            print    "The spread is likely mean-reverting."
        else:
            print    "The spread is not mean-reverting."

        spread = Series(spread)
        signalMean = spread.mean()
        signalDev = spread.std()

        openMult = 1.0
        closeMult = 0.5
        stopLossMult = 4.0

        openSignal = signalDev * openMult;
        closeSignal = signalDev * closeMult;
        stopLossSignal = signalDev * stopLossMult;

        residSpread = spread - signalMean
        residSpread.plot()

        openSignalUp = openSignal * (residSpread * 0 + 1)
        openSignalDown = -openSignal * (residSpread * 0 + 1)
Example #57
0
            test_data_class = test_data['class']
            test_data = test_data[test_columns]
            
            correct_num = 0
            all_num = len(test_data_index)
            for i in test_data_index:
                # print i, test_data.ix[i]
                if predict(test_data.ix[i], train_data, k) == test_data_class[i]:
                    correct_num += 1
            
            knn_accuracy = correct_num / all_num
            print 'accuracy:' ,  knn_accuracy
            knn_accuracy_list.append(knn_accuracy)
        
        knn_accuracy_series = Series(knn_accuracy_list)
        print 'k: ', k, 'knn average accuracy:', knn_accuracy_series.mean()
        average_result.append(knn_accuracy_series.mean())
    print 'all average accuracy result for each k: ', average_result
    
    fig, axes = plt.subplots(nrows = 2, ncols = 1)
    fig.suptitle('KNN Result')

    axes[0].set_title('simple line plot result')
    axes[0].plot(k_list, average_result)
    axes[0].set_xlabel('k')
    axes[0].set_ylabel('Average Accuracy')
    axes[0].tick_params(axis='x', labelsize=11)
    axes[0].tick_params(axis='y', labelsize=11)

    axes[1].set_title('scatter plot result')
    axes[1].scatter(k_list, average_result)
Example #58
0
    account.userID,
    account.password
)
traderChannel


#%% 定长队列
from collections import deque
q = deque(maxlen=10)
for i in range(10):
    q.append(i)

#%%
from pandas import Series
s = Series(range(10))
s.mean()
s.std()


#%% 交易结果导出到excel
import os
os.chdir('/home/duhan/github/CTPTrader')
from comhelper import setDjangoEnvironment
setDjangoEnvironment()
from database.models import *
from django_pandas.io import read_frame
from pandas.io.excel import ExcelWriter
df = read_frame(ModelPosition.objects.filter(state='close'))
writer = ExcelWriter('/tmp/output.xls')
df.to_excel(writer)
writer.save()
Example #59
0
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

grouping = pd.qcut(frame.data1, 10, labels=False)

grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()


# ### 用特定于分组的值填充缺失值
s = Series(np.random.randn(6))
s[::2] = np.nan
s

s.fillna(s.mean())

states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

data.groupby(group_key).mean()

fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
Example #60
0
format = lambda x: '%.2f' % x 
data['popm'].map(format) 

# sort on values
data2 = data.sort_index(by='popm')
data2.head()
data2.tail()

# summary statistics
# d1990.sum()
data.describe()
data.std()

# how much did total population change between 1990 and 2010? 
# TODO: how many people did the average congressperson represent in 1990?

# we could also represent a single variable as a series with hierarchical indexing
p = Series(data['pop'].values, index=[data['st'], data['yr']])
p['North Carolina']
p.mean(level='st')
# TODO: calculate standard deviation by year

p.swaplevel('st', 'yr')

# correlation
data['pop'].corr(data['ev'])

# estimate a linear model
model = pd.ols(y=data['ev'], x=data['popm'])
print model 
model.beta