Beispiel #1
0
 def _calc_avg(ver, data):
     ret = {'version': ver}
     if not data:
         return ret
     header = ('dem1', 'dem2', 'dem3', 'dem4', 'dem5')
     df = DataFrame(data, columns=header)
     avg_dict = dict(zip(header, df.median().round(2)))
     ret.update(avg_dict)
     ret.update({'avg': round(df.median().mean(), 2)})
     return ret
Beispiel #2
0
def compare(df1: pd.DataFrame, df2: pd.DataFrame):
    """同じ列を持つ二つのdfの値を色々比べた結果をdfに入れて返す。"""
    assert (df1.columns == df2.columns).all()

    std = (df1.std() + df2.std()) / 2
    df_result = pd.DataFrame(index=df1.columns)
    df_result["mean_ae/std"] = np.abs(df1.mean() - df2.mean()) / std
    df_result["median_ae/std"] = np.abs(df1.median() - df2.median()) / std
    df_result["mode1"] = df1.mode().transpose()[0]
    df_result["mode2"] = df2.mode().transpose()[0]

    df_result = df_result.sort_values("median_ae/std", ascending=False)
    return df_result
Beispiel #3
0
def work_on_collection(samp_to_map, min_samp_cutoff, delsdetectthresh,
                       real_del_thresh, dels_cooc_thresh, vsgv_dissim_thresh,
                       vsgv_clip_quantile, vsgv_fit_interval, vsgv_fit_method,
                       x_coverage, rate_param, vsgv_dense_perc, browser_path):
    binsize = int(rate_param / float(x_coverage))
    dichotomize = True
    dichotomize_thresh = 0.5
    max_spacing = 10
    taxonomy = read_pickle(taxonomypath)
    genepos = read_pickle(genepospath)
    bac_samps_map = defaultdict(dict)
    for samp, bacid_maps in samp_to_map.iteritems():
        for bacname, bacmap in bacid_maps.iteritems():
            bac_samps_map[bacname][samp] = bacmap

    sgvregions_all = []
    delsregions_all = []
    for bacname, bacdict in bac_samps_map.iteritems():
        bacdf = DataFrame(bacdict).T
        if bacdf.shape[0] < min_samp_cutoff: continue
        if (bacdf.median() < 1).sum() / float(bacdf.shape[1]) > 0.3:
            continue
        delsregions, deldf = find_deletions(bacdf, bacname, dichotomize, dichotomize_thresh, \
                                            delsdetectthresh, max_spacing, dels_cooc_thresh)
        delsregions_all.extend(delsregions)
        sgvregions, normdf = find_sgvs(bacdf, max_spacing, vsgv_dense_perc, bacname, deldf, \
                               real_del_thresh, vsgv_clip_quantile, vsgv_fit_interval, \
                               vsgv_fit_method, vsgv_dissim_thresh)
        sgvregions_all.extend(sgvregions)

        if browser_path is not None:
            draw_one_region(bacname, binsize, taxonomy, normdf, \
                            deldf, delsregions, bacdf, sgvregions, browser_path, genepos)
    return concat(sgvregions_all, axis = 1) if len(sgvregions_all) > 0 else DataFrame(), \
        concat(delsregions_all, axis = 1) if len(delsregions_all) > 0 else DataFrame()
        def construct_portfolio(self):
            """
            根据风格因子的收益和kf预测的结果,计算不同风格的权重
            :return:
            """
            pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M")
            pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d")
            tradedays_data = w.tdays(pre_date, self.date, "Period=M")
            tradedayslist = tradedays_data[0]
            tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist]
            # 提取因子数据
            style_return = DataFrame()
            for f in self.factors:
                f_data = []
                for dt in tradedays:
                    stockcodes = StockPool(dt).select_stock()
                    f_data = f(dt, stockcodes).getdata()
                    f_ret = FactorProcess.get_alpha(stockcodes, dt,
                                                    -1)  # 选取一个月的alpha
                    df = DataFrame(data=[f_data, f_ret],
                                   columns=[f.windLabel, 'ret'])
                    long_only, long_short = FactorStyle.compute_style_return_month(
                        df, f.windLabel)
                    f_data.append(long_only)
                style_return[f.windLabel] = f_data

            performance = FactorStyle.performance_curve(style_return)
            sign = FactorStyle.kpredict(performance)
            sign1 = np.where(sign == 1, sign, 0)
            style_weight = (style_return.median() -
                            self.risk_free_rate) / style_return.var()
            k = style_weight / style_weight.sum().values
            weight = DataFrame(data=k.values * sign1, columns=k.columns)

            return weight
        def construct_portfolio(self):
            """
            根据每个因子的风格收益,来配置不同的权重,核心是根据凯利公司法则
            :return: 返回基于kelly公式计算的每个因子的权重
            """
            pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M")
            pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d")
            tradedays_data = w.tdays(pre_date, self.date, "Period=M")
            tradedayslist = tradedays_data[0]
            tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist]

            # 提取因子数据
            style_return = DataFrame()
            for f in self.factors:
                f_data = []
                for dt in tradedays:
                    stockcodes = StockPool(dt).select_stock()
                    f_data = f(dt, stockcodes).getdata()
                    f_ret = FactorProcess.get_alpha(stockcodes, dt,
                                                    -1)  # 选取一个月的alpha
                    df = DataFrame(data=[f_data, f_ret],
                                   columns=[f.windLabel, 'ret'])
                    long_only, long_short = FactorStyle.compute_style_return_month(
                        df, f.windLabel)
                    f_data.append(long_only)
                style_return[f.windLabel] = f_data
            style_weight = style_return.median() / style_return.var()
            k = style_weight / style_weight.sum().values
            performance = FactorStyle.performance_curve(style_return)
            sign = np.sign(performance.values[-1] - performance.mean())
            sign1 = np.where(sign == 1, sign, 0)
            weight = DataFrame(data=k.values * sign1, columns=k.columns)
            return weight
Beispiel #6
0
def impute(df: pd.DataFrame,
           columns: List[str],
           impute_values: List[float],
           method: Optional[str] = None,
           fit: bool = True,
           add_columns: bool = False) -> pd.DataFrame:
    if fit:
        if not method:
            raise ValueError("'method' has to be specified when fitting")
        if impute_values:
            raise ValueError("'impute value' argument cannot be used when " +
                             "fitting")

        if method == 'median':
            impute_values.extend(df.median())
        elif method == 'mean':
            impute_values.extend(df.mean())
        elif method == 'zero':
            impute_values.append(0)
        else:
            raise ValueError("Method not supported")
    else:
        if method:
            pass  # TODO: Print warning instead
            # raise ValueError("'method' argument cannot be used when fitting")
        if not impute_values:
            raise ValueError("'impute value' has to be specified when fitting")

    if add_columns:
        add_imputed_columns(df, columns)

    df[columns] = df[columns].fillna(impute_values[0] if len(impute_values) ==
                                     1 else impute_values)
    return df
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
Beispiel #8
0
 def pre_proceesing_regression(self, data_frame: pd.DataFrame, **kwargs):
     """
     receives a full data frame with the y axis (prediction), as the last
     feature.
     can get the kwargs:
         test_size = for the the split.
         random_state = for the the split.
     :param df:
     :param kwargs:
     :return:
     """
     if kwargs.get('test_size') is not None:
         test_size = kwargs.get('test_size')
     else:
         test_size = 0.33
     if kwargs.get('random_state') is not None:
         random_state = kwargs.get('random_state')
     else:
         random_state = 42
     data_frame.fillna(data_frame.median(), inplace=True)
     X = data_frame.iloc[:, 1:-1].values
     y = data_frame.iloc[:, -1].values
     data = self.normalization_model.fit_transform(X)
     X_train, X_test, y_train, y_test = model_selection.train_test_split(
         data,
         y,
         test_size=test_size,
         random_state=random_state,
         shuffle=True)
     return X_train, X_test, y_train, y_test
Beispiel #9
0
def createAALstats(df_aal: pd.DataFrame) -> pd.DataFrame:
    """Group together some basic statistics from each AAL Group"""
    stat_array = np.array([df_aal.mean(),df_aal.median(),df_aal.min(),df_aal.max()]).T
    stat_cols = ['Average','Median','Minimum','Maximum']
    stat_index = df_aal.mean().index
    df_stats = pd.DataFrame(stat_array,columns=stat_cols,index=stat_index)
    return df_stats
Beispiel #10
0
def impute(data: pd.DataFrame, strategy="mode") -> pd.DataFrame:
    """Impute missing values according to `strategy`.

    Parameters
    ----------
    data : DataFrame
        Data for imputation.
    strategy : str, optional
        Method for calculating fill values, by default "mode".

    Returns
    -------
    DataFrame
        Data with

    Raises
    ------
    ValueError
        Could not fill values in some columns using `strategy`.
    """
    if strategy == "mode":
        filler = data.mode().loc[0]
    elif strategy == "mean":
        filler = data.mean()
    elif strategy == "median":
        filler = data.median()
    data = data.fillna(filler)
    has_na = data.isna().any(axis=0)
    if has_na.any():
        failed = has_na[has_na].index.to_list()
        raise ValueError(f"Could not fill values in {failed} with {strategy}")
    return data
def descriptive_stats(df: pd.DataFrame, percentiles=[.25, .5, .75], exclude=None, datetime_is_numeric=False):
    """
    The pandas.DataFrame.describe() function omits datatype, missing value count, and median value per column when
    generating descriptive statistics for a DataFrame. These are important to get a general overview of a dataset. This
    descriptive_stats() function generates those and the stats already provided in pandas.DataFrame.describe() for a
    DataFrame passed in as an input parameter. All other parameters are default describe() parameters.

    :param df: pandas.DataFrame for which to generate descriptive statistics
    :return: a pandas.DataFrame containing the descriptive stats, similar to pandas.DataFrame.describe()
    """
    dstats_df = pd.concat(
        [
            df.dtypes,
            df.isna().sum(),
            df.median(numeric_only=True)
        ],
        axis=1
    )
    dstats_df = dstats_df.rename(columns={
        0: "type",
        1: "missing",
        2: "median"
    })
    dstats_df = dstats_df.T
    dstats_df = pd.concat([dstats_df, df.describe(percentiles, include='all', exclude=exclude)])  #TODO: descibe might return a Series, not a df
    return dstats_df
Beispiel #12
0
    def test_fillna_categorical_nan(self):
        # GH 14021
        # np.nan should always be a valid filler
        cat = Categorical([np.nan, 2, np.nan])
        val = Categorical([np.nan, np.nan, np.nan])
        df = DataFrame({"cats": cat, "vals": val})
        with tm.assert_produces_warning(RuntimeWarning):
            res = df.fillna(df.median())
        v_exp = [np.nan, np.nan, np.nan]
        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category")
        tm.assert_frame_equal(res, df_exp)

        result = df.cats.fillna(np.nan)
        tm.assert_series_equal(result, df.cats)

        result = df.vals.fillna(np.nan)
        tm.assert_series_equal(result, df.vals)

        idx = pd.DatetimeIndex(
            ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT]
        )
        df = DataFrame({"a": Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.PeriodIndex(
            ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M"
        )
        df = DataFrame({"a": Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT])
        df = DataFrame({"a": Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
Beispiel #13
0
def analyze(df: pd.DataFrame):
    """中身を適当に分析してDataFrameに詰めて返す。"""
    if isinstance(df, pd.DataFrame):
        df_result = pd.DataFrame(index=df.columns)
        df_result["dtype"] = df.dtypes
        df_result["null"] = df.isnull().sum()
        df_result["nunique"] = df.nunique()
        df_result["min"] = df.min()
        df_result["median"] = df.median()
        df_result["max"] = df.max()
        df_result["mode"] = df.mode().transpose()[0]
        df_result["mean"] = df.mean()
        df_result["std"] = df.std()
        # # はずれ値のはずれ度合いを見るためにRobustScalerした結果の絶対値を見てみる。
        # numeric_columns = df.select_dtypes(include=np.number).columns
        # df_result["outlier_size"] = np.nan
        # df_result.loc[numeric_columns, "outlier_size"] = (
        #     tk.preprocessing.SafeRobustScaler(clip_range=None)
        #     .fit_transform(df.loc[:, numeric_columns])
        #     .fillna(0)
        #     .abs()
        #     .max()
        #     .round(decimals=1)
        # )
        return df_result
    else:
        raise NotImplementedError()
Beispiel #14
0
    def test_fillna_categorical_nan(self):
        # GH 14021
        # np.nan should always be a valid filler
        cat = Categorical([np.nan, 2, np.nan])
        val = Categorical([np.nan, np.nan, np.nan])
        df = DataFrame({"cats": cat, "vals": val})
        res = df.fillna(df.median())
        v_exp = [np.nan, np.nan, np.nan]
        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
                           dtype='category')
        tm.assert_frame_equal(res, df_exp)

        result = df.cats.fillna(np.nan)
        tm.assert_series_equal(result, df.cats)
        result = df.vals.fillna(np.nan)
        tm.assert_series_equal(result, df.vals)

        idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
                                '2011-01-01 09:00', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
                              pd.NaT, pd.NaT], freq='M')
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.TimedeltaIndex(['1 days', '2 days',
                                 '1 days', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
Beispiel #15
0
    def test_fillna_categorical_nan(self):
        # GH 14021
        # np.nan should always be a valid filler
        cat = Categorical([np.nan, 2, np.nan])
        val = Categorical([np.nan, np.nan, np.nan])
        df = DataFrame({"cats": cat, "vals": val})
        res = df.fillna(df.median())
        v_exp = [np.nan, np.nan, np.nan]
        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
                           dtype='category')
        tm.assert_frame_equal(res, df_exp)

        result = df.cats.fillna(np.nan)
        tm.assert_series_equal(result, df.cats)
        result = df.vals.fillna(np.nan)
        tm.assert_series_equal(result, df.vals)

        idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
                                '2011-01-01 09:00', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
                              pd.NaT, pd.NaT], freq='M')
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.TimedeltaIndex(['1 days', '2 days',
                                 '1 days', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
Beispiel #16
0
def remove_nan(df: pd.DataFrame) -> pd.DataFrame:

    if df.isnull().values.any():
        print(f'Data not OK, removing nan values..')
        print()
        nan_values = []
        indices = list(np.arange(df.shape[1]))
        for j in range(df.shape[1]):
            nan_values.append(df[j].isnull().sum().sum())

        print(f'Before:')
        print(f"Indices:    {indices}")  #index of feature
        print(f"NaN values: {nan_values}"
              )  #number of nan values corresponding to each feature
        print()

        df = df.fillna(df.median())  #replacing nan with median

        nan_values = []
        indices = list(np.arange(df.shape[1]))
        for j in range(df.shape[1]):
            nan_values.append(df[j].isnull().sum().sum())

        print(f'After:')
        print(f"Indices:    {indices}")  #index of feature
        print(f"NaN values: {nan_values}"
              )  #number of nan values corresponding to each feature
        print()

    else:
        print(f"Data has no NaN values")

    return df
Beispiel #17
0
def plot_variation_distn(gene_vars: pd.DataFrame):
    """
    Plot distn of variation mean and medians across genes
    :param gene_vars: DF with genes in rows and genes' variations as values across columns.
    """
    plt.hist(gene_vars.median(axis=1), bins=100, alpha=0.4, label='median')
    plt.hist(gene_vars.mean(axis=1), bins=100, alpha=0.4, label='mean')
    plt.legend()
Beispiel #18
0
def describe(df: pd.DataFrame) -> pd.DataFrame:
    return pd.concat([
        df.mean().rename('mean'),
        df.median().rename('median'),
        df.max().rename('max'),
        df.min().rename('min')
    ],
                     axis=1).T
Beispiel #19
0
 def get_grades_avg(cls, proj, ver, reso):
     """
     获取图片平均分信息
     """
     imgs = Image.objects.filter(project=proj, version=ver, resolution=reso)
     rows = [Grade.get_average(img) for img in imgs]
     df = DataFrame(rows, index=[r.pop('version') for r in rows])
     df.loc['total'] = df.median()
     return df
Beispiel #20
0
def remove_zero_median(gene_vars: pd.DataFrame) -> pd.DataFrame:
    """
    Remove rows with 0 median.
    :param gene_vars: DF with genes in rows and genes' variations as values across columns.
    :return: DF without rows with 0 median
    """
    var_medians = gene_vars.median(axis=1)
    remove_genes = var_medians[var_medians == 0].index
    return gene_vars.drop(remove_genes)
Beispiel #21
0
def transform_to_numeric_form(df: pd.DataFrame) -> pd.DataFrame:
    """ Transform all data to be in numeric form

    A few steps are performed:
    1. Remove timestamp from features if any, because it can be easily used to
    distinguish the train set from the test set in case of time split.
    2. Category/object columns are transformed to one-hot encoded (dummy)
    variables. It deals well with <NA>.
    3. <NA> are filled with median.

    Those steps may not be suitable for every dataset.
    They are designed to be as general as possible and to work with every data.
    Consider more hand-crafted preprocessing if any warning appear.

    Parameters
    ----------
    df: pd.DataFrame
        Initially preprocessed data that should be transformed.

    Returns
    -------
    df: pd.DataFrame
        A DataFrame with all transformation performed.

    """
    STANDARD_WARNING = '\nKeep in mind that the dataset should be initially preprocessed.'

    # remove timestamp
    columns_before = set(df.columns)
    df = df.select_dtypes(exclude='datetime')
    columns_after = set(df.columns)
    if columns_after != columns_before:
        warn('Columns with dtype=datetime were removed. You may want to use '
             'timedelta features instead.' + STANDARD_WARNING)

    # encode all categorical features as one-hot
    num_cols_before = len(df.columns)
    df = get_dummies(df)
    num_cols_after = len(df.columns)
    if num_cols_after > 10 * num_cols_before:
        warn(
            f'Number of columns after dummy encoding changed from '
            f'{num_cols_before} to {num_cols_after}. It can mean that:\n'
            f'(1) you have category type columns with a lof of unique variables (this is fine), or \n'
            f'(2) a numeric column was treated as category-like and every value is one-hot encoded (this is wrong).'
            f'{STANDARD_WARNING}')

    # fill <NA>
    if df.isna().any().any():
        df.fillna(df.median(), inplace=True)
        warn('Missing values are filled with medians.' + STANDARD_WARNING)

    return df
Beispiel #22
0
def get_stats(csv: pd.DataFrame) -> Dict[str, Dict[str, float]]:
    """Získá všechny hledané statistické hodnoty."""
    output = {}
    for key in csv.keys():
        data = {
            'mean': csv.mean()[key],
            'median': csv.median()[key],
            'first': quartiles(csv)[0][key],
            'last': quartiles(csv)[1][key],  # TODO: Liší se
            'passed': len(csv[(csv[key] > 0)])
        }
        output[key] = data
    return output
Beispiel #23
0
 def descriptive_stats_report(given_data: pd.DataFrame) -> pd.DataFrame:
     """ Create a report with descriptive stats of the data. """
     description = given_data.describe().T
     description.drop(columns=["count"], inplace=True)
     jarque_bera_test = given_data.apply(
         lambda attribute: stats.jarque_bera(attribute))
     description.insert(loc=1, column="median", value=given_data.median())
     description.insert(loc=2,
                        column="jarque_bera",
                        value=jarque_bera_test.iloc[0, 0:])
     description.insert(loc=3,
                        column="p_value",
                        value=jarque_bera_test.iloc[1, 0:])
     return description
def impute(df: pd.DataFrame):
    """Returns df with imputed features.
    Note: lots of things have filled na with "unknown"
    """

    # fill in values for some vars from unknown -> None
    df.loc[df['AbdomenTender'].isin(['no', 'unknown']),
           'AbdTenderDegree'] = 'None'

    # pandas impute missing values with median
    df = df.fillna(df.median())

    df.GCSScore = df.GCSScore.fillna(df.GCSScore.median())
    return df
Beispiel #25
0
    def test_quantile(self, datetime_frame):
        from numpy import percentile

        df = datetime_frame
        q = df.quantile(0.1, axis=0, numeric_only=True)
        assert q["A"] == percentile(df["A"], 10)
        tm.assert_index_equal(q.index, df.columns)

        q = df.quantile(0.9, axis=1, numeric_only=True)
        assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90)
        tm.assert_index_equal(q.index, df.index)

        # test degenerate case
        q = DataFrame({
            "x": [],
            "y": []
        }).quantile(0.1, axis=0, numeric_only=True)
        assert np.isnan(q["x"]) and np.isnan(q["y"])

        # non-numeric exclusion
        df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
        rs = df.quantile(0.5, numeric_only=True)
        with tm.assert_produces_warning(FutureWarning,
                                        match="Select only valid"):
            xp = df.median().rename(0.5)
        tm.assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(0.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        tm.assert_series_equal(result, expected)

        result = df.quantile([0.5, 0.75], axis=1)
        expected = DataFrame({
            1: [1.5, 1.75],
            2: [2.5, 2.75],
            3: [3.5, 3.75]
        },
                             index=[0.5, 0.75])
        tm.assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3], ["a", "b", 4]])
        result = df.quantile(0.5, axis=1, numeric_only=True)
        expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
        tm.assert_series_equal(result, expected)
Beispiel #26
0
def desc(df: pd.DataFrame):
	"""Produces a summary of the input DataFrame

	Arguments:
		df {pd.DataFrame} -- [description]

	Returns:
		pd.DataFrame -- DataFrame of summary statistics
	"""

	desc = df.describe(percentiles = None).T
	desc['missing'] = len(df.index) - desc['count']
	# desc = desc.astype('int')
	desc['median'] = df.median()
	desc['missing %'] = desc.missing / len(df.index) * 100
	return desc.T
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Checks that input is a dataframe.
        Parameters
        ----------
        X : Pandas DataFrame
        y : Pandas Series, np.array. Default = None
            Parameter is necessary for compatibility with sklearn.pipeline.Pipeline.
        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            If there are no numerical variables in the df or the df is empty
        Returns
        -------
        X : Pandas DataFrame
            The same dataframe entered as parameter
        """

        # check input dataframe
        X = _to_dataframe(X)

        if self.method == 'feature':
            new_index, indexer = X.columns.sort_values(return_indexer=True)
            pos = new_index.get_loc(self.feature, method='pad')
            pos = indexer[pos]
            self.scaling_factors_ = X.iloc[:, pos]

        elif self.method in ['total', 'sum']:
            self.scaling_factors_ = X.sum(axis=1)

        elif self.method == 'PQN':
            # "Build" the reference sample based and compute quotients
            if self.ref_sample == 'mean':  # Mean spectre of all samples
                ref_sample2 = X / X.mean(axis=0)
            elif self.ref_sample == 'median':  # Median spectre of all samples
                ref_sample2 = X / X.median(axis=0)
            elif self.ref_sample in X.index:  # Sample name to use as a reference
                ref_sample2 = X / X.loc[self.ref_sample, :]
            else:  # Actual sample given (ref_sample is array like)
                ref_sample2 = X / self.ref_sample
            # Normalization Factors
            self.scaling_factors_ = ref_sample2.median(axis=1)

        self.input_shape_ = X.shape
        return self
Beispiel #28
0
def get_totals(df: pd.DataFrame):
    """
    The function takes a pandas DataFrame and creates a dictionary with selected summary statistics.
    """
    out = dict()
    out['min'] = df.min()
    out['per15'] = df.quantile(0.15)
    out['qr1'] = df.quantile(0.25)
    out['median'] = df.median()
    out['qr3'] = df.quantile(0.75)
    out['per85'] = df.quantile(0.85)
    out['max'] = df.max()
    out['count'] = df.count()
    out['mean'] = df.mean()
    out['iqr'] = out['qr3'] - out['qr1']

    return pd.DataFrame(out)
Beispiel #29
0
    def test_quantile(self):
        from numpy import percentile

        q = self.tsframe.quantile(0.1, axis=0)
        assert q['A'] == percentile(self.tsframe['A'], 10)
        tm.assert_index_equal(q.index, self.tsframe.columns)

        q = self.tsframe.quantile(0.9, axis=1)
        assert (q['2000-01-17'] == percentile(self.tsframe.loc['2000-01-17'],
                                              90))
        tm.assert_index_equal(q.index, self.tsframe.index)

        # test degenerate case
        q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
        assert (np.isnan(q['x']) and np.isnan(q['y']))

        # non-numeric exclusion
        df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
        rs = df.quantile(0.5)
        xp = df.median().rename(0.5)
        assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile([.5, .75], axis=1)
        expected = DataFrame({
            1: [1.5, 1.75],
            2: [2.5, 2.75],
            3: [3.5, 3.75]
        },
                             index=[0.5, 0.75])
        assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3], ['a', 'b', 4]])
        result = df.quantile(.5, axis=1)
        expected = Series([3., 4.], index=[0, 1], name=0.5)
        assert_series_equal(result, expected)
def transform_df(df: pd.DataFrame) -> (pd.DataFrame):
    '''Accepts a pd.DataFrame of survey results as input, applies a
    scoring system to the responses, fills nans with the median score, 
    and renames the columns for brevity. Returns the modified pd.DataFrame
    as output.
    '''
    df.replace(
        ("Strongly Disagree", "Disagree", "Slightly Disagree",
         "Unsure/No Opinion", "Slightly Agree", "Agree", "Strongly Agree"),
        (-3, -2, -1, 0, 1, 2, 3),
        inplace=True)
    df.replace(("Yes", "No"), (1, 0), inplace=True)
    df = df.dropna(thresh=len(df) * .9, axis=1)
    df = df.fillna(df.median())
    df.columns = df.columns.str.replace(
        'For each of the statements below, please indicate how strongly you agree or disagree.',
        "",
        regex=True)
    return df
Beispiel #31
0
    def get_mean_row(self, arr_res, col):
        """ 結果配列の指定項目の中央値行を取得する """
        #seidoの中央値行取得
        arr_target = []

        for val in arr_res:
            arr_target.append(val[col])

        df_target = DataFrame(arr_target)
        df_target.columns = ['A']
        med = df_target.median()
        med = med[0]  #valueだけ取得

        #return  df_target[df_target.A == med] #中央値の行を取得(複数あり)
        #medianと一致する先頭の行を返却
        for val in arr_res:
            if val[col] == med:
                return val
        return []
Beispiel #32
0
    def test_quantile(self, datetime_frame):
        from numpy import percentile

        df = datetime_frame
        q = df.quantile(0.1, axis=0)
        assert q['A'] == percentile(df['A'], 10)
        tm.assert_index_equal(q.index, df.columns)

        q = df.quantile(0.9, axis=1)
        assert (q['2000-01-17'] ==
                percentile(df.loc['2000-01-17'], 90))
        tm.assert_index_equal(q.index, df.index)

        # test degenerate case
        q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
        assert(np.isnan(q['x']) and np.isnan(q['y']))

        # non-numeric exclusion
        df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
        rs = df.quantile(0.5)
        xp = df.median().rename(0.5)
        assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile([.5, .75], axis=1)
        expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
                              3: [3.5, 3.75]}, index=[0.5, 0.75])
        assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3],
                        ['a', 'b', 4]])
        result = df.quantile(.5, axis=1)
        expected = Series([3., 4.], index=[0, 1], name=0.5)
        assert_series_equal(result, expected)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
def cross_validate_trades(trades, N=20, subset_fraction=0.7):

    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype=float)

    for n in range(N):
        sample_tickers = list(
            random.choice(tickers, sample_size, replace=False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype=float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis=1)
    result['Std'] = summary.std(axis=1)
    result['Median'] = summary.median(axis=1)
    result['Max'] = summary.max(axis=1)
    result['Min'] = summary.min(axis=1)

    return (result, summary)
Beispiel #35
0
# after prepaired data, time to plot it:

for new_counter in range(file_counter+1):
    #print new_counter
    Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    #prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]
    
    Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]
    
    Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)]
    
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    #x2_std_dev = 3
    #once again correcting counter:
    x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2]
    #print x1[0], x2[0], quelle_initialTimestamps[new_counter]
    # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    f, axarr = plt.subplots(2, sharex=True)
    axarr[0].grid()
Beispiel #36
0
import numpy as np
import scipy as sc
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
db = pd.read_csv("/users/rosiezou/Desktop/mortgage-stanley/FMAC-5US.csv")
table = DataFrame(db, columns = ['Date', 'Value'])
plt.plot(db['Date'], db['Value'], 'bo')
regressionline = sc.stats.linregress(db['Date'], db['Value'])
m = regressionline[0]
b = regressionline[1]
x = np.linspace(0, 18, 100)
plt.plot(x, m*x + b)
plt.show()
print(table.median(0))
print(table.mode(0))