def _calc_avg(ver, data): ret = {'version': ver} if not data: return ret header = ('dem1', 'dem2', 'dem3', 'dem4', 'dem5') df = DataFrame(data, columns=header) avg_dict = dict(zip(header, df.median().round(2))) ret.update(avg_dict) ret.update({'avg': round(df.median().mean(), 2)}) return ret
def compare(df1: pd.DataFrame, df2: pd.DataFrame): """同じ列を持つ二つのdfの値を色々比べた結果をdfに入れて返す。""" assert (df1.columns == df2.columns).all() std = (df1.std() + df2.std()) / 2 df_result = pd.DataFrame(index=df1.columns) df_result["mean_ae/std"] = np.abs(df1.mean() - df2.mean()) / std df_result["median_ae/std"] = np.abs(df1.median() - df2.median()) / std df_result["mode1"] = df1.mode().transpose()[0] df_result["mode2"] = df2.mode().transpose()[0] df_result = df_result.sort_values("median_ae/std", ascending=False) return df_result
def work_on_collection(samp_to_map, min_samp_cutoff, delsdetectthresh, real_del_thresh, dels_cooc_thresh, vsgv_dissim_thresh, vsgv_clip_quantile, vsgv_fit_interval, vsgv_fit_method, x_coverage, rate_param, vsgv_dense_perc, browser_path): binsize = int(rate_param / float(x_coverage)) dichotomize = True dichotomize_thresh = 0.5 max_spacing = 10 taxonomy = read_pickle(taxonomypath) genepos = read_pickle(genepospath) bac_samps_map = defaultdict(dict) for samp, bacid_maps in samp_to_map.iteritems(): for bacname, bacmap in bacid_maps.iteritems(): bac_samps_map[bacname][samp] = bacmap sgvregions_all = [] delsregions_all = [] for bacname, bacdict in bac_samps_map.iteritems(): bacdf = DataFrame(bacdict).T if bacdf.shape[0] < min_samp_cutoff: continue if (bacdf.median() < 1).sum() / float(bacdf.shape[1]) > 0.3: continue delsregions, deldf = find_deletions(bacdf, bacname, dichotomize, dichotomize_thresh, \ delsdetectthresh, max_spacing, dels_cooc_thresh) delsregions_all.extend(delsregions) sgvregions, normdf = find_sgvs(bacdf, max_spacing, vsgv_dense_perc, bacname, deldf, \ real_del_thresh, vsgv_clip_quantile, vsgv_fit_interval, \ vsgv_fit_method, vsgv_dissim_thresh) sgvregions_all.extend(sgvregions) if browser_path is not None: draw_one_region(bacname, binsize, taxonomy, normdf, \ deldf, delsregions, bacdf, sgvregions, browser_path, genepos) return concat(sgvregions_all, axis = 1) if len(sgvregions_all) > 0 else DataFrame(), \ concat(delsregions_all, axis = 1) if len(delsregions_all) > 0 else DataFrame()
def construct_portfolio(self): """ 根据风格因子的收益和kf预测的结果,计算不同风格的权重 :return: """ pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M") pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d") tradedays_data = w.tdays(pre_date, self.date, "Period=M") tradedayslist = tradedays_data[0] tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist] # 提取因子数据 style_return = DataFrame() for f in self.factors: f_data = [] for dt in tradedays: stockcodes = StockPool(dt).select_stock() f_data = f(dt, stockcodes).getdata() f_ret = FactorProcess.get_alpha(stockcodes, dt, -1) # 选取一个月的alpha df = DataFrame(data=[f_data, f_ret], columns=[f.windLabel, 'ret']) long_only, long_short = FactorStyle.compute_style_return_month( df, f.windLabel) f_data.append(long_only) style_return[f.windLabel] = f_data performance = FactorStyle.performance_curve(style_return) sign = FactorStyle.kpredict(performance) sign1 = np.where(sign == 1, sign, 0) style_weight = (style_return.median() - self.risk_free_rate) / style_return.var() k = style_weight / style_weight.sum().values weight = DataFrame(data=k.values * sign1, columns=k.columns) return weight
def construct_portfolio(self): """ 根据每个因子的风格收益,来配置不同的权重,核心是根据凯利公司法则 :return: 返回基于kelly公式计算的每个因子的权重 """ pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M") pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d") tradedays_data = w.tdays(pre_date, self.date, "Period=M") tradedayslist = tradedays_data[0] tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist] # 提取因子数据 style_return = DataFrame() for f in self.factors: f_data = [] for dt in tradedays: stockcodes = StockPool(dt).select_stock() f_data = f(dt, stockcodes).getdata() f_ret = FactorProcess.get_alpha(stockcodes, dt, -1) # 选取一个月的alpha df = DataFrame(data=[f_data, f_ret], columns=[f.windLabel, 'ret']) long_only, long_short = FactorStyle.compute_style_return_month( df, f.windLabel) f_data.append(long_only) style_return[f.windLabel] = f_data style_weight = style_return.median() / style_return.var() k = style_weight / style_weight.sum().values performance = FactorStyle.performance_curve(style_return) sign = np.sign(performance.values[-1] - performance.mean()) sign1 = np.where(sign == 1, sign, 0) weight = DataFrame(data=k.values * sign1, columns=k.columns) return weight
def impute(df: pd.DataFrame, columns: List[str], impute_values: List[float], method: Optional[str] = None, fit: bool = True, add_columns: bool = False) -> pd.DataFrame: if fit: if not method: raise ValueError("'method' has to be specified when fitting") if impute_values: raise ValueError("'impute value' argument cannot be used when " + "fitting") if method == 'median': impute_values.extend(df.median()) elif method == 'mean': impute_values.extend(df.mean()) elif method == 'zero': impute_values.append(0) else: raise ValueError("Method not supported") else: if method: pass # TODO: Print warning instead # raise ValueError("'method' argument cannot be used when fitting") if not impute_values: raise ValueError("'impute value' has to be specified when fitting") if add_columns: add_imputed_columns(df, columns) df[columns] = df[columns].fillna(impute_values[0] if len(impute_values) == 1 else impute_values) return df
class LogAggregate: def __init__(self, dataset): self.dataset = DataFrame(dataset) def get_median(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']] else: return self.dataset.median()[kwarg['key']] def get_average(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']] else: return self.dataset.mean()[kwarg['key']] def get_min(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']] else: return self.dataset.min()[kwarg['key']] def get_max(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']] else: return self.dataset.max()[kwarg['key']] def get_count(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']] else: return self.dataset.count()[kwarg['key']]
def pre_proceesing_regression(self, data_frame: pd.DataFrame, **kwargs): """ receives a full data frame with the y axis (prediction), as the last feature. can get the kwargs: test_size = for the the split. random_state = for the the split. :param df: :param kwargs: :return: """ if kwargs.get('test_size') is not None: test_size = kwargs.get('test_size') else: test_size = 0.33 if kwargs.get('random_state') is not None: random_state = kwargs.get('random_state') else: random_state = 42 data_frame.fillna(data_frame.median(), inplace=True) X = data_frame.iloc[:, 1:-1].values y = data_frame.iloc[:, -1].values data = self.normalization_model.fit_transform(X) X_train, X_test, y_train, y_test = model_selection.train_test_split( data, y, test_size=test_size, random_state=random_state, shuffle=True) return X_train, X_test, y_train, y_test
def createAALstats(df_aal: pd.DataFrame) -> pd.DataFrame: """Group together some basic statistics from each AAL Group""" stat_array = np.array([df_aal.mean(),df_aal.median(),df_aal.min(),df_aal.max()]).T stat_cols = ['Average','Median','Minimum','Maximum'] stat_index = df_aal.mean().index df_stats = pd.DataFrame(stat_array,columns=stat_cols,index=stat_index) return df_stats
def impute(data: pd.DataFrame, strategy="mode") -> pd.DataFrame: """Impute missing values according to `strategy`. Parameters ---------- data : DataFrame Data for imputation. strategy : str, optional Method for calculating fill values, by default "mode". Returns ------- DataFrame Data with Raises ------ ValueError Could not fill values in some columns using `strategy`. """ if strategy == "mode": filler = data.mode().loc[0] elif strategy == "mean": filler = data.mean() elif strategy == "median": filler = data.median() data = data.fillna(filler) has_na = data.isna().any(axis=0) if has_na.any(): failed = has_na[has_na].index.to_list() raise ValueError(f"Could not fill values in {failed} with {strategy}") return data
def descriptive_stats(df: pd.DataFrame, percentiles=[.25, .5, .75], exclude=None, datetime_is_numeric=False): """ The pandas.DataFrame.describe() function omits datatype, missing value count, and median value per column when generating descriptive statistics for a DataFrame. These are important to get a general overview of a dataset. This descriptive_stats() function generates those and the stats already provided in pandas.DataFrame.describe() for a DataFrame passed in as an input parameter. All other parameters are default describe() parameters. :param df: pandas.DataFrame for which to generate descriptive statistics :return: a pandas.DataFrame containing the descriptive stats, similar to pandas.DataFrame.describe() """ dstats_df = pd.concat( [ df.dtypes, df.isna().sum(), df.median(numeric_only=True) ], axis=1 ) dstats_df = dstats_df.rename(columns={ 0: "type", 1: "missing", 2: "median" }) dstats_df = dstats_df.T dstats_df = pd.concat([dstats_df, df.describe(percentiles, include='all', exclude=exclude)]) #TODO: descibe might return a Series, not a df return dstats_df
def test_fillna_categorical_nan(self): # GH 14021 # np.nan should always be a valid filler cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) with tm.assert_produces_warning(RuntimeWarning): res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) tm.assert_series_equal(result, df.cats) result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT] ) df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.PeriodIndex( ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M" ) df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT]) df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
def analyze(df: pd.DataFrame): """中身を適当に分析してDataFrameに詰めて返す。""" if isinstance(df, pd.DataFrame): df_result = pd.DataFrame(index=df.columns) df_result["dtype"] = df.dtypes df_result["null"] = df.isnull().sum() df_result["nunique"] = df.nunique() df_result["min"] = df.min() df_result["median"] = df.median() df_result["max"] = df.max() df_result["mode"] = df.mode().transpose()[0] df_result["mean"] = df.mean() df_result["std"] = df.std() # # はずれ値のはずれ度合いを見るためにRobustScalerした結果の絶対値を見てみる。 # numeric_columns = df.select_dtypes(include=np.number).columns # df_result["outlier_size"] = np.nan # df_result.loc[numeric_columns, "outlier_size"] = ( # tk.preprocessing.SafeRobustScaler(clip_range=None) # .fit_transform(df.loc[:, numeric_columns]) # .fillna(0) # .abs() # .max() # .round(decimals=1) # ) return df_result else: raise NotImplementedError()
def test_fillna_categorical_nan(self): # GH 14021 # np.nan should always be a valid filler cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype='category') tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) tm.assert_series_equal(result, df.cats) result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', '2011-01-01 09:00', pd.NaT, pd.NaT]) df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', pd.NaT, pd.NaT], freq='M') df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.TimedeltaIndex(['1 days', '2 days', '1 days', pd.NaT, pd.NaT]) df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
def remove_nan(df: pd.DataFrame) -> pd.DataFrame: if df.isnull().values.any(): print(f'Data not OK, removing nan values..') print() nan_values = [] indices = list(np.arange(df.shape[1])) for j in range(df.shape[1]): nan_values.append(df[j].isnull().sum().sum()) print(f'Before:') print(f"Indices: {indices}") #index of feature print(f"NaN values: {nan_values}" ) #number of nan values corresponding to each feature print() df = df.fillna(df.median()) #replacing nan with median nan_values = [] indices = list(np.arange(df.shape[1])) for j in range(df.shape[1]): nan_values.append(df[j].isnull().sum().sum()) print(f'After:') print(f"Indices: {indices}") #index of feature print(f"NaN values: {nan_values}" ) #number of nan values corresponding to each feature print() else: print(f"Data has no NaN values") return df
def plot_variation_distn(gene_vars: pd.DataFrame): """ Plot distn of variation mean and medians across genes :param gene_vars: DF with genes in rows and genes' variations as values across columns. """ plt.hist(gene_vars.median(axis=1), bins=100, alpha=0.4, label='median') plt.hist(gene_vars.mean(axis=1), bins=100, alpha=0.4, label='mean') plt.legend()
def describe(df: pd.DataFrame) -> pd.DataFrame: return pd.concat([ df.mean().rename('mean'), df.median().rename('median'), df.max().rename('max'), df.min().rename('min') ], axis=1).T
def get_grades_avg(cls, proj, ver, reso): """ 获取图片平均分信息 """ imgs = Image.objects.filter(project=proj, version=ver, resolution=reso) rows = [Grade.get_average(img) for img in imgs] df = DataFrame(rows, index=[r.pop('version') for r in rows]) df.loc['total'] = df.median() return df
def remove_zero_median(gene_vars: pd.DataFrame) -> pd.DataFrame: """ Remove rows with 0 median. :param gene_vars: DF with genes in rows and genes' variations as values across columns. :return: DF without rows with 0 median """ var_medians = gene_vars.median(axis=1) remove_genes = var_medians[var_medians == 0].index return gene_vars.drop(remove_genes)
def transform_to_numeric_form(df: pd.DataFrame) -> pd.DataFrame: """ Transform all data to be in numeric form A few steps are performed: 1. Remove timestamp from features if any, because it can be easily used to distinguish the train set from the test set in case of time split. 2. Category/object columns are transformed to one-hot encoded (dummy) variables. It deals well with <NA>. 3. <NA> are filled with median. Those steps may not be suitable for every dataset. They are designed to be as general as possible and to work with every data. Consider more hand-crafted preprocessing if any warning appear. Parameters ---------- df: pd.DataFrame Initially preprocessed data that should be transformed. Returns ------- df: pd.DataFrame A DataFrame with all transformation performed. """ STANDARD_WARNING = '\nKeep in mind that the dataset should be initially preprocessed.' # remove timestamp columns_before = set(df.columns) df = df.select_dtypes(exclude='datetime') columns_after = set(df.columns) if columns_after != columns_before: warn('Columns with dtype=datetime were removed. You may want to use ' 'timedelta features instead.' + STANDARD_WARNING) # encode all categorical features as one-hot num_cols_before = len(df.columns) df = get_dummies(df) num_cols_after = len(df.columns) if num_cols_after > 10 * num_cols_before: warn( f'Number of columns after dummy encoding changed from ' f'{num_cols_before} to {num_cols_after}. It can mean that:\n' f'(1) you have category type columns with a lof of unique variables (this is fine), or \n' f'(2) a numeric column was treated as category-like and every value is one-hot encoded (this is wrong).' f'{STANDARD_WARNING}') # fill <NA> if df.isna().any().any(): df.fillna(df.median(), inplace=True) warn('Missing values are filled with medians.' + STANDARD_WARNING) return df
def get_stats(csv: pd.DataFrame) -> Dict[str, Dict[str, float]]: """Získá všechny hledané statistické hodnoty.""" output = {} for key in csv.keys(): data = { 'mean': csv.mean()[key], 'median': csv.median()[key], 'first': quartiles(csv)[0][key], 'last': quartiles(csv)[1][key], # TODO: Liší se 'passed': len(csv[(csv[key] > 0)]) } output[key] = data return output
def descriptive_stats_report(given_data: pd.DataFrame) -> pd.DataFrame: """ Create a report with descriptive stats of the data. """ description = given_data.describe().T description.drop(columns=["count"], inplace=True) jarque_bera_test = given_data.apply( lambda attribute: stats.jarque_bera(attribute)) description.insert(loc=1, column="median", value=given_data.median()) description.insert(loc=2, column="jarque_bera", value=jarque_bera_test.iloc[0, 0:]) description.insert(loc=3, column="p_value", value=jarque_bera_test.iloc[1, 0:]) return description
def impute(df: pd.DataFrame): """Returns df with imputed features. Note: lots of things have filled na with "unknown" """ # fill in values for some vars from unknown -> None df.loc[df['AbdomenTender'].isin(['no', 'unknown']), 'AbdTenderDegree'] = 'None' # pandas impute missing values with median df = df.fillna(df.median()) df.GCSScore = df.GCSScore.fillna(df.GCSScore.median()) return df
def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame q = df.quantile(0.1, axis=0, numeric_only=True) assert q["A"] == percentile(df["A"], 10) tm.assert_index_equal(q.index, df.columns) q = df.quantile(0.9, axis=1, numeric_only=True) assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90) tm.assert_index_equal(q.index, df.index) # test degenerate case q = DataFrame({ "x": [], "y": [] }).quantile(0.1, axis=0, numeric_only=True) assert np.isnan(q["x"]) and np.isnan(q["y"]) # non-numeric exclusion df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile(0.5, numeric_only=True) with tm.assert_produces_warning(FutureWarning, match="Select only valid"): xp = df.median().rename(0.5) tm.assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) result = df.quantile([0.5, 0.75], axis=1) expected = DataFrame({ 1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75] }, index=[0.5, 0.75]) tm.assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ["a", "b", 4]]) result = df.quantile(0.5, axis=1, numeric_only=True) expected = Series([3.0, 4.0], index=[0, 1], name=0.5) tm.assert_series_equal(result, expected)
def desc(df: pd.DataFrame): """Produces a summary of the input DataFrame Arguments: df {pd.DataFrame} -- [description] Returns: pd.DataFrame -- DataFrame of summary statistics """ desc = df.describe(percentiles = None).T desc['missing'] = len(df.index) - desc['count'] # desc = desc.astype('int') desc['median'] = df.median() desc['missing %'] = desc.missing / len(df.index) * 100 return desc.T
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Checks that input is a dataframe. Parameters ---------- X : Pandas DataFrame y : Pandas Series, np.array. Default = None Parameter is necessary for compatibility with sklearn.pipeline.Pipeline. Raises ------ TypeError If the input is not a Pandas DataFrame ValueError If there are no numerical variables in the df or the df is empty Returns ------- X : Pandas DataFrame The same dataframe entered as parameter """ # check input dataframe X = _to_dataframe(X) if self.method == 'feature': new_index, indexer = X.columns.sort_values(return_indexer=True) pos = new_index.get_loc(self.feature, method='pad') pos = indexer[pos] self.scaling_factors_ = X.iloc[:, pos] elif self.method in ['total', 'sum']: self.scaling_factors_ = X.sum(axis=1) elif self.method == 'PQN': # "Build" the reference sample based and compute quotients if self.ref_sample == 'mean': # Mean spectre of all samples ref_sample2 = X / X.mean(axis=0) elif self.ref_sample == 'median': # Median spectre of all samples ref_sample2 = X / X.median(axis=0) elif self.ref_sample in X.index: # Sample name to use as a reference ref_sample2 = X / X.loc[self.ref_sample, :] else: # Actual sample given (ref_sample is array like) ref_sample2 = X / self.ref_sample # Normalization Factors self.scaling_factors_ = ref_sample2.median(axis=1) self.input_shape_ = X.shape return self
def get_totals(df: pd.DataFrame): """ The function takes a pandas DataFrame and creates a dictionary with selected summary statistics. """ out = dict() out['min'] = df.min() out['per15'] = df.quantile(0.15) out['qr1'] = df.quantile(0.25) out['median'] = df.median() out['qr3'] = df.quantile(0.75) out['per85'] = df.quantile(0.85) out['max'] = df.max() out['count'] = df.count() out['mean'] = df.mean() out['iqr'] = out['qr3'] - out['qr1'] return pd.DataFrame(out)
def test_quantile(self): from numpy import percentile q = self.tsframe.quantile(0.1, axis=0) assert q['A'] == percentile(self.tsframe['A'], 10) tm.assert_index_equal(q.index, self.tsframe.columns) q = self.tsframe.quantile(0.9, axis=1) assert (q['2000-01-17'] == percentile(self.tsframe.loc['2000-01-17'], 90)) tm.assert_index_equal(q.index, self.tsframe.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) assert (np.isnan(q['x']) and np.isnan(q['y'])) # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) expected = DataFrame({ 1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75] }, index=[0.5, 0.75]) assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) expected = Series([3., 4.], index=[0, 1], name=0.5) assert_series_equal(result, expected)
def transform_df(df: pd.DataFrame) -> (pd.DataFrame): '''Accepts a pd.DataFrame of survey results as input, applies a scoring system to the responses, fills nans with the median score, and renames the columns for brevity. Returns the modified pd.DataFrame as output. ''' df.replace( ("Strongly Disagree", "Disagree", "Slightly Disagree", "Unsure/No Opinion", "Slightly Agree", "Agree", "Strongly Agree"), (-3, -2, -1, 0, 1, 2, 3), inplace=True) df.replace(("Yes", "No"), (1, 0), inplace=True) df = df.dropna(thresh=len(df) * .9, axis=1) df = df.fillna(df.median()) df.columns = df.columns.str.replace( 'For each of the statements below, please indicate how strongly you agree or disagree.', "", regex=True) return df
def get_mean_row(self, arr_res, col): """ 結果配列の指定項目の中央値行を取得する """ #seidoの中央値行取得 arr_target = [] for val in arr_res: arr_target.append(val[col]) df_target = DataFrame(arr_target) df_target.columns = ['A'] med = df_target.median() med = med[0] #valueだけ取得 #return df_target[df_target.A == med] #中央値の行を取得(複数あり) #medianと一致する先頭の行を返却 for val in arr_res: if val[col] == med: return val return []
def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame q = df.quantile(0.1, axis=0) assert q['A'] == percentile(df['A'], 10) tm.assert_index_equal(q.index, df.columns) q = df.quantile(0.9, axis=1) assert (q['2000-01-17'] == percentile(df.loc['2000-01-17'], 90)) tm.assert_index_equal(q.index, df.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) assert(np.isnan(q['x']) and np.isnan(q['y'])) # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]) assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) expected = Series([3., 4.], index=[0, 1], name=0.5) assert_series_equal(result, expected)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype = float) for n in range(N): sample_tickers = list(random.choice(tickers, sample_size, replace = False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype = float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis = 1) result['Std'] = summary.std(axis = 1) result['Median'] = summary.median(axis = 1) result['Max'] = summary.max(axis = 1) result['Min'] = summary.min(axis = 1) return (result, summary)
def cross_validate_trades(trades, N=20, subset_fraction=0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype=float) for n in range(N): sample_tickers = list( random.choice(tickers, sample_size, replace=False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype=float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis=1) result['Std'] = summary.std(axis=1) result['Median'] = summary.median(axis=1) result['Max'] = summary.max(axis=1) result['Min'] = summary.min(axis=1) return (result, summary)
# after prepaired data, time to plot it: for new_counter in range(file_counter+1): #print new_counter Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ] x1 = Qbers.index.tolist() y1 = Qbers["Qber"].tolist() x1_average = DataFrame.mean(Qbers)["Qber"] x1_std_dev = DataFrame.std(Qbers)["Qber"] #prepairing proper time: x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1] Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ] x2_average = DataFrame.mean(Raws)["Raw key"] x2_median = DataFrame.median(Raws)["Raw key"] x2_max = DataFrame.max(Raws)["Raw key"] Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)] x2 = Raws.index.tolist() y2 = Raws["Raw key"].tolist() print x2_average #x2_std_dev = 3 #once again correcting counter: x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2] #print x1[0], x2[0], quelle_initialTimestamps[new_counter] # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html f, axarr = plt.subplots(2, sharex=True) axarr[0].grid()
import numpy as np import scipy as sc from scipy import stats import matplotlib.pyplot as plt import pandas as pd from pandas import DataFrame, Series db = pd.read_csv("/users/rosiezou/Desktop/mortgage-stanley/FMAC-5US.csv") table = DataFrame(db, columns = ['Date', 'Value']) plt.plot(db['Date'], db['Value'], 'bo') regressionline = sc.stats.linregress(db['Date'], db['Value']) m = regressionline[0] b = regressionline[1] x = np.linspace(0, 18, 100) plt.plot(x, m*x + b) plt.show() print(table.median(0)) print(table.mode(0))