def total_energy(positions: pd.DataFrame, velocities: pd.DataFrame) -> int: """ Computes the total energy of the system for a given position """ pot = positions.abs().sum(axis=1) kin = velocities.abs().sum(axis=1) return sum(pot * kin)
def linear_envelope( signal_df: pandas.DataFrame, critical_freqs: Union[float, Sequence[float]], sampling_frequency: int, order: int, filter_type: str = "butter", zero_lag: bool = True, cheby_param: Optional[float] = None, zero_center_: bool = True, inplace: bool = False, ) -> pandas.DataFrame: """Find the linear envelope of a signal. This function finds the linear envelope of the raw EMG signal by: 1. (optionally) zero-centering each signal. 2. Taking the `abs` of each value (full-wave rectification). 3. Low-pass filtering the signal. Args: signal_df: a :py:class:`~pandas.DataFrame` with a different discrete-time signal in each of its columns. critical_freqs: passed along to :py:func:`digital_filter`. sampling_frequency: passed along to :py:func:`digital_filter`. order: passed along to :py:func:`digital_filter`. filter_type: passed along to :py:func:`digital_filter`. zero_lag: passed along to :py:func:`digital_filter`. cheby_param: passed along to :py:func:`digital_filter`. zero_center_: if `True`, zero-center the data before taking its absolute value. inplace: if `True`, the data in the original :py:class:`~pandas.DataFrame` will be modified directly. If `False`, the transformations will be applied to a copy of the data. """ if zero_center_: signal_df = zero_center(signal_df, inplace=inplace) if inplace: signal_df[:] = signal_df.abs() else: signal_df = signal_df.abs() return digital_filter( signal_df=signal_df, critical_freqs=critical_freqs, sampling_frequency=sampling_frequency, order=order, filter_type=filter_type, band_type="lowpass", zero_lag=zero_lag, cheby_param=cheby_param, inplace=inplace, )
def reduce_correlation_matrix(correlations: pd.DataFrame, reduction_size: int): best_indicators: List[str] = [] correlations = correlations.abs() correlations_original = correlations.copy() cor = correlations.to_numpy() row_sums = np.sum(cor, axis=1) min_row = np.argmin(row_sums) best_indicators.append(correlations.columns[min_row]) correlations.drop(correlations.index[min_row], axis="index", inplace=True) correlations.drop(correlations.columns[min_row], axis="columns", inplace=True) while len(best_indicators) < reduction_size: row_sums = [] for index, row in correlations.iterrows(): row_sums.append(correlations_original.loc[index, best_indicators].sum()) min_row = np.argmin(row_sums) ind = correlations.columns[min_row] if ind not in best_indicators: best_indicators.append(ind) correlations.drop(correlations.index[min_row], axis="index", inplace=True) correlations.drop(correlations.columns[min_row], axis="columns", inplace=True) ret = correlations_original.loc[best_indicators, best_indicators] return ret
def plot_feature_importance(coefficients: DataFrame, limit: int = None) -> None: with pd.option_context('display.max_rows', None, 'display.max_columns', None): if not limit: limit = len(coefficients) coefficients = coefficients.reindex(coefficients.abs().sort_values( ascending=True, by='mean').index) coefficients = coefficients[-limit:] plt.figure(figsize=(4, 7 * (limit / 25))) plt.tick_params( axis='y', # changes apply to the x-axis which='both', # both major and minor ticks are affected left=False, # ticks along the bottom edge are off ) # plt.tick_params(axis='x', labelcolor='#414141', color='#b9b8b9') rects = plt.barh( coefficients.index, coefficients['mean'], color="#f89f76", ) max_width = pipe( rects, map(lambda rect: rect.get_width()), max, ) for index, rect in enumerate(rects): number = coefficients.iloc[index]['mean'] plt.text( max_width * 1.1 + (-0.02 if number < 0 else 0), rect.get_y() + 0.2, f'{number:.3f}', # color='#060606', ha='left', ) # plt.gcf().patch.set_facecolor('#fdeadd') plt.margins(y=0.01) # plt.gca().patch.set_facecolor('white') plt.gca().spines['top'].set_visible(False) plt.gca().spines['bottom'].set_visible(False) plt.gca().spines['right'].set_visible(False) plt.gca().spines['right'].set_linewidth(1) plt.gca().spines['right'].set_color('#b9b8b9') plt.gca().spines['left'].set_linewidth(1) plt.gca().spines['left'].set_color('#b9b8b9') plt.gca().set_axisbelow(True) import matplotlib as mpl mpl.rcParams['figure.dpi'] = 100 plt.grid(axis='x') plt.gca().xaxis.grid(linestyle='--', which='major', linewidth=1) plt.gca().get_xgridlines()[1].set_linestyle('-')
def test_where_complex(self): # GH 6345 expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) df[df.abs() >= 5] = np.nan tm.assert_frame_equal(df, expected)
def process(self, data, num_phonemes=None, wavelets=False): """ process is a method for summing CWT energies from EMG data into windows for each phoneme in the file. Attributes: data: a pandas DataFrame containing subvocalization EMG data processed by CWT, with any number of columns. Returns: a DataFrame containing summed energy spectrum windows for each phoneme based on an even splitting of the data. """ if num_phonemes: num_windows = num_phonemes self.samples_per_window = int(data.shape[0] // num_windows) if wavelets: windows = DataFrame() # Go through each row from first index to last for each window for window in range(num_windows): first_index = int(window * self.samples_per_window) last_index = int(first_index + self.samples_per_window) data_window = DataFrame(data.iloc[first_index:last_index]) # Sum up the squares of amplitudes in each column new_row = data_window.abs().pow(2).sum(axis=0) windows = windows.append(new_row, ignore_index=True) # Return all the windows return windows
def create_df_with_most_and_least_correlated_features( corr_df: pd.DataFrame, corr_column: str, max_variables: int, return_most_correlated: bool = False, return_least_correlated: bool = False) -> pd.DataFrame: """ Creates data frame with max_variables/2 most correlated and max_variables/2 least correlated features :param corr_df: table with correlations :param corr_column: name of column with correlations :param max_variables: maximum variables in the output data frame :param return_most_correlated: boolean variable if we want to create df with only most correlated variables :param return_least_correlated: boolean variable if we want to create df with only least correlated variables Return: data frame with most or/and least correlated features """ sorted_df = corr_df.abs().sort_values(by=[corr_column], ascending=False) most_correlated = corr_df.loc[sorted_df.index].head(int(max_variables / 2)) least_correlated = corr_df.loc[sorted_df.index].tail(int(max_variables / 2)) if return_most_correlated: return most_correlated elif return_least_correlated: return least_correlated else: return pd.concat([most_correlated, least_correlated])
def true_range(high, low, close, drift=None, offset=None, **kwargs): """Indicator: True Range""" # Validate arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) high_low_range = non_zero_range(high, low) drift = get_drift(drift) offset = get_offset(offset) # Calculate Result prev_close = close.shift(drift) ranges = [high_low_range, high - prev_close, prev_close - low] true_range = DataFrame(ranges).T true_range = true_range.abs().max(axis=1) # Offset if offset != 0: true_range = true_range.shift(offset) # Handle fills if 'fillna' in kwargs: true_range.fillna(kwargs['fillna'], inplace=True) if 'fill_method' in kwargs: true_range.fillna(method=kwargs['fill_method'], inplace=True) # Name and Categorize it true_range.name = f"TRUERANGE_{drift}" true_range.category = 'volatility' return true_range
def compute_top_correlations_features(corr: pd.DataFrame, max_features: int) -> list: """ Returns the max_features features having top correlations. Parameters ---------- corr: pd.DataFrame max_features : int Returns ------- list """ sorted_corr = corr.abs().unstack().sort_values(kind="quicksort")[::-1] set_features = set() i = 0 while len(set_features) < max_features and i < len(sorted_corr): if sorted_corr.index[i][0] != sorted_corr.index[i][1]: set_features.add(sorted_corr.index[i][0]) # Last iteration can add one more feature otherwise if len(set_features) != max_features: set_features.add(sorted_corr.index[i][1]) i += 1 return list(set_features)
def classify_open_closed_loci_with_quant( df: pd.DataFrame, quant: float = 0.1) -> Tuple[pd.DataFrame, pd.DataFrame]: abs_sub = df.abs() q = abs_sub.intensity.quantile(quant) open_areas = abs_sub.query("intensity <= @q") closed_areas = abs_sub.query("intensity > @q") return open_areas, closed_areas
def up_down_from_characteristic_direction(expr: pd.DataFrame, top_n=600): ''' Using the output of `characteristic_direction`, we can extract the top n genes with the highest absolute characteristic direction coefficients and split them into `up` and `down`. ''' highest_abs_expr = expr.loc[expr.abs().sort_values( 'CD-coefficient', ascending=False)[:top_n].index] return type( 'UpDownGeneset', tuple(), dict( up=list(highest_abs_expr[highest_abs_expr > 0].dropna().index), down=list(highest_abs_expr[highest_abs_expr < 0].dropna().index), ))
def plot_feature_permutation_importances( X_train_coefs: DataFrame, X_test: DataFrame, y_test: Series, est: BaseEstimator, sort_by: str = "coefficient", figsize: Tuple = (12, 8), ptitle: str = "plot title", savefig: Path = Path().cwd() / "reports" / "figures" / "manual_fi.png", save_pref: bool = False, ) -> DataFrame: """Plot feature and permutation importances""" fig, axs = plt.subplots(figsize=figsize, nrows=1, ncols=2) plt.subplots_adjust(wspace=0.6) axf = axs[0] # Sort by absolute value X_train_coefs = X_train_coefs.reindex(X_train_coefs.abs().sort_values( by=sort_by, ascending=True).index) X_train_coefs.plot(kind="barh", ax=axf, legend=False) axf.set_ylabel(None) axf.set_title(ptitle, fontweight="bold") labels = [ item.get_text().replace("_", " ") for item in axf.get_yticklabels() ] axf.set_yticklabels(labels) axp = axs[1] result = permutation_importance(est, X=X_test, y=y_test, n_repeats=10, random_state=42, n_jobs=-1) sorted_idx = result.importances_mean.argsort() axp.boxplot( result.importances[sorted_idx].T, vert=False, labels=X_test.columns[sorted_idx], patch_artist=True, ) axp.set_title("Permutation Importances (test data)", fontweight="bold", loc="left") curr_datetime = datetime.now().strftime("%Y%m%d-%H%M%S") filename = f"feature_importances__{curr_datetime}.png" if save_pref and not (savefig / filename).is_file(): fig.savefig(savefig / filename, bbox_inches="tight", dpi=300) return X_train_coefs
def explain_local(self, x_explain: pd.DataFrame, n_cols: Optional[int] = None): res = [] for sample_explanation in x_explain.abs().to_dict(orient='records'): importance = self._regularize( sorted(sample_explanation.items(), key=operator.itemgetter(1), reverse=True)) total_mvmt = sum(map(operator.itemgetter(1), importance)) res_ind = dict(importance[:n_cols]) res_ind['rest'] = total_mvmt - sum(res_ind.values()) res.append(res_ind) return res
def comprehensive_analysis(records, **conf): rates = DataFrame() for code, values in records: rates[code] = change_rate( values, conf['col'], conf['slide'], conf['days']) # print '==== plot rates ====' # plot_items(rates, 2, 400) # print '==== highest correlations ====' # smoothed = rates.applymap(lambda x: round(x / 5.0)) # sims = smoothed.corr() # sims = sims[sims != 1] # print sims.head().stack().nlargest(5) print '==== highest liquidities(variances) ====' var_highest = rates.abs().sum().nlargest(5) print var_highest
def process(self, data: pd.DataFrame, factWeight: pd.DataFrame, method: str = 'Equal', rp: int = 60, hp: int = 5, **kwargs) -> pd.DataFrame: """ 部分权重会用到未来数据,所以需要对权重进行平移与相应的因子值进行匹配 Parameters ---------- hp : 持有期 rp : 滚动周期 data : 因子集 factWeight :因子权重 method : 因子合成方法 kwargs : Returns ------- """ self.rp, self.hp = rp, hp factDir = np.sign(factWeight.rolling(rp, min_periods=1).mean()) factDir = factDir.shift(hp + 1) # 收益率为标签(预测值), 历史收益数据加权需要+ 1 # 因子转为正向因子,同时因子收益等指标调整为单调状态 factNew = data.mul(factDir, level=0).dropna() factWeightNew = factWeight.abs() method_dict = {"RetWeight": self.retWeighted, "OPT": self.MAX_IC_IR } if method is None: return data res = method_dict[method](fact=factNew, factWeight=factWeightNew, **kwargs) return res
def dataFrameMathTest(): #Note : The methods that return a series default to working on columns. df = DataFrame() # Load a DataFrame from a CSV file org_df = pd.read_csv('mlg.csv') df = org_df.iloc[:,1:7] resAbs = df.abs() # absolute values print(resAbs) #resAdd = df.add(o) # add df, Series or value #print(resAdd) resCount = df.count() # non NA/null values print(resCount) resCumMax = df.cummax() # (cols default axis) print(resCumMax) resCumMin = df.cummin() # (cols default axis) print(resCumMin) resCumSum = df.cumsum() # (cols default axis) print(resCumSum) resDiff = df.diff() # 1st diff (col def axis) print(resDiff) resDiv = df.div(12) # div by df, Series, value print(resDiv) #resDot = df.dot(13) # matrix dot product #print(resDot) resMax = df.max() # max of axis (col def) print(resMax) resMean = df.mean() # mean (col default axis) print(resMean) resMedian = df.median()# median (col default) print(resMedian) resMin = df.min() # min of axis (col def) print(resMin) resMul = df.mul(2) # mul by df Series val print(resMul) resSum = df.sum() # sum axis (cols default) print(resSum) resWhere = df.where(df > 0.5, other=np.nan) print(resWhere)
def remove_duplicate_columns(df: pd.DataFrame, tol: float = 1e-8) -> List[str]: """Remove duplicate columns. Parameters ---------- df : DataFrame Input dataframe. tol : float, optional Tolerance to assess duplicate columns. Default is 1e-8. Returns ------- columns: list of str Columns to keep after removing duplicates. """ df = df / df.abs().sum(0) df *= 1 / tol # keep = df.round().T.drop_duplicates(keep="last").T.columns # Slow!! idx = np.unique(df.round().values, axis=1, return_index=True)[-1] keep = df.columns[sorted(idx)] return keep
import numpy as np from pandas import DataFrame npdata = np.random.randn(5, 3) columnNames = ['x1', 'x2', 'x3'] data = DataFrame(npdata, columns=columnNames) print(data.abs()) print('\nMaximum value per column:') print(data.max()) print('\nMinimum value per row:') print(data.min(axis=1)) print('\nSum of values per column:') print(data.sum()) print('\nAverage value per row:') print(data.mean(axis=1)) print('\nCalculate max - min per column') f = lambda x: x.max() - x.min() print(data.apply(f)) print('\nCalculate max - min per row') f = lambda x: x.max() - x.min() print(data.apply(f, axis=1))
class Portfolio(object): """This class represents portfolio and its events.""" def __repr__(self): return '<Portfolio {}>'.format(self.prices.shape) def __init__(self, ohlcs, starting_capital=100000, price_type='cprices', transaction_fee_bps=15., transaction_fee_min=7): self.price_type = price_type self.transaction_fee_bps = transaction_fee_bps self.transaction_fee_min = transaction_fee_min self.prices = self.from_ohlcs(ohlcs, price_type) self.volumes = self.from_ohlcs(ohlcs, 'volumes') self.trades = DataFrame(zeros(self.prices.shape), self.prices.index, self.prices.columns) self.fees = DataFrame(zeros(self.prices.shape), self.prices.index, self.prices.columns) self.starting_capital = starting_capital self.capital = [] self.quantities = [] self.values = [] self.refresh() def from_ohlcs(self, ohlcs, price_type): """Set prices using a list of ohlc classes.""" dfs = [] for ohlc in ohlcs: df = DataFrame(getattr(ohlc, price_type), posix_as_dt(ohlc.timestamps)) dfs.append(df) prices = concat(dfs, join='outer', axis=1) prices.columns = [ohlc.symbol for ohlc in ohlcs] return prices.fillna(method='pad') def refresh(self): """Calculates positions, values, free capital and costs from trades. Fees of short positions (if any) are same as cost for long. This is not realistic, but the class is intended to represent long only portfolios. """ self.fees = self.transaction_fee_bps * self.trades.abs() * \ self.prices / 10000 small = self.fees < self.transaction_fee_min nonzero = self.trades.abs() > 0 self.fees[small * nonzero] = self.transaction_fee_min self.quantities = self.trades.cumsum() self.values = self.quantities * self.prices self.capital = self.starting_capital + self.total_trade_values - \ self.total_fees def trade(self, timestring, symbol, quantity): """Convenience function to enter trades and refresh.""" self.trades[symbol][timestring] = quantity self.refresh() def trade_max(self): """Trade all capital on first day, equal sized positions.""" first_day = dt_as_str(self.prices.index[0]) trade_sizes = zeros(len(self.prices.columns)) trade_sizes[:] = self.starting_capital / float(len(trade_sizes)) trade_sizes = [trade_sizes[ind] / self.prices.iloc[0].values[ind] for ind in arange(len(trade_sizes))] trade_sizes = trunc(trade_sizes) for ind in arange(len(self.prices.columns)): self.trade(first_day, self.prices.columns[ind], trade_sizes[ind]) self.refresh() @property def market_value(self): """Value of equity positions at each time.""" return self.values.sum(axis=1) @property def total_value(self): """Total value of portfolio at each time.""" return self.market_value + self.capital @property def trade_values(self): """Trade values for each trade.""" tvals = -self.trades * self.prices to_0 = (tvals == 0) + (isnull(tvals)) tvals[to_0] = 0 return tvals @property def total_trade_values(self): """Cumulative sum of all trades.""" return self.trade_values.sum(axis=1).cumsum() @property def total_fees(self): """Cumulative sum of fees.""" return self.fees.sum(axis=1).cumsum()
#%% #STEP 4 # This cell calculate the absolute values of the proportion of one race to another in the wards. # The assign function from pandas is used to create new columns that are populated with these calculations. # Reads excel file pd.read_excel( r"S:\GEOG 6293.10 Special Topics 201603\Volpe, Travis - TVolpe1\Final Project\GIS Python Project\Project Data\Data\Wards_2010_DCcounts.xlsx" ) #Caculates proportions and uses assign to create new columns disindx_assign2 = ( df.abs( disindx_assign1.assign( AbsWB=disindx_assign1['wi_WT'] - disindx_assign1['bi_BT']).assign( AbsWA=disindx_assign1['wi_WT'] - disindx_assign1['ai_AT']) #.assign(AbsBW = disindx_assign1[ 'bi_BT' ] - disindx_assign1[ 'wi_WT' ]) : Not needed used to confim absolute value was working .assign(AbsBA=disindx_assign1['bi_BT'] - disindx_assign1['ai_AT']).head(n=9))) #Creates an excel file populated with the dated calculated from the disindx_assign2 function. writer = pd.ExcelWriter( r"S:\GEOG 6293.10 Special Topics 201603\Volpe, Travis - TVolpe1\Final Project\GIS Python Project\Project Data\Data\Wards_2010_DCratio.xlsx" ) disindx_assign2.to_excel(writer, ) writer.save() # Step 4a: in the exported excel file calaculate the disilimarity index # D = 1/2(SUM(Abs_Races)) # as the disilimarity index is a global measure and in this context is only begining # calaculated for the entire city it did not seem necessary to use pandas to do this.
def test_operators_timedelta64(self): df = DataFrame( dict( A=date_range("2012-1-1", periods=3, freq="D"), B=date_range("2012-1-2", periods=3, freq="D"), C=Timestamp("20120101") - timedelta(minutes=5, seconds=5), )) diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"])) # min result = diffs.min() assert result[0] == diffs.loc[0, "A"] assert result[1] == diffs.loc[0, "B"] result = diffs.min(axis=1) assert (result == diffs.loc[0, "B"]).all() # max result = diffs.max() assert result[0] == diffs.loc[2, "A"] assert result[1] == diffs.loc[2, "B"] result = diffs.max(axis=1) assert (result == diffs["A"]).all() # abs result = diffs.abs() result2 = abs(diffs) expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"])) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() mixed["C"] = "foo" mixed["D"] = 1 mixed["E"] = 1.0 mixed["F"] = Timestamp("20130101") # results in an object array result = mixed.min() expected = Series( [ pd.Timedelta(timedelta(seconds=5 * 60 + 5)), pd.Timedelta(timedelta(days=-1)), "foo", 1, 1.0, Timestamp("20130101"), ], index=mixed.columns, ) tm.assert_series_equal(result, expected) # excludes numeric result = mixed.min(axis=1) expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # works when only those columns are selected result = mixed[["A", "B"]].min(1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) result = mixed[["A", "B"]].min() expected = Series([timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"]) tm.assert_series_equal(result, expected) # GH 3106 df = DataFrame({ "time": date_range("20130102", periods=5), "time2": date_range("20130105", periods=5), }) df["off1"] = df["time2"] - df["time"] assert df["off1"].dtype == "timedelta64[ns]" df["off2"] = df["time"] - df["time2"] df._consolidate_inplace() assert df["off1"].dtype == "timedelta64[ns]" assert df["off2"].dtype == "timedelta64[ns]"
yc10 - yc10[1] frame7 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('abc'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series_7 = frame7.iloc[0] frame7 - series_7 #每一層都會減掉,叫做廣播broadcasting series_7_1 = frame7['b'] frame7.sub(series_7_1, axis=0) ##函數應用與映射 frame8 = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Ohio', 'Utah', 'Texas', 'Oregon']) frame8 frame8.abs() # = np.abs(frame8) 取絕對值 f = lambda x: x.max() - x.min() frame8.apply(f) frame8.apply(f, axis=1) def f(x): return Series([x.min(), x.max()], index=['min', 'max']) frame8.apply(f) #傳遞進去的函數可以是多個值組成的Series f = lambda x: '%.2f' % x frame8.applymap(f) #這邊特別使用applymap! frame8['b'].map(f) #Series有一個元素級函數的map用法,所以上述也要用map
def correlation_matrix_to_sorted_pairs(corr: pd.DataFrame): df = corr.abs().stack().reset_index() df = df.loc[(df['level_0'] != 'level_0') & (df['level_1'] != 'level_0')] df = df.loc[df['level_0'] != df['level_1']] return df.sort_values([0]).iloc[::2].reset_index(drop=True)
def plot_comparatives(data: DataFrame): columns = data.columns fig = figure(figsize=[14, 7]) gs = GridSpec(3, 5, width_ratios=[1.5, 1.5, 1.5, .06, 2.5], height_ratios=[.3, 1, 1]) gs.update(left=0.05, right=0.95, top=.95, wspace=0.3, hspace=0) hmap_original = subplot(gs[1, 0], xticks=[]) hmap_original_x = subplot(gs[0, 0], yticks=[], xticks=[]) hmap_blurred = subplot(gs[1, 1], xticks=[], yticks=[]) hmap_blurred_x = subplot(gs[0, 1], yticks=[], xticks=[]) hmap_contrast = subplot(gs[1, 2], xticks=[], yticks=[]) hmap_contrast_x = subplot(gs[0, 2], yticks=[], xticks=[]) hmap_cbar = subplot(gs[1, 3]) gs_s = GridSpec(3, 2, width_ratios=[5, 2], height_ratios=[.5, 1, 1]) gs_s.update(left=0.05, right=0.95, top=.95, wspace=0.5, hspace=0.3) hist = subplot(gs_s[0, 1], yticks=[]) scat = subplot(gs_s[1, 1], xticks=[], yticks=[]) gs2 = GridSpec(1, 2) gs2.update(left=0.05, right=0.95, top=.4, wspace=0.05) box1 = subplot(gs2[0, 0], xticks=[]) box2 = subplot(gs2[0, 1], yticks=[], xticks=[]) # ------------------------------------------------------------------ functions = ( template('r', 'Modulus of medians for\n2 sec windows Kendall Tau', hmap_original, hmap_original_x, lambda x: x, 'original'), template('g', 'Modulus of Gaussian for\ncorrelation matrix', hmap_blurred, hmap_blurred_x, lambda x: gaussian_filter(x, sigma=(1, 1), order=0), 'gaussian'), template('b', 'Modulus of Gaussian\nwith increased contrast', hmap_contrast, hmap_contrast_x, lambda x: _adjust_contrast(x, lower=10, upper=90), 'high contrast'), ) dm = list() for start in data.index[::512]: m = data[:][start:start + 512].corr('kendall').abs() dm.append(m.as_matrix()) dm = asarray(dm) dm = nanmedian(dm, axis=0) hmap_kws = dict(xticklabels=10, yticklabels=10, square=True, vmin=0, vmax=1) # HEATMAPS ---------------------------------------------- dfm = DataFrame(dm, columns=columns, index=columns).abs() results = dict() for index, item in enumerate(functions): dfm = DataFrame(item.func(dfm.as_matrix()), columns=columns, index=columns).abs() results[item.hist_label] = dfm.abs() sns.heatmap(dfm, ax=item.ax, cbar_ax=hmap_cbar, **hmap_kws) item.ax.set_yticklabels(item.ax.get_yticklabels(), rotation=60, fontsize=8) item.ax.set_xticklabels(item.ax.get_xticklabels(), rotation=30, fontsize=8) item.upper_ax.set_title(item.title) sum_dt = _smooth_line(dfm.as_matrix().sum(axis=0)) item.upper_ax.plot(sum_dt) item.upper_ax.set_xlim(0, sum_dt.size) d_nan = dfm.abs().copy() d_nan[d_nan == 1] = NaN sns.kdeplot(d_nan.as_matrix().ravel(), ax=hist, c=item.c, label=item.hist_label, lw=0.8) hist.set_xticks([0]) hist.legend(fontsize=8) if index == 0: item.upper_ax.set_ylabel('Sum', fontsize=8) continue item.ax.set_yticks([]) else: hist.set_title( 'Distributions of the medians\nof 2 sec Kendall kendall') sns.despine(left=True, right=True, top=True, bottom=False, offset=5, ax=hist) hmap_cbar.set_aspect(10) # BOX PLOTS ------------------------------------- dfm = results['original'] dfm[dfm == 1] = 0 sns.boxplot(data=results['original'], ax=box1, linewidth=0.5, fliersize=3) box1.set_title('Distribution of each channel (original)') sns.despine(left=False, right=True, top=True, bottom=True, ax=box1, offset=5) box1.set_yticks([0, 1]) box1.set_yticklabels([0, 1]) box1.set_xticks([]) box1.set_xlabel('Channels', fontsize=8) box1.set_ylim(0, 1) dfm = DataFrame(dm, columns=columns, index=columns).abs() dfm[dfm == 1] = 0 sns.boxplot(data=results['high contrast'], ax=box2, linewidth=0.5, fliersize=3) sns.despine(left=False, right=True, top=True, bottom=True, ax=box2, offset=5) box2.set_title('Distribution of each channel (high contrast)') box2.set_yticks([0, 1]) box2.set_yticklabels([]) box2.set_xticks([]) box2.set_xlabel('Channels', fontsize=8) box2.set_ylim(0, 1) dfm = results['high contrast'] dfm[dfm == 1] = 0 clustered_dt = _cluster(dfm.as_matrix()) scat.scatter(clustered_dt['X0'], clustered_dt['X1'], c=clustered_dt['columns_list'], cmap='spectral', s=15) scat.set_title('Ward linkage') scat.set_xticks([]) scat.set_yticks([]) return fig
'nova lox': 'salmon' } data['animal'] = data['food'].map(str.lower).map(meat_to_animal) print(data) print(data['food'].map(lambda x: meat_to_animal[x.lower()])) print('-------------------------') # 数据标准化 datafile = './data/normalization_data.xls' # 参数初始化 data = pd.read_excel(datafile, header=None) # 读取数据 print((data - data.min()) / (data.max() - data.min())) # 最小-最大规范化 print((data - data.mean()) / data.std()) # 零-均值规范化 print(data / 10 ** np.ceil(np.log10(data.abs().max()))) # 小数定标规范化 print('-------------------------') # 替换值 data = Series([1., -999., 2., -999., -1000., 3.]) print(data) print(data.replace(-999, np.nan)) print(data.replace([-999, -1000], np.nan)) print(data.replace([-999, -1000], [np.nan, 0])) print(data.replace({-999: np.nan, -1000: 0})) print('-------------------------')
def compute_diff_size(diff: pd.DataFrame): return diff.abs().values.sum()
df1+df2 df1.add(df2, fill_value=0) # Operations between DataFrame and Series frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] frame - series #boardcast on each row series2 = frame['d'] frame.sub(series2, axis=0) #boardcast on each column #Function application and mapping------------------------ # numpy的ufunc会被应用到元素级 frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon']) np.abs(frame) frame.abs() # DataFrame的apply默认将函数应用在各列 f = lambda x: x.max() - x.min() #x is an array? frame.apply(f) frame.apply(f,axis=1) #应用于各行 def f(x): return Series([x.min(), x.max()], index = ['min', 'max']) frame.apply(f) #元素级的python函数应该用applymap,Series用map format = lambda x: '%.2f' % x frame.applymap(format) frame['e'].map(format) #Hierarchical indexing===================================== data = Series(np.random.randn(10),index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
def _generate_absolute_upper_triangle_corrs(self, corrs: DataFrame, mask: ndarray) -> DataFrame: return corrs.abs().where(mask, np.nan)
import numpy as np from pandas import DataFrame npdata = np.random.randn(5, 3) #create a 5 by 3 random matrix columnNames = ['x1', 'x2', 'x3'] data = DataFrame(npdata, columns=columnNames) print(data.abs()) #get the absolute value for each element print("\nMaximum value per column : ") print(data.max()) #get maximum value for each column print('\nMinimum value per row : ') print(data.min(axis=1)) #get minimum value for each row print('\nSum of values per column : ') print(data.sum()) #get sum of values for each column print('\nAverage value per row : ') print(data.mean(axis=1)) #get average value for each row print('\nCalculate max - min per column') f = lambda x: x.max() - x.min() print(data.apply(f)) print('\nCalculate max - min per row') f = lambda x: x.max() - x.min() print(data.apply(f, axis=1))
'honey ham': 'pig', 'nova lox': 'salmon' } data['animal'] = data['food'].map(str.lower).map(meat_to_animal) data data['food'].map(lambda x: meat_to_animal[x.lower()]) # 数据标准化 datafile = 'd:/data/normalization_data.xls' #参数初始化 data = pd.read_excel(datafile, header = None) #读取数据 (data - data.min())/(data.max() - data.min()) #最小-最大规范化 (data - data.mean())/data.std() #零-均值规范化 data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 ###替换值 data = Series([1., -999., 2., -999., -1000., 3.]) data data.replace(-999, np.nan) data.replace([-999, -1000], np.nan) data.replace([-999, -1000], [np.nan, 0]) data.replace({-999: np.nan, -1000: 0})