def brar(open_, high, low, close, length=None, scalar=None, drift=None, offset=None, **kwargs): """Indicator: BRAR (BRAR)""" # Validate Arguments open_ = verify_series(open_) high = verify_series(high) low = verify_series(low) close = verify_series(close) length = int(length) if length and length > 0 else 26 scalar = float(scalar) if scalar else 100 high_open_range = non_zero_range(high, open_) open_low_range = non_zero_range(open_, low) drift = get_drift(drift) offset = get_offset(offset) # Calculate Result hcy = non_zero_range(high, close.shift(drift)) cyl = non_zero_range(close.shift(drift), low) hcy[hcy < 0] = 0 # Zero negative values cyl[cyl < 0] = 0 # "" ar = scalar * high_open_range.rolling(length).sum() ar /= open_low_range.rolling(length).sum() br = scalar * hcy.rolling(length).sum() br /= cyl.rolling(length).sum() # Offset if offset != 0: ar = ar.shift(offset) br = ar.shift(offset) # Handle fills if 'fillna' in kwargs: ar.fillna(kwargs['fillna'], inplace=True) br.fillna(kwargs['fillna'], inplace=True) if 'fill_method' in kwargs: ar.fillna(method=kwargs['fill_method'], inplace=True) br.fillna(method=kwargs['fill_method'], inplace=True) # Name and Categorize it _props = f"_{length}" ar.name = f"AR{_props}" br.name = f"BR{_props}" ar.category = br.category = 'momentum' # Prepare DataFrame to return brardf = DataFrame({ar.name: ar, br.name: br}) brardf.name = f"BRAR{_props}" brardf.category = 'momentum' return brardf
def test_cythonized_aggers(op_name): data = { "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], "B": ["A", "B"] * 6, "C": np.random.randn(12), } df = DataFrame(data) df.loc[2:10:2, "C"] = np.nan op = lambda x: getattr(x, op_name)() # single column grouped = df.drop(["B"], axis=1).groupby("A") exp = {cat: op(group["C"]) for cat, group in grouped} exp = DataFrame({"C": exp}) exp.index.name = "A" result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(["A", "B"]) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group["C"]) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ["A", "B"] exp.name = "C" result = op(grouped)["C"] if op_name in ["sum", "prod"]: tm.assert_series_equal(result, exp)
def _testit(name): op = lambda x: getattr(x, name)() # single column grouped = df.drop(['B'], axis=1).groupby('A') exp = {} for cat, group in grouped: exp[cat] = op(group['C']) exp = DataFrame({'C': exp}) exp.index.name = 'A' result = op(grouped) assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(['A', 'B']) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group['C']) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ['A', 'B'] exp.name = 'C' result = op(grouped)['C'] if not tm._incompat_bottleneck_version(name): assert_series_equal(result, exp)
def join_closest_index(df: pd.DataFrame, other: pd.DataFrame, other_name: str = 'other') -> pd.DataFrame: """ Join `df` with the closest row (by index) of `other` :param df: index in time, :param other: index in time, constant intervals :param other_name: name of the joined columns :return: df """ original_index = df.index round_index = other.index.values[1] - other.index.values[0] df.index = np.floor(df.index / round_index).astype(int) other_offset = (other.index / round_index).astype(int).min() - 1 other = other.reset_index( drop=True) # make sure whole int span is exactly covered other.index = other.index + other_offset other.name = other_name if len(other.columns) == 1: df = df.join(other) else: df = df.join(other, rsuffix=f'_{other_name}') df.index = original_index return df
def true_range(high, low, close, drift=None, offset=None, **kwargs): """Indicator: True Range""" # Validate arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) high_low_range = non_zero_range(high, low) drift = get_drift(drift) offset = get_offset(offset) # Calculate Result prev_close = close.shift(drift) ranges = [high_low_range, high - prev_close, prev_close - low] true_range = DataFrame(ranges).T true_range = true_range.abs().max(axis=1) # Offset if offset != 0: true_range = true_range.shift(offset) # Handle fills if 'fillna' in kwargs: true_range.fillna(kwargs['fillna'], inplace=True) if 'fill_method' in kwargs: true_range.fillna(method=kwargs['fill_method'], inplace=True) # Name and Categorize it true_range.name = f"TRUERANGE_{drift}" true_range.category = 'volatility' return true_range
def test_cythonized_aggers(op_name): data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan], 'B': ['A', 'B'] * 6, 'C': np.random.randn(12)} df = DataFrame(data) df.loc[2:10:2, 'C'] = np.nan op = lambda x: getattr(x, op_name)() # single column grouped = df.drop(['B'], axis=1).groupby('A') exp = {cat: op(group['C']) for cat, group in grouped} exp = DataFrame({'C': exp}) exp.index.name = 'A' result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(['A', 'B']) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group['C']) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ['A', 'B'] exp.name = 'C' result = op(grouped)['C'] if op_name in ['sum', 'prod']: tm.assert_series_equal(result, exp)
def ppo(close, fast=None, slow=None, signal=None, scalar=None, offset=None, **kwargs): """Indicator: Percentage Price Oscillator (PPO)""" # Validate Arguments close = verify_series(close) fast = int(fast) if fast and fast > 0 else 12 slow = int(slow) if slow and slow > 0 else 26 signal = int(signal) if signal and signal > 0 else 9 scalar = float(scalar) if scalar else 100 if slow < fast: fast, slow = slow, fast min_periods = int( kwargs["min_periods"]) if "min_periods" in kwargs and kwargs[ "min_periods"] is not None else fast offset = get_offset(offset) # Calculate Result fastma = sma(close, length=fast) slowma = sma(close, length=slow) ppo = scalar * (fastma - slowma) ppo /= slowma signalma = ema(ppo, length=signal) histogram = ppo - signalma # Offset if offset != 0: ppo = ppo.shift(offset) histogram = histogram.shift(offset) signalma = signalma.shift(offset) # Handle fills if "fillna" in kwargs: ppo.fillna(kwargs["fillna"], inplace=True) histogram.fillna(kwargs["fillna"], inplace=True) signalma.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: ppo.fillna(method=kwargs["fill_method"], inplace=True) histogram.fillna(method=kwargs["fill_method"], inplace=True) signalma.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _props = f"_{fast}_{slow}_{signal}" ppo.name = f"PPO{_props}" histogram.name = f"PPOh{_props}" signalma.name = f"PPOs{_props}" ppo.category = histogram.category = signalma.category = "momentum" # Prepare DataFrame to return data = {ppo.name: ppo, histogram.name: histogram, signalma.name: signalma} df = DataFrame(data) df.name = f"PPO{_props}" df.category = ppo.category return df
def test_factor_weights(self, factor_vals, tickers, groups, demeaned, group_adjust, equal_weight, expected_vals): index = date_range('1/12/2000', periods=len(factor_vals)) factor = DataFrame(index=index, columns=tickers, data=factor_vals).stack() factor.index = factor.index.set_names(['date', 'asset']) factor.name = 'factor' factor_data = DataFrame() factor_data['factor'] = factor groups = Series(groups) factor_data['group'] = \ Series(index=factor.index, data=groups[factor.index.get_level_values('asset')].values) weights = \ factor_weights(factor_data, demeaned, group_adjust, equal_weight) expected = Series(data=expected_vals, index=factor_data.index, name='factor') assert_series_equal(weights, expected)
def accbands(high, low, close, length=None, c=None, drift=None, mamode=None, offset=None, **kwargs): """Indicator: Acceleration Bands (ACCBANDS)""" # Validate arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) high_low_range = non_zero_range(high, low) length = int(length) if length and length > 0 else 20 c = float(c) if c and c > 0 else 4 mamode = mamode if isinstance(mamode, str) else "sma" drift = get_drift(drift) offset = get_offset(offset) # Calculate Result hl_ratio = high_low_range / (high + low) hl_ratio *= c _lower = low * (1 - hl_ratio) _upper = high * (1 + hl_ratio) lower = ma(mamode, _lower, length=length) mid = ma(mamode, close, length=length) upper = ma(mamode, _upper, length=length) # Offset if offset != 0: lower = lower.shift(offset) mid = mid.shift(offset) upper = upper.shift(offset) # Handle fills if "fillna" in kwargs: lower.fillna(kwargs["fillna"], inplace=True) mid.fillna(kwargs["fillna"], inplace=True) upper.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: lower.fillna(method=kwargs["fill_method"], inplace=True) mid.fillna(method=kwargs["fill_method"], inplace=True) upper.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it lower.name = f"ACCBL_{length}" mid.name = f"ACCBM_{length}" upper.name = f"ACCBU_{length}" mid.category = upper.category = lower.category = "volatility" # Prepare DataFrame to return data = {lower.name: lower, mid.name: mid, upper.name: upper} accbandsdf = DataFrame(data) accbandsdf.name = f"ACCBANDS_{length}" accbandsdf.category = mid.category return accbandsdf
def kc(high, low, close, length=None, scalar=None, mamode=None, offset=None, **kwargs): """Indicator: Keltner Channels (KC)""" # Validate arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) length = int(length) if length and length > 0 else 20 scalar = float(scalar) if scalar and scalar > 0 else 2 mamode = mamode.lower() if mamode else None offset = get_offset(offset) # Calculate Result use_tr = kwargs.pop("tr", True) if use_tr: range_ = true_range(high, low, close) else: range_ = high_low_range(high, low) _mode = "" if mamode == "sma": basis = sma(close, length) band = sma(range_, length=length) _mode += "s" elif mamode is None or mamode == "ema": basis = ema(close, length=length) band = ema(range_, length=length) lower = basis - scalar * band upper = basis + scalar * band # Offset if offset != 0: lower = lower.shift(offset) basis = basis.shift(offset) upper = upper.shift(offset) # Handle fills if "fillna" in kwargs: lower.fillna(kwargs["fillna"], inplace=True) basis.fillna(kwargs["fillna"], inplace=True) upper.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: lower.fillna(method=kwargs["fill_method"], inplace=True) basis.fillna(method=kwargs["fill_method"], inplace=True) upper.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _props = f"{_mode if len(_mode) else ''}_{length}_{scalar}" lower.name = f"KCL{_props}" basis.name = f"KCB{_props}" upper.name = f"KCU{_props}" basis.category = upper.category = lower.category = "volatility" # Prepare DataFrame to return data = {lower.name: lower, basis.name: basis, upper.name: upper} kcdf = DataFrame(data) kcdf.name = f"KC{_props}" kcdf.category = basis.category return kcdf
def test_cythonized_aggers(op_name): data = { 'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan], 'B': ['A', 'B'] * 6, 'C': np.random.randn(12) } df = DataFrame(data) df.loc[2:10:2, 'C'] = np.nan op = lambda x: getattr(x, op_name)() # single column grouped = df.drop(['B'], axis=1).groupby('A') exp = {cat: op(group['C']) for cat, group in grouped} exp = DataFrame({'C': exp}) exp.index.name = 'A' result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(['A', 'B']) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group['C']) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ['A', 'B'] exp.name = 'C' result = op(grouped)['C'] if op_name in ['sum', 'prod']: tm.assert_series_equal(result, exp)
def test_factor_weights( self, factor_vals, tickers, groups, demeaned, group_adjust, equal_weight, expected_vals, ): index = date_range("1/12/2000", periods=len(factor_vals)) factor = DataFrame( index=index, columns=tickers, data=factor_vals ).stack() factor.index = factor.index.set_names(["date", "asset"]) factor.name = "factor" factor_data = DataFrame() factor_data["factor"] = factor groups = Series(groups) factor_data["group"] = Series( index=factor.index, data=groups[factor.index.get_level_values("asset")].values, ) weights = factor_weights( factor_data, demeaned, group_adjust, equal_weight ) expected = Series( data=expected_vals, index=factor_data.index, name="factor" ) assert_series_equal(weights, expected)
def adx(high, low, close, length=None, scalar=None, drift=None, offset=None, **kwargs): """Indicator: ADX""" # Validate Arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) length = length if length and length > 0 else 14 scalar = float(scalar) if scalar else 100 drift = get_drift(drift) offset = get_offset(offset) # Calculate Result atr_ = atr(high=high, low=low, close=close, length=length) up = high - high.shift(drift) # high.diff(drift) dn = low.shift(drift) - low # low.diff(-drift).shift(drift) pos = ((up > dn) & (up > 0)) * up neg = ((dn > up) & (dn > 0)) * dn pos = pos.apply(zero) neg = neg.apply(zero) k = scalar / atr_ dmp = k * rma(close=pos, length=length) dmn = k * rma(close=neg, length=length) dx = scalar * (dmp - dmn).abs() / (dmp + dmn) adx = rma(close=dx, length=length) # Offset if offset != 0: dmp = dmp.shift(offset) dmn = dmn.shift(offset) adx = adx.shift(offset) # Handle fills if "fillna" in kwargs: adx.fillna(kwargs["fillna"], inplace=True) dmp.fillna(kwargs["fillna"], inplace=True) dmn.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: adx.fillna(method=kwargs["fill_method"], inplace=True) dmp.fillna(method=kwargs["fill_method"], inplace=True) dmn.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it adx.name = f"ADX_{length}" dmp.name = f"DMP_{length}" dmn.name = f"DMN_{length}" adx.category = dmp.category = dmn.category = "trend" # Prepare DataFrame to return data = {adx.name: adx, dmp.name: dmp, dmn.name: dmn} adxdf = DataFrame(data) adxdf.name = f"ADX_{length}" adxdf.category = "trend" return adxdf
def get_productForm(self): products_list = [] prop_list = [] self_SQL_PRODUCT = SQL_PRODUCT.format({'product': self.product}) self_SQL_PROP = SQL_PROP.format({'prop': self.prop}) try: CUR.execute(self_SQL_PRODUCT) results_product = CUR.fetchall() for row in results_product: products_list.append(row) CUR.execute(self_SQL_PROP) results_prop = CUR.fetchall() for row in results_prop: prop_list.append(row[0]) except Exception as e: raise e finally: # 将prop表的值做为product表的列名 frame = DataFrame(products_list, columns=prop_list[:]) frame.name = self.product + "_productForm" frame.index.names = ['product id'] frame.columns.names = ['attribute'] self.productForm = frame.fillna(0) print("Info: The {0[product]}'productForm is up to date.".format( {'product': self.product}))
def factor_alpha_beta( factor_data: pd.DataFrame, returns: pd.DataFrame = None, demeaned: bool = True, group_adjust: bool = False, equal_weight: bool = False, ): """ 计算因子的 alpha (超额收益), alpha 的 t-统计量 以及 beta 值 参数 --- :param factor_data: 索引为 ['日期' '股票'] 的 MultiIndex, values 包括因子值,远期收益,因子分位,因子分组 [可选] :param returns: 因子远期收益,默认为 None, 如果为 None 的时候,会通过调用 `factor_returns` 来计算相应的收益 :param demeaned: 是否基于一个多空组合 :param group_adjust: 是否进行行业中性处理 :param equal_weight: 返回 --- """ if returns is None: returns = factor_returns( factor_data, demeaned, group_adjust, equal_weight ) universe_ret = ( factor_data.groupby(level="datetime")[get_forward_returns_columns( factor_data.columns )].mean().loc[returns.index] ) if isinstance(returns, pd.Series): returns.name = universe_ret.columns.values[0] returns = pd.DataFrame(returns) alpha_beta = pd.DataFrame() for period in returns.columns.values: x = universe_ret[period].values y = returns[period].values x = add_constant(x) reg_fit = OLS(y, x).fit() try: alpha, beta = reg_fit.params except ValueError: alpha_beta.loc["Ann. alpha", period] = np.nan alpha_beta.loc["beta", period] = np.nan else: freq_adjust = pd.Timedelta(days=DAYS_PER_YEAR) / pd.Timedelta( utils.get_period(period.replace("period_", "")) ) alpha_beta.loc["Ann. alpha", period] = (1 + alpha)**freq_adjust - 1.0 alpha_beta.loc["beta", period] = beta return alpha_beta
def _testit(name): op = lambda x: getattr(x, name)() # single column grouped = df.drop(['B'], axis=1).groupby('A') exp = {} for cat, group in grouped: exp[cat] = op(group['C']) exp = DataFrame({'C': exp}) exp.index.name = 'A' result = op(grouped) assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(['A', 'B']) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group['C']) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ['A', 'B'] exp.name = 'C' result = op(grouped)['C'] if name in ['sum', 'prod']: assert_series_equal(result, exp)
def aroon(high, low, length=None, scalar=None, talib=None, offset=None, **kwargs): """Indicator: Aroon & Aroon Oscillator""" # Validate Arguments length = length if length and length > 0 else 14 scalar = float(scalar) if scalar else 100 high = verify_series(high, length) low = verify_series(low, length) offset = get_offset(offset) mode_tal = bool(talib) if isinstance(talib, bool) else True if high is None or low is None: return # Calculate Result if Imports["talib"] and mode_tal: from talib import AROON, AROONOSC aroon_down, aroon_up = AROON(high, low, length) aroon_osc = AROONOSC(high, low, length) else: periods_from_hh = high.rolling(length + 1).apply(recent_maximum_index, raw=True) periods_from_ll = low.rolling(length + 1).apply(recent_minimum_index, raw=True) aroon_up = aroon_down = scalar aroon_up *= 1 - (periods_from_hh / length) aroon_down *= 1 - (periods_from_ll / length) aroon_osc = aroon_up - aroon_down # Handle fills if "fillna" in kwargs: aroon_up.fillna(kwargs["fillna"], inplace=True) aroon_down.fillna(kwargs["fillna"], inplace=True) aroon_osc.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: aroon_up.fillna(method=kwargs["fill_method"], inplace=True) aroon_down.fillna(method=kwargs["fill_method"], inplace=True) aroon_osc.fillna(method=kwargs["fill_method"], inplace=True) # Offset if offset != 0: aroon_up = aroon_up.shift(offset) aroon_down = aroon_down.shift(offset) aroon_osc = aroon_osc.shift(offset) # Name and Categorize it aroon_up.name = f"AROONU_{length}" aroon_down.name = f"AROOND_{length}" aroon_osc.name = f"AROONOSC_{length}" aroon_down.category = aroon_up.category = aroon_osc.category = "trend" # Prepare DataFrame to return data = { aroon_down.name: aroon_down, aroon_up.name: aroon_up, aroon_osc.name: aroon_osc, } aroondf = DataFrame(data) aroondf.name = f"AROON_{length}" aroondf.category = aroon_down.category return aroondf
def bbands(close, length=None, std=None, mamode=None, offset=None, **kwargs): """Indicator: Bollinger Bands (BBANDS)""" # Validate arguments close = verify_series(close) length = int(length) if length and length > 0 else 5 std = float(std) if std and std > 0 else 2.0 mamode = mamode if isinstance(mamode, str) else "sma" offset = get_offset(offset) # Calculate Result standard_deviation = stdev(close=close, length=length) deviations = std * standard_deviation mid = ma(mamode, close, length=length, **kwargs) lower = mid - deviations upper = mid + deviations bandwidth = 100 * (upper - lower) / mid # Offset if offset != 0: lower = lower.shift(offset) mid = mid.shift(offset) upper = upper.shift(offset) bandwidth = bandwidth.shift(offset) # Handle fills if "fillna" in kwargs: lower.fillna(kwargs["fillna"], inplace=True) mid.fillna(kwargs["fillna"], inplace=True) upper.fillna(kwargs["fillna"], inplace=True) bandwidth.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: lower.fillna(method=kwargs["fill_method"], inplace=True) mid.fillna(method=kwargs["fill_method"], inplace=True) upper.fillna(method=kwargs["fill_method"], inplace=True) bandwidth.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it lower.name = f"BBL_{length}_{std}" mid.name = f"BBM_{length}_{std}" upper.name = f"BBU_{length}_{std}" bandwidth.name = f"BBB_{length}_{std}" upper.category = lower.category = "volatility" mid.category = bandwidth.category = upper.category # Prepare DataFrame to return data = { lower.name: lower, mid.name: mid, upper.name: upper, bandwidth.name: bandwidth } bbandsdf = DataFrame(data) bbandsdf.name = f"BBANDS_{length}_{std}" bbandsdf.category = mid.category return bbandsdf
def kvo(high, low, close, volume, fast=None, slow=None, signal=None, mamode=None, drift=None, offset=None, **kwargs): """Indicator: Klinger Volume Oscillator (KVO)""" # Validate arguments fast = int(fast) if fast and fast > 0 else 34 slow = int(slow) if slow and slow > 0 else 55 signal = int(signal) if signal and signal > 0 else 13 mamode = mamode.lower() if mamode and isinstance(mamode, str) else "ema" _length = max(fast, slow, signal) high = verify_series(high, _length) low = verify_series(low, _length) close = verify_series(close, _length) volume = verify_series(volume, _length) drift = get_drift(drift) offset = get_offset(offset) if high is None or low is None or close is None or volume is None: return # Calculate Result signed_volume = volume * signed_series(hlc3(high, low, close), 1) sv = signed_volume.loc[signed_volume.first_valid_index():, ] kvo = ma(mamode, sv, length=fast) - ma(mamode, sv, length=slow) kvo_signal = ma(mamode, kvo.loc[kvo.first_valid_index():, ], length=signal) # Offset if offset != 0: kvo = kvo.shift(offset) kvo_signal = kvo_signal.shift(offset) # Handle fills if "fillna" in kwargs: kvo.fillna(kwargs["fillna"], inplace=True) kvo_signal.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: kvo.fillna(method=kwargs["fill_method"], inplace=True) kvo_signal.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _props = f"_{fast}_{slow}_{signal}" kvo.name = f"KVO{_props}" kvo_signal.name = f"KVOs{_props}" kvo.category = kvo_signal.category = "volume" # Prepare DataFrame to return data = {kvo.name: kvo, kvo_signal.name: kvo_signal} df = DataFrame(data) df.name = f"KVO{_props}" df.category = kvo.category return df
def aberration(high, low, close, length=None, atr_length=None, offset=None, **kwargs): """Indicator: Aberration (ABER)""" # Validate arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) length = int(length) if length and length > 0 else 5 atr_length = int(atr_length) if atr_length and atr_length > 0 else 15 offset = get_offset(offset) # Calculate Result atr_ = atr(high=high, low=low, close=close, length=atr_length) jg = hlc3(high=high, low=low, close=close) zg = sma(jg, length) sg = zg + atr_ xg = zg - atr_ # Offset if offset != 0: zg = zg.shift(offset) sg = sg.shift(offset) xg = xg.shift(offset) atr_ = atr_.shift(offset) # Handle fills if "fillna" in kwargs: zg.fillna(kwargs["fillna"], inplace=True) sg.fillna(kwargs["fillna"], inplace=True) xg.fillna(kwargs["fillna"], inplace=True) atr_.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: zg.fillna(method=kwargs["fill_method"], inplace=True) sg.fillna(method=kwargs["fill_method"], inplace=True) xg.fillna(method=kwargs["fill_method"], inplace=True) atr_.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _props = f"_{length}_{atr_length}" zg.name = f"ABER_ZG{_props}" sg.name = f"ABER_SG{_props}" xg.name = f"ABER_XG{_props}" atr_.name = f"ABER_ATR{_props}" zg.category = sg.category = "volatility" xg.category = atr_.category = zg.category # Prepare DataFrame to return data = {zg.name: zg, sg.name: sg, xg.name: xg, atr_.name: atr_} aberdf = DataFrame(data) aberdf.name = f"ABER{_props}" aberdf.category = zg.category return aberdf
def fisher(high, low, length=None, signal=None, offset=None, **kwargs): """Indicator: Fisher Transform (FISHT)""" # Validate Arguments length = int(length) if length and length > 0 else 9 signal = int(signal) if signal and signal > 0 else 1 _length = max(length, signal) high = verify_series(high, _length) low = verify_series(low, _length) offset = get_offset(offset) if high is None or low is None: return # Calculate Result hl2_ = hl2(high, low) highest_hl2 = hl2_.rolling(length).max() lowest_hl2 = hl2_.rolling(length).min() hlr = high_low_range(highest_hl2, lowest_hl2) hlr[hlr < 0.001] = 0.001 position = ((hl2_ - lowest_hl2) / hlr) - 0.5 v = 0 m = high.size result = [npNaN for _ in range(0, length - 1)] + [0] for i in range(length, m): v = 0.66 * position.iloc[i] + 0.67 * v if v < -0.99: v = -0.999 if v > 0.99: v = 0.999 result.append(0.5 * (nplog((1 + v) / (1 - v)) + result[i - 1])) fisher = Series(result, index=high.index) signalma = fisher.shift(signal) # Offset if offset != 0: fisher = fisher.shift(offset) signalma = signalma.shift(offset) # Handle fills if "fillna" in kwargs: fisher.fillna(kwargs["fillna"], inplace=True) signalma.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: fisher.fillna(method=kwargs["fill_method"], inplace=True) signalma.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _props = f"_{length}_{signal}" fisher.name = f"FISHERT{_props}" signalma.name = f"FISHERTs{_props}" fisher.category = signalma.category = "momentum" # Prepare DataFrame to return data = {fisher.name: fisher, signalma.name: signalma} df = DataFrame(data) df.name = f"FISHERT{_props}" df.category = fisher.category return df
def pvo(volume, fast=None, slow=None, signal=None, scalar=None, offset=None, **kwargs): """Indicator: Percentage Volume Oscillator (PVO)""" # Validate Arguments fast = int(fast) if fast and fast > 0 else 12 slow = int(slow) if slow and slow > 0 else 26 signal = int(signal) if signal and signal > 0 else 9 scalar = float(scalar) if scalar else 100 if slow < fast: fast, slow = slow, fast volume = verify_series(volume, max(fast, slow, signal)) offset = get_offset(offset) if volume is None: return # Calculate Result fastma = ema(volume, length=fast) slowma = ema(volume, length=slow) pvo = scalar * (fastma - slowma) pvo /= slowma signalma = ema(pvo, length=signal) histogram = pvo - signalma # Offset if offset != 0: pvo = pvo.shift(offset) histogram = histogram.shift(offset) signalma = signalma.shift(offset) # Handle fills if "fillna" in kwargs: pvo.fillna(kwargs["fillna"], inplace=True) histogram.fillna(kwargs["fillna"], inplace=True) signalma.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: pvo.fillna(method=kwargs["fill_method"], inplace=True) histogram.fillna(method=kwargs["fill_method"], inplace=True) signalma.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _props = f"_{fast}_{slow}_{signal}" pvo.name = f"PVO{_props}" histogram.name = f"PVOh{_props}" signalma.name = f"PVOs{_props}" pvo.category = histogram.category = signalma.category = "momentum" # data = {pvo.name: pvo, histogram.name: histogram, signalma.name: signalma} df = DataFrame(data) df.name = pvo.name df.category = pvo.category return df
def _gather_frames_column(self, key): results = {} for name, df in self.frames.items(): results[name] = df[key] df = DataFrame(results) df.name = key return df
def get_pandas_df(self): columns = self.columns_label + self.get_all_used_atoms_in_order( self.mass_spectrum) dict_data_list = self.get_list_dict_data(self.mass_spectrum) df = DataFrame(dict_data_list, columns=columns) df.name = self.output_file return df
def adx(high, low, close, length=None, drift=None, offset=None, **kwargs): """Indicator: ADX""" # Validate Arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) length = length if length and length > 0 else 14 drift = get_drift(drift) offset = get_offset(offset) # Calculate Result _atr = atr(high=high, low=low, close=close, length=length) up = high - high.shift(drift) dn = low.shift(drift) - low pos = ((up > dn) & (up > 0)) * up neg = ((dn > up) & (dn > 0)) * dn pos = pos.apply(zero) neg = neg.apply(zero) dmp = (100 / _atr) * rma(close=pos, length=length) dmn = (100 / _atr) * rma(close=neg, length=length) dx = 100 * (dmp - dmn).abs() / (dmp + dmn) adx = rma(close=dx, length=length) # Offset if offset != 0: dmp = dmp.shift(offset) dmn = dmn.shift(offset) adx = adx.shift(offset) # Handle fills if 'fillna' in kwargs: adx.fillna(kwargs['fillna'], inplace=True) dmp.fillna(kwargs['fillna'], inplace=True) dmn.fillna(kwargs['fillna'], inplace=True) if 'fill_method' in kwargs: adx.fillna(method=kwargs['fill_method'], inplace=True) dmp.fillna(method=kwargs['fill_method'], inplace=True) dmn.fillna(method=kwargs['fill_method'], inplace=True) # Name and Categorize it adx.name = f"ADX_{length}" dmp.name = f"DMP_{length}" dmn.name = f"DMN_{length}" adx.category = dmp.category = dmn.category = 'trend' # Prepare DataFrame to return data = {adx.name: adx, dmp.name: dmp, dmn.name: dmn} adxdf = DataFrame(data) adxdf.name = f"ADX_{length}" adxdf.category = 'trend' return adxdf
def donchian(high, low, lower_length=None, upper_length=None, offset=None, **kwargs): """Indicator: Donchian Channels (DC)""" # Validate arguments high = verify_series(high) low = verify_series(low) lower_length = int( lower_length) if lower_length and lower_length > 0 else 20 upper_length = int( upper_length) if upper_length and upper_length > 0 else 20 lower_min_periods = int( kwargs['lower_min_periods'] ) if 'lower_min_periods' in kwargs and kwargs[ 'lower_min_periods'] is not None else lower_length upper_min_periods = int( kwargs['upper_min_periods'] ) if 'upper_min_periods' in kwargs and kwargs[ 'upper_min_periods'] is not None else upper_length offset = get_offset(offset) # Calculate Result lower = low.rolling(lower_length, min_periods=lower_min_periods).min() upper = high.rolling(upper_length, min_periods=upper_min_periods).max() mid = 0.5 * (lower + upper) # Handle fills if 'fillna' in kwargs: lower.fillna(kwargs['fillna'], inplace=True) mid.fillna(kwargs['fillna'], inplace=True) upper.fillna(kwargs['fillna'], inplace=True) if 'fill_method' in kwargs: lower.fillna(method=kwargs['fill_method'], inplace=True) mid.fillna(method=kwargs['fill_method'], inplace=True) upper.fillna(method=kwargs['fill_method'], inplace=True) # Offset if offset != 0: lower = lower.shift(offset) mid = mid.shift(offset) upper = upper.shift(offset) # Name and Categorize it lower.name = f"DCL_{lower_length}_{upper_length}" mid.name = f"DCM_{lower_length}_{upper_length}" upper.name = f"DCU_{lower_length}_{upper_length}" mid.category = upper.category = lower.category = 'volatility' # Prepare DataFrame to return data = {lower.name: lower, mid.name: mid, upper.name: upper} dcdf = DataFrame(data) dcdf.name = f"DC_{lower_length}_{upper_length}" dcdf.category = 'volatility' return dcdf
def amat(close=None, fast=None, slow=None, mamode=None, lookback=None, offset=None, **kwargs): """Indicator: Archer Moving Averages Trends (AMAT)""" # Validate Arguments close = verify_series(close) fast = int(fast) if fast and fast > 0 else 8 slow = int(slow) if slow and slow > 0 else 21 lookback = int(lookback) if lookback and lookback > 0 else 2 mamode = mamode.lower() if mamode else "ema" offset = get_offset(offset) # Calculate Result if mamode == "hma": fast_ma = hma(close=close, length=fast, **kwargs) slow_ma = hma(close=close, length=slow, **kwargs) elif mamode == "linreg": fast_ma = linreg(close=close, length=fast, **kwargs) slow_ma = linreg(close=close, length=slow, **kwargs) elif mamode == "rma": fast_ma = rma(close=close, length=fast, **kwargs) slow_ma = rma(close=close, length=slow, **kwargs) elif mamode == "sma": fast_ma = sma(close=close, length=fast, **kwargs) slow_ma = sma(close=close, length=slow, **kwargs) elif mamode == "wma": fast_ma = wma(close=close, length=fast, **kwargs) slow_ma = wma(close=close, length=slow, **kwargs) else: # "ema" fast_ma = ema(close=close, length=fast, **kwargs) slow_ma = ema(close=close, length=slow, **kwargs) mas_long = long_run(fast_ma, slow_ma, length=lookback) mas_short = short_run(fast_ma, slow_ma, length=lookback) # Offset if offset != 0: mas_long = mas_long.shift(offset) mas_short = mas_short.shift(offset) # # Handle fills if "fillna" in kwargs: mas_long.fillna(kwargs["fillna"], inplace=True) mas_short.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: mas_long.fillna(method=kwargs["fill_method"], inplace=True) mas_short.fillna(method=kwargs["fill_method"], inplace=True) # Prepare DataFrame to return amatdf = DataFrame({ f"AMAT_{mas_long.name}": mas_long, f"AMAT_{mas_short.name}": mas_short }) # Name and Categorize it amatdf.name = f"AMAT_{mamode.upper()}_{fast}_{slow}_{lookback}" amatdf.category = "trend" return amatdf
def accbands(high, low, close, length=None, c=None, drift=None, mamode=None, offset=None, **kwargs): """Indicator: Acceleration Bands (ACCBANDS)""" # Validate arguments high = verify_series(high) low = verify_series(low) close = verify_series(close) high_low_range = non_zero_range(high, low) length = int(length) if length and length > 0 else 20 c = float(c) if c and c > 0 else 4 min_periods = int(kwargs['min_periods']) if 'min_periods' in kwargs and kwargs['min_periods'] is not None else length mamode = mamode.lower() if mamode else 'sma' drift = get_drift(drift) offset = get_offset(offset) # Calculate Result hl_ratio = high_low_range / (high + low) hl_ratio *= c _lower = low * (1 - hl_ratio) _upper = high * (1 + hl_ratio) if mamode is None or mamode == 'sma': lower = _lower.rolling(length, min_periods=min_periods).mean() mid = close.rolling(length, min_periods=min_periods).mean() upper = _upper.rolling(length, min_periods=min_periods).mean() elif mamode == 'ema': lower = _lower.ewm(span=length, min_periods=min_periods).mean() mid = close.ewm(span=length, min_periods=min_periods).mean() upper = _upper.ewm(span=length, min_periods=min_periods).mean() # Offset if offset != 0: lower = lower.shift(offset) mid = mid.shift(offset) upper = upper.shift(offset) # Handle fills if 'fillna' in kwargs: lower.fillna(kwargs['fillna'], inplace=True) mid.fillna(kwargs['fillna'], inplace=True) upper.fillna(kwargs['fillna'], inplace=True) if 'fill_method' in kwargs: lower.fillna(method=kwargs['fill_method'], inplace=True) mid.fillna(method=kwargs['fill_method'], inplace=True) upper.fillna(method=kwargs['fill_method'], inplace=True) # Name and Categorize it lower.name = f"ACCBL_{length}" mid.name = f"ACCBM_{length}" upper.name = f"ACCBU_{length}" mid.category = upper.category = lower.category = 'volatility' # Prepare DataFrame to return data = {lower.name: lower, mid.name: mid, upper.name: upper} accbandsdf = DataFrame(data) accbandsdf.name = f"ACCBANDS_{length}" accbandsdf.category = 'volatility' return accbandsdf
def cdl_z(open_, high, low, close, length=None, full=None, ddof=None, offset=None, **kwargs): """Candle Type: Z Score""" # Validate Arguments length = int(length) if length and length > 0 else 30 ddof = int(ddof) if ddof and ddof >= 0 and ddof < length else 1 open_ = verify_series(open_, length) high = verify_series(high, length) low = verify_series(low, length) close = verify_series(close, length) offset = get_offset(offset) full = bool(full) if full is not None and full else False if open_ is None or high is None or low is None or close is None: return # Calculate Result if full: length = close.size z_open = zscore(open_, length=length, ddof=ddof) z_high = zscore(high, length=length, ddof=ddof) z_low = zscore(low, length=length, ddof=ddof) z_close = zscore(close, length=length, ddof=ddof) _full = "a" if full else "" _props = _full if full else f"_{length}_{ddof}" df = DataFrame({ f"open_Z{_props}": z_open, f"high_Z{_props}": z_high, f"low_Z{_props}": z_low, f"close_Z{_props}": z_close, }) if full: df.fillna(method="backfill", axis=0, inplace=True) # Offset if offset != 0: df = df.shift(offset) # Handle fills if "fillna" in kwargs: df.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: df.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it df.name = f"CDL_Z{_props}" df.category = "candles" return df
def stoch(high, low, close, k=None, d=None, smooth_k=None, offset=None, **kwargs): """Indicator: Stochastic Oscillator (STOCH)""" # Validate arguments k = k if k and k > 0 else 14 d = d if d and d > 0 else 3 smooth_k = smooth_k if smooth_k and smooth_k > 0 else 3 _length = max(k, d, smooth_k) high = verify_series(high, _length) low = verify_series(low, _length) close = verify_series(close, _length) offset = get_offset(offset) if high is None or low is None or close is None: return # Calculate Result lowest_low = low.rolling(k).min() highest_high = high.rolling(k).max() stoch = 100 * (close - lowest_low) stoch /= non_zero_range(highest_high, lowest_low) stoch_k = sma(stoch, length=smooth_k) stoch_d = sma(stoch_k, length=d) # Offset if offset != 0: stoch_k = stoch_k.shift(offset) stoch_d = stoch_d.shift(offset) # Handle fills if "fillna" in kwargs: stoch_k.fillna(kwargs["fillna"], inplace=True) stoch_d.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: stoch_k.fillna(method=kwargs["fill_method"], inplace=True) stoch_d.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it _name = "STOCH" _props = f"_{k}_{d}_{smooth_k}" stoch_k.name = f"{_name}k{_props}" stoch_d.name = f"{_name}d{_props}" stoch_k.category = stoch_d.category = "momentum" # Prepare DataFrame to return data = {stoch_k.name: stoch_k, stoch_d.name: stoch_d} df = DataFrame(data) df.name = f"{_name}{_props}" df.category = stoch_k.category return df
def kvo(high, low, close, volume, fast=None, slow=None, length_sig=None, mamode=None, drift=None, offset=None, **kwargs): """Indicator: Klinger Volume Oscillator (KVO)""" # Validate arguments fast = int(fast) if fast and fast > 0 else 34 slow = int(slow) if slow and slow > 0 else 55 length_sig = int(length_sig) if length_sig and length_sig > 0 else 13 mamode = mamode.lower() if mamode and isinstance(mamode, str) else "ema" _length = max(fast, slow, length_sig) high = verify_series(high, _length) low = verify_series(low, _length) close = verify_series(close, _length) volume = verify_series(volume, _length) drift = get_drift(drift) offset = get_offset(offset) if high is None or low is None or close is None or volume is None: return # Calculate Result mom = hlc3(high, low, close).diff(drift) trend = npWhere(mom > 0, 1, 0) + npWhere(mom < 0, -1, 0) dm = non_zero_range(high, low) m = high.size cm = [0] * m for i in range(1, m): cm[i] = (cm[i - 1] + dm[i]) if trend[i] == trend[i - 1] else (dm[i - 1] + dm[i]) vf = 100 * volume * trend * abs(2 * dm / cm - 1) kvo = ma(mamode, vf, length=fast) - ma(mamode, vf, length=slow) kvo_signal = ma(mamode, kvo, length=length_sig) # Offset if offset != 0: kvo = kvo.shift(offset) kvo_signal = kvo_signal.shift(offset) # Handle fills if "fillna" in kwargs: kvo.fillna(kwargs["fillna"], inplace=True) kvo_signal.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: kvo.fillna(method=kwargs["fill_method"], inplace=True) kvo_signal.fillna(method=kwargs["fill_method"], inplace=True) # Name and Categorize it kvo.name = f"KVO_{fast}_{slow}" kvo_signal.name = f"KVOSig_{length_sig}" kvo.category = kvo_signal.category = "volume" # Prepare DataFrame to return data = {kvo.name: kvo, kvo_signal.name: kvo_signal} kvoandsig = DataFrame(data) kvoandsig.name = f"KVO_{fast}_{slow}_{length_sig}" kvoandsig.category = kvo.category return kvoandsig
def test_get_columns(self): X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = [1, 2]) self.assertEqual(["1", "2"], _get_column_names(X).tolist()) X.columns = numpy.asarray([1.0, 2.0]) self.assertEqual(["1.0", "2.0"], _get_column_names(X).tolist()) X = Series([1, 2, 3], name = 1) self.assertEqual("1", _get_column_names(X).tolist()) X.name = 1.0 self.assertEqual("1.0", _get_column_names(X).tolist())
def _init_dataframe(self, df, name=None): name = name or df.name self.columns = [name] if name is None: raise Exception("need a name for df") for col, series in df.iteritems(): frame = DataFrame({name: series}) frame.name = col self.frames[col] = frame
def _gather_column(self, key): if key in self._cache: return self._cache[key] results = {} for name, df in self.frames.iteritems(): results[name] = df[key] df = DataFrame(results) df.name = key self._cache[key] = df return df
def test_common_start_returns(self, before, after, mean_by_date, demeaned, expected_vals): dr = date_range(start='2015-1-17', end='2015-2-2') dr.name = 'date' tickers = ['A', 'B', 'C', 'D'] r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80) prices = DataFrame(index=dr, columns=tickers, data=[[r1**1, r2**1, r3**1, r4**1], [r1**2, r2**2, r3**2, r4**2], [r1**3, r2**3, r3**3, r4**3], [r1**4, r2**4, r3**4, r4**4], [r1**5, r2**5, r3**5, r4**5], [r1**6, r2**6, r3**6, r4**6], [r1**7, r2**7, r3**7, r4**7], [r1**8, r2**8, r3**8, r4**8], [r1**9, r2**9, r3**9, r4**9], [r1**10, r2**10, r3**10, r4**10], [r1**11, r2**11, r3**11, r4**11], [r1**12, r2**12, r3**12, r4**12], [r1**13, r2**13, r3**13, r4**13], [r1**14, r2**14, r3**14, r4**14], [r1**15, r2**15, r3**15, r4**15], [r1**16, r2**16, r3**16, r4**16], [r1**17, r2**17, r3**17, r4**17]]) dr2 = date_range(start='2015-1-21', end='2015-1-29') factor = DataFrame(index=dr2, columns=tickers, data=[[3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1]]).stack() factor.index = factor.index.set_names(['date', 'asset']) factor.name = 'factor' cmrt = common_start_returns( factor, prices, before, after, False, mean_by_date, factor if demeaned else None) cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)}) expected = DataFrame(index=range(-before, after + 1), columns=['mean', 'std'], data=expected_vals) assert_frame_equal(cmrt, expected)
def compute_one(t, df, **kwargs): if t.grouper.iscolumn: grouper = compute(t.grouper, {t.child: df}) # a Series elif isinstance(t.grouper, Projection) and t.grouper.child is t.child: grouper = t.grouper.columns # list of column names if isinstance(t.apply, Summary): names = t.apply.names preapply = DataFrame(dict(zip( names, [compute(v.child, {t.child: df}) for v in t.apply.values]))) df2 = concat_nodup(df, preapply) groups = df2.groupby(grouper) d = defaultdict(list) for name, v in zip(names, t.apply.values): d[name].append(getattr(Series, v.symbol)) result = groups.agg(dict(d)) # Rearrange columns to match names order result = result[sorted(list(result.columns), key=lambda t: names.index(t[0]))] result.columns = t.apply.names # flatten down multiindex if isinstance(t.apply, Reduction): names = t.apply.dshape[0].names preapply = compute(t.apply.child, {t.child: df}) # Pandas and Blaze column naming schemes differ # Coerce DataFrame column names to match Blaze's names preapply = preapply.copy() if isinstance(preapply, Series): preapply.name = names[0] else: preapply.columns = names df2 = concat_nodup(df, preapply) if t.apply.child.iscolumn: groups = df2.groupby(grouper)[names[0]] else: groups = df2.groupby(grouper)[names] result = compute_one(t.apply, groups) # do reduction result = DataFrame(result).reset_index() result.columns = t.columns return result
def collector2df(collector, station, sos_name): """Request CSV response from SOS and convert to Pandas DataFrames.""" collector.features = [station] collector.variables = [sos_name] long_name = get_station_longName(station) try: response = collector.raw(responseFormat="text/csv") data_df = read_csv(BytesIO(response.encode("utf-8")), parse_dates=True, index_col="date_time") except ExceptionReport as e: # warn("Station %s is not NAVD datum. %s" % (long_name, e)) print(str(e)) data_df = DataFrame() # Assigning an empty DataFrame for now. data_df.name = long_name return data_df
def ForITOL(H): NBINS=10 values, bins=cut(H["MIByBranch"]["I(Ti,G)"].TurnOver,bins=NBINS, retbins=True) try: from matplotlib import pyplot as plt import matplotlib except ImportError: if NBINS==10: zz=Series(["#FFF1A9", "#FEE187", "#FECA66", "#FEAB49", "#FD8C3C","#FC5B2E","#ED2E21", "#D41020", "#B00026", "#800026"], index=values.cat.categories) else: raise ImportError else: cm = plt.get_cmap('YlOrRd') z=arange(1,(NBINS+1),1)/float(NBINS) zz=Series([matplotlib.colors.rgb2hex(x).upper() for x in cm(z)], index=values.cat.categories) XITOL=DataFrame({"branch name":list(values.index.get_level_values("Name")), "mode":"range","label":list(values.values), "color":zz[values]}) H["MIByBranch"].loc[:,("I(Ti,G)","Color")]=zz[values].values #print "wwww" #print H["MIByBranch"]["I(Ti,G)"] #print H["MIByBranch"].columns #H["MIByBranch"].reindex(H["MIByBranch"].index) L=len(H["MIByBranch"].columns) H["MIByBranch"]=H["MIByBranch"].iloc[:,sum([range(4),[L-1],range(4,L-1)],[])] #H["MIByBranch"]=H["MIByBranch"].iloc[:,[0,1,2,3,11,4,5,6,7,8,9,10]] #print "CIAO" #print H["MIByBranch"].columns XITOL.set_index("branch name",inplace=True) XITOL["label"]=["_to_".join(x.split(", "))[1:-1] for x in XITOL["label"]] #print XITOL.iloc[0:3,:] label=numpy.array(["NotSignificant","Significant"])[(H["MIByBranch"]["I(Ti,G)"].MultTest*1).values] color=numpy.array(["#000000","#00FFFF"])[(H["MIByBranch"]["I(Ti,G)"].MultTest*1).values] XITOLbis=DataFrame({"mode":"clade","label":label, "color":color}, index=list(values.index.get_level_values("Name")) ) XITOLbis.name="branch name" XITOL=XITOL.append(XITOLbis) XITOL=XITOL[[ "mode", "color","label"]] Pie=H["MIByBranch"]["By Group Relative Frequency"].query("Is_Leaf==True") color=spacedColors(Pie.shape[1]) Pie.index=Pie.index.get_level_values("Name") Pie.columns=MultiIndex(levels=[[Pie.columns],[color]],labels=[range(Pie.shape[1])]*2, names=["LABELS","COLORS"]) Pie.index.name="" #Transform in integer to do not upset ITOL HIST=(Pie*H["counts"].index.get_level_values("Total Counts")[0]).astype(int) return XITOL, HIST,H,values.cat.categories.tolist()
def collector2df(collector, station, sos_name, provider='COOPS'): """Request CSV response from SOS and convert to Pandas DataFrames.""" collector.features = [station] collector.variables = [sos_name] long_name = get_station_longName(station, provider) try: response = collector.raw(responseFormat="text/csv") data_df = read_csv(BytesIO(response.encode('utf-8')), parse_dates=True, index_col='date_time') col = 'sea_water_temperature (C)' data_df['Observed Data'] = data_df[col] except ExceptionReport as e: # warn("Station %s is not NAVD datum. %s" % (long_name, e)) print(str(e)) data_df = DataFrame() # Assigning an empty DataFrame for now. data_df.name = long_name return data_df
def coops2df(collector, coops_id, sos_name): """Request CSV response from SOS and convert to Pandas DataFrames.""" collector.features = [coops_id] collector.variables = [sos_name] long_name = get_Coops_longName(coops_id) try: response = collector.raw(responseFormat="text/csv") data_df = read_csv(BytesIO(response.encode('utf-8')), parse_dates=True, index_col='date_time') col = 'wind_speed (m/s)' data_df['Observed Data'] = data_df[col] except ExceptionReport as e: warn("Station %s is not NAVD datum. %s" % (long_name, e)) data_df = DataFrame() # Assing an empty DataFrame for now. data_df.name = long_name return data_df
def editor(interrogation, operation=None, denominator=False, sort_by=False, keep_stats=False, keep_top=False, just_totals=False, threshold='medium', just_entries=False, skip_entries=False, merge_entries=False, just_subcorpora=False, skip_subcorpora=False, span_subcorpora=False, merge_subcorpora=False, replace_names=False, replace_subcorpus_names=False, projection=False, remove_above_p=False, p=0.05, print_info=False, spelling=False, selfdrop=True, calc_all=True, keyword_measure='ll', **kwargs ): """ See corpkit.interrogation.Interrogation.edit() for docstring """ # grab arguments, in case we get dict input and have to iterate locs = locals() import corpkit import re import collections import pandas as pd import numpy as np from pandas import DataFrame, Series from time import localtime, strftime try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: have_ipython = False try: from IPython.display import display, clear_output except ImportError: pass # to use if we also need to worry about concordance lines return_conc = False from corpkit.interrogation import Interrodict, Interrogation, Concordance if interrogation.__class__ == Interrodict: locs.pop('interrogation', None) from collections import OrderedDict outdict = OrderedDict() for i, (k, v) in enumerate(interrogation.items()): # only print the first time around if i != 0: locs['print_info'] = False if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self': denominator = interrogation # if df2 is also a dict, get the relevant entry if isinstance(denominator, (dict, Interrodict)): #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \ # sorted(set([i.lower() for i in list(denominator.keys())])): # locs['denominator'] = denominator[k] # fix: this repeats itself for every key, when it doesn't need to # denominator_sum: if kwargs.get('denominator_sum'): locs['denominator'] = denominator.collapse(axis='key') if kwargs.get('denominator_totals'): locs['denominator'] = denominator[k].totals else: locs['denominator'] = denominator[k].results outdict[k] = v.results.edit(**locs) if print_info: thetime = strftime("%H:%M:%S", localtime()) print("\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (thetime, "'\n '".join(sorted(outdict.keys())))) return Interrodict(outdict) elif isinstance(interrogation, (DataFrame, Series)): dataframe1 = interrogation elif isinstance(interrogation, Interrogation): #if interrogation.__dict__.get('concordance', None) is not None: # concordances = interrogation.concordance branch = kwargs.pop('branch', 'results') if branch.lower().startswith('r') : dataframe1 = interrogation.results elif branch.lower().startswith('t'): dataframe1 = interrogation.totals elif branch.lower().startswith('c'): dataframe1 = interrogation.concordance return_conc = True else: dataframe1 = interrogation.results elif isinstance(interrogation, Concordance) or \ all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']): return_conc = True print('heree') dataframe1 = interrogation # hope for the best else: dataframe1 = interrogation the_time_started = strftime("%Y-%m-%d %H:%M:%S") pd.options.mode.chained_assignment = None try: from process import checkstack except ImportError: from corpkit.process import checkstack if checkstack('pythontex'): print_info=False def combiney(df, df2, operation='%', threshold='medium', prinf=True): """mash df and df2 together in appropriate way""" totals = False # delete under threshold if just_totals: if using_totals: if not single_totals: to_drop = list(df2[df2['Combined total'] < threshold].index) df = df.drop([e for e in to_drop if e in list(df.index)]) if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print('Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show))) if len(to_drop) > 10: print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)) else: print('') else: denom = df2 else: denom = list(df2) if single_totals: if operation == '%': totals = df.sum() * 100.0 / float(df.sum().sum()) df = df * 100.0 try: df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '+': try: df = df.add(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '-': try: df = df.sub(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '*': totals = df.sum() * float(df.sum().sum()) try: df = df.mul(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '/': try: totals = df.sum() / float(df.sum().sum()) df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == 'a': for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2 elif operation.startswith('c'): import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pandas.concat([df, df2], axis=1) return df, totals elif not single_totals: if not operation.startswith('a'): # generate totals if operation == '%': totals = df.sum() * 100.0 / float(df2.sum().sum()) if operation == '*': totals = df.sum() * float(df2.sum().sum()) if operation == '/': totals = df.sum() / float(df2.sum().sum()) if operation.startswith('c'): # add here the info that merging will not work # with identical colnames import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") d = pd.concat([df.T, df2.T]) # make index nums d = d.reset_index() # sum and remove duplicates d = d.groupby('index').sum() dx = d.reset_index('index') dx.index = list(dx['index']) df = dx.drop('index', axis=1).T def editf(datum): meth = {'%': datum.div, '*': datum.mul, '/': datum.div, '+': datum.add, '-': datum.sub} if datum.name in list(df2.columns): method = meth[operation] mathed = method(df2[datum.name], fill_value=0.0) if operation == '%': return mathed * 100.0 else: return mathed else: return datum * 0.0 df = df.apply(editf) else: for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2.T.sum() return df, totals def parse_input(df, the_input): """turn whatever has been passed in into list of words that can be used as pandas indices---maybe a bad way to go about it""" parsed_input = False import re if the_input == 'all': the_input = r'.*' if isinstance(the_input, int): try: the_input = str(the_input) except: pass the_input = [the_input] elif isinstance(the_input, STRINGTYPE): regex = re.compile(the_input) parsed_input = [w for w in list(df) if re.search(regex, w)] return parsed_input from corpkit.dictionaries.process_types import Wordlist if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist: the_input = list(the_input) if isinstance(the_input, list): if isinstance(the_input[0], int): parsed_input = [word for index, word in enumerate(list(df)) if index in the_input] elif isinstance(the_input[0], STRINGTYPE): try: parsed_input = [word for word in the_input if word in df.columns] except AttributeError: # if series parsed_input = [word for word in the_input if word in df.index] return parsed_input def synonymise(df, pos='n'): """pass a df and a pos and convert df columns to most common synonyms""" from nltk.corpus import wordnet as wn #from dictionaries.taxonomies import taxonomies from collections import Counter fixed = [] for w in list(df.columns): try: syns = [] for syns in wn.synsets(w, pos=pos): for w in syns: synonyms.append(w) top_syn = Counter(syns).most_common(1)[0][0] fixed.append(top_syn) except: fixed.append(w) df.columns = fixed return df def convert_spell(df, convert_to='US', print_info=print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print('Converting spelling ... \n') if convert_to == 'UK': usa_convert = {v: k for k, v in list(usa_convert.items())} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df def merge_duplicates(df, print_info=print_info): if print_info: print('Merging duplicate entries ... \n') # now we have to merge all duplicates for dup in df.columns.get_duplicates(): #num_dupes = len(list(df[dup].columns)) temp = df[dup].sum(axis=1) #df = df.drop([dup for d in range(num_dupes)], axis=1) df = df.drop(dup, axis=1) df[dup] = temp return df def name_replacer(df, replace_names, print_info=print_info): """replace entry names and merge""" import re # get input into list of tuples # if it's a string, we want to delete it if isinstance(replace_names, STRINGTYPE): replace_names = [(replace_names, '')] # this is for some malformed list if not isinstance(replace_names, dict): if isinstance(replace_names[0], STRINGTYPE): replace_names = [replace_names] # if dict, make into list of tupes if isinstance(replace_names, dict): replace_names = [(v, k) for k, v in replace_names.items()] for to_find, replacement in replace_names: if print_info: if replacement: print('Replacing "%s" with "%s" ...\n' % (to_find, replacement)) else: print('Deleting "%s" from entry names ...\n' % to_find) to_find = re.compile(to_find) if not replacement: replacement = '' df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)] df = merge_duplicates(df, print_info=False) return df def just_these_entries(df, parsed_input, prinf=True): entries = [word for word in list(df) if word not in parsed_input] if prinf: print('Keeping %d entries:\n %s' % \ (len(parsed_input), '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') df = df.drop(entries, axis=1) return df def skip_these_entries(df, parsed_input, prinf=True): if prinf: print('Skipping %d entries:\n %s' % \ (len(parsed_input), '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') df = df.drop(parsed_input, axis=1) return df def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False): """makes appropriate name for merged entries""" if merging_subcorpora: if newname is False: newname = 'combine' if isinstance(newname, int): the_newname = list(df.columns)[newname] elif isinstance(newname, STRINGTYPE): if newname == 'combine': if len(parsed_input) <= 3: the_newname = '/'.join(parsed_input) elif len(parsed_input) > 3: the_newname = '/'.join(parsed_input[:3]) + '...' else: the_newname = newname if not newname: # revise this code import operator sumdict = {} for item in parsed_input: summed = sum(list(df[item])) sumdict[item] = summed the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0] if not isinstance(the_newname, STRINGTYPE): the_newname = str(the_newname, errors='ignore') return the_newname def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging='entries'): # make new entry with sum of parsed input if len(parsed_input) == 0: import warnings warnings.warn('No %s could be automatically merged.\n' % merging) else: if prinf: print('Merging %d %s as "%s":\n %s' % \ (len(parsed_input), merging, the_newname, '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') # remove old entries temp = sum([df[i] for i in parsed_input]) if isinstance(df, Series): df = df.drop(parsed_input, errors='ignore') nms = list(df.index) else: df = df.drop(parsed_input, axis=1, errors='ignore') nms = list(df.columns) if the_newname in nms: df[the_newname] = df[the_newname] + temp else: df[the_newname] = temp return df def just_these_subcorpora(df, lst_of_subcorpora, prinf=True): if isinstance(lst_of_subcorpora[0], int): lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if prinf: print('Keeping %d subcorpora:\n %s' % (len(good_years), '\n '.join(good_years[:10]))) if len(good_years) > 10: print('... and %d more ... \n' % (len(good_years) - 10)) else: print('') df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0) return df def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True): if isinstance(lst_of_subcorpora, int): lst_of_subcorpora = [lst_of_subcorpora] if isinstance(lst_of_subcorpora[0], int): lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if len(bad_years) == 0: import warnings warnings.warn('No subcorpora skipped.\n') else: if prinf: print('Skipping %d subcorpora:\n %s' % (len(bad_years), '\n '.join([str(i) for i in bad_years[:10]]))) if len(bad_years) > 10: print('... and %d more ... \n' % (len(bad_years) - 10)) else: print('') df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0) return df def span_these_subcorpora(df, lst_of_subcorpora, prinf=True): """select only a span of suborpora (first, last)""" fir, sec = lst_of_subcorpora if len(lst_of_subcorpora) == 0: import warnings warnings.warn('Span not identified.\n') else: if prinf: print('Keeping subcorpora:\n %d--%d\n' % (int(fir), int(sec))) sbs = list(df.index) df = df.ix[sbs.index(fir):sbs.index(sec) + 1] return df def projector(df, list_of_tuples, prinf=True): """project abs values""" if isinstance(list_of_tuples, list): tdict = {} for a, b in list_of_tuples: tdict[a] = b list_of_tuples = tdict for subcorpus, projection_value in list(list_of_tuples.items()): if isinstance(subcorpus, int): subcorpus = str(subcorpus) df.ix[subcorpus] = df.ix[subcorpus] * projection_value if prinf: if isinstance(projection_value, float): print('Projection: %s * %s' % (subcorpus, projection_value)) if isinstance(projection_value, int): print('Projection: %s * %d' % (subcorpus, projection_value)) if prinf: print('') return df def do_stats(df): """do linregress and add to df""" try: from scipy.stats import linregress except ImportError: thetime = strftime("%H:%M:%S", localtime()) print('%s: sort type not available in this verion of corpkit.' % thetime) return False indices = list(df.index) first_year = list(df.index)[0] try: x = [int(y) - int(first_year) for y in indices] except ValueError: x = list(range(len(indices))) statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] stats = [] if isinstance(df, Series): y = list(df.values) sl = Series(list(linregress(x, y)), index=statfields) else: for entry in list(df.columns): y = list(df[entry]) stats.append(list(linregress(x, y))) sl = DataFrame(zip(*stats), index=statfields, columns=list(df.columns)) df = df.append(sl) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) return df def resort(df, sort_by = False, keep_stats = False): """sort results, potentially using scipy's linregress""" # translate options and make sure they are parseable stat_field = ['slope', 'intercept', 'r', 'p', 'stderr'] easy_sorts = ['total', 'infreq', 'name', 'most', 'least'] stat_sorts = ['increase', 'decrease', 'static', 'turbulent'] options = stat_field + easy_sorts + stat_sorts sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'} sort_by = sort_by_convert.get(sort_by, sort_by) # probably broken :( if just_totals: if sort_by == 'name': return df.sort_index() else: return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1) stats_done = False if keep_stats or sort_by in stat_field + stat_sorts: df = do_stats(df) stats_done = True if isinstance(df, bool): if df is False: return False if isinstance(df, Series): if stats_done: stats = df.ix[range(-5, 0)] df = df.drop(list(stats.index)) if sort_by == 'name': df = df.sort_index() else: df = df.sort_values(ascending=sort_by != 'total') if stats_done: df = df.append(stats) return df if sort_by == 'name': # currently case sensitive df = df.reindex_axis(sorted(df.columns), axis=1) elif sort_by in ['total', 'infreq']: if df1_istotals: df = df.T df = df[list(df.sum().sort_values(ascending=sort_by != 'total').index)] # sort by slope etc., or search by subcorpus name if sort_by in stat_field or sort_by not in options: asc = kwargs.get('reverse', False) df = df.T.sort_values(by=sort_by, ascending=asc).T if sort_by in ['increase', 'decrease', 'static', 'turbulent']: slopes = df.ix['slope'] if sort_by == 'increase': df = df[slopes.argsort()[::-1]] elif sort_by == 'decrease': df = df[slopes.argsort()] elif sort_by == 'static': df = df[slopes.abs().argsort()] elif sort_by == 'turbulent': df = df[slopes.abs().argsort()[::-1]] if remove_above_p: df = df.T df = df[df['p'] <= p] df = df.T # remove stats field by default if not keep_stats: df = df.drop(stat_field, axis=0, errors='ignore') return df def set_threshold(big_list, threshold, prinf=True): if isinstance(threshold, STRINGTYPE): if threshold.startswith('l'): denominator = 10000 if threshold.startswith('m'): denominator = 5000 if threshold.startswith('h'): denominator = 2500 if isinstance(big_list, DataFrame): tot = big_list.sum().sum() if isinstance(big_list, Series): tot = big_list.sum() tshld = float(tot) / float(denominator) else: tshld = threshold if prinf: print('Threshold: %d\n' % tshld) return tshld # copy dataframe to be very safe df = dataframe1.copy() # make cols into strings try: df.columns = [str(c) for c in list(df.columns)] except: pass if operation is None: operation = 'None' if isinstance(interrogation, Concordance): return_conc = True # do concordance work if return_conc: if just_entries: if isinstance(just_entries, int): just_entries = [just_entries] if isinstance(just_entries, STRINGTYPE): df = df[df['m'].str.contains(just_entries)] if isinstance(just_entries, list): if all(isinstance(e, STRINGTYPE) for e in just_entries): mp = df['m'].map(lambda x: x in just_entries) df = df[mp] else: df = df.ix[just_entries] if skip_entries: if isinstance(skip_entries, int): skip_entries = [skip_entries] if isinstance(skip_entries, STRINGTYPE): df = df[~df['m'].str.contains(skip_entries)] if isinstance(skip_entries, list): if all(isinstance(e, STRINGTYPE) for e in skip_entries): mp = df['m'].map(lambda x: x not in skip_entries) df = df[mp] else: df = df.drop(skip_entries, axis=0) if just_subcorpora: if isinstance(just_subcorpora, int): just_subcorpora = [just_subcorpora] if isinstance(just_subcorpora, STRINGTYPE): df = df[df['c'].str.contains(just_subcorpora)] if isinstance(just_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in just_subcorpora): mp = df['c'].map(lambda x: x in just_subcorpora) df = df[mp] else: df = df.ix[just_subcorpora] if skip_subcorpora: if isinstance(skip_subcorpora, int): skip_subcorpora = [skip_subcorpora] if isinstance(skip_subcorpora, STRINGTYPE): df = df[~df['c'].str.contains(skip_subcorpora)] if isinstance(skip_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora): mp = df['c'].map(lambda x: x not in skip_subcorpora) df = df[mp] else: df = df.drop(skip_subcorpora, axis=0) return Concordance(df) if print_info: print('\n***Processing results***\n========================\n') df1_istotals = False if isinstance(df, Series): df1_istotals = True df = DataFrame(df) # if just a single result else: df = DataFrame(df) if operation.startswith('k'): if sort_by is False: if not df1_istotals: sort_by = 'turbulent' if df1_istotals: df = df.T # figure out if there's a second list # copy and remove totals if there is single_totals = True using_totals = False outputmode = False if denominator.__class__ == Interrogation: try: denominator = denominator.results except AttributeError: denominator = denominator.totals if denominator is not False and not isinstance(denominator, STRINGTYPE): df2 = denominator.copy() using_totals = True if isinstance(df2, DataFrame): if len(df2.columns) > 1: single_totals = False else: df2 = Series(df2) elif isinstance(df2, Series): single_totals = True #if operation == 'k': #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?') else: if operation in ['k', 'a', '%', '/', '*', '-', '+']: denominator = 'self' if denominator == 'self': outputmode = True if operation.startswith('a') or operation.startswith('A'): if list(df.columns)[0] != '0' and list(df.columns)[0] != 0: df = df.T if using_totals: if not single_totals: df2 = df2.T if projection: # projection shouldn't do anything when working with '%', remember. df = projector(df, projection) if using_totals: df2 = projector(df2, projection) if spelling: df = convert_spell(df, convert_to=spelling) df = merge_duplicates(df, print_info=False) if not single_totals: df2 = convert_spell(df2, convert_to=spelling, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not df1_istotals: sort_by = 'total' if replace_names: df = name_replacer(df, replace_names) df = merge_duplicates(df) if not single_totals: df2 = name_replacer(df2, replace_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not sort_by: sort_by = 'total' if replace_subcorpus_names: df = name_replacer(df.T, replace_subcorpus_names) df = merge_duplicates(df).T df = df.sort_index() if not single_totals: if isinstance(df2, DataFrame): df2 = df2.T df2 = name_replacer(df2, replace_subcorpus_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if isinstance(df2, DataFrame): df2 = df2.T df2 = df2.sort_index() if not sort_by: sort_by = 'total' # remove old stats if they're there: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: df = df.drop(statfields, axis=0) except: pass if using_totals: try: df2 = df2.drop(statfields, axis=0) except: pass # remove totals and tkinter order for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and df1_istotals: continue try: df = df.drop(name, axis=ax, errors='ignore') except: pass for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and single_totals: continue try: df2 = df2.drop(name, axis=ax, errors='ignore') except: pass # merging: make dicts if they aren't already, so we can iterate if merge_entries: if not isinstance(merge_entries, list): if isinstance(merge_entries, STRINGTYPE): merge_entries = {'combine': merge_entries} # for newname, criteria for name, the_input in sorted(merge_entries.items()): pin = parse_input(df, the_input) the_newname = newname_getter(df, pin, newname=name, prinf=print_info) df = merge_these_entries(df, pin, the_newname, prinf=print_info) if not single_totals: pin2 = parse_input(df2, the_input) df2 = merge_these_entries(df2, pin2, the_newname, prinf=False) else: for i in merge_entries: pin = parse_input(df, merge_entries) the_newname = newname_getter(df, pin, prinf=print_info) df = merge_these_entries(df, pin, the_newname, prinf=print_info) if not single_totals: pin2 = parse_input(df2, merge_entries) df2 = merge_these_entries(df2, pin2, the_newname, prinf=False) if merge_subcorpora: if not isinstance(merge_subcorpora, dict): if isinstance(merge_subcorpora, list): if isinstance(merge_subcorpora[0], tuple): merge_subcorpora = {x: y for x, y in merge_subcorpora} elif isinstance(merge_subcorpora[0], STRINGTYPE): merge_subcorpora = {'combine': [x for x in merge_subcorpora]} elif isinstance(merge_subcorpora[0], int): merge_subcorpora = {'combine': [str(x) for x in merge_subcorpora]} else: merge_subcorpora = {'combine': merge_subcorpora} for name, the_input in sorted(merge_subcorpora.items()): pin = parse_input(df.T, the_input) the_newname = newname_getter(df.T, pin, newname=name, \ merging_subcorpora=True, prinf=print_info) df = merge_these_entries(df.T, pin, the_newname, merging='subcorpora', prinf=print_info).T if using_totals: pin2 = parse_input(df2.T, the_input) df2 = merge_these_entries(df2.T, pin2, the_newname, merging='subcorpora', prinf=False).T if just_subcorpora: df = just_these_subcorpora(df, just_subcorpora, prinf=print_info) if using_totals: df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False) if skip_subcorpora: df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info) if using_totals: df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False) if span_subcorpora: df = span_these_subcorpora(df, span_subcorpora, prinf=print_info) if using_totals: df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False) if just_entries: df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info) if not single_totals: df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False) if skip_entries: df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info) if not single_totals: df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) if just_totals: df = DataFrame(df.sum(), columns=['Combined total']) if using_totals: if not single_totals: df2 = DataFrame(df2.sum(), columns=['Combined total']) else: df2 = df2.sum() tots = df.sum(axis=1) if using_totals or outputmode: if not operation.startswith('k'): tshld = 0 # set a threshold if just_totals if outputmode is True: df2 = df.T.sum() if not just_totals: df2.name = 'Total' else: df2.name = 'Combined total' using_totals = True single_totals = True if just_totals: if not single_totals: tshld = set_threshold(df2, threshold, prinf=print_info) df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info) # if doing keywording... if operation.startswith('k'): if isinstance(denominator, STRINGTYPE): if denominator == 'self': df2 = df.copy() else: df2 = denominator from corpkit.keys import keywords df = keywords(df, df2, selfdrop=selfdrop, threshold=threshold, print_info=print_info, editing=True, calc_all=calc_all, sort_by=sort_by, measure=keyword_measure, **kwargs) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # resort data if sort_by or keep_stats: df = resort(df, keep_stats=keep_stats, sort_by=sort_by) if isinstance(df, bool): if df is False: return 'linregress' if keep_top: if not just_totals: df = df[list(df.columns)[:keep_top]] else: df = df.head(keep_top) if just_totals: # turn just_totals into series: df = Series(df['Combined total'], name='Combined total') if df1_istotals: if operation.startswith('k'): try: df = Series(df.ix[dataframe1.name]) df.name = '%s: keyness' % df.name except: df = df.iloc[0, :] df.name = 'keyness' % df.name # generate totals branch if not percentage results: # fix me if df1_istotals or operation.startswith('k'): if not just_totals: try: total = Series(df['Total'], name='Total') except: total = 'none' pass #total = df.copy() else: total = 'none' else: # might be wrong if using division or something... try: total = df.T.sum(axis=1) except: total = 'none' if not isinstance(tots, DataFrame) and not isinstance(tots, Series): total = df.sum(axis=1) else: total = tots if isinstance(df, DataFrame): datatype = df.iloc[0].dtype else: datatype = df.dtype locs['datatype'] = datatype # TURN INT COL NAMES INTO STR try: df.results.columns = [str(d) for d in list(df.results.columns)] except: pass def add_tkt_index(df): """add an order for tkintertable if using gui""" if isinstance(df, Series): df = df.T df = df.drop('tkintertable-order', errors='ignore', axis=0) df = df.drop('tkintertable-order', errors='ignore', axis=1) dat = [i for i in range(len(df.index))] df['tkintertable-order'] = Series(dat, index=list(df.index)) df = df.T return df # while tkintertable can't sort rows if checkstack('tkinter'): df = add_tkt_index(df) if kwargs.get('df1_always_df'): if isinstance(df, Series): df = DataFrame(df) # delete non-appearing conc lines if not hasattr(interrogation, 'concordance'): lns = None elif hasattr(interrogation, 'concordance') and interrogation.concordance is None: lns = None else: col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns)) ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index)) lns = interrogation.concordance[col_crit] lns = lns.loc[ind_crit] lns = Concordance(lns) output = Interrogation(results=df, totals=total, query=locs, concordance=lns) if print_info: print('***Done!***\n========================\n') return output
else: prob = prob * (1 - row.iloc[ix]) return prob for elo_name in list(elo_names): allchunks = [] for cb in chunk_bounds: model = chunkmodels[elo_name, cb] newcol_name = 'cb_' + str(elo_name.translate(None, ' ()[],')) + '_' + str(cb) like_colnames.append(newcol_name) msg('Predicting %s' % newcol_name) preds = model.predict_proba(X) preds_series = DataFrame(preds).iloc[:,1] preds_series.index = X.index preds_series.name = cb allchunks.append(preds_series) allchunks.append(fit_df['movergain']) allchunks_df = concat(allchunks, axis=1) fit_df[elo_name] = allchunks_df.apply(gain_likelihood, axis=1) if diagnose: cols_to_show = list(elo_names) cols_to_show.extend(['gamenum','side','halfply','elo','movergain']) print fit_df[cols_to_show].transpose() # group by player-game, and combine all the likelihoods into a single # likelihood for that ELO def exp_sum_log(foo):
def df_loads(stream): """ Returns dataframe from a serialized stream""" tempobj = cPickle.loads(stream) # loads not load df = tempobj.dataframe for attr, value in tempobj._metadict.items(): setattr(df, attr, value) return df if __name__ == "__main__": ### Make a random dataframe, add some attributes df = DataFrame(((randn(3, 3))), columns=["a", "b", "c"]) print_customattr(df) print "adding some attributes" df.name = "Billy" df.junk = "in the trunk" print_customattr(df) ### Serialize into memory stream = df_dumps(df) print "wrote dataframe to memory" ### Restore from memory dfnew = df_loads(stream) print "restored from memory" print_customattr(dfnew) ### Serialize into file outfile = "dftest.df" # What file extension is commonly used for this? df_dump(df, outfile) print "wrote dataframe to file %s" % outfile
# Create a list of obs dataframes, one for each station: # <codecell> obs_df = [] sta_names = [] sta_failed = [] for sta in stations: b = coops2df(collector, sta, sos_name) name = b.name sta_names.append(name) print(name) if b.empty: sta_failed.append(name) b = DataFrame(np.arange(len(ts)) * np.NaN, index=ts.index, columns=['Observed Data']) b.name = name # Limit interpolation to 10 points (10 @ 6min = 1 hour). col = 'Observed Data' concatenated = concat([b, ts], axis=1).interpolate(limit=10)[col] obs_df.append(DataFrame(concatenated)) obs_df[-1].name = b.name # <codecell> geodetic = ccrs.Geodetic(globe=ccrs.Globe(datum='WGS84')) tiler = MapQuestOpenAerial() fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection=tiler.crs)) # Open Source Imagery from MapQuest (max zoom = 16?) zoom = 8 extent = [box[0], box[2], box[1], box[3]] ax.set_extent(extent, geodetic)
def test_timegrouper_with_reg_groups(self): # GH 3794 # allow combinateion of timegrouper/reg groups df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 1, 1, 13, 0), datetime(2013, 1, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 12, 2, 14, 0), ] }).set_index('Date') df_sorted = df_original.sort_values(by='Quantity', ascending=False) for df in [df_original, df_sorted]: expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() assert_frame_equal(result, expected) expected = DataFrame({ 'Buyer': 'Carl Mark Carl Joe'.split(), 'Quantity': [1, 3, 9, 18], 'Date': [ datetime(2013, 1, 1, 0, 0), datetime(2013, 1, 1, 0, 0), datetime(2013, 7, 1, 0, 0), datetime(2013, 7, 1, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() assert_frame_equal(result, expected) df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 10, 1, 13, 0), datetime(2013, 10, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 10, 2, 14, 0), ] }).set_index('Date') df_sorted = df_original.sort_values(by='Quantity', ascending=False) for df in [df_original, df_sorted]: expected = DataFrame({ 'Buyer': 'Carl Joe Mark Carl Joe'.split(), 'Quantity': [6, 8, 3, 4, 10], 'Date': [ datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 2, 0, 0), datetime(2013, 10, 2, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), ] }).set_index(['Date', 'Buyer']) assert_frame_equal(result, expected) # passing the name df = df.reset_index() result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' ]).sum() assert_frame_equal(result, expected) with self.assertRaises(KeyError): df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() # passing the level df = df.set_index('Date') result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer' ]).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum( ) assert_frame_equal(result, expected) with self.assertRaises(ValueError): df.groupby([pd.Grouper(freq='1M', level='foo'), 'Buyer']).sum() # multi names df = df.copy() df['Date'] = df.index + pd.offsets.MonthEnd(2) result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' ]).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), ] }).set_index(['Date', 'Buyer']) assert_frame_equal(result, expected) # error as we have both a level and a name! with self.assertRaises(ValueError): df.groupby([pd.Grouper(freq='1M', key='Date', level='Date'), 'Buyer']).sum() # single groupers expected = DataFrame({'Quantity': [31], 'Date': [datetime(2013, 10, 31, 0, 0) ]}).set_index('Date') result = df.groupby(pd.Grouper(freq='1M')).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M')]).sum() assert_frame_equal(result, expected) expected = DataFrame({'Quantity': [31], 'Date': [datetime(2013, 11, 30, 0, 0) ]}).set_index('Date') result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() assert_frame_equal(result, expected) # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') for freq in ['D', 'M', 'A', 'Q-APR']: expected = df.groupby('user_id')[ 'whole_cost'].resample( freq).sum().dropna().reorder_levels( ['date', 'user_id']).sort_index().astype('int64') expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected)