def test_replace_mixed(self): mf = self.mixed_frame mf.iloc[5:20, mf.columns.get_loc('foo')] = nan mf.iloc[-10:, mf.columns.get_loc('A')] = nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-18, nan), self.mixed_frame) result = self.mixed_frame.replace(np.nan, -1e8) expected = self.mixed_frame.fillna(value=-1e8) assert_frame_equal(result, expected) assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame) # int block upcasting df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64')}) expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64')}) result = df.replace(0, 0.5) assert_frame_equal(result, expected) df.replace(0, 0.5, inplace=True) assert_frame_equal(df, expected) # int block splitting df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64'), 'C': Series([1, 2], dtype='int64')}) expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0.5, 1], dtype='float64'), 'C': Series([1, 2], dtype='int64')}) result = df.replace(0, 0.5) assert_frame_equal(result, expected) # to object block upcasting df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), 'B': Series([0, 1], dtype='int64')}) expected = DataFrame({'A': Series([1, 'foo'], dtype='object'), 'B': Series([0, 1], dtype='int64')}) result = df.replace(2, 'foo') assert_frame_equal(result, expected) expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'), 'B': Series([0, 'foo'], dtype='object')}) result = df.replace([1, 2], ['foo', 'bar']) assert_frame_equal(result, expected) # test case from df = DataFrame({'A': Series([3, 0], dtype='int64'), 'B': Series([0, 3], dtype='int64')}) result = df.replace(3, df.mean().to_dict()) expected = df.copy().astype('float64') m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] assert_frame_equal(result, expected)
def kurtosis(str,list): s=list w = pd.read_csv(str, usecols=s) frame = DataFrame(w) h=len(w) print h t = frame.mean() d = frame.std() e = ((w - t) /d) ** 4 g=e.sum() p1=h*(h+1) p2=float((h-1)*(h-2)*(h-3)) p3=float(3*((h-1)**2)) p4=(h-2)*(h-3) i=(((p1/p2)*g)-(p3/p4)) print 'kurtosis=',i
def summarize_he( analytical_sets ): results = {} he = {} for analytical_set in analytical_sets: he[analytical_set.label] = calculate_he(analytical_set.allele_df) he_df = DataFrame( he ) labels = list(he_df.columns) if len(labels) == 2: # use Mann-Whitney / Wilcoxon test results['test'] = 'Wilcoxon test (paired)' results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]]) elif len(labels) > 2: # use Kruskal Wallis results['test'] = 'Kruskal-Wallis test' results['stats'] = kruskal( * [he_df[x] for x in labels]) results['warning'] = '' results['data'] = he_df results['mean'] = he_df.mean() results['stddev'] = he_df.std() #raise RuntimeError return results
def mydeviate(str,list,Deviation=0,MeanAbsDeviation=1,MeanSqDev=0): s=list w= pd.read_csv(str,usecols=s) s=DataFrame(w) t= s.mean() if Deviation==1: b=[w-t] print b if MeanAbsDeviation==1: a=[abs(s)-t] print(a) if MeanSqDev==1: c=[(w-t)**2] print c return
class LogAggregate: def __init__(self, dataset): self.dataset = DataFrame(dataset) def get_median(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']] else: return self.dataset.median()[kwarg['key']] def get_average(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']] else: return self.dataset.mean()[kwarg['key']] def get_min(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']] else: return self.dataset.min()[kwarg['key']] def get_max(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']] else: return self.dataset.max()[kwarg['key']] def get_count(self, *arg, **kwarg): if kwarg.has_key('group_by'): return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']] else: return self.dataset.count()[kwarg['key']]
def skewness(str,list): s= list w = pd.read_csv(str, usecols=s) frame = DataFrame(w) h=len(w) t = frame.mean() d = frame.std() e = ((w - t) /d) ** 3 g=e.sum() i=(h*g)/((h-1)*(h-2)) print 'skewness=',i
def preprocessing(self, df: pd.DataFrame): # Features wrangling df_features = df.iloc[:, 5:] df_features = df_features.fillna(df.mean()) df_features = np.array(df_features) np.apply_along_axis(self._normalize, 0, df_features) # Labels wrangling df_labels = np.array(df.loc[:, "Hogwarts House"]) return df_features, df_labels
def summary_statistics(data_set: pd.DataFrame) -> pd.DataFrame: summary_data = dict() summary_data['mean'] = data_set.mean(numeric_only=True) summary_data['std'] = data_set.std(ddof=1, numeric_only=True) summary_data['min'] = data_set.min(numeric_only=True) summary_data['max'] = data_set.max(numeric_only=True) return pd.DataFrame(summary_data).T
class CalWeight: def __init__(self, step, risk_aversion): self.risk_aversion = risk_aversion if step == 0: self.start = '2012-01-01' self.end = '2014-12-31' elif step == 1: self.start = '2012-01-01' self.end = '2015-2-28' elif step == 2: self.start = '2012-01-01' self.end = '2015-4-30' secIDs = [ '000300.ZICN', '000905.ZICN', '399006.ZICN', 'SPX.ZIUS', '000012.ZICN', '000013.ZICN' ] self.rtn_table = DataFrame() for secID in secIDs: cp = self.get_return(secID) cp.name = secID self.rtn_table = pd.concat([self.rtn_table, cp], axis=1) self.rtn_table.fillna(0, inplace=True) self.cov_mat = self.rtn_table.cov() * 250 self.exp_rtn = self.rtn_table.mean() * 250 def get_return(self, ticker): tmp_lst = [] fname = PERFIX + 'data_' + ticker + '.csv' with open(fname, 'r') as f: reader = csv.reader(f) for row in reader: tmp_lst.append(row) df = pd.DataFrame(tmp_lst[1:], columns=tmp_lst[0]) df['Date'] = pd.to_datetime(df['Date']) df = df.set_index("Date") df = df[self.start:self.end] temp = df['Close'].astype('float64').pct_change().fillna(0.) return temp def get_weight(self): risk_aversion = self.risk_aversion P = risk_aversion * matrix(self.cov_mat.values) q = -1 * matrix(self.exp_rtn.values) G = matrix( np.vstack((np.diag(np.ones(len(self.exp_rtn))), np.diag(-np.ones(len(self.exp_rtn)))))) h = matrix( np.array([np.ones(len(self.exp_rtn)), np.zeros(len(self.exp_rtn)) ]).reshape(len(self.exp_rtn) * 2, 1)) A = matrix(np.ones(len(self.exp_rtn)), (1, len(self.exp_rtn))) b = matrix([1.0]) solvers.options['show_progress'] = False sol = solvers.qp(P, q, G, h, A, b) return DataFrame(index=self.exp_rtn.index, data=np.round(sol['x'], 2), columns=['weight']) # 权重精确到小数点后两位
def _plot_stats_attribute(stats_list: Sequence[Stats], attribute: str, label, ax=None): """Plot a certain attribute of a collection of histories.""" data = np.asarray([getattr(h, attribute) for h in stats_list]) df = DataFrame(data.T) df_mean = df.mean(axis=1) df_std = df.std(axis=1) sns_ax = sns.lineplot(df_mean.index, df_mean, label=label, ax=ax) sns_ax.fill_between(df_mean.index, df_mean - df_std, df_mean + df_std, alpha=0.3)
def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None): """ Plot the means and standard deviations of each dataset. :param real: DataFrame containing the real data :param fake: DataFrame containing the fake data :param ax: Axis to plot on. If none, a new figure is made. """ if ax is None: fig, ax = plt.subplots(1, 2, figsize=(10, 5)) fig.suptitle('Absolute Log Mean and STDs of numeric data\n', fontsize=16) ax[0].grid(True) ax[1].grid(True) real = real._get_numeric_data() fake = fake._get_numeric_data() real_mean = np.log(np.add(abs(real.mean()).values, 1e-5)) fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5)) min_mean = min(real_mean) - 1 max_mean = max(real_mean) + 1 line = np.arange(min_mean, max_mean) sns.lineplot(x=line, y=line, ax=ax[0]) sns.scatterplot(x=real_mean, y=fake_mean, ax=ax[0]) ax[0].set_title('Means of real and fake data') ax[0].set_xlabel('real data mean (log)') ax[0].set_ylabel('fake data mean (log)') real_std = np.log(np.add(real.std().values, 1e-5)) fake_std = np.log(np.add(fake.std().values, 1e-5)) min_std = min(real_std) - 1 max_std = max(real_std) + 1 line = np.arange(min_std, max_std) sns.lineplot(x=line, y=line, ax=ax[1]) sns.scatterplot(x=real_std, y=fake_std, ax=ax[1]) ax[1].set_title('Stds of real and fake data') ax[1].set_xlabel('real data std (log)') ax[1].set_ylabel('fake data std (log)') if ax is None: plt.show()
def variation_statistic(gene_data: pd.DataFrame) -> pd.Series: """ Calculate std/mean for each gene and replace nan with 0 :gene_data: Expression DF with genes in rows. Calculations are performed for each row across features. :return: Series with statistic for each row """ statistic = gene_data.std(axis=1) / gene_data.mean(axis=1) # statistic = gene_data.std(axis=1) # TODO How to deal with 0 expressed genes? Are they informative????? return statistic.replace(np.nan, 0)
def fit(self, X: pd.DataFrame, y=None): self._reset() if self.with_mean: self.mean_ = X.mean() if self.with_std: self.scale_ = X.std(ddof=0) return self
def test_mean_datetimelike_numeric_only_false(self): df = DataFrame( { "A": np.arange(3), "B": pd.date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), } ) # datetime(tz) and timedelta work result = df.mean(numeric_only=False) expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) # mean of period is not allowed df["D"] = pd.period_range("2016", periods=3, freq="A") with pytest.raises(TypeError, match="mean is not implemented for Period"): df.mean(numeric_only=False)
def mean_centered(self, utility_matrix: pd.DataFrame) -> pd.DataFrame: """ :param utility_matrix: :return: """ mean_centered_utility_matrix = utility_matrix.sub( utility_matrix.mean()) return self.data.similarity_matrix_cosine(mean_centered_utility_matrix)
def table(self, extractedData: DataFrame, name: str): filename = self.filename self.filename = filename + '_all.txt' self._table(extractedData, name) mean = extractedData.mean(axis=0) std = extractedData.std(axis=0) self.filename = filename + '_average.txt' self._table(mean, name, std)
def get_topwords(self, countries, thresh=10, tf_idf=False): tw = DataFrame() for r in range(len(self.df)): if self.df.loc[r, 'country_id'] in countries: if tf_idf: tw = tw.append(self.tf_idf.loc[r, :]) else: tw = tw.append(self.df.loc[r, :]) return tw.mean().order(ascending=False)[:thresh]
def scatter_peaks_no_peaks( top_eco: pd.DataFrame, top_naked: pd.DataFrame, non_top_eco: pd.DataFrame, non_top_naked: pd.DataFrame, ax: plt.Axes = None, ): if not ax: _, ax = plt.subplots(figsize=(12, 12)) ax.set_xlabel("Chromatin") ax.set_ylabel("Naked") ax.scatter( non_top_eco, non_top_naked, alpha=0.2, label="All Points", ) ax.scatter(top_eco, top_naked, label="Open ATAC") ax.axvline(non_top_eco.mean(), color="C0") ax.axvline(top_eco.mean(), color="C1") ax.axhline(non_top_naked.mean(), color="C0") ax.axhline(top_naked.mean(), color="C1") ax.legend( loc="upper right", frameon=False, shadow=False, ) # We concatenate the two DFs to a single one so that the dropna() call will # "synced" between the two different rows top = pd.DataFrame({"chrom": top_eco, "naked": top_naked}).dropna(axis=0) all_ = pd.DataFrame({ "chrom": non_top_eco, "naked": non_top_naked }).dropna(axis=0) r_top, _ = scipy.stats.pearsonr(top.loc[:, "chrom"], top.loc[:, "naked"]) r_all, _ = scipy.stats.pearsonr(all_.loc[:, "chrom"], all_.loc[:, "naked"]) ax.text(0.01, 0.8, f"R (top) = {r_top} \nR (rest) = {r_all}", transform=ax.transAxes) return ax
def get_sharpe_ratios(df_returns: DataFrame, risk_free_rate: float = 0.0, periods_per_annum: int = 252) -> Series: """ Helper function to calculate the (annualized) Sharpe Ratios of the financial instruments contained in the input dataframe. """ numer = (df_returns.mean(axis=0) - risk_free_rate) * periods_per_annum denom = np.sqrt(df_returns.var(axis=0) * periods_per_annum) return numer / denom
def pd_03(): df=DataFrame(np.random.randn(6,3)) df.ix[2:,1]=np.nan df.ix[4:,2]=np.nan print df print df.fillna(method='ffill') print df.fillna(method='ffill',limit=2) data=Series([1.,None,3.5,None,7]) print data.fillna(data.mean()) print df.fillna(df.mean())
def get_mean_by_bin(df: pd.DataFrame) -> pd.Series: """ Takes all the sweep data from the input dataframe as returned by read_hackrf_sweep_file_and_merge and gets the average db for each bin. Returns as a pandas Series :param df: pd.DataFrame from experiment in question :return: pd.Series of average """ return df.mean(axis=0)
def test_replace_series_dict(self): # from GH 3064 df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) result = df.replace(0, {"zero": 0.5, "one": 1.0}) expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) tm.assert_frame_equal(result, expected) result = df.replace(0, df.mean()) tm.assert_frame_equal(result, expected) # series to series/dict df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) s = Series({"zero": 0.0, "one": 2.0}) result = df.replace(s, {"zero": 0.5, "one": 1.0}) expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) tm.assert_frame_equal(result, expected) result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected)
def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True df = DataFrame({ "A": np.arange(3), "B": pd.date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), "D": pd.period_range("2016", periods=3, freq="A"), }) result = df.mean(numeric_only=True) expected = Series({"A": 1.0}) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # in the future datetime columns will be included result = df.mean() expected = Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected)
def normalize_data(self, data: pd.DataFrame, idx) -> pd.DataFrame: """ All values should be normalized to range(-1,1). :param data: The data to normalize. :param idx: An id for remembering normalization values in class. :return: Normalized DataFrame. """ self.mean[idx] = data.mean() self.ptp[idx] = data.max() - data.min() return (data - self.mean[idx]) / self.ptp[idx]
def test_mean_excludes_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. df = DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) with tm.assert_produces_warning(FutureWarning): result = df.mean() expected = Series(dtype=np.float64) tm.assert_series_equal(result, expected)
def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame: """ Removes all rows from the given DataFrame containing outliers in any of the columns. :param df: Input DataFrame. :param zscore: z-score to use when calculating outliers. :return: The DataFrame with all outliers removed. """ scores = (df - df.mean()) / df.std(ddof=0).values return df[(np.abs(scores) < zscore).all(axis=1)]
def moments_features(path): if not os.path.exists(path): logger.error(path + " is not exist!") return im = cv2.imread(path) [b, g, r] = cv2.split(im) moments = [] for n in [b, g, r]: df = DataFrame(np.array(n.flatten())) moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]]) return moments
def __init__(self, df): scaler = MinMaxScaler(feature_range=(0, 100)) df_scaled = DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index) #一定要保证colunms特别是index的一致 df_corr = df_scaled.corr() df_stat = DataFrame([df_scaled.apply(lambda x: np.sqrt(np.var(x)))], index=['sd']) #标准差 df_stat = df_stat.append(Series(df_scaled.mean(), name='mean')) #均值 df_stat = df_stat.append(Series(df_scaled.apply(self.get_entropy), name='entropy')) #熵值 self.df_stat = df_stat.append(Series(df_corr.applymap(lambda x: 1-x).sum(), name='critic_part')) #CRITIC部分 self.df = df self.scaler = scaler #归一标尺 self.df_scaled = df_scaled #归一后数据,格式跟df完全一致
def normalize( data: pd.DataFrame, normalization_type: NormalizationType = NormalizationType.STANDARD): if normalization_type == NormalizationType.STANDARD: data_mean = data.mean(axis=0) data_std = data.std(axis=0) return (data - data_mean) / data_std, data_mean, data_std else: data_min = data.min(axis=0) data_max = data.max(axis=0) return (data - data_min) / (data_max - data_min), data_min, data_max
def rescale(data: pd.DataFrame) -> np.ndarray: means = data.mean(axis=0) variances = data.var(axis=0) scaled_data = data.copy(deep=True) for row in range(data.shape[0]): for col in range(data.shape[1]): scaled_data.iloc[row, col] = (data.iloc[row, col] - means[col]) / np.sqrt(variances[col]) return scaled_data, means, np.sqrt(variances)
def _print_full_performance(performance: pd.DataFrame, verbosity='mean') -> None: """ Prints the accuracy, precision, recall and f-1 score Args: accuracy: performance of the predictor as a DataFrame verbosity: if 'mean' will print only the mean value(s), otherwise, will print for each label """ if verbosity == 'mean': print(performance.mean().astype(float).round(3)) else: print(performance.astype(float).round(3))
def combine_spread(file_set, shift, drop_return_data=False): """ Combine the spread of input files, return with mean and standard deviation calculated. """ data = [] values = {} for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'): values[val] = {} # Collect data from all files into dictionaries for i, _file in enumerate(file_set): data.append(Spread().read(_file)) for val in values.keys(): values[val][i] = Series( data=data[i].spread[val]['val'], index=data[i].times ) data[i].times = (np.array(data[i].times) - shift[i]) spread = Spread() spread.spread['num'] = len(file_set) for val in values.keys(): # Shift time as per synchronisation for i in values[val]: values[val][i].index = np.array(values[val][i].index) - shift[i] # Convert to DataFrame df = DataFrame(data=values[val]) # If not a single file, keep only indices with at least two non-NaN if len(file_set) > 1: df = df.dropna() # If return data dropped, fill data here if drop_return_data: for i in df.columns: data[i].spread[val]['val'] = df[i].tolist() # Get times, mean and standard error as lists mean = list(df.mean(axis=1)) std_error = list(df.std(axis=1)) times = list(df.index) # Add to Spread object spread.spread[val]['val'] = mean spread.spread[val]['std'] = std_error spread.spread['times'] = times return spread, data
def cv(df: pd.DataFrame, fill_value: Optional[float] = None) -> pd.Series: """ Computes the Coefficient of variation for each column. Used by DataContainer objects to compute metrics. """ res = df.std() / df.mean() if fill_value is not None: res = res.fillna(fill_value) return res
def avg_columns(df:pd.DataFrame) -> pd.Series: '''Calculates mean of all columns in DataFrame. Args: df: Data frame of values to average. Returns: Series of means indexed by columns. ''' return df.mean()
def get_average_curve(input_csv: pd.DataFrame) -> pd.DataFrame: r""" Find the generalized curve to represent the class :param input_csv: raw class data :return: data points for generalized curve """ average_series = input_csv.mean(axis=0) generalised = pd.DataFrame(average_series).transpose() return generalised
def normalize(self, signal: pd.DataFrame) -> pd.DataFrame: """Apply normalization Args: signal (pd.DataFrame): Raw signal Returns: signal (pd.DataFrame): Normalized signal """ df_mean = signal.mean() df_std = signal.std() signal = (signal - df_mean) / df_std return signal
def select_centroids_smart(df: pd.DataFrame, k: int, get_dist=get_euclidean_distances) -> np.ndarray: points = pd.DataFrame(df.mean(axis=0)).T i = 1 while i < k: dists = get_dist(df, points).sum(axis=1) furthest = np.argmax(dists) next_point = pd.DataFrame(df.iloc[furthest]).T points = points.append(next_point) df = drop_df(df, df.iloc[furthest]) i += 1 return points.values
def fillNan(matrix: pd.DataFrame, type: str = 'value', value: float = 0): """ :param matrix: :param type: lựa chọn ['value', 'col_avg', 'row_avg'] :param value: float :return: """ filled_matrix = matrix.isna() result_matrix = matrix.copy() if type == 'value': result_matrix = matrix.fillna(value) elif type == 'col_avg': col_avg = matrix.mean(axis=0) result_matrix = matrix.fillna(col_avg) elif type == 'row_avg': row_avg = matrix.mean(axis=1) result_matrix = matrix.T.fillna(row_avg).T return result_matrix, filled_matrix
def _maximum_likelihood_pairs(pairings: DataFrame, ret_largest: bool = True): """ Given a pairings, choose the maximum likely pairing. """ pm = pairings.mean(axis=1) if pm.gt(0).sum() == 0: raise ValueError("There is no crossover between these datasets") elif pm.gt(0).sum() == 1 or ret_largest: return pm.idxmax(), pm.max() else: return pm[pm.gt(0)]
def resumo_disciplinas(dados: pd.DataFrame) -> pd.DataFrame: media_disciplina = dados.mean(axis=0).round(2) numero_acessos = dados[dados != 0].count(axis=0) tabela = pd.DataFrame( columns=media_disciplina.index, data=[media_disciplina.values, numero_acessos], index=['Média de Acesso por Caderno', 'Número de Acesso por Caderno ']) return tabela
def test_replace_series_dict(self): # from GH 3064 df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) result = df.replace(0, {'zero': 0.5, 'one': 1.0}) expected = DataFrame( {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}}) assert_frame_equal(result, expected) result = df.replace(0, df.mean()) assert_frame_equal(result, expected) # series to series/dict df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) s = Series({'zero': 0.0, 'one': 2.0}) result = df.replace(s, {'zero': 0.5, 'one': 1.0}) expected = DataFrame( {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}}) assert_frame_equal(result, expected) result = df.replace(s, df.mean()) assert_frame_equal(result, expected)
def BackTestSignal(dfXAlpha, dfXReturn, XPrice, strategy, riskmgr=None, freq = 252): dfAlphaWeight = strategy.GenSingleAlphaWeight(dfXAlpha) if riskmgr is not None: dfAlphaWeight = riskmgr.AdjustAlphaWeight(dfAlphaWeight) dfSignalReturn = GenSingleFactorReturn(dfAlphaWeight, dfXReturn) # (simple_sharpe, geo_sharpe, sim_mean * N, geo_mean * N, vol) sharpe = CalcSharpeRatio(dfSignalReturn['Return'], freq) # Detailed Data dfLongCount = DataFrame(columns=['LongCount'], data=dfAlphaWeight.apply(lambda s: s[s>0].count(), axis=1)) dfShortCount = DataFrame(columns=['ShortCount'], data=dfAlphaWeight.apply(lambda s: s[s<0].count(), axis=1)) dfLongExposure = DataFrame(columns=['LongExposure'], data=dfAlphaWeight.apply(lambda s: s[s>0].sum(), axis=1)) dfShortExposure = DataFrame(columns=['ShortExposure'], data=dfAlphaWeight.apply(lambda s: s[s<0].sum(), axis=1)) dfNetExposure = DataFrame(columns=['NetExposure'], data=dfAlphaWeight.apply(sum, axis=1)) dfTotalDollarInvest = DataFrame(columns=['I'], data=dfAlphaWeight.apply(lambda s: abs(s).sum(), axis=1)) dfTotalDollarTraded = DataFrame(columns=['D'], data=(dfAlphaWeight - dfAlphaWeight.shift(1)).apply(lambda s: abs(s).sum(), axis=1)) dfSharesTraded = dfAlphaWeight / XPrice dfTotalSharesTraded = DataFrame(columns=['Q'], data=(dfSharesTraded - dfSharesTraded.shift(1)).apply(lambda s: abs(s).sum(), axis=1)) TurnOver = dfTotalDollarTraded.mean()[0]/dfTotalDollarInvest.mean()[0] CentsPerShare = 100 * dfSignalReturn['Return'].iloc[1:].mean() / dfTotalSharesTraded.mean()[0] dfMetrics = DataFrame(list(sharpe)).T dfMetrics.columns = ['Simple Sharpe', 'Geo. Sharpe', 'Simple Mean', 'Geo. Mean', 'Anual Vol'] dfMetrics['Turnover'] = TurnOver dfMetrics['CentsPerShare'] = CentsPerShare dfMetrics['AvgHolding'] = 1.0/TurnOver dfMetrics.index = [dfXAlpha.index.name] dfSignalReturn = pd.merge(dfSignalReturn, dfLongCount,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfShortCount,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfLongExposure,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfShortExposure,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfNetExposure,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarInvest,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarTraded,left_index=True,right_index=True,how='outer') dfSignalReturn = pd.merge(dfSignalReturn, dfTotalSharesTraded,left_index=True,right_index=True,how='outer') return dfMetrics, dfSignalReturn, dfAlphaWeight
def test_align_int_fill_bug(self): # GH #910 X = np.arange(10 * 10, dtype='float64').reshape(10, 10) Y = np.ones((10, 1), dtype=int) df1 = DataFrame(X) df1['0.X'] = Y.squeeze() df2 = df1.astype(float) result = df1 - df1.mean() expected = df2 - df2.mean() assert_frame_equal(result, expected)
def stndize(str,list): s=list w= pd.read_csv(str,usecols=s) frame = DataFrame(w) t=frame.mean() print t z=frame.std() print z print (w-t)/z return;
def test_common_start_returns(self, before, after, mean_by_date, demeaned, expected_vals): dr = date_range(start='2015-1-17', end='2015-2-2') dr.name = 'date' tickers = ['A', 'B', 'C', 'D'] r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80) prices = DataFrame(index=dr, columns=tickers, data=[[r1**1, r2**1, r3**1, r4**1], [r1**2, r2**2, r3**2, r4**2], [r1**3, r2**3, r3**3, r4**3], [r1**4, r2**4, r3**4, r4**4], [r1**5, r2**5, r3**5, r4**5], [r1**6, r2**6, r3**6, r4**6], [r1**7, r2**7, r3**7, r4**7], [r1**8, r2**8, r3**8, r4**8], [r1**9, r2**9, r3**9, r4**9], [r1**10, r2**10, r3**10, r4**10], [r1**11, r2**11, r3**11, r4**11], [r1**12, r2**12, r3**12, r4**12], [r1**13, r2**13, r3**13, r4**13], [r1**14, r2**14, r3**14, r4**14], [r1**15, r2**15, r3**15, r4**15], [r1**16, r2**16, r3**16, r4**16], [r1**17, r2**17, r3**17, r4**17]]) dr2 = date_range(start='2015-1-21', end='2015-1-29') factor = DataFrame(index=dr2, columns=tickers, data=[[3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1]]).stack() factor.index = factor.index.set_names(['date', 'asset']) factor.name = 'factor' cmrt = common_start_returns( factor, prices, before, after, False, mean_by_date, factor if demeaned else None) cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)}) expected = DataFrame(index=range(-before, after + 1), columns=['mean', 'std'], data=expected_vals) assert_frame_equal(cmrt, expected)
def calc_kelly_leverages(securities: Set[str], start_date: date, end_date: date, risk_free_rate: float = 0.04) -> Dict[str, float]: """Calculates the optimal leverages for the given securities and time frame. Returns a list of (security, leverage) tuple with the calculate optimal leverages. Note: risk_free_rate is annualized """ f = {} ret = {} excess_return = {} # Download the historical prices from Yahoo Finance and calculate the # excess return (return of security - risk free rate) for each security. for symbol in securities: try: hist_prices = get_historical_data( symbol, start=start_date, end=end_date, output_format='pandas') except IOError as e: raise ValueError(f'Unable to download data for {symbol}. ' f'Reason: {str(e)}') f[symbol] = hist_prices ret[symbol] = hist_prices['close'].pct_change() # risk_free_rate is annualized excess_return[symbol] = (ret[symbol] - (risk_free_rate / 252)) # Create a new DataFrame based on the Excess Returns. df = DataFrame(excess_return).dropna() # Calculate the CoVariance and Mean of the DataFrame C = 252 * df.cov() M = 252 * df.mean() # Calculate the Kelly-Optimal Leverages using Matrix Multiplication F = inv(C).dot(M) # Return a list of (security, leverage) tuple return {security: leverage for security, leverage in zip(df.columns.values.tolist(), F)}
def test_ops(self): # tst ops and reversed ops in evaluation # GH7198 # smaller hits python, larger hits numexpr for n in [4, 4000]: df = DataFrame(1, index=range(n), columns=list('abcd')) df.iloc[0] = 2 m = df.mean() for op_str, op, rop in [('+', '__add__', '__radd__'), ('-', '__sub__', '__rsub__'), ('*', '__mul__', '__rmul__'), ('/', '__truediv__', '__rtruediv__')]: base = (DataFrame(np.tile(m.values, n) # noqa .reshape(n, -1), columns=list('abcd'))) expected = eval("base{op}df".format(op=op_str)) # ops as strings result = eval("m{op}df".format(op=op_str)) assert_frame_equal(result, expected) # these are commutative if op in ['+', '*']: result = getattr(df, op)(m) assert_frame_equal(result, expected) # these are not elif op in ['-', '/']: result = getattr(df, rop)(m) assert_frame_equal(result, expected) # GH7192 df = DataFrame(dict(A=np.random.randn(25000))) df.iloc[0:5] = np.nan expected = (1 - np.isnan(df.iloc[0:25])) result = (1 - np.isnan(df)).iloc[0:25] assert_frame_equal(result, expected)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype = float) for n in range(N): sample_tickers = list(random.choice(tickers, sample_size, replace = False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype = float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis = 1) result['Std'] = summary.std(axis = 1) result['Median'] = summary.median(axis = 1) result['Max'] = summary.max(axis = 1) result['Min'] = summary.min(axis = 1) return (result, summary)
def avg_medal_count(): ''' Using the dataframe's apply method, create a new Series called avg_medal_count that indicates the average number of gold, silver, and bronze medals earned amongst countries who earned at least one medal of any kind at the 2014 Sochi olympics. Note that the countries list already only includes countries that have earned at least one medal. No additional filtering is necessary. You do not need to call the function in your code when running it in the browser - the grader will do that automatically when you submit or test it. ''' countries = ['Russian Fed.', 'Norway', 'Canada', 'United States', 'Netherlands', 'Germany', 'Switzerland', 'Belarus', 'Austria', 'France', 'Poland', 'China', 'Korea', 'Sweden', 'Czech Republic', 'Slovenia', 'Japan', 'Finland', 'Great Britain', 'Ukraine', 'Slovakia', 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan'] gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0] bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1] olympic_medal_counts = {'country_name':countries, 'gold': Series(gold), 'silver': Series(silver), 'bronze': Series(bronze)} df = DataFrame(olympic_medal_counts) # YOUR CODE HERE #df['average_medal_count'] = df.mean(axis=1) #avg_medal_count_by_country = df[['country_name','average_medal_count']] avg_medal_count = df.mean() # Or, we could do it this way avg_medal_count = df[['gold', 'silver', 'bronze']].apply(numpy.mean) print(avg_medal_count) return avg_medal_count
class GetGenes(object): def __init__(self, data): self.dataframe = DataFrame(data) # read a text file and return a data frame. Records should be separated by TAB # There should not be duplicate column names def import_file(self, filename): # this function use to convert string to float def convert(x): try: x = float(x) except ValueError: pass return(x) table = [] for line in open(filename): if(line.strip()): # If not empty line line = line.rstrip('\n').split('\t') line = list(map(convert, line)) table.append(line) self.dataframe = DataFrame(table[1:],columns=table[0]) return def houseKeepingGenes(self, geneNum): # compute the CV of data std = array(self.dataframe.std(axis = 1)) mean = array(self.dataframe.mean(axis = 1)) CV = std/mean CV = list(map(abs, CV)) # convert to positive number # get the fist N minimum value mins = nsmallest(geneNum, CV) print("The GOOD genes are:\n") for item in mins: print(self.dataframe.ix[CV.index(item)][0]) return
## 带有重复值的轴索引 obj = Series(range(5), index=['a','a','b','b','c']) # 检验是否唯一 obj.index.is_unique # 一个索引有多个值,那么该索引就会返回多个值。 obj['a'] ## 汇总和计算描述统计 df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]], index=['a','b','c','d'], columns=['one','two']) # 对列 df.sum() # 对行 df.sum(axis=1) # 默认会排除NA,但是可以通过skipna禁用该功能 df.mean(axis=1,skipna=False) # 返回最大值的索引 df.idxmax() # 累加 df.cumsum() df.describe() # 相关系数 returns.MSFT.corr(returns.IBM) returns.corr() returns.cov() returns.corrwith(returns.IBM) ## 唯一值,值计数以及成员资格 obj = Series(['c','a','d','a','a','b','b','c','c']) uniques = obj.unique() # 统计个数
def run(self,Model='svc',kernel='linear',is_cross_validation=True, cross_validationMethod='LOO', DecompositionMethod='PCA',decompositionLevel='FeatureType',n_components=30, FeatureSelection='TopExplainedVarianceComponents', n_features=10, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False,isSaveCsv=None, isSavePickle=None, isSaveFig=None,isSelectSubFeatures=False,SubFeatures='ExpressionLevel'): # -- TODO : # -- # Greedy selection on features + Other feature selection types... # -- # Make sure featuers are Best only based on train data!!! # -- # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration # -- # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015) # -- # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation # -- # add f feature analysis by facial part (see excel) # -- # select best model (svm, otherwise ridge regression) # -- # compare svc results with regerssion results (using LOO and different Params for regression - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) # -- # check how the model weights behave - feature selection analysis # -- # calc model error # -- # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided # -- # LOO - also on bool labels (patients vs controls and mental status bool) # -- # add mental status rank scores (0-4) # -- # make sure p-val returns the right value in 'scores' # -- # run it over random data (permutation test) # -- # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R) ## init if isSelectSubFeatures: print('Features : ' + SubFeatures) f=self.FeaturesDF.copy() featureNames=self.FeaturesDF.index.names try: f=f.loc[SubFeatures] f.index=MultiIndex.from_product([[SubFeatures],f.index], names=featureNames) except KeyError: f.index=f.index.swaplevel(0,1) f=f.loc[SubFeatures] f.index=MultiIndex.from_product([f.index,[SubFeatures]], names=featureNames) self.FeaturesDF=f.copy() else: SubFeatures='allFeatureTypes' FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)] self.FullResults=DF() # set learning params (cross validation method, and model for learning) isBoolLabel=self.LabelsObject.isBoolLabel isBoolScores=isBoolLabel if DecompositionMethod==None and (FeatureSelection == 'TopExplainedVarianceComponents' or FeatureSelection == 'TopNComponents'): print("ERROR- feature selection method cannot be '"+ FeatureSelection +"' when X is not decomposed") FeatureSelection=raw_input("Choose a different feature selection method ('RFE','f_regression','dPrime','AllFeatures'): ") model, isBoolModel= learningUtils.setModel(Model) selectFeatures =learningUtils.setFeatureSelection(FeatureSelection,n_features) n_components=min(n_features,n_features) #cannot have more components than features. decompositionTitle, decomposeFunction= learningUtils.setDecomposition(DecompositionMethod,n_components,decompositionLevel) isDecompose= decompositionTitle!='noDecomposition' # save learning params self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'Decomposition':decompositionTitle,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']} print('\n------------Learning Details------------') print(DF.from_dict(self.Learningdetails,orient='index')) print('\n----' + cross_validationMethod + ' Cross validation Results:----') #define global variables over modules (to be used in myUtils) globalVars.transformMargins=0#lambda x:x globalVars.isBoolLabel=isBoolLabel globalVars.isBoolModel=isBoolModel global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject) trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2) LabelingList=trainLabels_all.columns #['N1'] self.ResultsDF=DF() self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList isMultivarLabels=False LabelingIndex=enumerate(LabelingList) if isMultivarLabels: LabelingIndex=enumerate([LabelingList]) for label_ind, Labeling in LabelingIndex: """if isPerm: #TODO - fix this to work with continous / bool data try: trainLabels=self.LabelsObject.permedLabelsDF[Labeling] except AttributeError: self.LabelsObject.permLabels() trainLabels=self.LabelsObject.permedLabelsDF[Labeling]""" #set subjects list according to labels and features X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling) X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1) #init train and test labels trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all) trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2) #make sure only labeled subjects are used for classification X=X.query('subject == '+ str(list(trainLabels.index)) ) X.index.get_level_values(X.index.names[0]) SubjectIndex=list(set(X.index.get_level_values('subject'))) X2=X2.query('subject == '+ str(list(trainLabels2.index)) ) X2.index.get_level_values(X2.index.names[0]) SubjectIndex2=list(set(X2.index.get_level_values('subject'))) #init vars if isBetweenSubjects: cv_param=len(SubjectIndex) self.Learningdetails['CrossValSubjects']='between' isWithinSubjects=False else: isWithinSubjects=True X=X.swaplevel(0,1) PieceIndex=list(set(X.index.get_level_values('Piece_ind'))) cv_param=len(PieceIndex) self.Learningdetails['CrossValSubjects']='within' self.Learningdetails['NumOfFeatures']=n_features try: print('\n**' + Labeling + '**') except TypeError: print('\n*******') print(Labeling) cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) ## Learning - feature selection for different scoring types, with cross validation - BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis cv_ind=0 #used for transforming from margins returned from svm to continouse labels (e.g . PANSS) trainScores=DF() test_index=X.index testScores=concat([DF(index=test_index),DF(index=['std_train_err'])]) testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) testProbas=DF(index=X.index) testProbas2=DF(index=SubjectIndex) #impt=Imputer(missing_values='NaN', strategy='median', axis=0) globalVars.LabelRange=LabelRange ModelWeights1=DF(columns=range(len(cv)),index=X.columns) Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning ExplainedVar=DF(columns=range(len(cv))) ModelWeights2=DF(columns=range(len(cv))) bestNfeaturesPanel=Panel(items=LabelingList,minor_axis=range(len(cv)),major_axis=range(n_features)) #bestNfeaturesPanel=Panel(items=LabelingList,major_axis=range(len(cv)),minor_axis=MultiIndex.from_tuples(('a','b'))) for train, test in cv: if not is_cross_validation: train=np.append(train,test) #test=np.append(train,test) self.Learningdetails['CrossVal']='NONE' #if cv_ind>0: # break if isBetweenSubjects: #set X and Y train_subjects=trainLabels.iloc[train].index test_subjects=testLabels.iloc[test].index Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects) Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects) if isConcatTwoLabels: #used when there is more than one doctor Xtrain=concat([Xtrain,Xtrain2]) Xtest=concat([Xtest,Xtest2]) Ytrain=concat([Ytrain,Ytrain2]) YtrainTrue=concat([YtrainTrue,YtrainTrue2]) Ytest=concat([Ytest,Ytest2]) Xdropped=concat([Xdropped,Xdropped2]) SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2))) droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test. #select N best features: Xtrain, Xtest, bestNfeatures, components, explainedVar = learningUtils.decomposeAndSelectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeatures,decomposeFunction) BestFeaturesForLabel.add(bestNfeatures) #todo - delete this?? bestNfeaturesPanel[Labeling][cv_ind]=bestNfeatures """for feature_ind,feature_name in enumerate(bestNfeatures): try: bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind except KeyError: bestNfeaturesPanel[Labeling].columns=bestNfeaturesPanel[Labeling].columns.append(feature_name)#continue here!! use bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind [bestNfeatures].iloc[cv_ind]=range(len(bestNfeatures))""" #train 1 TrainModel=model TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index()) """try: #Components[cv_ind]=components.T #ExplainedVar[cv_ind]=explainedVar isDecompose=True""" if cv_ind==0: ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures))) ModelWeights1[cv_ind]=TrainModel.coef_.flatten() #get ROC scores without cross validation: #train 2 if isBoolLabel: PiecePrediction_train=DF(TrainModel.predict_proba(Xtrain).T[1],index=Xtrain.index,columns=['prediction']) TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1}) else: PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction']) TrainModel2=linear_model.LinearRegression() Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel) TrainModel2.fit(Xtrain2, Ytrain2) if cv_ind==0: ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns) ModelWeights2[cv_ind]=TrainModel2.coef_.flatten() #test 1 if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test if isDecompose: dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index) XtestDropped=dXdropped[bestNfeatures] YtestDropped=Series(XtestDropped.copy().icol(0)) #YTrueDropped=Series(Xdropped.copy().icol(0)) for subject in droppedSubjects: YtestDropped[subject]=testLabels_all[Labeling].loc[subject] #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject] Ytest=concat([Ytest,YtestDropped]).sort_index() Xtest=concat([Xtest,XtestDropped]).sort_index() if isPerm: #TODO- Check this!! Ytest=y_perms.loc[Ytest.index] Xtest=Xtest.fillna(0.) elif isWithinSubjects: #train 1 train_pieces=PieceIndex[train] test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces))) Ytrain=Series(index=X.index) Ytest=Series(index=X.index) YtrainTrue=Series(index=X.index) for subject in PieceIndex: for piece in train_pieces: Ytrain.loc[piece].loc[subject]=trainLabels[subject] YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] Ytest.loc[piece].loc[subject]=testLabels[subject] Ytrain=Ytrain.dropna() YtrainTrue=YtrainTrue.dropna() for subject in test_subjects: Ytest.loc[piece].loc[subject]=testLabels[subject] #train scores 1 if cv_ind==0: trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel) plt.figure(1) if len(LabelingList)>1: plt.subplot(round(len(LabelingList)/2),2,label_ind+1) if isBoolLabel: testScores,testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel) else: testScores[cv_ind],testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel) plt.title(Labeling,fontsize=10) else: plt.figure(3) new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel) trainScores=concat([trainScores,new_trainScores],axis=1) #test 1 testScores[cv_ind],testProbas_new=learningUtils.getTestScores(Ytest,Xtest,TrainModel) testProbas=concat([testProbas,testProbas_new]) #train2 if isBoolLabel: PiecePrediction_test=DF(TrainModel.predict_proba(Xtest).T[1],index=Xtest.index,columns=['prediction']) else: PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction']) Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test,isBoolLabel) if cv_ind==0: trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2) YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2 #plt.figure(1) #if len(LabelingList)>1: #plt.subplot(round(len(LabelingList)/2),2,label_ind+1) #test2 if isBoolLabel: testScores2,testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) else: testScores2[cv_ind],testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) #plt.title(Labeling,fontsize=10) else: new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2) YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2 trainScores2=concat([trainScores2,new_trainScores2],axis=1) if len(Xtest2)>0: # if there is more than one segment for subject testScores2[cv_ind],testProbas2_new=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) testProbas2=concat([testProbas2,testProbas2_new]) cv_ind+=1 #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. fig2=plt.figure(2) if len(LabelingList)>1: plt.subplot(round(len(LabelingList)/2),2,label_ind+1) #if isAddDroppedSubjects: # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects] # else: # testLabelsSummary=testLabels scoresSummary,rocDF = learningUtils.getScoresSummary(trainScores2,testScores2,testProbas2,TrueLabels[Labeling]) # reset global vars globalVars.fitYscale='notDefined' globalVars.beta=DF() plt.title(Labeling,fontsize=10) plt.xlabel('Ytrue',fontsize=8) plt.ylabel('Ypredicted',fontsize=8) plt.tick_params(labelsize=6) #print(crossValScores.T) scores=scoresSummary.fillna(0.) #analyze feature weights ModelWeights1=ModelWeights1.dropna(how='all') WeightedFeatures1_index0=analysisUtils.getFeaturesWeights(0,bestNfeaturesPanel[Labeling],ModelWeights1) #FeatureAnalysisIndex=0 for featureType, 1= au's (if not decomposed) or component rank (if decomposed) WeightedFeatures1_index1=analysisUtils.getFeaturesWeights(1,bestNfeaturesPanel[Labeling],ModelWeights1) WeightedFeatures1=concat([DF(index=['-------(A) Index0-------']),WeightedFeatures1_index0,DF(index=['-------(B) Index1 -------']),WeightedFeatures1_index1]) WeightedFeatures2=DF(ModelWeights2.mean(axis=1)).fillna(0) #WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0) BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2]) self.BestFeatures[Labeling]=Series(BestFeatures.values.flatten(),index=BestFeatures.index) #analyze decomposition if isDecompose: Components_mean = Components.mean(axis=0) Components_std = Components.std(axis=0) normalize=lambda df:DF(StandardScaler().fit_transform(df.T).T,index=df.index,columns=df.columns) """#componentsMeanFeatureType=normalize(Components.mean(axis=1,level='FeatureType')) #componentsMeanFeatureTypeABS=normalize(componentsDF.abs().mean(axis=1,level='FeatureType')) #componentsMeanFSsignal=normalize(componentsDF.mean(axis=1,level='fs-signal')) #componentsMeanFSsignalABS=normalize(componentsDF.abs().mean(axis=1,level='fs-signal')) #ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check! #ExplainedVar_mean.index=['ExplainedVar_mean'] #ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check! #ExplainedVar_std.index=['ExplainedVar_std'] #componentsToCSV=concat([DF(index='---meanFeatureType----'),componentsMeanFeatureType,DF(index='---meanFeatureType - abs ----'),componentsMeanFeatureTypeABS,DF(index='---mean fs-signal ----'),componentsMeanFSsignal,DF(index='---mean fs-signal - abs ----'),componentsMeanFSsignalABS]) try: self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std]) except AttributeError: self.LabelComponents=dict.fromkeys(LabelingList) self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])""" """print(Components_mean) print(ExplainedVar_mean) print(WeightedFeatures1)""" #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff LabelFullResults=concat([DF(index=[Labeling]),scores]) self.FullResults=concat([self.FullResults,LabelFullResults]) self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1) #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png') testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score... FullSubjectsList=YpredictedOverAllLabels[0].columns YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all') YdroppNans=YdroppNans.dropna(axis=1,how='all') YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all') notNans_cv_ind=YpredictedOverAllLabels.items notNans_trainSubjects=YpredictedOverAllLabels.minor_axis notNans_LabelsList=YpredictedOverAllLabels.major_axis notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList] cv_ind=0 for train, test in cv: if cv_ind in notNans_cv_ind: print(test) train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects))) test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects))) if len(train)>0 and len(test)>0: AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train] AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0) AllLabelsYTrainTrue=notNans_TrueLabels[train] AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test] AllLabelsYTestTrue=notNans_TrueLabels[test] pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns) global AllLabelsTransformationMatrix AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!! TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix) #testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3) cv_ind+=1 self.BestNFeaturesAll=bestNfeaturesPanel self.ResultsDF=self.ResultsDF.fillna(0.) ## Print and save results print('\n') print(self.ResultsDF) print('\n') D=self.Learningdetails savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+ '_FSelection'+FeatureSelection+'_Decompostion'+D['Decomposition']+'PieceSize'+D['PieceLength']+'_'+SubFeatures if isPerm: savePath=savePath+'_PERMStest' saveName=savePath+'\\'+str(n_features)+'_features' self.Learningdetails['saveDir']=savePath dir=os.path.dirname(saveName) if not os.path.exists(dir): os.makedirs(dir) if isSavePickle is None: isSavePickle=int(raw_input('Save Results to pickle? ')) if isSaveCsv is None: isSaveCsv= int(raw_input('save Results to csv? ')) if isSaveFig is None: isSaveFig=int(raw_input('save Results to figure? ')) if isSavePickle: self.ResultsDF.to_pickle(saveName+'.pickle') self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle') if isSaveCsv: DetailsDF=DF.from_dict(self.Learningdetails,orient='index') ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures]) ResultsCSV.to_csv(saveName+'.csv') if isBoolLabel: ROCfig=learningUtils.save_plotROC(rocDF,isSave=True,saveName=saveName,title=SubFeatures) if isSaveCsv or isSavePickle: print('successfully saved as:\n' + saveName) if isSaveFig: plt.figure(1) plt.savefig(saveName + 'Train.png') plt.figure(2) plt.savefig(saveName + 'Test.png') plt.close() plt.close()
sys.stdout = Logger(title) if t == 2: authors = y_options.get(0)[1] recipients = y_options.get(1)[1] accuracies = [] results = {} clf, author_score = load_classifier(clfs.get(7), X, y_options.get(0)) for a in np.unique(authors): s_targets = list(compress(recipients, authors == a)) s_data = list(compress(X, authors == a)) _, score = classify(clfs.get(c), ("Recipients of {}".format(class_labels[a]), s_targets), s_data) accuracies.append(np.mean(score)) results[a] = dict(zip(np.unique(s_targets), np.atleast_1d(score))) df = DataFrame(results).T plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title) df = df.T.fillna(df.mean(axis=1)).T plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title+" (filled)") print("##################################") print("Mean Recipient Score: {}".format(np.mean(accuracies))) print("##################################") print("FINAL SCORE: {}".format(np.mean(author_score * np.mean(accuracies)))) print("##################################") else: classify(clfs.get(c), y, X, save=save_results if t == 0 else False)
# reductions or summary statistics f = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() # columns sum df.sum(axis=1) # sum row by row df (7.10 - 4.5)/2 df.mean(axis=1, skipna=False) df df.idxmax() df df.cumsum() # accumultation df.describe() # multiple summary statistics in one shot. obj = Series(['a', 'a', 'b', 'c'] * 4) obj obj.describe() ## Correlation and Covariance import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'],
BPbin= [] for y in frame2.sbp: BPbin.append(int((y - min(frame2.sbp))/3)) frame2['bin'] = BPbin frame2 = frame2[:(len(frame2))-2] #removes trailing incomplete cardiac cycle print frame2 groupedRR = frame2['RR'].groupby(frame2['bin']) RRarray = groupedRR.mean() groupedSBP = frame2['sbp'].groupby(frame2['bin']) SBParray = np.asarray(groupedSBP.mean()) print SBParray bin_weight = groupedSBP.size()/frame2['hb'].max() frame3 = frame2.mean() #linear regression #RR vs SBP slope, intercept, r_value, p_value, std_err = linregress(SBParray, RRarray) frame3['BRS slope'] = slope frame3['R^2'] = r_value**2 print frame3 bestfit = [(i*0.012020)+intercept for i in SBParray] #plots plots plots plots plots plots plots plots plots plots plots fig = plt.figure() #ECG plot ax1 = fig.add_subplot(2, 1, 1)
# 'Panel' objects are 3D. wp = Panel( { 'Item1' : DataFrame(randn(4, 3)), 'Item2' : DataFrame(randn(4, 2)) } ) pprint( wp ) # There are also 'TimeSeries', 'SparseSeries', and 'SparsePanel' objects. # In newer versions, there is experiemental support for higher-dimensional # panels. # Stats can also be performed on Pandas objects. df = DataFrame( randn( 6, 4 ), columns=[ 'A', 'B', 'C', 'D' ] ) pprint( df ) # You can choose which axis number to perform the operation along. pprint( df.mean( 0 ) ) pprint( df.mean( 1 ) ) # Much more to Pandas, but that's the basic idea. # For more information, see: # http://pandas.pydata.org/pandas-docs/stable/index.html # Also, definitely have a look at StatsModels: # http://statsmodels.sourceforge.net/ # http://statsmodels.sourceforge.net/stable/ # <demo> --- stop ---
# Returns all repos data from a given user def getRepos(user): myrepos=requests.get("https://api.github.com/users/"+ user +"/repos", \ headers={'Authorization': 'token 5218551eb082bffa572318de0c2de10d255170b1'}).json() return myrepos # Getting number of stars data = DataFrame() i = 0 for user in topGitUsers: userRepos = getRepos(user) i += 1 print i #check progress if len(userRepos) > 0: stars = [] listUserStars = [('',0)] for repo in userRepos: #print repo['stargazers_count'] stars.append(repo['stargazers_count']) userStars = DataFrame(stars) userMeanSt = userStars.mean(axis=0) listUserStars.append((user,userMeanSt)) #print user + str(userMeanSt[0]) result = DataFrame({'userId': user,'Mean of stars': userMeanSt}) data = data.append(result) else: print user + ': No repos found for this user' data.to_csv('gitTopUsersMean.csv')
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
if False: kw = dict(method='time') df = df.reindex(index).interpolate(**kw).ix[index] dfs.update({model: df}) dfs = Panel.fromDict(dfs).swapaxes(0, 2) # In[ ]: from pandas import DataFrame means = dict() for station, df in dfs.iteritems(): df.dropna(axis=1, how='all', inplace=True) mean = df.mean() df = df - mean + mean['OBS_DATA'] means.update({station: mean['OBS_DATA'] - mean.drop('OBS_DATA')}) bias = DataFrame.from_dict(means).dropna(axis=1, how='all') bias = bias.applymap('{:.2f}'.format).replace('nan', '--') columns = dict() [columns.update({station: get_coops_longname(station)}) for station in bias.columns.values] bias.rename(columns=columns, inplace=True) to_html(bias.T, 'style.css')
import numpy as np from pandas import DataFrame, Series import pandas as pd import matplotlib.pyplot as plt import scipy.stats df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100) plt.bar(np.arange(len(df.mean())), df.mean(), align='center', color='white', linewidth=1.5) plt.hold(True) plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(), elinewidth=1.2, capsize=7.5, fmt=None) plt.show()