def sd_ratio(df1: pd.DataFrame, df2: pd.DataFrame, robust: bool = False, fill_value: Optional[float] = None) -> pd.Series: """ Computes the ratio between the standard deviation of the columns of DataFrame1 and DataFrame2. Used to compute the D-Ratio metric. NaN values are filled to np.inf. Parameters ---------- df1 : DataFrame with shape (n1, m) df2 : DataFrame with shape (n2, m) robust : bool If True uses the MAD as an estimator of the standard deviation. Else computes the sample standard deviation. fill_value : Number used to input NaNs. Returns ------- ratio : pd.Series """ if robust: ratio = mad(df1) / mad(df2) else: ratio = df1.std() / df2.std() if fill_value is not None: ratio = ratio.fillna(fill_value) return ratio
def attributes_sanity_check(df: pd.DataFrame): """Utility function to check if the standard deviation of one (or more) attributes is zero. This utility function can be used to check if any attribute has a standard deviation of zero. This would lead to NaN's, when normalizing the features and thus would lead to NaN's when training the model. The function will raise a `RuntimeError` if one or more zeros have been detected and will print the list of corresponding attribute names to the console. Parameters ---------- df : pd.DataFrame DataFrame of catchment attributes as columns. Raises ------ RuntimeError If one or more attributes have a standard deviation of zero. """ # Iterate over attributes and check for NaNs attributes = [] if any(df.std() == 0.0) or any(df.std().isnull()): for k, v in df.std().iteritems(): if (v == 0) or (np.isnan(v)): attributes.append(k) if attributes: msg = [ "The following attributes have a std of zero or NaN, which results in NaN's ", "when normalizing the features. Remove the attributes from the attribute feature list ", "and restart the run. \n", f"Attributes: {attributes}" ] raise RuntimeError("".join(msg))
def scale(df: pd.DataFrame, method: str) -> pd.DataFrame: """ scales features using different methods. Parameters ---------- df: pandas.DataFrame method: {"autoscaling", "rescaling", "pareto"} Scaling method. `autoscaling` performs mean centering scaling of features to unitary variance. `rescaling` scales data to a 0-1 range. `pareto` performs mean centering and scaling using the square root of the standard deviation Returns ------- scaled: pandas.DataFrame """ if method == "autoscaling": scaled = (df - df.mean()) / df.std() elif method == "rescaling": scaled = (df - df.min()) / (df.max() - df.min()) elif method == "pareto": scaled = (df - df.mean()) / df.std().apply(np.sqrt) else: msg = "Available methods are `autoscaling`, `rescaling` and `pareto`." raise ValueError(msg) # replace nans generated when dividing by zero scaled[scaled.isna()] = 0 return scaled
def stats_nodules_deca(values_dic): """ Function that calculates the extremes, mean and std of several metrics given by the values_dic Parameters: values_dic (dictionary): volume - list with the volumes of the nodules volume% - list with the percentage volumes of the nodules larg_diam - list with the largest diameters of the nodules Feret_diam - list with the feret diameters of the nodules Returns: values_dic_out (dictionary): 'volume' : mean and std of the volume 'volume extremes' : largest and smallest value of the volume 'volume%' : mean and the std of the volume percentage 'larg_diam' : mean and the std of the largest diameters 'larg_diam_extremes' : largest and smallest value of the diameters 'Feret diam' : mean and the std of the Feret diameters 'Feret diam' : largest and smallest value of the Feret diameters """ # ------- Calculate stats stats_dic_out={'volume':[0,0],# mean first, std second 'volume extremes':[0,0], #largest and smallest 'volume%':[0,0], # mean first, std second 'larg_diam':[0,0],# mean first, std second 'larg_diam_extremes':[0,0], #largest and smallest 'Feret diam':[0,0], # mean first, std second 'Feret diam extremes':[0,0]} #largest and smallest stats_dic_out['volume'][0]=df.mean(pd.Series(values_dic['volume'])) stats_dic_out['volume'][1]=df.std(pd.Series(values_dic['volume'])) stats_dic_out['volume extremes'][0] = max(values_dic['volume']) stats_dic_out['volume extremes'][1] = min(values_dic['volume']) stats_dic_out['volume%'][0]=df.mean(pd.Series(values_dic['volume%'])) stats_dic_out['volume%'][1]=df.std(pd.Series(values_dic['volume%'])) stats_dic_out['larg_diam'][0]=df.mean(pd.Series(values_dic['larg_diam'])) stats_dic_out['larg_diam'][1]=df.std(pd.Series(values_dic['larg_diam'])) stats_dic_out['larg_diam_extremes'][0] = max(values_dic['larg_diam']) stats_dic_out['larg_diam_extremes'][1] = min(values_dic['larg_diam']) stats_dic_out['Feret diam'][0]=df.mean(pd.Series(values_dic['Feret diam'])) stats_dic_out['Feret diam'][1]=df.std(pd.Series(values_dic['Feret diam'])) stats_dic_out['Feret diam extremes'][0] = max(values_dic['Feret diam']) stats_dic_out['Feret diam extremes'][1] = min(values_dic['Feret diam']) return stats_dic_out
def standardize_features(train: pd.DataFrame, test: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame): """ Standardize all features except label to mean 0 variance 1. The test will be processed with the mean and the std of the training set. :param train: feature array of training set from where the std and the mean will be estimated :param test: feature array of the test set :return: tuple of the stand. train and test arrays """ train_std = (train - train.mean()) / train.std() test_std = (test - train.mean()) / train.std() return train_std, test_std
def compare(df1: pd.DataFrame, df2: pd.DataFrame): """同じ列を持つ二つのdfの値を色々比べた結果をdfに入れて返す。""" assert (df1.columns == df2.columns).all() std = (df1.std() + df2.std()) / 2 df_result = pd.DataFrame(index=df1.columns) df_result["mean_ae/std"] = np.abs(df1.mean() - df2.mean()) / std df_result["median_ae/std"] = np.abs(df1.median() - df2.median()) / std df_result["mode1"] = df1.mode().transpose()[0] df_result["mode2"] = df2.mode().transpose()[0] df_result = df_result.sort_values("median_ae/std", ascending=False) return df_result
def _create_summary(data: pd.DataFrame, original): summary = pd.DataFrame(0, index=data.columns, columns=[ "original", "mean", "std.error", "perc.025", "perc.975", "t stat." ]) summary.loc[:, "mean"] = data.mean(axis=0) summary.loc[:, "std.error"] = data.std(axis=0) summary.loc[:, "perc.025"] = data.quantile(0.025, axis=0) summary.loc[:, "perc.975"] = data.quantile(0.975, axis=0) summary.loc[:, "original"] = original summary.loc[:, "t stat."] = original / data.std(axis=0) return summary
def test_std_timedelta64_skipna_false(self): # GH#37392 tdi = pd.timedelta_range("1 Day", periods=10) df = DataFrame({"A": tdi, "B": tdi}) df.iloc[-2, -1] = pd.NaT result = df.std(skipna=False) expected = Series([df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) result = df.std(axis=1, skipna=False) expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)]) tm.assert_series_equal(result, expected)
def standardize( data: pd.DataFrame) -> Tuple[pd.DataFrame, Tuple[float, float]]: """ Applies standardization to input data. Result should have mean zero and standard deviation of one. Args ---- data: pd.DataFrame Returns ------- Tuple[pd.DataFrame, Tuple[float, float]] data: pd.DataFrame standardized data with zero mean and std of one. Tuple[float, float] mean and standard deviation used on each column of input data to make standardization. These values should be used to obtain the original dataframe. Raises ------ ValueError: if data has only one value. """ if data.shape[0] == 1: raise ValueError('Input data must have more than one value') mu = data.mean(skipna=True) std = data.std(skipna=True, ddof=0) data = (data - mu) / std.fillna(1) return [data, (mu, std)]
def analyze(df: pd.DataFrame): """中身を適当に分析してDataFrameに詰めて返す。""" if isinstance(df, pd.DataFrame): df_result = pd.DataFrame(index=df.columns) df_result["dtype"] = df.dtypes df_result["null"] = df.isnull().sum() df_result["nunique"] = df.nunique() df_result["min"] = df.min() df_result["median"] = df.median() df_result["max"] = df.max() df_result["mode"] = df.mode().transpose()[0] df_result["mean"] = df.mean() df_result["std"] = df.std() # # はずれ値のはずれ度合いを見るためにRobustScalerした結果の絶対値を見てみる。 # numeric_columns = df.select_dtypes(include=np.number).columns # df_result["outlier_size"] = np.nan # df_result.loc[numeric_columns, "outlier_size"] = ( # tk.preprocessing.SafeRobustScaler(clip_range=None) # .fit_transform(df.loc[:, numeric_columns]) # .fillna(0) # .abs() # .max() # .round(decimals=1) # ) return df_result else: raise NotImplementedError()
def kurtosis(str,list): s=list w = pd.read_csv(str, usecols=s) frame = DataFrame(w) h=len(w) print h t = frame.mean() d = frame.std() e = ((w - t) /d) ** 4 g=e.sum() p1=h*(h+1) p2=float((h-1)*(h-2)*(h-3)) p3=float(3*((h-1)**2)) p4=(h-2)*(h-3) i=(((p1/p2)*g)-(p3/p4)) print 'kurtosis=',i
def construct_portfolio(self): """ :return: """ pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M") pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d") tradedays_data = w.tdays(pre_date, self.date, "Period=M") tradedayslist = tradedays_data[0] tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist] # 提取因子数据 style_return = DataFrame() for f in self.factors: f_data = [] for dt in tradedays: stockcodes = StockPool(dt).select_stock() f_data = f(dt, stockcodes).getdata() f_ret = FactorProcess.get_alpha(stockcodes, dt, -1) # 选取一个月的alpha df = DataFrame(data=[f_data, f_ret], columns=[f.windLabel, 'ret']) long_only, long_short = FactorStyle.compute_style_return_month( df, f.windLabel) f_data.append(long_only) style_return[f.windLabel] = f_data style_sigma = style_return.std() weight = style_sigma / self.target weight[weight > 1] = 1 weight = weight / len(self.factors) return weight
def summarize_he( analytical_sets ): results = {} he = {} for analytical_set in analytical_sets: he[analytical_set.label] = calculate_he(analytical_set.allele_df) he_df = DataFrame( he ) labels = list(he_df.columns) if len(labels) == 2: # use Mann-Whitney / Wilcoxon test results['test'] = 'Wilcoxon test (paired)' results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]]) elif len(labels) > 2: # use Kruskal Wallis results['test'] = 'Kruskal-Wallis test' results['stats'] = kruskal( * [he_df[x] for x in labels]) results['warning'] = '' results['data'] = he_df results['mean'] = he_df.mean() results['stddev'] = he_df.std() #raise RuntimeError return results
def kurtosis(str, list): s = list w = pd.read_csv(str, usecols=s) frame = DataFrame(w) h = len(w) print h t = frame.mean() d = frame.std() e = ((w - t) / d)**4 g = e.sum() p1 = h * (h + 1) p2 = float((h - 1) * (h - 2) * (h - 3)) p3 = float(3 * ((h - 1)**2)) p4 = (h - 2) * (h - 3) i = (((p1 / p2) * g) - (p3 / p4)) print 'kurtosis=', i
def perform_pca(returns_df: pd.DataFrame, n): ''' Uses the sklearn library to perform principle components analysis on the stocks. ''' stocks = returns_df.columns stdev_returns = returns_df.std(ddof=1, axis=0) # Standardise the data scaled = StandardScaler().fit_transform(returns_df) # Conduct principle components analysis and project onto principle components pca = PCA(n_components=len(returns_df.columns)) print(scaled.shape) transformed = pca.fit_transform(scaled) eigenvalues = pca.explained_variance_ pc_df = pd.DataFrame(pca.components_, columns=[ 'PC{}'.format(i) for i in range(1, len(returns_df.columns) + 1) ], index=returns_df.columns) # Divide rows by STDEV of each coin return to get eigen portfolio weights eigenportfolios = pc_df.div(stdev_returns, axis=0) # The columns of 'eigenportfolios' are the relevant eigenportfolios. # Lets add the returns of the first n eigenportfolios to the returns dataframe. for i in range(1, n + 1): pc = 'PC{}'.format(i) returns_df[pc] = np.sum(returns_df[stocks].multiply( eigenportfolios[pc].to_list()), axis=1) / np.sqrt(eigenvalues[i - 1]) return returns_df
def skewness(str,list): s= list w = pd.read_csv(str, usecols=s) frame = DataFrame(w) h=len(w) t = frame.mean() d = frame.std() e = ((w - t) /d) ** 3 g=e.sum() i=(h*g)/((h-1)*(h-2)) print 'skewness=',i
def vars_gaussian(rets: pd.DataFrame, modified: bool = False) -> dict: """Retorna um dicionário com os 4 VaRs-paramétricos: 95%, 97%, 99% e 99.9% do dataframe de retornos 'rets'. Se modified=True, considera a skewness e curtose de 'rets' e realiza a correção de Cornish-Fisher. Args: rets (pd.DataFrame): dataframe dos retornos. Returns: dict: {95: ..., 97: ..., 99: ..., 99.9: ...} """ lvls = (95, 97, 99, 99.9) # z-scores zs = [norm.ppf(1 - lvl / 100) for lvl in lvls] if modified: s, k = skew(rets), kurtosis(rets) zs = [cornish_fisher_z(z, s, k) for z in zs] vol = rets.std() var = {v[0]: (rets.mean() + v[1] * vol)[0] for v in zip(lvls, zs)} return var
def construct_portfolio(self): """ :return: """ pre_date_data = w.tdaysoffset(-self.window, self.date, "Period=M") pre_date = pre_date_data.Data[0][0].strftime("%Y-%m-%d") tradedays_data = w.tdays(pre_date, self.date, "Period=M") tradedayslist = tradedays_data[0] tradedays = [td.strftime("%Y-%m-%d") for td in tradedayslist] # 提取因子数据 style_return = DataFrame() for f in self.factors: f_data = [] for dt in tradedays: stockcodes = StockPool(dt).select_stock() f_data = f(dt, stockcodes).getdata() f_ret = FactorProcess.get_alpha(stockcodes, dt, -1) # 选取一个月的alpha df = DataFrame(data=[f_data, f_ret], columns=[f.windLabel, 'ret']) long_only, long_short = FactorStyle.compute_style_return_month( df, f.windLabel) f_data.append(long_only) style_return[f.windLabel] = f_data S = matrix(style_return.cov().values) pbar = matrix(np.zeros_like(style_return.std().values)) n = len(self.factors) G = matrix(0.0, (n, n)) G[::n + 1] = -1.0 h = matrix(0.0, (n, 1)) A = matrix(1.0, (1, n)) b = matrix(1.0) portfolio_weight = qp(S, -pbar, G, h, A, b)['x']
def bse(data: pd.DataFrame, weight_name: Optional[str]=None, ignore: List[str]=None) -> pd.DataFrame: """ Calculate the Block Standard Error (BSE). Parameters ---------- data : Dataframe with CV data over time and weights. weight_name : Name of the weight column. ignore : List of column names to ignore. Returns ------- bse : Dataframe containing BSEs over all iterations. References ---------- Flyvbjerg, H., Petersen, H. G. Error estimates on averages of correlated data. The Journal of Chemical Physics, 91(1), 461 (1989) """ if ignore is None: ignore = [] if 'time' not in ignore: ignore.append('time') # Prepare input, first element if weight_name is not None: weights = data[weight_name].values ignore.append(weight_name) length = data.shape[0] width = data.shape[1] index = data.T.index data = data.values blist = [data.std(axis=0) / np.sqrt(length)] length //= 2 # Iteratively increase block size while length > 2: halved = np.empty((length, width)) # Each iteration, we halve the dataset for i in range(0, length): if weight_name is not None: halved[i] = (1 / (weights[2 * i - 1] + weights[2 * i]) * (data[2 * i - 1] * weights[2 * i - 1] + data[2 * i] * weights[2 * i])) else: halved[i] = 0.5 * (data[2 * i - 1] + data[2 * i]) # Calculate the BSE bse = halved.std(axis=0) / np.sqrt(length) blist.append(bse) length //= 2 # Reconstruct Dataframe return pd.DataFrame(np.asarray(blist), columns=index).drop(ignore, axis=1)
def calc_bse(data: pd.DataFrame, weight_name: Optional[str] = None, ignore: List[str] = None) -> pd.DataFrame: ''' Calculate the Block Standard Error (BSE). Parameters ---------- data : Dataframe with CV data over time and weights. weight_name : Name of the weight column. ignore : List of column names to ignore. Returns ------- bse : Dataframe containing BSEs over all iterations. References ---------- Flyvbjerg, H., Petersen, H. G. Error estimates on averages of correlated data. The Journal of Chemical Physics, 91(1), 461 (1989) ''' if ignore is None: ignore = [] if 'time' not in ignore: ignore.append('time') # Prepare input, first element if weight_name is not None: weights = data[weight_name].values ignore.append(weight_name) length = data.shape[0] width = data.shape[1] index = data.T.index data = data.values blist = [data.std(axis=0) / np.sqrt(length)] length = length // 2 # Iteratively increase block size while length > 2: halved = np.empty((length, width)) # Each iteration, we halve the dataset for i in range(0, length): if weight_name is not None: halved[i] = (1 / (weights[2 * i - 1] + weights[2 * i]) * (data[2 * i - 1] * weights[2 * i - 1] + data[2 * i] * weights[2 * i])) else: halved[i] = 0.5 * (data[2 * i - 1] + data[2 * i]) # Calculate the BSE bse = halved.std(axis=0) / np.sqrt(length) blist.append(bse) length = length // 2 # Reconstruct Dataframe return pd.DataFrame(np.asarray(blist), columns=index).drop(ignore, axis=1)
def structural_adj(self, cov: pd.DataFrame, spec_ret: pd.DataFrame, fact_exp: pd.DataFrame, liq_mv: pd.DataFrame, liq_mv_name: PVN.LIQ_MV.value, time_window: int = 120): """ :param cov: 经Newey-West调整的个股特异收益矩阵 :param spec_ret: 个股特异收益序列 :param fact_exp: 因子暴露 :param liq_mv: 流通市值 :param liq_mv_name: 流通市值名称 :param time_window: 个股特异收益的时间窗口(后面考虑改为特异收益序列的长度) :return: """ # 计算协调参数 h_n = spec_ret.count() # 非空数量 V_n = (h_n - 20 / 4) / 20 * 2 # 数据缺失程度(先用20测试) sigma_n = spec_ret.std().fillna(1) # 样本等权标准差(无法计算的标准差记为1) TODO sigma_n_steady = (spec_ret.quantile(.75) - spec_ret.quantile(0.25)) / 1.35 # 样本稳健估计标准差 Z_n = abs((sigma_n - sigma_n_steady) / sigma_n_steady) # 数据肥尾程度 # 将无限大值替换为0 Z_n[np.isinf(Z_n)] = 0 Z_n.fillna(0, inplace=True) left_, right_ = V_n.where(V_n > 0, 0), np.exp(1 - Z_n) left_, right_ = left_.where(left_ < 1, 1), right_.where(right_ < 1, 1) gam_n = left_ * right_ # 个股协调参数[0,1] reg_data = pd.concat([np.log(sigma_n), liq_mv, gam_n, fact_exp], axis=1) reg_data.columns = ['sigma', liq_mv_name, 'gam_n' ] + fact_exp.columns.tolist() ref_data_com = reg_data[reg_data['gam_n'] == 1] # 加权(流通市值)最小二乘法用优质股票估计因子对特异波动的贡献值 model = sm.WLS(ref_data_com['sigma'], ref_data_com[fact_exp.columns], weights=ref_data_com['gam_n']).fit() # 个股结构化特异波动预测值 sigma_STR = pd.DataFrame(np.diag( np.exp(np.dot(fact_exp, model.params)) * 1.05), index=fact_exp.index, columns=fact_exp.index) # 对特异收益矩阵进行结构化调整 F_STR = sigma_STR.mul((1 - gam_n), axis=0) + cov.mul(gam_n, axis=0) return F_STR
def normalize_factor(factor: pd.DataFrame, mean=None, std=None) -> pd.DataFrame: if mean is None: mean = factor.mean() if std is None: std = factor.std() return (factor - mean) / std
def attributes_sanity_check(df: pd.DataFrame): """Utility function to check the suitability of the attributes for model training. This utility function can be used to check if any attribute has a standard deviation of zero. This would lead to NaN's when normalizing the features and thus would lead to NaN's when training the model. It also checks if any attribute for any basin contains a NaN, which would also cause NaNs during model training. Parameters ---------- df : pd.DataFrame DataFrame of catchment attributes as columns. Raises ------ RuntimeError If one or more attributes have a standard deviation of zero or any attribute for any basin is NaN. """ # Check for NaNs in standard deviation of attributes. attributes = [] if any(df.std() == 0.0) or any(df.std().isnull()): for k, v in df.std().iteritems(): if (v == 0) or (np.isnan(v)): attributes.append(k) if attributes: msg = [ "The following attributes have a std of zero or NaN, which results in NaN's ", "when normalizing the features. Remove the attributes from the attribute feature list ", "and restart the run. \n", f"Attributes: {attributes}" ] raise RuntimeError("".join(msg)) # Check for NaNs in any attribute of any basin nan_df = df[df.isnull().any(axis=1)] if len(nan_df) > 0: failure_cases = defaultdict(list) for basin, row in nan_df.iterrows(): for feature, value in row.iteritems(): if np.isnan(value): failure_cases[basin].append(feature) # create verbose error message msg = [ "The following basins/attributes are NaN, which can't be used as input:" ] for basin, features in failure_cases.items(): msg.append(f"{basin}: {features}") raise RuntimeError("\n".join(msg))
def form_valid(self, form): document = form.save(commit=False) project = form.cleaned_data['project'] document.save() filename = settings.MEDIA_ROOT+"/"+document.document.name sniffer = csv.Sniffer() dialect = sniffer.sniff(open(filename, 'r').read(), delimiters='\t,;') # defining the separator of the csv file df = read_csv(filename, delimiter=dialect.delimiter) tumour_cols = [col for col in df.columns if 'Tumour' in col] norm_cols = [col for col in df.columns if 'Norm' in col] document.sample_num = len(tumour_cols) document.norm_num = len(norm_cols) document.row_num = len(df) document.save() """ Use PANDAS to preprocess input file(calculate Mean_norm CNR and STD) and save to process folder Create ProcessDocument instance to store the file in database""" path = os.path.join('users', str(document.project.owner), str(document.project),'process', 'process_'+str(document.get_filename())) if not os.path.exists(settings.MEDIA_ROOT+'/'+os.path.join('users', str(document.project.owner), str(document.project),'process')): os.mkdir(settings.MEDIA_ROOT+'/'+os.path.join('users', str(document.project.owner), str(document.project),'process')) process_doc = ProcessDocument() process_doc.document = path process_doc.input_doc = document process_doc.created_by = self.request.user process_doc.save() new_file = settings.MEDIA_ROOT+"/"+path df = df.set_index('SYMBOL') #create index by SYMBOL column df = df.groupby(df.index, level=0).mean() #deal with duplicate genes by taking mean value mean_norm = df[[norm for norm in norm_cols]].mean(axis=1) from scipy.stats.mstats import gmean gmean_norm = df[[norm for norm in norm_cols]].apply(gmean, axis=1) df1 = DataFrame(df[[norm for norm in norm_cols]], index=df.index) df1 = df1.std(axis=1) df['Mean_norm'] = mean_norm df = df.div(df.Mean_norm, axis='index') df['Mean_norm'] = mean_norm df['gMean_norm'] = gmean_norm df['std'] = df1 df.to_csv(new_file, sep='\t') return HttpResponseRedirect(self.success_url+project.name)
def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None, fname=None): """ Plot the means and standard deviations of each dataset. :param real: DataFrame containing the real data :param fake: DataFrame containing the fake data :param ax: Axis to plot on. If none, a new figure is made. :param fname: If not none, saves the plot with this file name. """ if ax is None: fig, ax = plt.subplots(1, 2, figsize=(10, 5)) fig.suptitle('Absolute Log Mean and STDs of numeric data\n', fontsize=16) ax[0].grid(True) ax[1].grid(True) real = real._get_numeric_data() fake = fake._get_numeric_data() real_mean = np.log(np.add(abs(real.mean()).values, 1e-5)) fake_mean = np.log(np.add(abs(fake.mean()).values, 1e-5)) min_mean = min(real_mean) - 1 max_mean = max(real_mean) + 1 line = np.arange(min_mean, max_mean) sns.lineplot(x=line, y=line, ax=ax[0]) sns.scatterplot(x=real_mean, y=fake_mean, ax=ax[0]) ax[0].set_title('Means of real and fake data') ax[0].set_xlabel('real data mean (log)') ax[0].set_ylabel('fake data mean (log)') real_std = np.log(np.add(real.std().values, 1e-5)) fake_std = np.log(np.add(fake.std().values, 1e-5)) min_std = min(real_std) - 1 max_std = max(real_std) + 1 line = np.arange(min_std, max_std) sns.lineplot(x=line, y=line, ax=ax[1]) sns.scatterplot(x=real_std, y=fake_std, ax=ax[1]) ax[1].set_title('Stds of real and fake data') ax[1].set_xlabel('real data std (log)') ax[1].set_ylabel('fake data std (log)') if fname is not None: plt.savefig(fname) if ax is None: plt.show()
def annualized_volatility(self, df: pd.DataFrame) -> float: """ Calculates annualized volatility for a date-indexed pandas data frame It works for any interval of time and whether it is prices or returns. """ years_past = self.get_years_past() entries_per_year = df.shape[0] / years_past return df.std() * np.sqrt(entries_per_year)
def _plot_stats_attribute(stats_list: Sequence[Stats], attribute: str, label, ax=None): """Plot a certain attribute of a collection of histories.""" data = np.asarray([getattr(h, attribute) for h in stats_list]) df = DataFrame(data.T) df_mean = df.mean(axis=1) df_std = df.std(axis=1) sns_ax = sns.lineplot(df_mean.index, df_mean, label=label, ax=ax) sns_ax.fill_between(df_mean.index, df_mean - df_std, df_mean + df_std, alpha=0.3)
def summary_statistics(data_set: pd.DataFrame) -> pd.DataFrame: summary_data = dict() summary_data['mean'] = data_set.mean(numeric_only=True) summary_data['std'] = data_set.std(ddof=1, numeric_only=True) summary_data['min'] = data_set.min(numeric_only=True) summary_data['max'] = data_set.max(numeric_only=True) return pd.DataFrame(summary_data).T
def testZscore(self, df: pd.DataFrame, stdev_cutoff: float = 5.0): """ Checks to make sure there are no outliers using z score cutoff. """ z_scores = ((df - df.mean(axis=0, skipna=True)) / df.std(axis=0, skipna=True)).abs() self.assertGreater(0, (z_scores > stdev_cutoff).to_numpy().sum(), "There are outlier values!")
def drop_high_volatility(df: pd.DataFrame, threshold=2) -> pd.DataFrame: """ 清除波动率过大的数据 :param df: :param threshold: 波动率超过几个标准差 :return: 原始数据除去了波动率过大的对应列 Contributed by Gu Chengyang """ return df[df.columns[df.min() < df.mean() - threshold * df.std()]]
def get_portfolio_risk(cls, weights: list, ror: pd.DataFrame) -> float: """ Computes the std of portfolio returns. """ # cls.weights_sum_is_one(weights) if isinstance(ror, pd.Series): # required for a single asset portfolio return ror.std() weights = np.array(weights) covmat = ror.cov() return math.sqrt(weights.T @ covmat @ weights)
def mean_and_std(cls, df: DataFrame) -> DataFrame: """ Standard.Specification. Args: df: The data to be standardized. Returns: The mean and std (standard deviation) of data. """ scale = df.std() scale.name = 'std' return cls._stack_as_rows(cls._mean(df), scale)
def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame: """ Removes all rows from the given DataFrame containing outliers in any of the columns. :param df: Input DataFrame. :param zscore: z-score to use when calculating outliers. :return: The DataFrame with all outliers removed. """ scores = (df - df.mean()) / df.std(ddof=0).values return df[(np.abs(scores) < zscore).all(axis=1)]
def testWLS(self): X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) Y = Series(np.random.randn(30)) weights = X.std(1) self._check_wls(X, Y, weights) weights.ix[[5, 15]] = np.nan Y[[2, 21]] = np.nan self._check_wls(X, Y, weights)
def moments_features(path): if not os.path.exists(path): logger.error(path + " is not exist!") return im = cv2.imread(path) [b, g, r] = cv2.split(im) moments = [] for n in [b, g, r]: df = DataFrame(np.array(n.flatten())) moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]]) return moments
def combine_spread(file_set, shift, drop_return_data=False): """ Combine the spread of input files, return with mean and standard deviation calculated. """ data = [] values = {} for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'): values[val] = {} # Collect data from all files into dictionaries for i, _file in enumerate(file_set): data.append(Spread().read(_file)) for val in values.keys(): values[val][i] = Series( data=data[i].spread[val]['val'], index=data[i].times ) data[i].times = (np.array(data[i].times) - shift[i]) spread = Spread() spread.spread['num'] = len(file_set) for val in values.keys(): # Shift time as per synchronisation for i in values[val]: values[val][i].index = np.array(values[val][i].index) - shift[i] # Convert to DataFrame df = DataFrame(data=values[val]) # If not a single file, keep only indices with at least two non-NaN if len(file_set) > 1: df = df.dropna() # If return data dropped, fill data here if drop_return_data: for i in df.columns: data[i].spread[val]['val'] = df[i].tolist() # Get times, mean and standard error as lists mean = list(df.mean(axis=1)) std_error = list(df.std(axis=1)) times = list(df.index) # Add to Spread object spread.spread[val]['val'] = mean spread.spread[val]['std'] = std_error spread.spread['times'] = times return spread, data
def stndize(str,list): s=list w= pd.read_csv(str,usecols=s) frame = DataFrame(w) t=frame.mean() print t z=frame.std() print z print (w-t)/z return;
def testWLS(self): # WLS centered SS changed (fixed) in 0.5.0 if sm.version.version < '0.5.0': raise nose.SkipTest X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) Y = Series(np.random.randn(30)) weights = X.std(1) self._check_wls(X, Y, weights) weights.ix[[5, 15]] = np.nan Y[[2, 21]] = np.nan self._check_wls(X, Y, weights)
def test_common_start_returns(self, before, after, mean_by_date, demeaned, expected_vals): dr = date_range(start='2015-1-17', end='2015-2-2') dr.name = 'date' tickers = ['A', 'B', 'C', 'D'] r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80) prices = DataFrame(index=dr, columns=tickers, data=[[r1**1, r2**1, r3**1, r4**1], [r1**2, r2**2, r3**2, r4**2], [r1**3, r2**3, r3**3, r4**3], [r1**4, r2**4, r3**4, r4**4], [r1**5, r2**5, r3**5, r4**5], [r1**6, r2**6, r3**6, r4**6], [r1**7, r2**7, r3**7, r4**7], [r1**8, r2**8, r3**8, r4**8], [r1**9, r2**9, r3**9, r4**9], [r1**10, r2**10, r3**10, r4**10], [r1**11, r2**11, r3**11, r4**11], [r1**12, r2**12, r3**12, r4**12], [r1**13, r2**13, r3**13, r4**13], [r1**14, r2**14, r3**14, r4**14], [r1**15, r2**15, r3**15, r4**15], [r1**16, r2**16, r3**16, r4**16], [r1**17, r2**17, r3**17, r4**17]]) dr2 = date_range(start='2015-1-21', end='2015-1-29') factor = DataFrame(index=dr2, columns=tickers, data=[[3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1], [3, 4, 2, 1]]).stack() factor.index = factor.index.set_names(['date', 'asset']) factor.name = 'factor' cmrt = common_start_returns( factor, prices, before, after, False, mean_by_date, factor if demeaned else None) cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)}) expected = DataFrame(index=range(-before, after + 1), columns=['mean', 'std'], data=expected_vals) assert_frame_equal(cmrt, expected)
def testWLS(self): # WLS centered SS changed (fixed) in 0.5.0 sm_version = sm.version.version if sm_version < LooseVersion("0.5.0"): raise nose.SkipTest("WLS centered SS not fixed in statsmodels" " version {0}".format(sm_version)) X = DataFrame(np.random.randn(30, 4), columns=["A", "B", "C", "D"]) Y = Series(np.random.randn(30)) weights = X.std(1) self._check_wls(X, Y, weights) weights.ix[[5, 15]] = np.nan Y[[2, 21]] = np.nan self._check_wls(X, Y, weights)
def testWLS(self): # WLS centered SS changed (fixed) in 0.5.0 if sm.version.version < '0.5.0': raise nose.SkipTest print( "Make sure you're using statsmodels 0.5.0.dev-cec4f26 or later.") X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) Y = Series(np.random.randn(30)) weights = X.std(1) self._check_wls(X, Y, weights) weights.ix[[5, 15]] = np.nan Y[[2, 21]] = np.nan self._check_wls(X, Y, weights)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7): tickers = trades.tickers sample_size = round(len(tickers) * subset_fraction) summary = DataFrame(dtype = float) for n in range(N): sample_tickers = list(random.choice(tickers, sample_size, replace = False)) trade_subset = trades.find(lambda T: T.ticker in sample_tickers) summary[n] = summary_report(trade_subset) result = DataFrame(dtype = float) result['Base'] = summary_report(trades) result['Mean'] = summary.mean(axis = 1) result['Std'] = summary.std(axis = 1) result['Median'] = summary.median(axis = 1) result['Max'] = summary.max(axis = 1) result['Min'] = summary.min(axis = 1) return (result, summary)
class GetGenes(object): def __init__(self, data): self.dataframe = DataFrame(data) # read a text file and return a data frame. Records should be separated by TAB # There should not be duplicate column names def import_file(self, filename): # this function use to convert string to float def convert(x): try: x = float(x) except ValueError: pass return(x) table = [] for line in open(filename): if(line.strip()): # If not empty line line = line.rstrip('\n').split('\t') line = list(map(convert, line)) table.append(line) self.dataframe = DataFrame(table[1:],columns=table[0]) return def houseKeepingGenes(self, geneNum): # compute the CV of data std = array(self.dataframe.std(axis = 1)) mean = array(self.dataframe.mean(axis = 1)) CV = std/mean CV = list(map(abs, CV)) # convert to positive number # get the fist N minimum value mins = nsmallest(geneNum, CV) print("The GOOD genes are:\n") for item in mins: print(self.dataframe.ix[CV.index(item)][0]) return
experiment_data_Raw = DataFrame({"Timestamp": quelle_timestampsRaws, "Raw key": quelle_raws, "Dataset": quelle_datasetR}) experiment_data_Raw = experiment_data_Raw.set_index("Timestamp") final_data = concat([experiment_data_Qber,experiment_data_Raw]) final_data = final_data.sort_index() # after prepaired data, time to plot it: for new_counter in range(file_counter+1): #print new_counter Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ] x1 = Qbers.index.tolist() y1 = Qbers["Qber"].tolist() x1_average = DataFrame.mean(Qbers)["Qber"] x1_std_dev = DataFrame.std(Qbers)["Qber"] #prepairing proper time: x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1] Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ] x2_average = DataFrame.mean(Raws)["Raw key"] x2_median = DataFrame.median(Raws)["Raw key"] x2_max = DataFrame.max(Raws)["Raw key"] Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)] x2 = Raws.index.tolist() y2 = Raws["Raw key"].tolist() print x2_average #x2_std_dev = 3
def test(): # a : adulte isolé # b : couple # c : enfant dans couple # d : enfan isolé # e : ado couple # f : ado isolé # g : chambre d'enfant # A: 2a,2e # b + 2*c + g fa = [0, 1, 2, 0, 0, 0, 1] ma = 2754.74 # B : 2a,2ea,supp: # b + 2*e + 2*g fb = [0, 1, 0, 0, 2, 0, 2] mb = 3165.15 # C : 1a,2e: # a + 2*d + g fc = [1, 0, 0, 2, 0, 0, 1] mc = 2291.04 # D: 2a, 2e, 2ea, 2*supp : # b + 2*c + 2*e + 3*g fd = [0, 1, 2, 0, 2, 0, 3] md = 3969.81 # E : 2a,1ea # b + e + g fe = [0, 1, 0, 0, 1, 0, 1] me = 2549.17 # F : 2a, 1e, 2ea # b + c + 2*e + 2*g ff = [0, 1, 1, 0, 2, 0, 2] mf = 3514.12 # G: 2a, 1e ,1ea, supp # b + c + e + 2*g fg = [0, 1, 1, 0, 1, 0, 2] mg = 3042.39 # H: 1a, 1ea # a + f + g fh = [1, 0, 0, 0, 0, 1, 1] mh = 2103.91 # solve f*x = m # A supplementary equation is needed because the system is inconsistant fsup = [1, -1 / 1.5, 0, 0, 0, 0, 0] msup = 0 f = [fa, fb, fc, fd, fe, ff, fg, fh, fsup] m = [ma, mb, mc, md, me, mf, mg, mh, msup] results = DataFrame() for i in range(8): selected_f1 = list(f) selected_m1 = list(m) selected_f1.pop(i) selected_m1.pop(i) for j in range(7): selected_f = list(selected_f1) selected_m = list(selected_m1) selected_f.pop(j) selected_m.pop(j) f_mat = np.array(selected_f) m_vec = np.array(selected_m) # print i, np.linalg.det(f_mat) try: x = DataFrame({str(i) + str(j): np.linalg.solve(f_mat, m_vec)}).T except: x = None from pandas import concat if x is not None: results = concat([results, x]) print results print results.mean() print results.std() print results.std() / results.mean()
def ExerciseCheckerAlmostCorrect(path): # Gather the solution solution_path = path + "Solutions/Week1.xlsx" solution = load_workbook(solution_path, read_only=True, use_iterators=False, keep_vba=False, guess_types=False, data_only=True) solution_rows = prepare_book(solution) num_solution_records = len(solution_rows) print "The number of solution records is: " + str(num_solution_records) + "\n" num_responses = 0 all_accuracy_array = [] almost_accuracy_array = [] # Gather the response answer_path = path + "Response/Week_1/" files = os.listdir(answer_path) for file in files: file_type_array = file.split(".") file_type = file_type_array[len(file_type_array)-1] if file_type not in ["xlsx", "xlsm", "xltx", "xltm"]: continue print file num_responses += 1 num_check = 0 num_contain = 0 try: answer = load_workbook(answer_path + file, read_only=True, use_iterators=False, keep_vba=False, guess_types=False, data_only=True) # Gather each sheet in the answer file for sheet in answer: answer_rows = {} for row in sheet.rows: full_address = str.lower(str.strip(str(row[0].value))) remaining_elements = set() for i in range(1, len(row)): remaining_elements.add(str.lower(str.strip(str(row[i].value)))) answer_rows[full_address] = remaining_elements # Compare the answer and the solution for row in solution_rows: full_address = str.lower(str.strip(str(row[0].value))) if answer_rows.has_key(full_address): for i in range(1, len(row)): num_check += 1 row_element = str.lower(str.strip(str(row[i].value))) if row_element in answer_rows[full_address] or row_element[1:len(row_element)] in answer_rows[full_address]: num_contain += 1 except Exception as e: print "False\t" + str(e) if num_check > 0: accuracy = float(num_contain) / num_check all_accuracy_array.append(accuracy) if accuracy < 1: almost_accuracy_array.append(accuracy) else: almost_accuracy_array.append(accuracy) print df1 = DataFrame(all_accuracy_array) print "ALl: " + str(num_responses) print df1.mean() print df1.std() print df2 = DataFrame(almost_accuracy_array) print "Almost: " + str(len(almost_accuracy_array)) print df2.mean() print df2.std()
'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' } data['animal'] = data['food'].map(str.lower).map(meat_to_animal) data data['food'].map(lambda x: meat_to_animal[x.lower()]) # 数据标准化 datafile = 'd:/data/normalization_data.xls' #参数初始化 data = pd.read_excel(datafile, header = None) #读取数据 (data - data.min())/(data.max() - data.min()) #最小-最大规范化 (data - data.mean())/data.std() #零-均值规范化 data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 ###替换值 data = Series([1., -999., 2., -999., -1000., 3.]) data data.replace(-999, np.nan) data.replace([-999, -1000], np.nan) data.replace([-999, -1000], [np.nan, 0]) data.replace({-999: np.nan, -1000: 0})
float( len( timedeltas_above_double_average ) ) / len( delta_t ) * 100 print "Timedeltas above double average", print len(timedeltas_above_double_average), print timedeltas_above_double_average_percent last_timestamp = timestamps[-1] print "Last timestamp", timestamps[-1] print "Maximal timestamp", max( timestamps ) print "Average frequency", float( len( timestamps ) ) / ( float( last_timestamp ) / 1000 ) delta_t = DataFrame( delta_t ) delta_t.plot() pyplot.show() print "Timedelta standard deviation", float( delta_t.std() ) font = { 'family': 'Consolas', 'weight': 'x-small', 'size': 11.0, 'stretch': 0 } pyplot.rc( 'font', **font ) pyplot.show( block=True )
WEEKCOLS=[WEEKDF,updf,dwndf] WEEKDF=pd.concat(WEEKCOLS,axis=1) WEEKDF.columns=['PLUSMINUSWEEK','UP RATE','DOWN RATE'] ''' print(' ') print (WEEKDF) ''' #find the current high low of futures POSITION=DataFrame([0]) VALUE=FIVEPATTERNCHANGE.mean()*CLWEEK.iloc[0,0]+CLWEEK.iloc[0,0] WEEKSTDUP=float((VALUE*FIVEPATTERNCHANGE.std())+VALUE) WEEKSTDDOWN=float(VALUE-(VALUE*FIVEPATTERNCHANGE.std())) print ("The last week through the last five weeks have done the following") print (FINALDF) print (' ') VALUE=FIVEPATTERNCHANGE.mean()*CLWEEK.iloc[0,0]+CLWEEK.iloc[0,0] VALUE=DataFrame(VALUE) print (' ') print ('Value price based on pattern') print ("%.2f" % VALUE.iloc[0,0]) print (' ') print ('One Standard Deviation Up') print ((VALUE*FIVEPATTERNCHANGE.std())+VALUE)
import numpy as np from pandas import DataFrame, Series import pandas as pd import matplotlib.pyplot as plt import scipy.stats df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100) plt.bar(np.arange(len(df.mean())), df.mean(), align='center', color='white', linewidth=1.5) plt.hold(True) plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(), elinewidth=1.2, capsize=7.5, fmt=None) plt.show()
def run(self,Model='ridge',kernel='linear', cross_validationMethod='KFold',FeatureSelection='PCA',n_features=20,scoringList=['specificity','sensitivity','precision','f1','accuracy','ss_mean'],isSaveCsv=None,isSavePickle=None, isSaveFig=None, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False): # -- TODO : # -- # Greedy selection on features + Other feature selection types... # -- # Make sure featuers are Best only based on train data!!! # -- # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration # -- # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015) # -- # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation # -- # add f feature analysis by facial part (see excel) # -- # select best model (svm, otherwise ridge regression) # -- # compare svc results with regerssion results (using LOO and different Params for regression - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) # -- # check how the model weights behave - feature selection analysis # -- # calc model error # -- # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided # -- # LOO - also on bool labels (patients vs controls and mental status bool) # -- # add mental status rank scores (0-4) # -- # make sure p-val returns the right value in 'scores' # -- # run it over random data (permutation test) # -- # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R) ## init FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)] self.FullResults=DF() self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']} print('\n------------Learning Details------------') print(DF.from_dict(self.Learningdetails,orient='index')) print('\n----' + cross_validationMethod + ' Cross validation Results:----') # Set learning params (cross validation method, and model for learning) isBoolLabel=self.LabelsObject.isBoolLabel isBoolScores=isBoolLabel model, isBoolModel, featureSelectionMethod,selectFeaturesFunction= learningUtils.setModel(Model,FeatureSelection,n_features) #define global variables over modules (to be used in myUtils) globalVars.transformMargins=0#lambda x:x globalVars.isBoolLabel=isBoolLabel globalVars.isBoolModel=isBoolModel global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject) trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2) LabelingList=['N1']#trainLabels_all.columns self.ResultsDF=DF() self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList. for label_ind, Labeling in enumerate(LabelingList): """if isPerm: #TODO - fix this to work with continous / bool data try: trainLabels=self.LabelsObject.permedLabelsDF[Labeling] except AttributeError: self.LabelsObject.permLabels() trainLabels=self.LabelsObject.permedLabelsDF[Labeling]""" #set subjects list according to labels and features X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling) X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1) #init train and test labels trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all) trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2) #make sure only labeled subjects are used for classification X=X.query('subject == '+ str(list(trainLabels.index)) ) X.index.get_level_values(X.index.names[0]) SubjectIndex=list(set(X.index.get_level_values('subject'))) X2=X2.query('subject == '+ str(list(trainLabels2.index)) ) X2.index.get_level_values(X2.index.names[0]) SubjectIndex2=list(set(X2.index.get_level_values('subject'))) #init vars if isBetweenSubjects: cv_param=len(SubjectIndex) self.Learningdetails['CrossValSubjects']='between' isWithinSubjects=False else: isWithinSubjects=True X=X.swaplevel(0,1) PieceIndex=list(set(X.index.get_level_values('Piece_ind'))) cv_param=len(PieceIndex) self.Learningdetails['CrossValSubjects']='within' self.Learningdetails['NumOfFeatures']=n_features print('\n**' + Labeling + '**') cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) ## Learning - feature selection for different scoring types, with cross validation - BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis cv_ind=0 #used for transforming from margins returned from svm to continouse labels (e.g . PANSS) trainScores=DF() test_index=X.index testScores=concat([DF(index=test_index),DF(index=['std_train_err'])]) testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) #impt=Imputer(missing_values='NaN', strategy='median', axis=0) globalVars.LabelRange=LabelRange ModelWeights1=DF(columns=range(len(cv)),index=X.columns) Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning ExplainedVar=DF(columns=range(len(cv))) ModelWeights2=DF(columns=range(len(cv))) for train, test in cv: if isBetweenSubjects: #set X and Y train_subjects=trainLabels.iloc[train].index test_subjects=testLabels.iloc[test].index Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects) Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects) if isConcatTwoLabels: #used when there is more than one doctor Xtrain=concat([Xtrain,Xtrain2]) Xtest=concat([Xtest,Xtest2]) Ytrain=concat([Ytrain,Ytrain2]) YtrainTrue=concat([YtrainTrue,YtrainTrue2]) Ytest=concat([Ytest,Ytest2]) Xdropped=concat([Xdropped,Xdropped2]) SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2))) droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test. """else: Xtrain=Xtrain1 Xtest=Xtest1 Xdropped=Xdropped1 Ytrain=Ytrain1 YtrainTrue=YtrainTrue1 Ytest=Ytest1""" #select N best features: Xtrain, Xtest, bestNfeatures, components, explainedVar,decomposeFunc=learningUtils.selectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeaturesFunction) BestFeaturesForLabel.add(bestNfeatures) #todo - delete this?? #train 1 TrainModel=model TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index()) try: Components[cv_ind]=components.T ExplainedVar[cv_ind]=explainedVar isDecompose=True if cv_ind==0: ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures))) ModelWeights1[cv_ind]=TrainModel.coef_.flatten() except AttributeError: isDecompose=False ModelWeights1[cv_ind].loc[bestNfeatures]=TrainModel.coef_.flatten() self.isDecompose=isDecompose #train 2 if isBoolLabel: PiecePrediction_train=DF(TrainModel.predict(Xtrain),index=Xtrain.index,columns=['prediction']) TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1}) else: PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction']) TrainModel2=linear_model.LinearRegression() Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel) TrainModel2.fit(Xtrain2, Ytrain2) if cv_ind==0: ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns) ModelWeights2[cv_ind]=TrainModel2.coef_.flatten() #test 1 if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test if isDecompose: dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index) XtestDropped=dXdropped[bestNfeatures] YtestDropped=Series(XtestDropped.copy().icol(0)) #YTrueDropped=Series(Xdropped.copy().icol(0)) for subject in droppedSubjects: YtestDropped[subject]=testLabels_all[Labeling].loc[subject] #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject] Ytest=concat([Ytest,YtestDropped]).sort_index() Xtest=concat([Xtest,XtestDropped]).sort_index() if isPerm: #TODO- Check this!! Ytest=y_perms.loc[Ytest.index] Xtest=Xtest.fillna(0.) elif isWithinSubjects: #train 1 train_pieces=PieceIndex[train] test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces))) Ytrain=Series(index=X.index) Ytest=Series(index=X.index) YtrainTrue=Series(index=X.index) for subject in PieceIndex: for piece in train_pieces: Ytrain.loc[piece].loc[subject]=trainLabels[subject] YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] Ytest.loc[piece].loc[subject]=testLabels[subject] Ytrain=Ytrain.dropna() YtrainTrue=YtrainTrue.dropna() for subject in test_subjects: Ytest.loc[piece].loc[subject]=testLabels[subject] #train scores 1 if cv_ind==0: trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel) plt.figure(1) if len(LabelingList)>1: plt.subplot(round(len(LabelingList)/2),2,label_ind+1) if isBoolLabel: testScores=learningUtils.getTestScores(Ytest,Xtest,TrainModel) else: testScores[cv_ind]=learningUtils.getTestScores(Ytest,Xtest,TrainModel) plt.title(Labeling,fontsize=10) else: plt.figure(3) new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel) trainScores=concat([trainScores,new_trainScores],axis=1) #test 1 testScores[cv_ind]=learningUtils.getTestScores(Ytest,Xtest,TrainModel) #train2 if isBoolLabel: PiecePrediction_test=DF(TrainModel.predict(Xtest),index=Xtest.index,columns=['prediction']) else: PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction']) Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test, isBoolLabel) if cv_ind==0: trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2) YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2 #plt.figure(1) #if len(LabelingList)>1: #plt.subplot(round(len(LabelingList)/2),2,label_ind+1) #test2 if isBoolLabel: testScores2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) else: testScores2[cv_ind]=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) #plt.title(Labeling,fontsize=10) else: new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2) YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2 trainScores2=concat([trainScores2,new_trainScores2],axis=1) testScores2[cv_ind]=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) cv_ind+=1 #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. fig2=plt.figure(2) if len(LabelingList)>1: plt.subplot(round(len(LabelingList)/2),2,label_ind+1) #if isAddDroppedSubjects: # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects] # else: # testLabelsSummary=testLabels scoresSummary = learningUtils.getScoresSummary(trainScores2,testScores2,TrueLabels[Labeling]) # reset global vars globalVars.fitYscale='notDefined' globalVars.beta=DF() plt.title(Labeling,fontsize=10) plt.xlabel('Ytrue',fontsize=8) plt.ylabel('Ypredicted',fontsize=8) plt.tick_params(labelsize=6) #print(crossValScores.T) scores=scoresSummary.fillna(0.) #analyze feature weightsL WeightedFeatures1=DF([ModelWeights1.mean(axis=1),ModelWeights1.std(axis=1)],index=['mean','std']).T.fillna(0) if isDecompose==0: WeightedFeatures1FeatureType=WeightedFeatures1.mean(level='FeatureType') WeightedFeatures1FsSingal=WeightedFeatures1.mean(level='fs-signal') WeightedFeatures1=concat([DF(index=['-------(A) FeatureType-------']),WeightedFeatures1FeatureType,DF(index=['-------(B) faceshift signal-------']),WeightedFeatures1FsSingal]) WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0) BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2]) self.BestFeatures[Labeling]=BestFeatures['mean'] #analyze decomposition if isDecompose: Components_mean = Components.mean(axis=0) Components_std = Components.std(axis=0) ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check! ExplainedVar_mean.index=['ExplainedVar_mean'] ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check! ExplainedVar_std.index=['ExplainedVar_std'] try: self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std]) except AttributeError: self.LabelComponents=dict.fromkeys(LabelingList) self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std]) """print(Components_mean) print(ExplainedVar_mean) print(WeightedFeatures1)""" #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff LabelFullResults=concat([DF(index=[Labeling]),scores]) self.FullResults=concat([self.FullResults,LabelFullResults]) self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1) #continue here!! to build pseudo inverse matrix from predicted to true - make sure columns + rows are set! #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png') testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score... FullSubjectsList=YpredictedOverAllLabels[0].columns YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all') YdroppNans=YdroppNans.dropna(axis=1,how='all') YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all') notNans_cv_ind=YpredictedOverAllLabels.items notNans_trainSubjects=YpredictedOverAllLabels.minor_axis notNans_LabelsList=YpredictedOverAllLabels.major_axis notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList] cv_ind=0 for train, test in cv: if cv_ind in notNans_cv_ind: print(test) train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects))) test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects))) if len(train)>0 and len(test)>0: AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train] AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0) AllLabelsYTrainTrue=notNans_TrueLabels[train] AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test] AllLabelsYTestTrue=notNans_TrueLabels[test] pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns) global AllLabelsTransformationMatrix AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!! TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix) testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3) cv_ind+=1 self.ResultsDF=self.ResultsDF.fillna(0.) ## Print and save results print('\n') print(self.ResultsDF) print('\n') D=self.Learningdetails savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+'_Features'+D['FeatureMethod']+ '_FS'+FeatureSelection+'_Kernel'+D['Kernel']+'_'+D['CrossValSubjects']+'Subjects_PieceSize'+D['PieceLength'] if isPerm: savePath=savePath+'_PERMStest' saveName=savePath+'\\'+str(n_features)+'_features' self.Learningdetails['saveDir']=savePath dir=os.path.dirname(saveName) if not os.path.exists(dir): os.makedirs(dir) if isSavePickle is None: isSavePickle=int(raw_input('Save Results to pickle? ')) if isSaveCsv is None: isSaveCsv=int(raw_input('save Results to csv? ')) if isSaveFig is None: isSaveFig=int(raw_input('save Results to figure? ')) if isSavePickle: self.ResultsDF.to_pickle(saveName+'.pickle') self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle') if isSaveCsv: DetailsDF=DF.from_dict(self.Learningdetails,orient='index') ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures]) ResultsCSV.to_csv(saveName+'.csv') if isSaveCsv or isSavePickle: print('successfully saved as:\n' + saveName) if isSaveFig: plt.figure(1) plt.savefig(saveName + 'Train.png') plt.figure(2) plt.savefig(saveName + 'Test.png') plt.close() plt.close()
test.ix[ser_id2].value_counts(sort=False).plot(kind='bar') test.ix[ser_id1].value_counts(sort=False).plot(kind='bar') # Sampling from the overlapped rated movies to calculate the correlation periods_test = DataFrame(np.zeros((20,7)),columns=[int(ser_max/100),int(ser_max/50),int(ser_max/20),int(ser_max/10),int(ser_max/5),int(ser_max/2),ser_max]) for i in periods_test.index: # Sampling 20 times for j in periods_test.columns: sample = test.reindex(columns=np.random.permutation(test.columns)[:j]) periods_test.ix[i,j] = sample.iloc[0].corr(sample.iloc[1]) # ix is for label index, iloc is for int index print periods_test[:5] print periods_test.describe() threshold = 0.1 temp_std = 0 # Take the threshold num which makes sampling correlation stable for i, std in enumerate(periods_test.std()): if std < 0.1 and temp_std >= 0.1: mini_period = periods_test.columns[i] break temp_std = std # Decide the value of min_periods. Set std 0.05 as threshold # mini_period = 200 check_size = int(len(data.index) * 0.2) # 20% dataset for testing check = {} check_data = data.copy() # Avoid the changes on original data check_data = check_data.ix[check_data.count(axis=1) > mini_period] # Filter users with few ratings. If there is no axis, the sum is the whole matrix for user in np.random.permutation(check_data.index): movie = np.random.permutation(check_data.ix[user].dropna().index)[0] check[(user,movie)] = check_data.ix[user,movie] check_data.ix[user,movie] = np.nan
def discretise_cnv(matrix, filter_sd=True, lower_bound=-1, upper_bound=1): matrix_discrete = DataFrame(0, index=matrix.axes[0], columns=matrix.axes[1]) matrix_discrete[matrix <= lower_bound] = -1.2 matrix_discrete[matrix >= upper_bound] = 1.2 return matrix_discrete.loc[:, matrix_discrete.std() != 0] if filter_sd else matrix_discrete
import numpy as np from pandas import DataFrame, Series import pandas as pd import matplotlib.pyplot as plt import scipy.stats df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100) plt.bar(np.arange(len(df.mean())), df.mean(), align='center', color='white', yerr=df.std(), ecolor='black', capsize=5, linewidth=1,) plt.grid() plt.show()