def main(): data = utils.read_data_from_csv('data/winequality-red.csv') for attribute in data[0].keys(): for name, func in BIN_FUNCTIONS.iteritems(): plot_histogram(data, attribute, func, name) data_frame = DataFrame(data) plot_scatter_matrix(data_frame) plot_parallel_coordinates(data_frame) plot_pca_projection(data) plot_pca_projection(data, normalized = True) plot_mds(data) data_frame.corr(method='pearson').to_csv('build/pearson.csv') data_frame.corr(method='kendall').to_csv('build/kendall.csv')
def pandas_pearson_compute(config: Settings, df: pd.DataFrame, summary: dict) -> Optional[pd.DataFrame]: return df.corr(method="pearson")
def create_model(self, dataset: pd.DataFrame) -> None: """ Description: Method creates model, train it and provide error distribution Parameters ---------- dataset : pandas DataFrame raw dataset that model will use to train itself index: integers columns: Date, High, Low, Open, Close, Volume, Adj Close """ self.raw_dataset = dataset.copy() # Additional features dataset['Open_Close diff'] = dataset['Open'] - dataset['Close'] dataset['High_Low diff'] = dataset['High'] - dataset['Low'] # Creating correlation matrix, extracting useful features for training correlation_matrix = dataset.corr() self.significant_features = list( correlation_matrix.loc[( (correlation_matrix.Close >= self.correlation_threshold) | (correlation_matrix.Close <= -self.correlation_threshold)), ['Close']].index) self.number_of_features = len(self.significant_features) dataset = dataset[self.significant_features] # Splitting dataset into train and test sets dataset = np.array(dataset) split_index = int(dataset.shape[0] * self.split_ratio) self.train_data = dataset[:split_index, :].copy() self.test_data = dataset[split_index:, :].copy() self.train_data = self.scaler.fit_transform(self.train_data) self.test_data = self.scaler.transform(self.test_data) x_train, y_train = self.get_xy_sets(self.train_data, self.backword_days) condidtion_1 = x_train is not None condidtion_2 = x_train is not np.array([]) condidtion_3 = y_train is not None condidtion_4 = y_train is not np.array([]) if not (condidtion_1 and condidtion_2 and condidtion_3 and condidtion_4): return None # Model initialization input_shape = (self.backword_days, self.number_of_features) self.initialize_model(input_shape) # Model training print("First training:") start_time = time.time() self.model.fit(x_train, y_train, epochs=self.epochs_number, batch_size=self.batch, validation_split=0.08) self.first_training_time = time.time() - start_time print("First training time: {:.2f} minutes ({:.3f}s)"\ .format(self.first_training_time/60, self.first_training_time)) # Testing model on test set x_test, y_test = self.get_xy_sets(self.test_data, self.backword_days) if x_test is None or y_test is None: return None y_predictions = self.model.predict(x_test) # Model evaluation y_predictions = self.scaler.inverse_transform(y_predictions) y_test = self.scaler.inverse_transform(y_test) self.rmse = pd.DataFrame([ np.sqrt(np.mean((y_test[:, i] - y_predictions[:, i])**2)) for i in range(y_test.shape[1]) ], index=self.significant_features, columns=['RMSE [%]']) print("RMSE:") print(self.rmse) if not all(row[0] <= (self.rmse_threshold * 100) for idx, row in self.rmse.iterrows()): raise Exception( 'RMSE value exceeded threshold ({}). Model is not usable.'. format(self.rmse_threshold)) # Error distribution self.error_distribution = y_test - y_predictions self.error_distribution = self.error_distribution[(np.abs( stats.zscore(self.error_distribution)) < 3).all(axis=1)] # Final training (optional) if self.second_train: final_dataset = self.scaler.fit_transform(dataset) final_x, final_y = self.get_xy_sets(final_dataset, self.backword_days) print("\nFinal training:") start_time = time.time() self.model.fit(final_x, final_y, epochs=self.epochs_number, batch_size=self.batch, validation_split=0.1) self.final_training_time = time.time() - start_time print("Final traning time: {:.2f} minutes ({:.3f}s)"\ .format(self.final_training_time/60, self.final_training_time)) self.total_training_time = self.final_training_time + self.first_training_time
rtn_table = DataFrame() for secID in secIDs: cp = get_return(secID) cp.name = secID rtn_table = pd.concat([rtn_table, cp], axis=1) rtn_table.fillna(0, inplace=True) #rtn_table.head(5) #rtn_table.mean()*250 #rtn_table.corr() print(rtn_table.mean() * 250) print(rtn_table.corr()) print("*************************************************") from cvxopt import matrix, solvers portfolio1 = [0, 1, 2, 4, 5] portfolio2 = range(6) cov_mat = rtn_table.cov() * 250 exp_rtn = rtn_table.mean() * 250 def cal_efficient_frontier(portfolio): if len(portfolio) <= 2 or len(portfolio) > 6: raise Exception('portfolio必须为长度大于2小于7的list!')
def plot_corr_heatmap(df: pd.DataFrame) -> None: """Plot the correlation matrix of a dataframe in heatmap""" corr_matrix = df.corr() return corr_matrix.style.background_gradient( cmap='coolwarm').set_precision(2)
def fill_per_peptide_correlations(protein_records): per_peptide_correlation_parameter_labels = ['{0} per peptide correlation (Pearson)'.format(name) for name in per_peptide_correlation_parameter_names] total_received_peptides_number = 0 total_missed_peptides_number = 0 for protein_record in protein_records: total_received_peptides_number += len(protein_record.received_peptide_records) total_missed_peptides_number += len(protein_record.missed_peptide_records) total_received_pairs_number = total_received_peptides_number * (total_received_peptides_number - 1) // 2 received_per_peptide_correlations = DataFrame(zeros((total_received_pairs_number, len(per_peptide_correlation_parameter_labels)), dtype=float64), columns=per_peptide_correlation_parameter_labels) total_missed_pairs_number = total_missed_peptides_number * (total_missed_peptides_number - 1) // 2 missed_per_peptide_correlations = DataFrame(zeros((total_missed_pairs_number, len(per_peptide_correlation_parameter_labels)), dtype=float64), columns=per_peptide_correlation_parameter_labels) received_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_received_peptides_number), dtype=float64)) missed_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_missed_peptides_number), dtype=float64)) received_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_received_peptides_number), dtype=float64)) missed_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_missed_peptides_number), dtype=float64)) received_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_received_peptides_number), dtype=float64)) missed_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_missed_peptides_number), dtype=float64)) # received_charges = [] # missed_charges = [] received_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_received_peptides_number), dtype=float64)) missed_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_missed_peptides_number), dtype=float64)) secondary_structure_fraction_names = ['Helix', 'Turn', 'Sheet'] received_secondary_structure_fractions = DataFrame( zeros((len(secondary_structure_fraction_names), total_received_peptides_number), dtype=float64)) missed_secondary_structure_fractions = DataFrame( zeros((len(secondary_structure_fraction_names), total_missed_peptides_number), dtype=float64)) label = 'Filling received peptides array-like parameter lists: ' show_progress(label, 35, 0.0) index = 1 for protein_record in protein_records: for received_peptide_record in protein_record.received_peptide_records: kidera_factor_index = 0 for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors: received_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value'] kidera_factor_index += 1 acid_index = 0 for acid in 'AGVMDYNSWLFIKPQCERTH': received_acid_percents[index - 1][acid_index] = \ received_peptide_record.peptide_parameters.amino_acid_percents[acid] acid_index += 1 group_index = 0 for group in received_peptide_record.peptide_parameters.amino_acids_composition: received_acid_compounds[index - 1][group_index] = group['percent'] group_index += 1 # charges = [] # for charge in received_peptide_record.peptide_parameters.charges: # charges.append(charge['charge']) # received_charges.append(charges) moment_index = 0 for moment in received_peptide_record.peptide_parameters.hydrophobic_moments: if moment['name'] != 'Polygly-polypro helix': received_hydrophobic_moments[index - 1][moment_index] = moment['moment'] group_index += 1 fraction_index = 0 for fraction in received_peptide_record.peptide_parameters.secondary_structure_fraction: received_secondary_structure_fractions[index - 1][fraction_index] = fraction['value'] fraction_index += 1 show_progress(label, 35, index / total_received_peptides_number) index += 1 print() label = 'Filling missed peptides array-like parameter lists: ' show_progress(label, 35, 0.0) index = 1 for protein_record in protein_records: for missed_peptide_record in protein_record.missed_peptide_records: kidera_factor_index = 0 for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors: missed_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value'] kidera_factor_index += 1 acid_index = 0 for acid in 'AGVMDYNSWLFIKPQCERTH': missed_acid_percents[index - 1][acid_index] = \ missed_peptide_record.peptide_parameters.amino_acid_percents[acid] acid_index += 1 group_index = 0 for group in missed_peptide_record.peptide_parameters.amino_acids_composition: missed_acid_compounds[index - 1][group_index] = group['percent'] group_index += 1 # charges = [] # for charge in missed_peptide_record.peptide_parameters.charges: # charges.append(charge['charge']) # missed_charges.append(charges) # moment_index = 0 for moment in missed_peptide_record.peptide_parameters.hydrophobic_moments: if moment['name'] != 'Polygly-polypro helix': missed_hydrophobic_moments[index - 1][moment_index] = moment['moment'] group_index += 1 fraction_index = 0 for fraction in missed_peptide_record.peptide_parameters.secondary_structure_fraction: missed_secondary_structure_fractions[index - 1][fraction_index] = fraction['value'] fraction_index += 1 show_progress(label, 35, index / total_missed_peptides_number) index += 1 print() print('Calculating Kidera factors per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_kidera_factors.corr(method='pearson'), 'Kidera factors') print('done') print('Calculating Kidera factors per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_kidera_factors.corr(method='pearson'), 'Kidera factors') print('done') print('Calculating amino acid percents per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_acid_percents.corr(method='pearson'), 'Amino acid percents') print('done') print('Calculating amino acid percents per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_acid_percents.corr(method='pearson'), 'Amino acid percents') print('done') print('Calculating amino acid compositions per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_acid_compounds.corr(method='pearson'), 'Amino acid compositions') print('done') print('Calculating amino acid compositions per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_acid_compounds.corr(method='pearson'), 'Amino acid compositions') print('done') # # label = 'Calculating charges Kendall correlation (missed peptides): ' # show_progress(label, 40, 0.0) # index = 1 # for first_charges in range(0, len(missed_charges)): # for second_charges in range(first_charges + 1, len(missed_charges)): # missed['Charges per peptide correlation (Kendall)'].append( # statistics.kendalltau(missed_charges[first_charges], missed_charges[second_charges]).correlation) # show_progress(label, 40, index / len(missed_charges)) # index += 1 # print() print('Calculating hydrophobic moments per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments') print('done') print('Calculating hydrophobic moments per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments') print('done') print('Calculating secondary structure fractions per peptide Pearson correlation (received peptides): ', end='') received_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(received_secondary_structure_fractions.corr(method='pearson'), 'Secondary structure fractions') print('done') print('Calculating secondary structure fractions per peptide Pearson correlation (missed peptides): ', end='') missed_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \ convert_correlation_matrix_to_serie(missed_secondary_structure_fractions.corr(method='pearson'), 'Secondary structure fractions') print('done') return received_per_peptide_correlations, missed_per_peptide_correlations
def file_commit_correlation(file_commit_frame: pd.DataFrame, corr_method='spearman') -> pd.DataFrame: return file_commit_frame.corr(method=corr_method)
0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50 ], '分数': [ 10, 22, 13, 43, 20, 22, 33, 50, 62, 48, 55, 75, 62, 73, 81, 76, 64, 82, 90, 93 ] } # 转换为DataFrame的数据格式 examDf = DataFrame(examDict) plt.scatter(examDf.分数, examDf.学习时间, color='b', label="Exam Data") plt.xlabel("Hours") plt.ylabel("Score") plt.show() rDf = examDf.corr() print(rDf) exam_X = examDf.分数 exam_Y = examDf.学习时间 X_train, X_test, Y_train, Y_test = train_test_split(exam_X, exam_Y, train_size=.8) # X_train为训练数据标签,X_test为测试数据标签,exam_X为样本特征,exam_y为样本标签,train_size 训练数据占比 print("原始数据特征:", exam_X.shape, ",训练数据特征:", X_train.shape, ",测试数据特征:", X_test.shape) print("原始数据标签:", exam_Y.shape, ",训练数据标签:", Y_train.shape, ",测试数据标签:", Y_test.shape) plt.scatter(X_train, Y_train, color="blue", label="train data") plt.scatter(X_test, Y_test, color="red", label="test data")
print(df5) import math def int_float_squares(series): return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2}) print(df.apply(int_float_squares, axis=1)) ### 7. Basic Stats ### print(df.describe()) print(df.cov()) print(df.corr()) ### 8. Merge and Join ### print(df) other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]}) print(other) print(pd.merge(df, other, on="str_col", how="inner")) print(pd.merge(df, other, on="str_col", how="outer")) print(pd.merge(df, other, on="str_col", how="left")) print(pd.merge(df, other, on="str_col", how="right")) ### 9. Plot ### plot_df = DataFrame(np.random.randn(1000, 2), columns=["x", "y"]) plot_df["y"] = plot_df["y"].map(lambda x: x + 1)
obj.reindex() data = DataFrame([[1,2,3],[4,5,6]]) data.drop() np.argsort() obj.rank() obj.sort_values() data.tail() data.cov() data.cov() data.corr() data.dropna() data.loc data.fillna() data.unstack()
def preprocess(directory, n_entries): hdf_path = directory.get_path("logs.h5", temp=False) print "hdf_path: %s" % hdf_path store = HDFStore(hdf_path) print "Keys: %s" % store.keys() print store store.close() df = pd.read_hdf(hdf_path, "logs") # df = directory.load('logs.h5') print "df: %s" % df if n_entries >= 0: df = df[:n_entries] secs = (df.index.max() - df.index.min()).total_seconds() hours = secs / 3600 levels = df.level.unique() print "%.1f hours of logs" % hours print "%d log entries/hour" % int(len(df) / hours) print "%.1f thousand log entries/hour" % (int(len(df) / hours) / 1000.0) print df.shape, df.columns for level in levels: print "%-5s : %5d" % (level, len(df[df.level == level])) print "df : %s" % str(df.shape) if False: def get_peak(counts): """Retun the peak value in Series counts""" if len(counts) == 0: return None return counts.indmax() # return counts.index[counts.argmax()] start_time, end_time = df.index.min(), df.index.max() print "orginal: start_time, end_time = %s, %s" % (start_time, end_time) # Start time and end time trunctated to whole minutes start_time = truncate_to_minutes(start_time + timedelta(minutes=2)) end_time = truncate_to_minutes(end_time - timedelta(minutes=2)) print "cleaned: start_time, end_time = %s, %s" % (start_time, end_time) details = get_details(df) directory.save("details", details) # The counts for each 1 minute bin minute_counts = get_minute_counts(df, start_time, end_time) print "minute_counts: %s\n%s" % (type(minute_counts), minute_counts.describe()) print "total entries: %s" % minute_counts.sum() level_counts = {level: get_minute_counts(df[df.level == level], start_time, end_time) for level in levels} # level_peaks = {level: get_peak(level_counts[level]) for level in levels} # print 'level_peaks: %s' % level_peaks if False: unique_files = df.file.unique() print "%d source files" % len(unique_files) for i, fl in enumerate(sorted(unique_files)[:5]): print "%3d: %s" % (i, fl) directory.save("unique_files", unique_files) # # Get all the unique log messages # level_file_line = df.groupby(["level", "file", "line"]) lfl_size = level_file_line.size() lfl_sorted = lfl_size.order(ascending=False) print "lfl_sorted: %s" % str(lfl_sorted.shape) # directory.save('level_file_line', tuple(level_file_line)) directory.save("lfl_sorted", lfl_sorted) # file:line uniquely identifies each level,file,line # Construct mappings in both directions lfl_to_string = OrderedDict(((lvl, fl, ln), "%s:%d" % (fl, ln)) for lvl, fl, ln in lfl_sorted.index) string_to_lfl = OrderedDict(("%s:%d" % (fl, ln), (lvl, fl, ln)) for lvl, fl, ln in lfl_sorted.index) print "string_to_lfl: %s" % len(string_to_lfl) # [((level,file,line),count)] sorted by count in descending order entry_types_list = zip(lfl_sorted.index, lfl_sorted) # {(level,file,line) : count} entry_types = OrderedDict(entry_types_list) directory.save("entry_types", entry_types) print "entry_types: %s" % len(entry_types) # # Build the correlation table # threshold = min(100, len(df) // 1000) lfl_freq_dict = { s: get_minute_counts(df[(df.file == fl) & (df.line == ln)], start_time, end_time) for s, (lvl, fl, ln) in string_to_lfl.items() if len(df[(df.file == fl) & (df.line == ln)]) >= threshold } print "++++" lfl_freq = DataFrame(lfl_freq_dict, columns=string_to_lfl.keys()) directory.save("lfl_freq", lfl_freq) lfl_freq_corr = lfl_freq.corr() directory.save("lfl_freq_corr", lfl_freq_corr) print "lfl_freq_corr: %s" % str(lfl_freq_corr.shape)
def cell_type_corr(self): '''Compute correlation coefficient between cell type''' print "The correlation matrix for cell types is:\n" celltype = DataFrame({'HL60':self.data['HL60_0_hrs'].append(self.data['HL60_24_hrs']),'U937':self.data['U937_0_hrs'].append(self.data['U937_24_hrs']),'Jurkat':self.data['Jurkat_0_hrs'].append(self.data['Jurkat_24_hrs call']),'NB4':self.data['NB4_0_hrs'].append(self.data['NB4_24_hrs'])}) print celltype.corr() print '\n'
#try different lags favcorrs=[] unfavcorrs=[] diffcorrs=[] approvcorrs=[] disappcorrs=[] appdiffcorrs=[] ovotecorrs=[] rvotecorrs=[] votediffcorrs=[] lags=[] for x in xrange(-90, 90): data['lag']=data.ma_sentiment.shift(x) lags.append(x) favcorrs.append(data.corr()['lag']['favorable']) diffcorrs.append(data.corr()['lag']['difference']) unfavcorrs.append(data.corr()['lag']['unfavorable']) favcorrs=Series(favcorrs, index=lags) unfavcorrs=Series(unfavcorrs, index=lags) diffcorrs=Series(diffcorrs, index=lags) lagged_corrs=DataFrame({'favorable': favcorrs, 'unfavorable': unfavcorrs, 'difference': diffcorrs}) lagged_corrs.to_csv(os.path.join('data', 'lexicon_lagged_corrs' + str(k) + '.csv'), sep='\t') lglist.append(lagged_corrs) sm7_laggedcorrs=lglist[0] sm15_laggedcorrs=lglist[1] sm30_laggedcorrs=lglist[2]
X = zeros((5,1000,100)) Y = zeros((1000,5)) # Load the data files into the array for i in range(5): X[i,:,:] = loadtxt('DataX{}.txt'.format(i)) Y[:,i] = loadtxt('DataY{}.txt'.format(i)) # Save the correlation matricies as png files to be used in the LaTeX document for i in range(5): X_df = DataFrame(X[i,:,:]) corr = X_df.corr() print '\n' + 'Creating a Correlation Matrix Heat Map for X_{} and saving to PNG file X_{}_Corr.png'.format(i,i) fig=plt.figure() plt.imshow(corr, cmap='hot', vmin=0, vmax=1) plt.colorbar() plt.xticks(range(0,len(corr)+1,10), range(0,len(corr)+1,10)) plt.yticks(range(0,len(corr)+1,10), range(0,len(corr)+1,10)); plt.savefig('X_{}_stan_Corr.png'.format(i), format='png') plt.close(fig) for i in range(5): X_df = DataFrame(X[i,:,:]) corr = X_df.corr()
pd.Index obj = Series([1, 2, 3]) obj.reindex() data = DataFrame([[1, 2, 3], [4, 5, 6]]) data.drop() np.argsort() obj.rank() obj.sort_values() data.tail() data.cov() data.cov() data.corr() data.dropna() data.loc data.fillna() data.unstack()
print "Definimos de nuevo el dataframe" df = pd.DataFrame(data={"A":[1,2], "B":[2.6,1.3]}) print df print "añadimos columnas combinando las actuales" df["C"] = df["A"]+df["B"] df["D"] = df["A"]*3 df["E"] = np.sqrt(df["A"]) print df print "*"*15 print "Datos disponibles de un dataframe" print " descripcion del dataframe" print df.describe() print " covarianza " print df.cov() print " correlación " print df.corr() print "*"*15 print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas " print " DataFrame(np.random.randn(1000,2),columns=['x','y'])" plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y']) print plot_df print "Mostramos las graficas" plot_df.plot() plot_df.hist()
class MultiFactor: def __init__(self, factor_name, stocks, start_date=None, end_date=None): self.factor_name = factor_name self.start_date = start_date self.end_date = end_date self.stocks = stocks self.factor = None self.factor_list = None self.method = None self.quantile_nl = None def set_factor(self): self.factor_list = None self.method = None self.quantile_nl = None def get_factor(self): self.factor_dict = { factor: pd.read_csv('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, factor), index_col=[0]) for factor in self.factor_list } self.df = DataFrame({ factor: self.factor_dict[factor].values.reshape(-1) for factor in self.factor_list }) self.corr = self.df.corr() self.e_value, self.e_vector = np.linalg.eig(self.corr) r = np.array(len(self.e_value) - rankdata(self.e_value), dtype=np.int32) self.e_value = self.e_value[r] self.e_vector = self.e_vector[:, r] def pairplot(self): plt.figure(figsize=(16, 12)) sns.pairplot(self.df) plt.savefig('%s/Results/%s/pair.png' % (gc.MULTIFACTOR_PATH, self.factor_name)) def corrplot(self): plt.figure(figsize=(16, 12)) sns.heatmap(self.corr) plt.savefig('%s/Results/%s/corr.png' % (gc.MULTIFACTOR_PATH, self.factor_name)) def screeplot(self): plt.figure(figsize=(16, 12)) plt.plot(self.e_value / self.e_value.sum()) plt.savefig('%s/Results/%s/scree.png' % (gc.MULTIFACTOR_PATH, self.factor_name)) def multi_analysis(self): if not os.path.exists('%s/Results/%s' % (gc.MULTIFACTOR_PATH, self.factor_name)): os.mkdir('%s/Results/%s' % (gc.MULTIFACTOR_PATH, self.factor_name)) self.pairplot() self.corrplot() self.screeplot() def combine_factor(self): self.factor = DataFrame() if self.method == 'ew': for factor in self.factor_list: self.factor = self.factor.add(self.factor_dict[factor], fill_value=0) elif self.method[:4] == 'pca_': pca_num = int(self.method[4]) for i in range(len(self.factor_list)): self.factor = self.factor.add( self.e_vector[i, pca_num] * self.factor_dict[self.factor_list[i]], fill_value=0) if self.quantile_nl: self.factor = self.factor.subtract(self.factor.quantile( self.quantile_nl, axis=1), axis=0)**2 def inf_to_nan(self, factor): factor[factor == np.inf] = np.nan factor[factor == -np.inf] = np.nan return factor def factor_analysis(self, industry_neutral=True, size_neutral=True, num_group=10): self.factor = self.inf_to_nan(self.factor) stocks = self.stocks start_date = self.start_date end_date = self.end_date y1 = pd.read_csv('%s/Data/y1.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y2 = pd.read_csv('%s/Data/y2.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y3 = pd.read_csv('%s/Data/y3.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y4 = pd.read_csv('%s/Data/y4.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y5 = pd.read_csv('%s/Data/y5.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] if start_date: y1 = y1.loc[y1.index >= start_date, :] y2 = y2.loc[y2.index >= start_date, :] y3 = y3.loc[y3.index >= start_date, :] y4 = y4.loc[y4.index >= start_date, :] y5 = y5.loc[y5.index >= start_date, :] if end_date: y1 = y1.loc[y1.index <= end_date, :] y2 = y2.loc[y2.index <= end_date, :] y3 = y3.loc[y3.index <= end_date, :] y4 = y4.loc[y4.index <= end_date, :] y5 = y5.loc[y5.index <= end_date, :] self.y1 = y1 self.y2 = y2 self.y3 = y3 self.y4 = y4 self.y5 = y5 if not os.path.exists( '%s/Results/%s/%s' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)): os.mkdir('%s/Results/%s/%s' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) factor = self.factor.copy() #行业中性 if industry_neutral: industrys = tools.get_industrys('L1', self.stocks) tmp = {} for k in industrys.keys(): if len(industrys[k]) > 0: tmp[k] = industrys[k] industrys = tmp factor = tools.standardize_industry(self.factor, industrys) self.factor_industry_neutral = factor.copy() #市值中性 if size_neutral: market_capitalization = DataFrame({ stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks }) market_capitalization = np.log(market_capitalization) if self.start_date: market_capitalization = market_capitalization.loc[ market_capitalization.index >= self.start_date, :] if self.end_date: market_capitalization = market_capitalization.loc[ market_capitalization.index <= self.end_date, :] if industry_neutral: market_capitalization = tools.standardize_industry( market_capitalization, industrys) beta = (factor * market_capitalization).sum(1) / ( market_capitalization * market_capitalization).sum(1) factor = factor - market_capitalization.mul(beta, axis=0) self.factor_industry_size_neutral = factor.copy() # self.factor_industry_neutral.fillna(0, inplace=True) # self.factor_industry_size_neutral.fillna(0, inplace=True) # factor.fillna(0, inplace=True) #因子分布 plt.figure(figsize=(16, 12)) plt.hist(factor.fillna(0).values.flatten()) plt.savefig('%s/Results/%s/%s/hist.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) #IC、IR、分组回测 ys = [self.y1, self.y2, self.y3, self.y4, self.y5] IC = {} IR = {} group_backtest = {} group_pos = {} for i in range(len(ys)): if industry_neutral: y_neutral = tools.standardize_industry(ys[i], industrys) if size_neutral: y_neutral = y_neutral - market_capitalization.mul( (y_neutral * market_capitalization).sum(1) / (market_capitalization * market_capitalization).sum(1), axis=0) IC[i] = (y_neutral * factor).mean(1) / factor.std(1) / y_neutral.std(1) IR[i] = IC[i].rolling(20).mean() / IC[i].rolling(20).std() factor_quantile = DataFrame( rankdata(factor, axis=1), index=factor.index, columns=factor.columns).div(factor.notna().sum(1), axis=0) # / len(factor.columns) factor_quantile[factor.isna()] = np.nan group_backtest[i] = {} group_pos[i] = {} for n in range(num_group): group_pos[i][n] = DataFrame((n / num_group <= factor_quantile) & (factor_quantile <= (n + 1) / num_group)) group_pos[i][n][~group_pos[i][n]] = np.nan group_pos[i][n] = 1 * group_pos[i][n] group_backtest[i][n] = ((group_pos[i][n] * ys[i]).mean(1) - ys[i].mean(1)).cumsum().rename( '%s' % (n / num_group)) self.IC = IC self.IR = IR self.group_pos = group_pos self.group_backtest = group_backtest plt.figure(figsize=(16, 12)) for i in range(len(ys)): IC[i].cumsum().plot() plt.legend(['%s' % i for i in range(len(ys))]) plt.savefig('%s/Results/%s/%s/IC.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) plt.figure(figsize=(16, 12)) for i in range(len(ys)): IR[i].cumsum().plot() plt.legend(['%s' % i for i in range(len(ys))]) plt.savefig('%s/Results/%s/%s/IR.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) for i in range(len(ys)): plt.figure(figsize=(16, 12)) for n in range(num_group): group_backtest[i][n].plot() plt.legend(['%s' % i for i in range(num_group)]) plt.savefig( '%s/Results/%s/%s/groupbacktest%s.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method, i)) def update_factor(self): self.set_factor() self.get_factor() self.combine_factor() self.factor = self.inf_to_nan(self.factor) #if 'industry' in self.neutral_list: if True: industrys = tools.get_industrys('L1', self.stocks) tmp = {} for k in industrys.keys(): if len(industrys[k]) > 0: tmp[k] = industrys[k] industrys = tmp factor = tools.standardize_industry(self.factor, industrys) #if 'market_capitalization' in self.neutral_list: if True: market_capitalization = DataFrame({ stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks }) market_capitalization = np.log(market_capitalization) if self.start_date: market_capitalization = market_capitalization.loc[ market_capitalization.index >= self.start_date, :] if self.end_date: market_capitalization = market_capitalization.loc[ market_capitalization.index <= self.end_date, :] #if 'industry' in self.neutral_list: if True: market_capitalization = tools.standardize_industry( market_capitalization, industrys) beta = (factor * market_capitalization).sum(1) / ( market_capitalization * market_capitalization).sum(1) factor = factor - market_capitalization.mul(beta, axis=0) # factor.fillna(0, inplace=True) if os.path.exists('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name)): if isinstance(factor.index[0], str): factor_old = pd.read_csv( '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name), index_col=[0]) else: factor_old = pd.read_csv( '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name), index_col=[0], parse_dates=[0]) factor = pd.concat([ factor_old, factor.loc[factor.index > factor_old.index[-1], :] ], axis=0) factor.sort_index(axis=0, inplace=True) factor.sort_index(axis=1, inplace=True) factor.to_csv('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name))
def StdCorr(data_frame: pd.DataFrame) -> pd.DataFrame: # Standard Correlation Coefficient - Pearson Correlation std_corr_frame = data_frame.corr(method="pearson") return std_corr_frame
def _fit_definition(self, data_x: pd.DataFrame) -> dict: # This method is described in APM on pg 47 as the following steps: # - calculate the correlation matrix of features # - determine the two features associated with the largest absolute pairwise correlation # (call them features `A` and `B`) # - Determine the average correlation between `A` and the other variables. # - Do the same for `B` # - If `A` has a larger average correlation, remove it; otherwise, remove feature `B` # - Repeat until no absolute correlations are above the threshold (``r correlation_threshold``) columns_to_remove = list() # noinspection PyUnresolvedReferences # `corr()` automatically excludes categorical features # we'll get the correlation outside the loop and remove features as we go because it is a very # expensive function call for large datasets correlation_matrix = data_x.corr() while True: local_correlation_matrix = correlation_matrix features = local_correlation_matrix.columns.values local_correlation_matrix = np.abs(local_correlation_matrix.values) np.fill_diagonal(local_correlation_matrix, np.NaN) # local_correlation_matrix.unique() # sorted(np.abs(np.unique(local_correlation_matrix)), reverse=True) highest_abs_pairwise_correlation = np.nanmax( local_correlation_matrix) if highest_abs_pairwise_correlation > self._max_correlation_threshold: # `where()` will always be 2 instances for correlation matrices, grab the first indexes = np.where(local_correlation_matrix == highest_abs_pairwise_correlation)[0] mean_a_correlation = np.nanmean( local_correlation_matrix[indexes[0], ]) mean_b_correlation = np.nanmean( local_correlation_matrix[indexes[1], ]) # A potential problem is that when we are e.g. resampling, there can be slight variations # depending on the scaling/etc.. and if, for example, the 'RemoveCorrelationsTransformer' # chooses (at "random") different features to remove, this messes with the functionality # that detects which features we should end with (e.g. a resampling training split doesn't # contain an uncommon value for a particular column, and is subsequently encoded # (e.g. one-hot) and then the training dataset does contain that value, and shit either breaks # or becomes inconsistent when predicting on two different transformed dataset # SO: we have to round (arbitrarily to 3) so that slight variations in correlations (e.g. # between the same two features when resampling) are consistent. if round(float(mean_a_correlation), 3) > round( float(mean_b_correlation), 3): column_to_remove = features[indexes[0]] else: column_to_remove = features[indexes[1]] columns_to_remove.append(column_to_remove) correlation_matrix.drop(index=column_to_remove, columns=column_to_remove, inplace=True) else: break return {'columns_to_remove': columns_to_remove}
def test_corr_nullable_integer(self, nullable_column, other_column, method): # https://github.com/pandas-dev/pandas/issues/33803 data = DataFrame({"a": nullable_column, "b": other_column}) result = data.corr(method=method) expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected)
bizframe_sub.head(n=10) # In[ ]: bsort = bizframe_sub.sort_values(by='review_count', ascending=False) # In[ ]: bsort[:10] # In[ ]: bizframe_sub.corr() # In[ ]: top_biz = bizframe_sub.ix[bizframe_sub.stars == 5.0] # In[ ]: tb = top_biz.sort_values(by='review_count', ascending=False) # In[ ]: tb[:10]
3 3.5 0.5 ] calculate var, cov, corr, cov-matrix, corr-matrix """ data = {'x1': [1, 2], 'x2': [2, 3], 'x3': [2, 1]} A = DataFrame(data) print A data1 = Series([3, 3.5, 0.5], index=['x1', 'x2', 'x3']) A = A.append(data1, ignore_index=True) print '\n', A, '\n' var_x1 = A['x1'].var() var_x2 = A['x2'].var() var_x3 = A['x3'].var() print 'var_x1: %f' % var_x1 print 'var_x2: %f' % var_x2 print 'var_x3: %f' % var_x3 cov_matrix = A.cov() print '\ncov_matrix:\n', cov_matrix corr_matrix = A.corr() print '\ncorr_matrix:\n', corr_matrix print "\nA['x1'].corr(A['x2']): %f" % A['x1'].corr(A['x2']) print "A['x1'].corr(A['x3']): %f" % A['x1'].corr(A['x3'])
plt.figure(figsize=(10, 10)) plt.title('Sales_rev- Client_Room_household', y=1.05, size=15) sns.heatmap(autumn_season.corr(), linewidths=0.1, vmax=1.0, square=True, cmap='CMRmap', linecolor='white', annot=True) plt.show() #not correlated plt.figure(figsize=(10, 10)) plt.title('Sales_rev- Client_Room_household', y=1.05, size=15) sns.heatmap(df.corr(), linewidths=0.1, vmax=1.0, square=True, cmap='CMRmap', linecolor='white', annot=True) plt.show() #not corred. Client_Region = df[df.Client_Region == 'Hague'] g = df[df.Client_Region == 'Groningen'] Aarhus = df[df.Client_Region == 'Aarhus'] plt.figure(figsize=(10, 10)) plt.title('Sales_rev- Client_Room_household', y=1.05, size=15)
def correlation(col1, col2): df = DataFrame({'col1': col1, 'col2': col2}) corr_matrix = df.corr(method='pearson', min_periods=1) result = corr_matrix.iloc[0, 1] return result
import pandas as pd import numpy as np # datafile = 'D:/新建 Microsoft Office Excel 工作表.xlsx' # data = pd.read_excel(datafile,header=None) # min = (data-data.min())/(data.max()-data.min()) # zero = (data - data.mean())/data.std() # float = data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 # print("原始数据为:\n",data) # print('--------------------') # print('最小-最大规范化后的数据:\n',min) from pandas import Series, DataFrame df = DataFrame(np.random.randn(4, 3), index=list('abcd'), columns=['frist', 'second', 'third']) print(df) print(df.describe()) print(df.sum()) print(df.sum(axis=1)) print('-----------') print(df.idxmax(), df.idxmin(), df.idxmin(axis=1)) print(df.cumsum()) print(df.var()) print(df.std()) print(df.pct_change()) print(df.cov()) print(df.corr())
def check_corr(df: pd.DataFrame) -> None: corr = df.corr() plt.figure(figsize=(10,10)) sns.heatmap(corr, square=True, annot=True) plt.savefig(f'src/sample_data/Kaggle/predict_target_of_bank/corr_heatmap.png')
def _calc_correlation_matrix(ts_df: DataFrame) -> DataFrame: return ts_df.corr()
def corr_mat( data: pd.DataFrame, split: Optional[ str] = None, # Optional[Literal['pos', 'neg', 'high', 'low']] = None, threshold: float = 0, target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None, method: str = "pearson", # Literal['pearson', 'spearman', 'kendall'] = "pearson", colored: bool = True, ) -> Union[pd.DataFrame, Any]: """ Returns a color-encoded correlation matrix. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \ is provided, the index/column information is used to label the plots split : Optional[str], optional Type of split to be performed, by default None {None, "pos", "neg", "high", "low"} threshold : float, optional Value between 0 and 1 to set the correlation threshold, by default 0 unless \ split = "high" or split = "low", in which case default is 0.3 target : Optional[Union[pd.DataFrame, str]], optional Specify target for correlation. E.g. label column to generate only the \ correlations between each feature and the label, by default None method : str, optional method: {"pearson", "spearman", "kendall"}, by default "pearson" * pearson: measures linear relationships and requires normally distributed \ and homoscedastic data. * spearman: ranked/ordinal correlation, measures monotonic relationships. * kendall: ranked/ordinal correlation, measures monotonic relationships. \ Computationally more expensive but more robust in smaller dataets than \ "spearman" colored : bool, optional If True the negative values in the correlation matrix are colored in red, by \ default True Returns ------- Union[pd.DataFrame, pd.Styler] If colored = True - corr: Pandas Styler object If colored = False - corr: Pandas DataFrame """ # Validate Inputs _validate_input_range(threshold, "threshold", -1, 1) _validate_input_bool(colored, "colored") def color_negative_red(val): color = "#FF3344" if val < 0 else None return "color: %s" % color data = pd.DataFrame(data) if isinstance(target, (str, list, pd.Series, np.ndarray)): target_data = [] if isinstance(target, str): target_data = data[target] data = data.drop(target, axis=1) elif isinstance(target, (list, pd.Series, np.ndarray)): target_data = pd.Series(target) target = target_data.name corr = pd.DataFrame(data.corrwith(target_data, method=method)) corr = corr.sort_values(corr.columns[0], ascending=False) corr.columns = [target] else: corr = data.corr(method=method) corr = _corr_selector(corr, split=split, threshold=threshold) if colored: return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep="-") else: return corr
def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict: """Calculate the correlation coefficients between variables for the correlation types selected in the config (pearson, spearman, kendall, phi_k, cramer). Args: variables: A dict with column names and variable types. df: The DataFrame with variables. Returns: A dictionary containing the correlation matrices for each of the active correlation measures. """ correlations = {} for correlation_name in ["pearson", "spearman", "kendall"]: if config["correlations"][correlation_name].get(bool): try: correlation = df.corr(method=correlation_name) if len(correlation) > 0: correlations[correlation_name] = correlation except (ValueError, AssertionError) as e: warn_correlation(correlation_name, e) if config["correlations"]["phi_k"].get(bool): import phik with warnings.catch_warnings(): warnings.simplefilter("ignore") # Phi_k does not filter non-numerical with high cardinality selcols = [] intcols = [] for col in df.columns.tolist(): try: tmp = (df[col].value_counts(dropna=False).reset_index(). dropna().set_index("index").iloc[:, 0]) if tmp.index.inferred_type == "mixed": continue if pd.api.types.is_numeric_dtype(df[col]): intcols.append(col) selcols.append(col) elif df[col].nunique() <= config[ "categorical_maximum_correlation_distinct"].get( int): selcols.append(col) except TypeError: continue except ValueError: continue try: correlations["phi_k"] = df[selcols].phik_matrix( interval_cols=intcols) # Only do this if the column_order is set with suppress(NotFoundError): # Get the preferred order column_order = config["column_order"].get(list) # Get the Phi_k sorted order current_order = (correlations["phi_k"].index. get_level_values("var1").tolist()) # Intersection (some columns are not used in correlation) column_order = [ x for x in column_order if x in current_order ] # Override the Phi_k sorting correlations["phi_k"] = correlations["phi_k"].reindex( index=column_order, columns=column_order) except ValueError as e: warn_correlation("phi_k", e) except DataError as e: warn_correlation("phi_k", e) categorical_correlations = { "cramers": cramers_matrix, "recoded": recoded_matrix } for correlation_name, get_matrix in categorical_correlations.items(): if config["correlations"][correlation_name].get(bool): try: correlation = get_matrix(df, variables) if len(correlation) > 0: correlations[correlation_name] = correlation except ValueError as e: warn_correlation(correlation_name, e) return correlations
class Tasks: """ a class to read 'actionhistory' json file and convert it to numpy array it involves two loops over the actions. in the first loop it extracts the name of all the sites and actions and also the error codes. in the second loop, for each entry a Task item is created and stored """ def __init__(self, _file, binary=False, TiersOnly=False, all_sites=[], all_errors=[], all_actions=[]): """ initialize an instance of Taks :param str _file: the full path of the actionhistory json file :param bool binary: if true, converts actions to acdc/non-acdc :param bool TiersOnly: if true, only the tier index of the site is stored instead of the full name :param all_actions, all_errors, all_actions: to be able to add additional values to the list """ self.TiersOnly = TiersOnly self.IsBinary = binary self.fIn = open(_file) self.js = json.load(self.fIn) self.all_sites = all_sites self.all_errors = all_errors self.all_actions = all_actions self.FillSiteErrors() if binary: self.all_actions = ["non-acdc", "acdc"] self.AllData = [] for tsk in self.js: self.AllData.append(Task(self.js[tsk], tsk, self)) self.ErrorsGoodBadSites = np.array( [tsk.Get2DArrayOfErrors() for tsk in self.AllData]) self.AllActions = np.array([tsk.action.code() for tsk in self.AllData]) self.df = DataFrame(data=[tsk.GetInfo() for tsk in self.AllData], columns=self.AllData[0].GetInfo(True)) def GetShuffledDS(self, n): p = np.random.permutation(len(self.AllData)) return self.ErrorsGoodBadSites[p[:n]], self.AllActions[p[:n]] def GetTrainTestDS(self, train_ratio, shuffle=False): """ convert the information to train/test :param float train_ratio: number between 0 and 1, the fraction to go for the training :ret: train_x, train_y, test_x , test_y """ if shuffle: self.ErrorsGoodBadSites, self.AllActions = self.GetShuffledDS( len(self.AllData)) n = int(train_ratio * len(self.AllData)) return self.ErrorsGoodBadSites[: n], self.AllActions[:n], self.ErrorsGoodBadSites[ n:], self.AllActions[n:] def FillSiteErrors(self, Print=False): """ For the first loop and fill the lists of errors, sites and actions :param bool Print: do printing after it has been done """ for tsk in self.js: errors = self.js[tsk]["errors"] for site_status in ["good_sites", "bad_sites"]: sites = errors[site_status] for err in sites: if int(err) not in self.all_errors: self.all_errors.append(int(err)) for site in sites[err]: if site not in self.all_sites: self.all_sites.append(site) action = self.js[tsk]['parameters']['action'] if action not in self.all_actions: self.all_actions.append(str(action)) self.all_sites.sort() self.all_errors.sort() self.all_actions.sort() if Print: print(self.all_sites) print(self.all_errors) print(self.all_actions) def PlotCorrelation(self): """ produce and show the correlation plot, based on the output of GetInfo method of the Task object """ plt.matshow(self.df.corr()) plt.show() def GroupBy(self, var1, var2): """ group by var1 and var2 and plot the counts """ groupby = self.df.groupby([var1, var2]) var3 = "nErrorsInGoodSites" if "nErrorsInBadSites" in [ var1, var2 ] else "nErrorsInBadSites" df_action_error_count = groupby[var3].count().reset_index() df_action_error_count.plot.scatter(x=var1, y=var2, s=df_action_error_count[var3]) plt.show()
def pandas_kendall_compute(config: Settings, df: pd.DataFrame, summary: dict) -> Optional[pd.DataFrame]: return df.corr(method="kendall")
def get_feature_clusters(X: pd.DataFrame, dependence_metric: str, distance_metric: str = None, linkage_method: str = None, n_clusters: int = None, critical_threshold: float = 0.0) -> list: """ Machine Learning for Asset Managers Snippet 6.5.2.1 , page 85. Step 1: Features Clustering Get clustered features subsets from the given set of features. :param X: (pd.DataFrame) of features. :param dependence_metric: (str) method to be use for generating dependence_matrix, either 'linear' or 'information_variation' or 'mutual_information' or 'distance_correlation'. :param distance_metric: (str) the distance operator to be used for generating the distance matrix. The methods that can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the feature are to be generated as it is by the ONC algorithm. :param linkage_method: (str) method of linkage to be used for clustering. Methods include: 'single' , 'ward' , 'complete' , 'average' , 'weighted' and 'centroid'. Set it to None if the feature are to be generated as it is by the ONC algorithm. :param n_clusters: (int) number of clusters to form. Must be less the total number of features. If None then it returns optimal number of clusters decided by the ONC Algorithm. :param critical_threshold: (float) threshold for determining low silhouette score in the dataset. It can any real number in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be indentified as having low silhouette and hence requied transformation will be appiled to for for correction of the same. :return: (list) of feature subsets. """ # Checking if dataset contains features low silhouette X = _check_for_low_silhouette_scores(X, critical_threshold) # Get the dependence matrix if dependence_metric != 'linear': dep_matrix = get_dependence_matrix(X, dependence_method=dependence_metric) else: dep_matrix = X.corr() if n_clusters is None and (distance_metric is None or linkage_method is None): return list(get_onc_clusters(dep_matrix.fillna(0)) [1].values()) # Get optimal number of clusters if distance_metric is not None and (linkage_method is not None and n_clusters is None): n_clusters = len(get_onc_clusters(dep_matrix.fillna(0))[1]) if n_clusters >= len( X.columns ): # Check if number of clusters exceeds number of features raise ValueError( 'Number of clusters must be less than the number of features') # Apply distance operator on the dependence matrix dist_matrix = get_distance_matrix(dep_matrix, distance_metric=distance_metric) # Get the linkage link = linkage(squareform(dist_matrix), method=linkage_method) clusters = fcluster(link, t=n_clusters, criterion='maxclust') clustered_subsets = [[f for c, f in zip(clusters, X.columns) if c == ci] for ci in range(1, n_clusters + 1)] return clustered_subsets
bike_d = y.groupby(['Item'])['Sales'].mean() days = pd.DataFrame(data=bike_d) bike_Item = days.sort_values(by='Sales', ascending=False, axis=0) fig = px.bar(bike_Item, x="Sales", y=bike_Item.index, color='Sales', color_continuous_scale='Blues', title="Average sales per month") #plotly.offline.plot(fig, filename='bike') #-----------------------CORRELATIONS plt.figure(figsize=(15, 15)) sns.heatmap(df.corr(), annot=True, cmap='Blues_r', mask=np.triu(df.corr(), k=1)) #-----------------------------ROI------------------------------------------------------------- """ROI ON 2020 in a pandemic it was anticipated a larger use of echo transport including bikes instead of public transport so the investment was higher""" #2019 ROI #filtering the year Year2019 = df[df.Year == 2019] investment = 40000 #received investment #passing vriables to the desired columns bike_costs = Item_cost_month = Year2019['Item_cost_month'] loss = Loss_item = Year2019['Loss_item']
def RPC_correlation_matrix(data: pd.DataFrame, *args, **kwargs): return data.corr()
def check_corr(df: pd.DataFrame) -> None: corr = df.corr() plt.figure(figsize=(20, 20)) sns.heatmap(corr, square=True, annot=True) plt.savefig(f'{SAVE_DIR}/corr_heatmap.png')
cNorm = colors.Normalize(vmin=0, vmax=len(years)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet) # 0,1,2,3,4, #lable1 5 lowest 6 final mean names = ['Month', 'Goals', 'Candidates', 'First Mean', 'Second Mean'] dataList = pricePridictTest.dataListManage() print(dataList) lablesLowest = [] # ,'Final Lowest','Final Mean' lablesMean = [] data = DataFrame(dataList) print(data.describe()) dataT = data.T print(data.shape) corrMat = DataFrame(data.corr()) print(names) # plt.figure() # for i in range(len(dataList)): # colorVal = scalarMap.to_rgba(i) # plt.plot(dataList[i], c=colorVal) # plt.grid(True) # plt.legend(loc='upper right') # plt.figure() # plt.pcolor(corrMat) #画相关性热力图 # plt.colorbar() #展示色条 # plt.figure() # plt.boxplot(dataT) # # plt.show()
def corr_finder(df: pd.DataFrame, threshold=0.3, print_corr=True, get_list=False, p_value=False, method='pearson') -> list: '''Returns a list of [x, y, corr_value[, p_value]] where [x, y] are the coordinates of the value on the correlation matrix Parameters ---------- df : pd.DataFrame The dataframe to find correlations in threshold : float, default=0.3 The abs(threshold) at which to flag a relationship between two features as being correlated print_corr : bool, default=True Prints the results of the correlation finder if True get_list : bool, default=False Returns the list of correlations in the format [x, y, corr_value, p_value] p_value is only returned if p_value=True p_value : bool, default=True Prints the corresponding p value of a correlation if print_corr=True, and adds it to the list returned if get_list=True method : {'pearson', 'spearman', 'kendall'} or callable, default='pearson' The method with which to calculate the correlation matrix Returns ------- corr_list: list or None, shape (n_correlations, 3 or 4) List is returned only upon request via get_list in the format [x, y, corr_value[, p_value]] n_correlations = number of correlations found with the given threshold p_value is optional. Returned only if given argument p_value is True Notes ----- Only accepts DataFrames without categorical features (Or has been OneHotEncoded properly) Checks through correlations of the bottom left triangle of the correlation matrix If categorical features are present, coordinates x and y will no longer reflect the correct column coordinates as in df.columns ''' # If the shape of df.corr() is not equal to a square matrix with the len/width equal to df.shape[1], there are categorical features assert df.shape[1] == df.shape[1], \ 'Correlation matrix shape should equal ({0}, {1}), it is instead {2}. Are there categorical features inside?'\ .format(df.shape[1], df.shape[1], df.shape) if method is None: method = 'pearson' df = df.corr(method=method) # Calculate p-values if requested if p_value: if method == 'pearson': df_pv = df.corr(method=pearsonr_pval) elif method == 'spearman': df_pv = df.corr(method=spearmanr_pval) elif method == 'kendall': df_pv = df.corr(method=kendall_pval) corr_list = list() # Combination of for statements iterate through all matrices of the # bottom-left triangular half of the correlation matrix for y in range(1, df.shape[1]): for x in range(0, y): # If correlation is above given threshold if abs(df.iloc[x, y]) > threshold: # Print anything only if requested (Default) # If p-value is desired, print it together with correlation and coordinates if print_corr and p_value: print('({}, {})'.format(x, y), '{} has a correlation of'.format(df.columns[x]), round(df.iloc[x, y], 4), 'with {}'.format(df.columns[y]), 'with p-value of', round(df_pv.iloc[x, y], 4)) elif print_corr: print('({}, {})'.format(x, y), '{} has a correlation of'.format(df.columns[x]), round(df.iloc[x, y], 4), 'with {}'.format(df.columns[y])) # If a list was requested to be returned if get_list: # Add p-value into list if it is desired if p_value: corr_list.append([ x, y, round(df.iloc[x, y], 4), round(df_pv.iloc[x, y], 4) ]) else: corr_list.append([x, y, round(df.iloc[x, y], 4)]) if get_list: return corr_list
def test_corr_invalid_method(self): # GH#22298 df = DataFrame(np.random.normal(size=(10, 2))) msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " with pytest.raises(ValueError, match=msg): df.corr(method="____")
def real_data_net_import_analysis(): base_path = Path().cwd() / 'result_data' sept_setup_path = base_path / 'schedule vs real - sept' oemof_baseline = load_oemof_costs(get_oemof_results(sept_setup_path, 'baseline_pred.oemof', True)) hal_baseline = hal_load_import_kwh(sept_setup_path / 'baseline_pred', 'baseline_pred') oemof_sept = load_oemof_costs(get_oemof_results(sept_setup_path, 'real_data_offline.oemof', True)) hal_sept = hal_load_import_kwh(sept_setup_path / 'real_data_online', 'real_data_online') total_hal_baseline = hal_baseline['wh_total'] / 1000 total_oemof_baseline = oemof_baseline['wh_total'] / 1000 total_hal_sept = hal_sept['wh_total'] / 1000 total_oemof_sept = oemof_sept['wh_total'] / 1000 mixed_costs = pd.DataFrame({ 'Pred HAL': total_hal_baseline, 'Pred OEMOF': total_oemof_baseline, 'Real HAL': total_hal_sept, 'Real OEMOF': total_oemof_sept, }, index=total_oemof_baseline.index) fig, axes = plt.subplots(nrows=1) mixed_costs.sum().plot.bar(ax=axes, grid=True).set_ylabel('KWh') fig.subplots_adjust(left=0.12, right=0.97, top=0.92, bottom=0.19, hspace=0.53) plt.savefig("praktikumsbericht/images/sept_real_total.pdf") dataframe_to_stat_table(sept_setup_path / 'stats.csv', mixed_costs) hal_stor_load = load_hal_storage_df(sept_setup_path / 'real_data_online', 'real_data_online')[0]['power[W]'] pred_oemof_results = get_oemof_results(sept_setup_path, 'baseline_pred.oemof', exclude_storage=True) real_oemof_results = get_oemof_results(sept_setup_path, 'real_data_offline.oemof', exclude_storage=True) oemof_stor_load = real_oemof_results['b1_data'][(('b1', 'sink_storage'), 'flow')] - real_oemof_results['b1_data'][(('source_storage', 'b1'), 'flow')] pv_pred = pred_oemof_results['b1_data'][(('source_pv', 'b1'), 'flow')] pv_real = real_oemof_results['b1_data'][(('source_pv', 'b1'), 'flow')] pl = DataFrame({ 'HAL controlled Storage Balance': hal_stor_load, 'Schedule controlled Storage Balance': oemof_stor_load, 'Pred PV Output': pv_pred, 'Actual PV Output': pv_real, }, index=real_oemof_results['b1_data'].index) corr = pl.corr() # print(corr.to_latex()) print(corr.to_latex(open("praktikumsbericht/images/sept_real_correlation.tex", 'w'), label='t/res/real', header=['HAL', 'Schedule', 'Pred PV', 'Actual PV'], caption='Correlation between PV input and storage consumption', float_format="%.2f")) fig, axes = plt.subplots(nrows=1) pl["2016-09-01"].plot(ax=axes) axes.set_ylabel('W') plt.savefig("praktikumsbericht/images/sept_real_example.pdf") # Percentiles fig, axes = plt.subplots(nrows=1) boxplot = mixed_costs.boxplot(grid=True, showfliers=False, ax=axes) boxplot.set_ylabel('KWh') fig.subplots_adjust(left=0.12, right=0.96, top=0.93, bottom=0.07, hspace=0.20) plt.savefig("praktikumsbericht/images/sept_real_peaks.pdf") dataframe_to_stat_table(sept_setup_path / 'stats.csv', mixed_costs)
def test_corr_int(self): # dtypes other than float64 GH#1761 df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) df3.cov() df3.corr()
# prepare data test_set = pd.read_csv('raw/TestSet.csv') test_subset = pd.read_csv('raw/TestSubset.csv') train_set = pd.read_csv('raw/TrainingSet.csv') train_subset = pd.read_csv('raw/TrainingSubset.csv') train = train_set.drop(['EbayID','QuantitySold','SellerName'], axis=1) train_target = train_set['QuantitySold'] _, n_features = train.shape df = DataFrame(np.hstack((train,train_target[:, None])), columns=range(n_features) + ["isSold"]) _ = sns.pairplot(df[:50], vars=[2,3,4,10,13], hue="isSold", size=1.5) plt.figure(figsize=(10,10)) # compute the correlation matrix corr = df.corr() # generate a mask for the upper triangle mask = np.zeros_like(corr,dtype=np.bool) mask[np.triu_indices_from(mask)] = True # generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(corr, mask=mask, cmap=cmap, vmax = .3, square=True, xticklabels=5, yticklabels=2, linewidths=.5, cbar_kws={"shrink":.5}) plt.yticks(rotation=0) plt.show()
#-*- coding: utf-8 -*- import matplotlib.pyplot as plt import numpy as np import pandas as pd from pandas import Series, DataFrame from pandas.tools.plotting import scatter_matrix df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) corr_mat = df.corr() print corr_mat scatter_matrix(df, alpha=0.2, figsize=(16, 16), diagonal='kde') plt.show() #plt.savefig('features.png')
def get_mode(arr): mode = []; arr_appear = dict((a, arr.count(a)) for a in arr); # 统计各个元素出现的次数 if max(arr_appear.values()) == 1: # 如果最大的出现为1 return; # 则没有众数 else: for k, v in arr_appear.items(): # 否则,出现次数最大的数字,就是众数 if v == max(arr_appear.values()): mode.append(k); return mode; get_mode(a) var(a) std(a) a=Series(a) a.skew() a.kurt() a.describe() df = DataFrame({'data1' : np.random.randn(5), 'data2' : np.random.randn(5)}) df.cov() df.corr() ###假设检验 from scipy import stats as ss df=DataFrame({'data':[10.1,10,9.8,10.5,9.7,10.1,9.9,10.2,10.3,9.9]}) ss.ttest_1samp(a = df, popmean = 10)