def main():
    data = utils.read_data_from_csv('data/winequality-red.csv')

    for attribute in data[0].keys():
        for name, func in BIN_FUNCTIONS.iteritems():
            plot_histogram(data, attribute, func, name)

    data_frame = DataFrame(data)
    plot_scatter_matrix(data_frame)
    plot_parallel_coordinates(data_frame)

    plot_pca_projection(data)
    plot_pca_projection(data, normalized = True)

    plot_mds(data)

    data_frame.corr(method='pearson').to_csv('build/pearson.csv')
    data_frame.corr(method='kendall').to_csv('build/kendall.csv')
Example #2
0
def pandas_pearson_compute(config: Settings, df: pd.DataFrame,
                           summary: dict) -> Optional[pd.DataFrame]:
    return df.corr(method="pearson")
    def create_model(self, dataset: pd.DataFrame) -> None:
        """
            Description: 
                Method creates model, train it and provide error distribution

            Parameters
            ----------
            dataset : pandas DataFrame
                raw dataset that model will use to train itself
                    index: integers 
                    columns: Date, High, Low, Open, Close, Volume, Adj Close
        """
        self.raw_dataset = dataset.copy()

        # Additional features
        dataset['Open_Close diff'] = dataset['Open'] - dataset['Close']
        dataset['High_Low diff'] = dataset['High'] - dataset['Low']

        # Creating correlation matrix, extracting useful features for training
        correlation_matrix = dataset.corr()
        self.significant_features = list(
            correlation_matrix.loc[(
                (correlation_matrix.Close >= self.correlation_threshold) |
                (correlation_matrix.Close <= -self.correlation_threshold)),
                                   ['Close']].index)
        self.number_of_features = len(self.significant_features)
        dataset = dataset[self.significant_features]

        # Splitting dataset into train and test sets
        dataset = np.array(dataset)
        split_index = int(dataset.shape[0] * self.split_ratio)
        self.train_data = dataset[:split_index, :].copy()
        self.test_data = dataset[split_index:, :].copy()
        self.train_data = self.scaler.fit_transform(self.train_data)
        self.test_data = self.scaler.transform(self.test_data)
        x_train, y_train = self.get_xy_sets(self.train_data,
                                            self.backword_days)

        condidtion_1 = x_train is not None
        condidtion_2 = x_train is not np.array([])
        condidtion_3 = y_train is not None
        condidtion_4 = y_train is not np.array([])
        if not (condidtion_1 and condidtion_2 and condidtion_3
                and condidtion_4):
            return None

        # Model initialization
        input_shape = (self.backword_days, self.number_of_features)
        self.initialize_model(input_shape)

        # Model training
        print("First training:")
        start_time = time.time()
        self.model.fit(x_train,
                       y_train,
                       epochs=self.epochs_number,
                       batch_size=self.batch,
                       validation_split=0.08)
        self.first_training_time = time.time() - start_time
        print("First training time: {:.2f} minutes ({:.3f}s)"\
            .format(self.first_training_time/60, self.first_training_time))

        # Testing model on test set
        x_test, y_test = self.get_xy_sets(self.test_data, self.backword_days)
        if x_test is None or y_test is None:
            return None
        y_predictions = self.model.predict(x_test)

        # Model evaluation
        y_predictions = self.scaler.inverse_transform(y_predictions)
        y_test = self.scaler.inverse_transform(y_test)
        self.rmse = pd.DataFrame([
            np.sqrt(np.mean((y_test[:, i] - y_predictions[:, i])**2))
            for i in range(y_test.shape[1])
        ],
                                 index=self.significant_features,
                                 columns=['RMSE [%]'])
        print("RMSE:")
        print(self.rmse)
        if not all(row[0] <= (self.rmse_threshold * 100)
                   for idx, row in self.rmse.iterrows()):
            raise Exception(
                'RMSE value exceeded threshold ({}). Model is not usable.'.
                format(self.rmse_threshold))

        # Error distribution
        self.error_distribution = y_test - y_predictions
        self.error_distribution = self.error_distribution[(np.abs(
            stats.zscore(self.error_distribution)) < 3).all(axis=1)]

        # Final training (optional)
        if self.second_train:
            final_dataset = self.scaler.fit_transform(dataset)
            final_x, final_y = self.get_xy_sets(final_dataset,
                                                self.backword_days)
            print("\nFinal training:")
            start_time = time.time()
            self.model.fit(final_x,
                           final_y,
                           epochs=self.epochs_number,
                           batch_size=self.batch,
                           validation_split=0.1)
            self.final_training_time = time.time() - start_time
            print("Final traning time: {:.2f} minutes ({:.3f}s)"\
                .format(self.final_training_time/60, self.final_training_time))
        self.total_training_time = self.final_training_time + self.first_training_time
rtn_table = DataFrame()

for secID in secIDs:
    cp = get_return(secID)
    cp.name = secID
    rtn_table = pd.concat([rtn_table, cp], axis=1)

rtn_table.fillna(0, inplace=True)

#rtn_table.head(5)

#rtn_table.mean()*250

#rtn_table.corr()
print(rtn_table.mean() * 250)
print(rtn_table.corr())

print("*************************************************")

from cvxopt import matrix, solvers

portfolio1 = [0, 1, 2, 4, 5]
portfolio2 = range(6)
cov_mat = rtn_table.cov() * 250
exp_rtn = rtn_table.mean() * 250


def cal_efficient_frontier(portfolio):

    if len(portfolio) <= 2 or len(portfolio) > 6:
        raise Exception('portfolio必须为长度大于2小于7的list!')
Example #5
0
def plot_corr_heatmap(df: pd.DataFrame) -> None:
    """Plot the correlation matrix of a dataframe in heatmap"""
    corr_matrix = df.corr()
    return corr_matrix.style.background_gradient(
        cmap='coolwarm').set_precision(2)
Example #6
0
def fill_per_peptide_correlations(protein_records):
    per_peptide_correlation_parameter_labels = ['{0} per peptide correlation (Pearson)'.format(name) for name in
                                                per_peptide_correlation_parameter_names]

    total_received_peptides_number = 0
    total_missed_peptides_number = 0
    for protein_record in protein_records:
        total_received_peptides_number += len(protein_record.received_peptide_records)
        total_missed_peptides_number += len(protein_record.missed_peptide_records)

    total_received_pairs_number = total_received_peptides_number * (total_received_peptides_number - 1) // 2
    received_per_peptide_correlations = DataFrame(zeros((total_received_pairs_number,
                                                         len(per_peptide_correlation_parameter_labels)),
                                                        dtype=float64),
                                                  columns=per_peptide_correlation_parameter_labels)
    total_missed_pairs_number = total_missed_peptides_number * (total_missed_peptides_number - 1) // 2
    missed_per_peptide_correlations = DataFrame(zeros((total_missed_pairs_number,
                                                       len(per_peptide_correlation_parameter_labels)),
                                                      dtype=float64),
                                                columns=per_peptide_correlation_parameter_labels)

    received_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_received_peptides_number),
                                              dtype=float64))
    missed_kidera_factors = DataFrame(zeros((len(kidera_factor_names), total_missed_peptides_number),
                                            dtype=float64))

    received_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_received_peptides_number),
                                             dtype=float64))
    missed_acid_percents = DataFrame(zeros((len('AGVMDYNSWLFIKPQCERTH'), total_missed_peptides_number),
                                           dtype=float64))

    received_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_received_peptides_number),
                                              dtype=float64))
    missed_acid_compounds = DataFrame(zeros((len(amino_acid_group_names), total_missed_peptides_number),
                                            dtype=float64))

    # received_charges = []
    # missed_charges = []

    received_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_received_peptides_number),
                                                   dtype=float64))
    missed_hydrophobic_moments = DataFrame(zeros((len(hydrophobic_moments_names), total_missed_peptides_number),
                                                 dtype=float64))

    secondary_structure_fraction_names = ['Helix', 'Turn', 'Sheet']
    received_secondary_structure_fractions = DataFrame(
        zeros((len(secondary_structure_fraction_names), total_received_peptides_number),
              dtype=float64))
    missed_secondary_structure_fractions = DataFrame(
        zeros((len(secondary_structure_fraction_names), total_missed_peptides_number),
              dtype=float64))

    label = 'Filling received peptides array-like parameter lists: '
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        for received_peptide_record in protein_record.received_peptide_records:
            kidera_factor_index = 0
            for kidera_factor in received_peptide_record.peptide_parameters.kidera_factors:
                received_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value']
                kidera_factor_index += 1

            acid_index = 0
            for acid in 'AGVMDYNSWLFIKPQCERTH':
                received_acid_percents[index - 1][acid_index] = \
                    received_peptide_record.peptide_parameters.amino_acid_percents[acid]
                acid_index += 1

            group_index = 0
            for group in received_peptide_record.peptide_parameters.amino_acids_composition:
                received_acid_compounds[index - 1][group_index] = group['percent']
                group_index += 1

            # charges = []
            # for charge in received_peptide_record.peptide_parameters.charges:
            #     charges.append(charge['charge'])
            # received_charges.append(charges)

            moment_index = 0
            for moment in received_peptide_record.peptide_parameters.hydrophobic_moments:
                if moment['name'] != 'Polygly-polypro helix':
                    received_hydrophobic_moments[index - 1][moment_index] = moment['moment']
                    group_index += 1

            fraction_index = 0
            for fraction in received_peptide_record.peptide_parameters.secondary_structure_fraction:
                received_secondary_structure_fractions[index - 1][fraction_index] = fraction['value']
                fraction_index += 1

            show_progress(label, 35, index / total_received_peptides_number)
            index += 1
    print()

    label = 'Filling missed peptides array-like parameter lists: '
    show_progress(label, 35, 0.0)
    index = 1
    for protein_record in protein_records:
        for missed_peptide_record in protein_record.missed_peptide_records:
            kidera_factor_index = 0
            for kidera_factor in missed_peptide_record.peptide_parameters.kidera_factors:
                missed_kidera_factors[index - 1][kidera_factor_index] = kidera_factor['value']
                kidera_factor_index += 1

            acid_index = 0
            for acid in 'AGVMDYNSWLFIKPQCERTH':
                missed_acid_percents[index - 1][acid_index] = \
                    missed_peptide_record.peptide_parameters.amino_acid_percents[acid]
                acid_index += 1

            group_index = 0
            for group in missed_peptide_record.peptide_parameters.amino_acids_composition:
                missed_acid_compounds[index - 1][group_index] = group['percent']
                group_index += 1

                # charges = []
                # for charge in missed_peptide_record.peptide_parameters.charges:
                #     charges.append(charge['charge'])
                # missed_charges.append(charges)
                #
            moment_index = 0
            for moment in missed_peptide_record.peptide_parameters.hydrophobic_moments:
                if moment['name'] != 'Polygly-polypro helix':
                    missed_hydrophobic_moments[index - 1][moment_index] = moment['moment']
                    group_index += 1

            fraction_index = 0
            for fraction in missed_peptide_record.peptide_parameters.secondary_structure_fraction:
                missed_secondary_structure_fractions[index - 1][fraction_index] = fraction['value']
                fraction_index += 1

            show_progress(label, 35, index / total_missed_peptides_number)
            index += 1
    print()

    print('Calculating Kidera factors per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_kidera_factors.corr(method='pearson'), 'Kidera factors')
    print('done')

    print('Calculating Kidera factors per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Kidera factors per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_kidera_factors.corr(method='pearson'), 'Kidera factors')
    print('done')

    print('Calculating amino acid percents per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_acid_percents.corr(method='pearson'), 'Amino acid percents')
    print('done')

    print('Calculating amino acid percents per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Amino acid percents per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_acid_percents.corr(method='pearson'), 'Amino acid percents')
    print('done')

    print('Calculating amino acid compositions per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_acid_compounds.corr(method='pearson'), 'Amino acid compositions')
    print('done')

    print('Calculating amino acid compositions per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Amino acid compositions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_acid_compounds.corr(method='pearson'), 'Amino acid compositions')
    print('done')

    #
    # label = 'Calculating charges Kendall correlation (missed peptides): '
    # show_progress(label, 40, 0.0)
    # index = 1
    # for first_charges in range(0, len(missed_charges)):
    #     for second_charges in range(first_charges + 1, len(missed_charges)):
    #         missed['Charges per peptide correlation (Kendall)'].append(
    #             statistics.kendalltau(missed_charges[first_charges], missed_charges[second_charges]).correlation)
    #     show_progress(label, 40, index / len(missed_charges))
    #     index += 1
    # print()

    print('Calculating hydrophobic moments per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments')
    print('done')

    print('Calculating hydrophobic moments per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Hydrophobic moments per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_hydrophobic_moments.corr(method='pearson'), 'Hydrophobic moments')
    print('done')

    print('Calculating secondary structure fractions per peptide Pearson correlation (received peptides): ', end='')
    received_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(received_secondary_structure_fractions.corr(method='pearson'),
                                            'Secondary structure fractions')
    print('done')

    print('Calculating secondary structure fractions per peptide Pearson correlation (missed peptides): ', end='')
    missed_per_peptide_correlations['Secondary structure fractions per peptide correlation (Pearson)'] = \
        convert_correlation_matrix_to_serie(missed_secondary_structure_fractions.corr(method='pearson'),
                                            'Secondary structure fractions')
    print('done')

    return received_per_peptide_correlations, missed_per_peptide_correlations
Example #7
0
def file_commit_correlation(file_commit_frame: pd.DataFrame,
                            corr_method='spearman') -> pd.DataFrame:
    return file_commit_frame.corr(method=corr_method)
Example #8
0
        0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00,
        3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50
    ],
    '分数': [
        10, 22, 13, 43, 20, 22, 33, 50, 62, 48, 55, 75, 62, 73, 81, 76, 64, 82,
        90, 93
    ]
}
# 转换为DataFrame的数据格式
examDf = DataFrame(examDict)
plt.scatter(examDf.分数, examDf.学习时间, color='b', label="Exam Data")
plt.xlabel("Hours")
plt.ylabel("Score")
plt.show()

rDf = examDf.corr()
print(rDf)

exam_X = examDf.分数
exam_Y = examDf.学习时间
X_train, X_test, Y_train, Y_test = train_test_split(exam_X,
                                                    exam_Y,
                                                    train_size=.8)
# X_train为训练数据标签,X_test为测试数据标签,exam_X为样本特征,exam_y为样本标签,train_size 训练数据占比
print("原始数据特征:", exam_X.shape, ",训练数据特征:", X_train.shape, ",测试数据特征:",
      X_test.shape)
print("原始数据标签:", exam_Y.shape, ",训练数据标签:", Y_train.shape, ",测试数据标签:",
      Y_test.shape)

plt.scatter(X_train, Y_train, color="blue", label="train data")
plt.scatter(X_test, Y_test, color="red", label="test data")
Example #9
0
print(df5)

import math


def int_float_squares(series):
    return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2})


print(df.apply(int_float_squares, axis=1))

### 7. Basic Stats ###

print(df.describe())
print(df.cov())
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]})
print(other)
print(pd.merge(df, other, on="str_col", how="inner"))
print(pd.merge(df, other, on="str_col", how="outer"))
print(pd.merge(df, other, on="str_col", how="left"))
print(pd.merge(df, other, on="str_col", how="right"))

### 9. Plot ###

plot_df = DataFrame(np.random.randn(1000, 2), columns=["x", "y"])
plot_df["y"] = plot_df["y"].map(lambda x: x + 1)
obj.reindex()

data = DataFrame([[1,2,3],[4,5,6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()


data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc


data.fillna()

data.unstack()

def preprocess(directory, n_entries):

    hdf_path = directory.get_path("logs.h5", temp=False)
    print "hdf_path: %s" % hdf_path

    store = HDFStore(hdf_path)
    print "Keys: %s" % store.keys()
    print store
    store.close()
    df = pd.read_hdf(hdf_path, "logs")

    # df = directory.load('logs.h5')
    print "df: %s" % df

    if n_entries >= 0:
        df = df[:n_entries]

    secs = (df.index.max() - df.index.min()).total_seconds()
    hours = secs / 3600
    levels = df.level.unique()

    print "%.1f hours of logs" % hours

    print "%d log entries/hour" % int(len(df) / hours)
    print "%.1f thousand log entries/hour" % (int(len(df) / hours) / 1000.0)
    print df.shape, df.columns
    for level in levels:
        print "%-5s : %5d" % (level, len(df[df.level == level]))
    print "df : %s" % str(df.shape)

    if False:

        def get_peak(counts):
            """Retun the peak value in Series counts"""
            if len(counts) == 0:
                return None
            return counts.indmax()
            # return counts.index[counts.argmax()]

    start_time, end_time = df.index.min(), df.index.max()
    print "orginal: start_time, end_time = %s, %s" % (start_time, end_time)

    # Start time and end time trunctated to whole minutes
    start_time = truncate_to_minutes(start_time + timedelta(minutes=2))
    end_time = truncate_to_minutes(end_time - timedelta(minutes=2))
    print "cleaned: start_time, end_time = %s, %s" % (start_time, end_time)

    details = get_details(df)
    directory.save("details", details)

    # The counts for each 1 minute bin
    minute_counts = get_minute_counts(df, start_time, end_time)
    print "minute_counts: %s\n%s" % (type(minute_counts), minute_counts.describe())
    print "total entries: %s" % minute_counts.sum()

    level_counts = {level: get_minute_counts(df[df.level == level], start_time, end_time) for level in levels}

    # level_peaks = {level: get_peak(level_counts[level])  for level in levels}
    # print 'level_peaks: %s' % level_peaks

    if False:
        unique_files = df.file.unique()
        print "%d source files" % len(unique_files)
        for i, fl in enumerate(sorted(unique_files)[:5]):
            print "%3d: %s" % (i, fl)

        directory.save("unique_files", unique_files)

    #
    # Get all the unique log messages
    #
    level_file_line = df.groupby(["level", "file", "line"])
    lfl_size = level_file_line.size()
    lfl_sorted = lfl_size.order(ascending=False)
    print "lfl_sorted: %s" % str(lfl_sorted.shape)

    # directory.save('level_file_line', tuple(level_file_line))
    directory.save("lfl_sorted", lfl_sorted)

    # file:line uniquely identifies each level,file,line
    # Construct mappings in both directions
    lfl_to_string = OrderedDict(((lvl, fl, ln), "%s:%d" % (fl, ln)) for lvl, fl, ln in lfl_sorted.index)
    string_to_lfl = OrderedDict(("%s:%d" % (fl, ln), (lvl, fl, ln)) for lvl, fl, ln in lfl_sorted.index)
    print "string_to_lfl: %s" % len(string_to_lfl)

    # [((level,file,line),count)] sorted by count in descending order
    entry_types_list = zip(lfl_sorted.index, lfl_sorted)

    # {(level,file,line) : count}
    entry_types = OrderedDict(entry_types_list)
    directory.save("entry_types", entry_types)
    print "entry_types: %s" % len(entry_types)

    #
    # Build the correlation table
    #
    threshold = min(100, len(df) // 1000)
    lfl_freq_dict = {
        s: get_minute_counts(df[(df.file == fl) & (df.line == ln)], start_time, end_time)
        for s, (lvl, fl, ln) in string_to_lfl.items()
        if len(df[(df.file == fl) & (df.line == ln)]) >= threshold
    }
    print "++++"
    lfl_freq = DataFrame(lfl_freq_dict, columns=string_to_lfl.keys())
    directory.save("lfl_freq", lfl_freq)

    lfl_freq_corr = lfl_freq.corr()
    directory.save("lfl_freq_corr", lfl_freq_corr)
    print "lfl_freq_corr: %s" % str(lfl_freq_corr.shape)
	def cell_type_corr(self):
		'''Compute correlation coefficient between cell type'''
		print "The correlation matrix for cell types is:\n"
		celltype = DataFrame({'HL60':self.data['HL60_0_hrs'].append(self.data['HL60_24_hrs']),'U937':self.data['U937_0_hrs'].append(self.data['U937_24_hrs']),'Jurkat':self.data['Jurkat_0_hrs'].append(self.data['Jurkat_24_hrs call']),'NB4':self.data['NB4_0_hrs'].append(self.data['NB4_24_hrs'])})
		print celltype.corr()
		print '\n'
Example #13
0
	#try different lags
	favcorrs=[]
	unfavcorrs=[]
	diffcorrs=[]
	approvcorrs=[]
	disappcorrs=[]
	appdiffcorrs=[]
	ovotecorrs=[]
	rvotecorrs=[]
	votediffcorrs=[]
	lags=[]

	for x in xrange(-90, 90): 
		data['lag']=data.ma_sentiment.shift(x)
		lags.append(x) 
		favcorrs.append(data.corr()['lag']['favorable'])
		diffcorrs.append(data.corr()['lag']['difference'])
		unfavcorrs.append(data.corr()['lag']['unfavorable'])

	favcorrs=Series(favcorrs, index=lags)
	unfavcorrs=Series(unfavcorrs, index=lags)
	diffcorrs=Series(diffcorrs, index=lags)

	lagged_corrs=DataFrame({'favorable': favcorrs, 'unfavorable': unfavcorrs, 'difference': diffcorrs})
	lagged_corrs.to_csv(os.path.join('data', 'lexicon_lagged_corrs' + str(k) + '.csv'), sep='\t')
	lglist.append(lagged_corrs)

sm7_laggedcorrs=lglist[0]
sm15_laggedcorrs=lglist[1]
sm30_laggedcorrs=lglist[2]
X = zeros((5,1000,100))
Y = zeros((1000,5))

# Load the data files into the array

for i in range(5):
    X[i,:,:] = loadtxt('DataX{}.txt'.format(i))
    Y[:,i] = loadtxt('DataY{}.txt'.format(i))


# Save the correlation matricies as png files to be used in the LaTeX document

for i in range(5):
    X_df = DataFrame(X[i,:,:])
    corr = X_df.corr()
    
    print '\n' + 'Creating a Correlation Matrix Heat Map for X_{} and saving to PNG file X_{}_Corr.png'.format(i,i)
    fig=plt.figure()    
    plt.imshow(corr, cmap='hot', vmin=0, vmax=1)
    plt.colorbar()
    plt.xticks(range(0,len(corr)+1,10), range(0,len(corr)+1,10))
    plt.yticks(range(0,len(corr)+1,10), range(0,len(corr)+1,10));
    
    plt.savefig('X_{}_stan_Corr.png'.format(i), format='png')
    plt.close(fig)

for i in range(5):
    X_df = DataFrame(X[i,:,:])
    corr = X_df.corr()
Example #15
0
pd.Index

obj = Series([1, 2, 3])

obj.reindex()

data = DataFrame([[1, 2, 3], [4, 5, 6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()

data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc

data.fillna()

data.unstack()
print "Definimos de nuevo el dataframe"
df = pd.DataFrame(data={"A":[1,2], "B":[2.6,1.3]})
print df
print "añadimos columnas combinando las actuales"
df["C"] = df["A"]+df["B"]
df["D"] = df["A"]*3
df["E"] = np.sqrt(df["A"])
print df
print "*"*15
print "Datos disponibles de un dataframe"
print " descripcion del dataframe"
print df.describe()
print " covarianza "
print df.cov()
print " correlación "
print df.corr()
print "*"*15

print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas "
print " DataFrame(np.random.randn(1000,2),columns=['x','y'])"
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y'])
print plot_df
print "Mostramos las graficas"
plot_df.plot()
plot_df.hist()





Example #17
0
class MultiFactor:
    def __init__(self, factor_name, stocks, start_date=None, end_date=None):
        self.factor_name = factor_name
        self.start_date = start_date
        self.end_date = end_date
        self.stocks = stocks
        self.factor = None
        self.factor_list = None
        self.method = None
        self.quantile_nl = None

    def set_factor(self):
        self.factor_list = None
        self.method = None
        self.quantile_nl = None

    def get_factor(self):
        self.factor_dict = {
            factor:
            pd.read_csv('%s/Data/%s.csv' % (gc.FACTORBASE_PATH, factor),
                        index_col=[0])
            for factor in self.factor_list
        }
        self.df = DataFrame({
            factor: self.factor_dict[factor].values.reshape(-1)
            for factor in self.factor_list
        })
        self.corr = self.df.corr()
        self.e_value, self.e_vector = np.linalg.eig(self.corr)
        r = np.array(len(self.e_value) - rankdata(self.e_value),
                     dtype=np.int32)
        self.e_value = self.e_value[r]
        self.e_vector = self.e_vector[:, r]

    def pairplot(self):
        plt.figure(figsize=(16, 12))
        sns.pairplot(self.df)
        plt.savefig('%s/Results/%s/pair.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name))

    def corrplot(self):
        plt.figure(figsize=(16, 12))
        sns.heatmap(self.corr)
        plt.savefig('%s/Results/%s/corr.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name))

    def screeplot(self):
        plt.figure(figsize=(16, 12))
        plt.plot(self.e_value / self.e_value.sum())
        plt.savefig('%s/Results/%s/scree.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name))

    def multi_analysis(self):
        if not os.path.exists('%s/Results/%s' %
                              (gc.MULTIFACTOR_PATH, self.factor_name)):
            os.mkdir('%s/Results/%s' % (gc.MULTIFACTOR_PATH, self.factor_name))
        self.pairplot()
        self.corrplot()
        self.screeplot()

    def combine_factor(self):
        self.factor = DataFrame()
        if self.method == 'ew':
            for factor in self.factor_list:
                self.factor = self.factor.add(self.factor_dict[factor],
                                              fill_value=0)
        elif self.method[:4] == 'pca_':
            pca_num = int(self.method[4])
            for i in range(len(self.factor_list)):
                self.factor = self.factor.add(
                    self.e_vector[i, pca_num] *
                    self.factor_dict[self.factor_list[i]],
                    fill_value=0)
        if self.quantile_nl:
            self.factor = self.factor.subtract(self.factor.quantile(
                self.quantile_nl, axis=1),
                                               axis=0)**2

    def inf_to_nan(self, factor):
        factor[factor == np.inf] = np.nan
        factor[factor == -np.inf] = np.nan
        return factor

    def factor_analysis(self,
                        industry_neutral=True,
                        size_neutral=True,
                        num_group=10):
        self.factor = self.inf_to_nan(self.factor)
        stocks = self.stocks
        start_date = self.start_date
        end_date = self.end_date
        y1 = pd.read_csv('%s/Data/y1.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y2 = pd.read_csv('%s/Data/y2.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y3 = pd.read_csv('%s/Data/y3.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y4 = pd.read_csv('%s/Data/y4.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y5 = pd.read_csv('%s/Data/y5.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]

        if start_date:
            y1 = y1.loc[y1.index >= start_date, :]
            y2 = y2.loc[y2.index >= start_date, :]
            y3 = y3.loc[y3.index >= start_date, :]
            y4 = y4.loc[y4.index >= start_date, :]
            y5 = y5.loc[y5.index >= start_date, :]

        if end_date:
            y1 = y1.loc[y1.index <= end_date, :]
            y2 = y2.loc[y2.index <= end_date, :]
            y3 = y3.loc[y3.index <= end_date, :]
            y4 = y4.loc[y4.index <= end_date, :]
            y5 = y5.loc[y5.index <= end_date, :]

        self.y1 = y1
        self.y2 = y2
        self.y3 = y3
        self.y4 = y4
        self.y5 = y5

        if not os.path.exists(
                '%s/Results/%s/%s' %
            (gc.MULTIFACTOR_PATH, self.factor_name, self.method)):
            os.mkdir('%s/Results/%s/%s' %
                     (gc.MULTIFACTOR_PATH, self.factor_name, self.method))
        factor = self.factor.copy()

        #行业中性
        if industry_neutral:
            industrys = tools.get_industrys('L1', self.stocks)
            tmp = {}
            for k in industrys.keys():
                if len(industrys[k]) > 0:
                    tmp[k] = industrys[k]
            industrys = tmp
            factor = tools.standardize_industry(self.factor, industrys)
            self.factor_industry_neutral = factor.copy()

        #市值中性
        if size_neutral:
            market_capitalization = DataFrame({
                stock:
                pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' %
                            (gc.DATABASE_PATH, stock),
                            index_col=[0],
                            parse_dates=[0]).loc[:, 'TOTMKTCAP']
                for stock in self.stocks
            })
            market_capitalization = np.log(market_capitalization)
            if self.start_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index >= self.start_date, :]
            if self.end_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index <= self.end_date, :]
            if industry_neutral:
                market_capitalization = tools.standardize_industry(
                    market_capitalization, industrys)
            beta = (factor * market_capitalization).sum(1) / (
                market_capitalization * market_capitalization).sum(1)
            factor = factor - market_capitalization.mul(beta, axis=0)
            self.factor_industry_size_neutral = factor.copy()

        # self.factor_industry_neutral.fillna(0, inplace=True)
        # self.factor_industry_size_neutral.fillna(0, inplace=True)
        # factor.fillna(0, inplace=True)
        #因子分布
        plt.figure(figsize=(16, 12))
        plt.hist(factor.fillna(0).values.flatten())
        plt.savefig('%s/Results/%s/%s/hist.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        #IC、IR、分组回测
        ys = [self.y1, self.y2, self.y3, self.y4, self.y5]
        IC = {}
        IR = {}
        group_backtest = {}
        group_pos = {}

        for i in range(len(ys)):
            if industry_neutral:
                y_neutral = tools.standardize_industry(ys[i], industrys)
            if size_neutral:
                y_neutral = y_neutral - market_capitalization.mul(
                    (y_neutral * market_capitalization).sum(1) /
                    (market_capitalization * market_capitalization).sum(1),
                    axis=0)
            IC[i] = (y_neutral *
                     factor).mean(1) / factor.std(1) / y_neutral.std(1)
            IR[i] = IC[i].rolling(20).mean() / IC[i].rolling(20).std()
            factor_quantile = DataFrame(
                rankdata(factor, axis=1),
                index=factor.index,
                columns=factor.columns).div(factor.notna().sum(1),
                                            axis=0)  # / len(factor.columns)
            factor_quantile[factor.isna()] = np.nan
            group_backtest[i] = {}
            group_pos[i] = {}
            for n in range(num_group):
                group_pos[i][n] = DataFrame((n / num_group <= factor_quantile)
                                            & (factor_quantile <=
                                               (n + 1) / num_group))
                group_pos[i][n][~group_pos[i][n]] = np.nan
                group_pos[i][n] = 1 * group_pos[i][n]
                group_backtest[i][n] = ((group_pos[i][n] * ys[i]).mean(1) -
                                        ys[i].mean(1)).cumsum().rename(
                                            '%s' % (n / num_group))
        self.IC = IC
        self.IR = IR
        self.group_pos = group_pos
        self.group_backtest = group_backtest

        plt.figure(figsize=(16, 12))
        for i in range(len(ys)):
            IC[i].cumsum().plot()
        plt.legend(['%s' % i for i in range(len(ys))])
        plt.savefig('%s/Results/%s/%s/IC.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        plt.figure(figsize=(16, 12))
        for i in range(len(ys)):
            IR[i].cumsum().plot()
        plt.legend(['%s' % i for i in range(len(ys))])
        plt.savefig('%s/Results/%s/%s/IR.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        for i in range(len(ys)):
            plt.figure(figsize=(16, 12))
            for n in range(num_group):
                group_backtest[i][n].plot()
            plt.legend(['%s' % i for i in range(num_group)])
            plt.savefig(
                '%s/Results/%s/%s/groupbacktest%s.png' %
                (gc.MULTIFACTOR_PATH, self.factor_name, self.method, i))

    def update_factor(self):
        self.set_factor()
        self.get_factor()
        self.combine_factor()
        self.factor = self.inf_to_nan(self.factor)
        #if 'industry' in self.neutral_list:
        if True:
            industrys = tools.get_industrys('L1', self.stocks)
            tmp = {}
            for k in industrys.keys():
                if len(industrys[k]) > 0:
                    tmp[k] = industrys[k]
            industrys = tmp
            factor = tools.standardize_industry(self.factor, industrys)
        #if 'market_capitalization' in self.neutral_list:
        if True:
            market_capitalization = DataFrame({
                stock:
                pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' %
                            (gc.DATABASE_PATH, stock),
                            index_col=[0],
                            parse_dates=[0]).loc[:, 'TOTMKTCAP']
                for stock in self.stocks
            })
            market_capitalization = np.log(market_capitalization)
            if self.start_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index >= self.start_date, :]
            if self.end_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index <= self.end_date, :]
            #if 'industry' in self.neutral_list:
            if True:
                market_capitalization = tools.standardize_industry(
                    market_capitalization, industrys)
            beta = (factor * market_capitalization).sum(1) / (
                market_capitalization * market_capitalization).sum(1)
            factor = factor - market_capitalization.mul(beta, axis=0)
        # factor.fillna(0, inplace=True)
        if os.path.exists('%s/Data/%s.csv' %
                          (gc.FACTORBASE_PATH, self.factor_name)):
            if isinstance(factor.index[0], str):
                factor_old = pd.read_csv(
                    '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name),
                    index_col=[0])
            else:
                factor_old = pd.read_csv(
                    '%s/Data/%s.csv' % (gc.FACTORBASE_PATH, self.factor_name),
                    index_col=[0],
                    parse_dates=[0])
            factor = pd.concat([
                factor_old, factor.loc[factor.index > factor_old.index[-1], :]
            ],
                               axis=0)
            factor.sort_index(axis=0, inplace=True)
        factor.sort_index(axis=1, inplace=True)
        factor.to_csv('%s/Data/%s.csv' %
                      (gc.FACTORBASE_PATH, self.factor_name))
Example #18
0
def StdCorr(data_frame: pd.DataFrame) -> pd.DataFrame:
    # Standard Correlation Coefficient - Pearson Correlation
    std_corr_frame = data_frame.corr(method="pearson")
    return std_corr_frame
    def _fit_definition(self, data_x: pd.DataFrame) -> dict:
        # This method is described in APM on pg 47 as the following steps:
        #   - calculate the correlation matrix of features
        #   - determine the two features associated with the largest absolute pairwise correlation
        #     (call them features `A` and `B`)
        #   - Determine the average correlation between `A` and the other variables.
        #     - Do the same for `B`
        #   - If `A` has a larger average correlation, remove it; otherwise, remove feature `B`
        #   - Repeat until no absolute correlations are above the threshold (``r correlation_threshold``)
        columns_to_remove = list()

        # noinspection PyUnresolvedReferences
        # `corr()` automatically excludes categorical features
        # we'll get the correlation outside the loop and remove features as we go because it is a very
        # expensive function call for large datasets
        correlation_matrix = data_x.corr()

        while True:

            local_correlation_matrix = correlation_matrix
            features = local_correlation_matrix.columns.values
            local_correlation_matrix = np.abs(local_correlation_matrix.values)
            np.fill_diagonal(local_correlation_matrix, np.NaN)

            # local_correlation_matrix.unique()
            # sorted(np.abs(np.unique(local_correlation_matrix)), reverse=True)

            highest_abs_pairwise_correlation = np.nanmax(
                local_correlation_matrix)

            if highest_abs_pairwise_correlation > self._max_correlation_threshold:
                # `where()` will always be 2 instances for correlation matrices, grab the first
                indexes = np.where(local_correlation_matrix ==
                                   highest_abs_pairwise_correlation)[0]

                mean_a_correlation = np.nanmean(
                    local_correlation_matrix[indexes[0], ])
                mean_b_correlation = np.nanmean(
                    local_correlation_matrix[indexes[1], ])

                # A potential problem is that when we are e.g. resampling, there can be slight variations
                # depending on the scaling/etc.. and if, for example, the 'RemoveCorrelationsTransformer'
                # chooses (at "random") different features to remove, this messes with the functionality
                # that detects which features we should end with (e.g. a resampling training split doesn't
                # contain an uncommon value for a particular column, and is subsequently encoded
                # (e.g. one-hot) and then the training dataset does contain that value, and shit either breaks
                # or becomes inconsistent when predicting on two different transformed dataset
                # SO: we have to round (arbitrarily to 3) so that slight variations in correlations (e.g.
                # between the same two features when resampling) are consistent.
                if round(float(mean_a_correlation), 3) > round(
                        float(mean_b_correlation), 3):
                    column_to_remove = features[indexes[0]]
                else:
                    column_to_remove = features[indexes[1]]
                columns_to_remove.append(column_to_remove)
                correlation_matrix.drop(index=column_to_remove,
                                        columns=column_to_remove,
                                        inplace=True)
            else:
                break

        return {'columns_to_remove': columns_to_remove}
Example #20
0
 def test_corr_nullable_integer(self, nullable_column, other_column, method):
     # https://github.com/pandas-dev/pandas/issues/33803
     data = DataFrame({"a": nullable_column, "b": other_column})
     result = data.corr(method=method)
     expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
     tm.assert_frame_equal(result, expected)
Example #21
0
bizframe_sub.head(n=10)


# In[ ]:

bsort = bizframe_sub.sort_values(by='review_count', ascending=False)


# In[ ]:

bsort[:10]


# In[ ]:

bizframe_sub.corr()


# In[ ]:

top_biz = bizframe_sub.ix[bizframe_sub.stars == 5.0]


# In[ ]:

tb = top_biz.sort_values(by='review_count', ascending=False)


# In[ ]:

tb[:10]
Example #22
0
            3   3.5 0.5
        ]

    calculate var, cov, corr, cov-matrix, corr-matrix
"""

data = {'x1': [1, 2], 'x2': [2, 3], 'x3': [2, 1]}
A = DataFrame(data)
print A

data1 = Series([3, 3.5, 0.5], index=['x1', 'x2', 'x3'])
A = A.append(data1, ignore_index=True)
print '\n', A, '\n'

var_x1 = A['x1'].var()
var_x2 = A['x2'].var()
var_x3 = A['x3'].var()

print 'var_x1: %f' % var_x1
print 'var_x2: %f' % var_x2
print 'var_x3: %f' % var_x3

cov_matrix = A.cov()
print '\ncov_matrix:\n', cov_matrix

corr_matrix = A.corr()
print '\ncorr_matrix:\n', corr_matrix

print "\nA['x1'].corr(A['x2']): %f" % A['x1'].corr(A['x2'])
print "A['x1'].corr(A['x3']): %f" % A['x1'].corr(A['x3'])
Example #23
0
plt.figure(figsize=(10, 10))
plt.title('Sales_rev- Client_Room_household', y=1.05, size=15)
sns.heatmap(autumn_season.corr(),
            linewidths=0.1,
            vmax=1.0,
            square=True,
            cmap='CMRmap',
            linecolor='white',
            annot=True)
plt.show()

#not correlated

plt.figure(figsize=(10, 10))
plt.title('Sales_rev- Client_Room_household', y=1.05, size=15)
sns.heatmap(df.corr(),
            linewidths=0.1,
            vmax=1.0,
            square=True,
            cmap='CMRmap',
            linecolor='white',
            annot=True)
plt.show()

#not corred.

Client_Region = df[df.Client_Region == 'Hague']
g = df[df.Client_Region == 'Groningen']
Aarhus = df[df.Client_Region == 'Aarhus']
plt.figure(figsize=(10, 10))
plt.title('Sales_rev- Client_Room_household', y=1.05, size=15)
Example #24
0
def correlation(col1, col2):
    df = DataFrame({'col1': col1, 'col2': col2})
    corr_matrix = df.corr(method='pearson', min_periods=1)
    result = corr_matrix.iloc[0, 1]
    return result
Example #25
0
import pandas as pd
import numpy as np

# datafile = 'D:/新建 Microsoft Office Excel 工作表.xlsx'
# data = pd.read_excel(datafile,header=None)
# min = (data-data.min())/(data.max()-data.min())
# zero = (data - data.mean())/data.std()
# float = data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
# print("原始数据为:\n",data)
# print('--------------------')
# print('最小-最大规范化后的数据:\n',min)

from pandas import Series, DataFrame

df = DataFrame(np.random.randn(4, 3), index=list('abcd'), columns=['frist', 'second', 'third'])
print(df)
print(df.describe())
print(df.sum())
print(df.sum(axis=1))
print('-----------')
print(df.idxmax(), df.idxmin(), df.idxmin(axis=1))
print(df.cumsum())
print(df.var())
print(df.std())
print(df.pct_change())
print(df.cov())
print(df.corr())
def check_corr(df: pd.DataFrame) -> None:
    corr = df.corr()
    plt.figure(figsize=(10,10))
    sns.heatmap(corr, square=True, annot=True)
    plt.savefig(f'src/sample_data/Kaggle/predict_target_of_bank/corr_heatmap.png')
Example #27
0
def _calc_correlation_matrix(ts_df: DataFrame) -> DataFrame:
    return ts_df.corr()
Example #28
0
def corr_mat(
    data: pd.DataFrame,
    split: Optional[
        str] = None,  # Optional[Literal['pos', 'neg', 'high', 'low']] = None,
    threshold: float = 0,
    target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None,
    method:
    str = "pearson",  # Literal['pearson', 'spearman', 'kendall'] = "pearson",
    colored: bool = True,
) -> Union[pd.DataFrame, Any]:
    """ Returns a color-encoded correlation matrix.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \
        is provided, the index/column information is used to label the plots
    split : Optional[str], optional
        Type of split to be performed, by default None
        {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
        split = "high" or split = "low", in which case default is 0.3
    target : Optional[Union[pd.DataFrame, str]], optional
        Specify target for correlation. E.g. label column to generate only the \
        correlations between each feature and the label, by default None
    method : str, optional
        method: {"pearson", "spearman", "kendall"}, by default "pearson"
        * pearson: measures linear relationships and requires normally distributed \
            and homoscedastic data.
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
        * kendall: ranked/ordinal correlation, measures monotonic relationships. \
            Computationally more expensive but more robust in smaller dataets than \
            "spearman"
    colored : bool, optional
        If True the negative values in the correlation matrix are colored in red, by \
        default True

    Returns
    -------
    Union[pd.DataFrame, pd.Styler]
        If colored = True - corr: Pandas Styler object
        If colored = False - corr: Pandas DataFrame
    """

    # Validate Inputs
    _validate_input_range(threshold, "threshold", -1, 1)
    _validate_input_bool(colored, "colored")

    def color_negative_red(val):
        color = "#FF3344" if val < 0 else None
        return "color: %s" % color

    data = pd.DataFrame(data)

    if isinstance(target, (str, list, pd.Series, np.ndarray)):
        target_data = []
        if isinstance(target, str):
            target_data = data[target]
            data = data.drop(target, axis=1)

        elif isinstance(target, (list, pd.Series, np.ndarray)):
            target_data = pd.Series(target)
            target = target_data.name

        corr = pd.DataFrame(data.corrwith(target_data, method=method))
        corr = corr.sort_values(corr.columns[0], ascending=False)
        corr.columns = [target]

    else:
        corr = data.corr(method=method)

    corr = _corr_selector(corr, split=split, threshold=threshold)

    if colored:
        return corr.style.applymap(color_negative_red).format("{:.2f}",
                                                              na_rep="-")
    else:
        return corr
Example #29
0
def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict:
    """Calculate the correlation coefficients between variables for the correlation types selected in the config
    (pearson, spearman, kendall, phi_k, cramer).

    Args:
        variables: A dict with column names and variable types.
        df: The DataFrame with variables.

    Returns:
        A dictionary containing the correlation matrices for each of the active correlation measures.
    """
    correlations = {}
    for correlation_name in ["pearson", "spearman", "kendall"]:
        if config["correlations"][correlation_name].get(bool):
            try:
                correlation = df.corr(method=correlation_name)
                if len(correlation) > 0:
                    correlations[correlation_name] = correlation
            except (ValueError, AssertionError) as e:
                warn_correlation(correlation_name, e)

    if config["correlations"]["phi_k"].get(bool):
        import phik

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # Phi_k does not filter non-numerical with high cardinality
            selcols = []
            intcols = []
            for col in df.columns.tolist():
                try:
                    tmp = (df[col].value_counts(dropna=False).reset_index().
                           dropna().set_index("index").iloc[:, 0])
                    if tmp.index.inferred_type == "mixed":
                        continue

                    if pd.api.types.is_numeric_dtype(df[col]):
                        intcols.append(col)
                        selcols.append(col)
                    elif df[col].nunique() <= config[
                            "categorical_maximum_correlation_distinct"].get(
                                int):
                        selcols.append(col)
                except TypeError:
                    continue
                except ValueError:
                    continue

            try:
                correlations["phi_k"] = df[selcols].phik_matrix(
                    interval_cols=intcols)

                # Only do this if the column_order is set
                with suppress(NotFoundError):
                    # Get the preferred order
                    column_order = config["column_order"].get(list)

                    # Get the Phi_k sorted order
                    current_order = (correlations["phi_k"].index.
                                     get_level_values("var1").tolist())

                    # Intersection (some columns are not used in correlation)
                    column_order = [
                        x for x in column_order if x in current_order
                    ]

                    # Override the Phi_k sorting
                    correlations["phi_k"] = correlations["phi_k"].reindex(
                        index=column_order, columns=column_order)
            except ValueError as e:
                warn_correlation("phi_k", e)
            except DataError as e:
                warn_correlation("phi_k", e)

    categorical_correlations = {
        "cramers": cramers_matrix,
        "recoded": recoded_matrix
    }
    for correlation_name, get_matrix in categorical_correlations.items():
        if config["correlations"][correlation_name].get(bool):
            try:
                correlation = get_matrix(df, variables)
                if len(correlation) > 0:
                    correlations[correlation_name] = correlation
            except ValueError as e:
                warn_correlation(correlation_name, e)

    return correlations
Example #30
0
class Tasks:
    """
    a class to read 'actionhistory' json file and convert it to numpy array
    it involves two loops over the actions. in the first loop it extracts the name of all the sites and actions and also the error codes.
    in the second loop, for each entry a Task item is created and stored
    """
    def __init__(self,
                 _file,
                 binary=False,
                 TiersOnly=False,
                 all_sites=[],
                 all_errors=[],
                 all_actions=[]):
        """
        initialize an instance of Taks
        :param str _file: the full path of the actionhistory json file
        :param bool binary: if true, converts actions to acdc/non-acdc
        :param bool TiersOnly: if true, only the tier index of the site is stored instead of the full name
        :param all_actions, all_errors, all_actions: to be able to add additional values to the list
        """
        self.TiersOnly = TiersOnly
        self.IsBinary = binary
        self.fIn = open(_file)
        self.js = json.load(self.fIn)

        self.all_sites = all_sites
        self.all_errors = all_errors
        self.all_actions = all_actions

        self.FillSiteErrors()

        if binary:
            self.all_actions = ["non-acdc", "acdc"]

        self.AllData = []

        for tsk in self.js:
            self.AllData.append(Task(self.js[tsk], tsk, self))

        self.ErrorsGoodBadSites = np.array(
            [tsk.Get2DArrayOfErrors() for tsk in self.AllData])
        self.AllActions = np.array([tsk.action.code() for tsk in self.AllData])
        self.df = DataFrame(data=[tsk.GetInfo() for tsk in self.AllData],
                            columns=self.AllData[0].GetInfo(True))

    def GetShuffledDS(self, n):
        p = np.random.permutation(len(self.AllData))
        return self.ErrorsGoodBadSites[p[:n]], self.AllActions[p[:n]]

    def GetTrainTestDS(self, train_ratio, shuffle=False):
        """
        convert the information to train/test
        :param float train_ratio: number between 0 and 1, the fraction to go for the training
        :ret: train_x, train_y, test_x , test_y
        """
        if shuffle:
            self.ErrorsGoodBadSites, self.AllActions = self.GetShuffledDS(
                len(self.AllData))
        n = int(train_ratio * len(self.AllData))
        return self.ErrorsGoodBadSites[:
                                       n], self.AllActions[:n], self.ErrorsGoodBadSites[
                                           n:], self.AllActions[n:]

    def FillSiteErrors(self, Print=False):
        """
        For the first loop and fill the lists of errors, sites and actions
        :param bool Print: do printing after it has been done
        """
        for tsk in self.js:
            errors = self.js[tsk]["errors"]
            for site_status in ["good_sites", "bad_sites"]:
                sites = errors[site_status]
                for err in sites:
                    if int(err) not in self.all_errors:
                        self.all_errors.append(int(err))
                    for site in sites[err]:
                        if site not in self.all_sites:
                            self.all_sites.append(site)
            action = self.js[tsk]['parameters']['action']
            if action not in self.all_actions:
                self.all_actions.append(str(action))
        self.all_sites.sort()
        self.all_errors.sort()
        self.all_actions.sort()

        if Print:
            print(self.all_sites)
            print(self.all_errors)
            print(self.all_actions)

    def PlotCorrelation(self):
        """
        produce and show the correlation plot, based on the output of GetInfo method of the Task object
        """
        plt.matshow(self.df.corr())
        plt.show()

    def GroupBy(self, var1, var2):
        """
        group by var1 and var2 and plot the counts
        """
        groupby = self.df.groupby([var1, var2])
        var3 = "nErrorsInGoodSites" if "nErrorsInBadSites" in [
            var1, var2
        ] else "nErrorsInBadSites"
        df_action_error_count = groupby[var3].count().reset_index()
        df_action_error_count.plot.scatter(x=var1,
                                           y=var2,
                                           s=df_action_error_count[var3])
        plt.show()
Example #31
0
def pandas_kendall_compute(config: Settings, df: pd.DataFrame,
                           summary: dict) -> Optional[pd.DataFrame]:
    return df.corr(method="kendall")
Example #32
0
def get_feature_clusters(X: pd.DataFrame,
                         dependence_metric: str,
                         distance_metric: str = None,
                         linkage_method: str = None,
                         n_clusters: int = None,
                         critical_threshold: float = 0.0) -> list:
    """
    Machine Learning for Asset Managers
    Snippet 6.5.2.1 , page 85. Step 1: Features Clustering

    Get clustered features subsets from the given set of features.

    :param X: (pd.DataFrame) of features.
    :param dependence_metric: (str) method to be use for generating dependence_matrix, either 'linear' or
                              'information_variation' or 'mutual_information' or 'distance_correlation'.
    :param distance_metric: (str) the distance operator to be used for generating the distance matrix. The methods that
                            can be applied are: 'angular', 'squared_angular', 'absolute_angular'. Set it to None if the
                            feature are to be generated as it is by the ONC algorithm.
    :param linkage_method: (str) method of linkage to be used for clustering. Methods include: 'single' , 'ward' ,
                           'complete' , 'average' , 'weighted' and 'centroid'. Set it to None if the feature are to
                            be generated as it is by the ONC algorithm.
    :param n_clusters: (int) number of clusters to form. Must be less the total number of features. If None then it
                       returns optimal number of clusters decided by the ONC Algorithm.
    :param critical_threshold: (float) threshold for determining low silhouette score in the dataset. It can any real number
                                in [-1,+1], default is 0 which means any feature that has a silhouette score below 0 will be
                                indentified as having low silhouette and hence requied transformation will be appiled to for
                                for correction of the same.
    :return: (list) of feature subsets.
    """
    # Checking if dataset contains features low silhouette
    X = _check_for_low_silhouette_scores(X, critical_threshold)

    # Get the dependence matrix
    if dependence_metric != 'linear':
        dep_matrix = get_dependence_matrix(X,
                                           dependence_method=dependence_metric)
    else:
        dep_matrix = X.corr()

    if n_clusters is None and (distance_metric is None
                               or linkage_method is None):
        return list(get_onc_clusters(dep_matrix.fillna(0))
                    [1].values())  # Get optimal number of clusters
    if distance_metric is not None and (linkage_method is not None
                                        and n_clusters is None):
        n_clusters = len(get_onc_clusters(dep_matrix.fillna(0))[1])
    if n_clusters >= len(
            X.columns
    ):  # Check if number of clusters exceeds number of features
        raise ValueError(
            'Number of clusters must be less than the number of features')

    # Apply distance operator on the dependence matrix
    dist_matrix = get_distance_matrix(dep_matrix,
                                      distance_metric=distance_metric)

    # Get the linkage
    link = linkage(squareform(dist_matrix), method=linkage_method)
    clusters = fcluster(link, t=n_clusters, criterion='maxclust')
    clustered_subsets = [[f for c, f in zip(clusters, X.columns) if c == ci]
                         for ci in range(1, n_clusters + 1)]

    return clustered_subsets
Example #33
0
bike_d = y.groupby(['Item'])['Sales'].mean()
days = pd.DataFrame(data=bike_d)
bike_Item = days.sort_values(by='Sales', ascending=False, axis=0)

fig = px.bar(bike_Item,
             x="Sales",
             y=bike_Item.index,
             color='Sales',
             color_continuous_scale='Blues',
             title="Average sales per month")
#plotly.offline.plot(fig, filename='bike')

#-----------------------CORRELATIONS
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(),
            annot=True,
            cmap='Blues_r',
            mask=np.triu(df.corr(), k=1))

#-----------------------------ROI-------------------------------------------------------------
"""ROI ON 2020 in a pandemic it was anticipated a larger use of echo transport including bikes 
instead of public transport so the investment was higher"""

#2019 ROI
#filtering the year
Year2019 = df[df.Year == 2019]
investment = 40000  #received investment
#passing vriables to the desired columns
bike_costs = Item_cost_month = Year2019['Item_cost_month']
loss = Loss_item = Year2019['Loss_item']
def RPC_correlation_matrix(data: pd.DataFrame, *args, **kwargs):
    return data.corr()
Example #35
0
def check_corr(df: pd.DataFrame) -> None:
    corr = df.corr()
    plt.figure(figsize=(20, 20))
    sns.heatmap(corr, square=True, annot=True)
    plt.savefig(f'{SAVE_DIR}/corr_heatmap.png')
Example #36
0
cNorm = colors.Normalize(vmin=0, vmax=len(years))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

# 0,1,2,3,4,
#lable1 5 lowest  6 final mean
names = ['Month', 'Goals', 'Candidates', 'First Mean', 'Second Mean']
dataList = pricePridictTest.dataListManage()
print(dataList)
lablesLowest = []  # ,'Final Lowest','Final Mean'
lablesMean = []

data = DataFrame(dataList)
print(data.describe())
dataT = data.T
print(data.shape)
corrMat = DataFrame(data.corr())
print(names)
# plt.figure()
# for i in range(len(dataList)):
#     colorVal = scalarMap.to_rgba(i)
#     plt.plot(dataList[i], c=colorVal)
#     plt.grid(True)
#     plt.legend(loc='upper right')

# plt.figure()
# plt.pcolor(corrMat)   #画相关性热力图
# plt.colorbar()  #展示色条
# plt.figure()
# plt.boxplot(dataT)
#
# plt.show()
Example #37
0
def corr_finder(df: pd.DataFrame,
                threshold=0.3,
                print_corr=True,
                get_list=False,
                p_value=False,
                method='pearson') -> list:
    '''Returns a list of [x, y, corr_value[, p_value]] where [x, y] are the coordinates of the value on the correlation matrix
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to find correlations in

    threshold : float, default=0.3
        The abs(threshold) at which to flag a relationship between two features as being correlated
        
    print_corr : bool, default=True
        Prints the results of the correlation finder if True
    
    get_list : bool, default=False
        Returns the list of correlations in the format [x, y, corr_value, p_value]
        p_value is only returned if p_value=True

    p_value : bool, default=True
        Prints the corresponding p value of a correlation if print_corr=True, and adds it to the list returned if get_list=True

    method : {'pearson', 'spearman', 'kendall'} or callable, default='pearson'
        The method with which to calculate the correlation matrix

    Returns
    -------
    corr_list: list or None, shape (n_correlations, 3 or 4)
        List is returned only upon request via get_list in the format [x, y, corr_value[, p_value]]
        n_correlations = number of correlations found with the given threshold
        p_value is optional. Returned only if given argument p_value is True
        
    Notes
    -----
    Only accepts DataFrames without categorical features (Or has been OneHotEncoded properly)
    Checks through correlations of the bottom left triangle of the correlation matrix
    If categorical features are present, coordinates x and y will no longer reflect the correct column coordinates as in df.columns
    '''
    # If the shape of df.corr() is not equal to a square matrix with the len/width equal to df.shape[1], there are categorical features
    assert df.shape[1] == df.shape[1], \
        'Correlation matrix shape should equal ({0}, {1}), it is instead {2}. Are there categorical features inside?'\
        .format(df.shape[1], df.shape[1], df.shape)

    if method is None:
        method = 'pearson'

    df = df.corr(method=method)

    # Calculate p-values if requested
    if p_value:
        if method == 'pearson':
            df_pv = df.corr(method=pearsonr_pval)
        elif method == 'spearman':
            df_pv = df.corr(method=spearmanr_pval)
        elif method == 'kendall':
            df_pv = df.corr(method=kendall_pval)

    corr_list = list()

    # Combination of for statements iterate through all matrices of the
    # bottom-left triangular half of the correlation matrix
    for y in range(1, df.shape[1]):
        for x in range(0, y):

            # If correlation is above given threshold
            if abs(df.iloc[x, y]) > threshold:

                # Print anything only if requested (Default)
                # If p-value is desired, print it together with correlation and coordinates
                if print_corr and p_value:
                    print('({}, {})'.format(x, y),
                          '{} has a correlation of'.format(df.columns[x]),
                          round(df.iloc[x, y], 4),
                          'with {}'.format(df.columns[y]), 'with p-value of',
                          round(df_pv.iloc[x, y], 4))
                elif print_corr:
                    print('({}, {})'.format(x, y),
                          '{} has a correlation of'.format(df.columns[x]),
                          round(df.iloc[x, y],
                                4), 'with {}'.format(df.columns[y]))

                # If a list was requested to be returned
                if get_list:
                    # Add p-value into list if it is desired
                    if p_value:
                        corr_list.append([
                            x, y,
                            round(df.iloc[x, y], 4),
                            round(df_pv.iloc[x, y], 4)
                        ])
                    else:
                        corr_list.append([x, y, round(df.iloc[x, y], 4)])

    if get_list:
        return corr_list
 def test_corr_invalid_method(self):
     # GH#22298
     df = DataFrame(np.random.normal(size=(10, 2)))
     msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
     with pytest.raises(ValueError, match=msg):
         df.corr(method="____")
Example #39
0
def real_data_net_import_analysis():
    base_path = Path().cwd() / 'result_data'
    sept_setup_path = base_path / 'schedule vs real - sept'

    oemof_baseline = load_oemof_costs(get_oemof_results(sept_setup_path, 'baseline_pred.oemof', True))
    hal_baseline = hal_load_import_kwh(sept_setup_path / 'baseline_pred', 'baseline_pred')

    oemof_sept = load_oemof_costs(get_oemof_results(sept_setup_path, 'real_data_offline.oemof', True))
    hal_sept = hal_load_import_kwh(sept_setup_path / 'real_data_online', 'real_data_online')

    total_hal_baseline = hal_baseline['wh_total'] / 1000
    total_oemof_baseline = oemof_baseline['wh_total'] / 1000
    total_hal_sept = hal_sept['wh_total'] / 1000
    total_oemof_sept = oemof_sept['wh_total'] / 1000

    mixed_costs = pd.DataFrame({
        'Pred HAL': total_hal_baseline,
        'Pred OEMOF': total_oemof_baseline,
        'Real HAL': total_hal_sept,
        'Real OEMOF': total_oemof_sept,
    }, index=total_oemof_baseline.index)

    fig, axes = plt.subplots(nrows=1)

    mixed_costs.sum().plot.bar(ax=axes, grid=True).set_ylabel('KWh')
    fig.subplots_adjust(left=0.12, right=0.97, top=0.92, bottom=0.19, hspace=0.53)
    plt.savefig("praktikumsbericht/images/sept_real_total.pdf")
    dataframe_to_stat_table(sept_setup_path / 'stats.csv', mixed_costs)

    hal_stor_load = load_hal_storage_df(sept_setup_path / 'real_data_online', 'real_data_online')[0]['power[W]']
    pred_oemof_results = get_oemof_results(sept_setup_path, 'baseline_pred.oemof', exclude_storage=True)
    real_oemof_results = get_oemof_results(sept_setup_path, 'real_data_offline.oemof', exclude_storage=True)
    oemof_stor_load = real_oemof_results['b1_data'][(('b1', 'sink_storage'), 'flow')] - real_oemof_results['b1_data'][(('source_storage', 'b1'), 'flow')]
    pv_pred = pred_oemof_results['b1_data'][(('source_pv', 'b1'), 'flow')]
    pv_real = real_oemof_results['b1_data'][(('source_pv', 'b1'), 'flow')]

    pl = DataFrame({
        'HAL controlled Storage Balance': hal_stor_load,
        'Schedule controlled Storage Balance': oemof_stor_load,
        'Pred PV Output': pv_pred,
        'Actual PV Output': pv_real,
    }, index=real_oemof_results['b1_data'].index)
    corr = pl.corr()
    # print(corr.to_latex())
    print(corr.to_latex(open("praktikumsbericht/images/sept_real_correlation.tex", 'w'),
                        label='t/res/real',
                        header=['HAL', 'Schedule', 'Pred PV', 'Actual PV'],
                        caption='Correlation between PV input and storage consumption',
                        float_format="%.2f"))
    fig, axes = plt.subplots(nrows=1)
    pl["2016-09-01"].plot(ax=axes)
    axes.set_ylabel('W')
    plt.savefig("praktikumsbericht/images/sept_real_example.pdf")

    # Percentiles
    fig, axes = plt.subplots(nrows=1)
    boxplot = mixed_costs.boxplot(grid=True, showfliers=False, ax=axes)
    boxplot.set_ylabel('KWh')
    fig.subplots_adjust(left=0.12, right=0.96, top=0.93, bottom=0.07, hspace=0.20)
    plt.savefig("praktikumsbericht/images/sept_real_peaks.pdf")

    dataframe_to_stat_table(sept_setup_path / 'stats.csv', mixed_costs)
    def test_corr_int(self):
        # dtypes other than float64 GH#1761
        df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})

        df3.cov()
        df3.corr()
Example #41
0
# prepare data
test_set = pd.read_csv('raw/TestSet.csv')
test_subset = pd.read_csv('raw/TestSubset.csv')
train_set = pd.read_csv('raw/TrainingSet.csv')
train_subset = pd.read_csv('raw/TrainingSubset.csv')

train = train_set.drop(['EbayID','QuantitySold','SellerName'], axis=1)
train_target = train_set['QuantitySold']
_, n_features = train.shape

df = DataFrame(np.hstack((train,train_target[:, None])), columns=range(n_features) + ["isSold"])
_ = sns.pairplot(df[:50], vars=[2,3,4,10,13], hue="isSold", size=1.5)
plt.figure(figsize=(10,10))

# compute the correlation matrix
corr = df.corr()

# generate a mask for the upper triangle
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax = .3,
                square=True, xticklabels=5, yticklabels=2,
                linewidths=.5, cbar_kws={"shrink":.5})

plt.yticks(rotation=0)

plt.show()
#-*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from pandas.tools.plotting import scatter_matrix

df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd'])
corr_mat = df.corr()
print corr_mat

scatter_matrix(df, alpha=0.2, figsize=(16, 16), diagonal='kde')

plt.show()
#plt.savefig('features.png')
Example #43
0
def get_mode(arr):  
    mode = [];  
    arr_appear = dict((a, arr.count(a)) for a in arr);  # 统计各个元素出现的次数  
    if max(arr_appear.values()) == 1:  # 如果最大的出现为1  
        return;  # 则没有众数  
    else:  
        for k, v in arr_appear.items():  # 否则,出现次数最大的数字,就是众数  
            if v == max(arr_appear.values()):  
                mode.append(k);  
    return mode;  

get_mode(a)

var(a)
std(a)

a=Series(a)
a.skew()
a.kurt()
a.describe()

df = DataFrame({'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df.cov()
df.corr()

###假设检验
from scipy import stats as ss
df=DataFrame({'data':[10.1,10,9.8,10.5,9.7,10.1,9.9,10.2,10.3,9.9]})
ss.ttest_1samp(a = df, popmean = 10)
Example #44
0
            3   3.5 0.5
        ]

    calculate var, cov, corr, cov-matrix, corr-matrix
"""

data = {'x1': [1, 2], 'x2': [2, 3], 'x3': [2, 1]}
A = DataFrame(data)
print A

data1 = Series([3, 3.5, 0.5], index=['x1', 'x2', 'x3'])
A = A.append(data1, ignore_index=True)
print '\n', A, '\n'

var_x1 = A['x1'].var()
var_x2 = A['x2'].var()
var_x3 = A['x3'].var()

print 'var_x1: %f' % var_x1
print 'var_x2: %f' % var_x2
print 'var_x3: %f' % var_x3

cov_matrix = A.cov()
print '\ncov_matrix:\n', cov_matrix

corr_matrix = A.corr()
print '\ncorr_matrix:\n', corr_matrix

print "\nA['x1'].corr(A['x2']): %f" % A['x1'].corr(A['x2'])
print "A['x1'].corr(A['x3']): %f" % A['x1'].corr(A['x3'])