def data_correaltion():
    data_path = "/Users/vaishnaviv/PycharmProjects/Assignment3_NeuralNetworks/BSOM_DataSet_for_HW3.csv"
    __data_raw = pd.read_csv(data_path)
    __data = __data_raw[[
        'all_mcqs_avg_n20', 'all_NBME_avg_n4', 'CBSE_01', 'CBSE_02', 'SA_NBME',
        'STEP_1', 'LEVEL'
    ]]
    print(__data_raw.columns.tolist())
    #__data = __data_raw[['O1_PI_01', 'O1_PI_02', 'O1_PI_03', 'O1_PI_04', 'O1_PI_05', 'O1_PI_06', 'O1_PI_07', 'O1_PI_08', 'O1_PI_09', 'O1_PI_10', 'O1_PI_11', 'O1_PI_12', 'O1_PI_13', 'O2_PI_01', 'O2_PI_02', 'O2_PI_03', 'O2_PI_04', 'O2_PI_05', 'O2_PI_06', 'O2_PI_07', 'O2_PI_08', 'O2_PI_09', 'O2_PI_10', 'O2_PI_11', 'O2_PI_12', 'O2_PI_13', 'HA_PI_01', 'HA_PI_02', 'HA_PI_03', 'HA_PI_04', 'HD_PI_01', 'HD_PI_02', 'HD_PI_03', 'HD_PI_04', 'HD_PI_05', 'HD_PI_06', 'HD_PI_07', 'HD_PI_08', 'HD_PI_09', 'HD_PI_10', 'HD_PI_11', 'HD_PI_12', 'HD_PI_13', 'HD_PI_14', 'HD_PI_15', 'SA_PI_01', 'SA_PI_02', 'SA_PI_03', 'SA_PI_04', 'SA_PI_05', 'SA_PI_06', 'SA_PI_07', 'SA_PI_08', 'SA_PI_09', 'SA_PI_10', 'SA_PI_11', 'SA_PI_12', 'SA_PI_13', 'SA_PI_14', 'SA_PI_15', 'SA_PI_16', 'SA_PI_17', 'SA_PI_18', 'SA_PI_19', 'SA_PI_20', 'SA_PI_21', 'SA_PI_22', 'SA_PI_23', 'SA_PI_24', 'SA_PI_25', 'SA_PI_26', 'B2E_PI_01', 'B2E_PI_02', 'B2E_PI_03', 'B2E_PI_04', 'B2E_PI_05', 'B2E_PI_06', 'B2E_PI_07', 'B2E_PI_08', 'B2E_PI_09', 'B2E_PI_10', 'B2E_PI_11', 'B2E_PI_12', 'B2E_PI_13', 'B2E_PI_14', 'B2E_PI_15', 'B2E_PI_16', 'B2E_PI_17', 'B2E_PI_18', 'B2E_PI_19', 'B2E_PI_20', 'B2E_PI_21', 'B2E_PI_22', 'B2E_PI_23', 'B2E_PI_24', 'B2E_PI_25', 'B2E_PI_26', 'B2E_PI_27', 'B2E_PI_28', 'B2E_PI_29', 'B2E_PI_30', 'BCR_PI_01', 'BCR_PI_02', 'BCR_PI_03', 'BCR_PI_04', 'BCR_PI_05', 'BCR_PI_06', 'BCR_PI_07', 'BCR_PI_08', 'BCR_PI_09', 'BCR_PI_10', 'BCR_PI_11', 'BCR_PI_12', 'BCR_PI_13', 'BCR_PI_14', 'BCR_PI_15', 'BCR_PI_16', 'BCR_PI_17', 'BCR_PI_18', 'BCR_PI_19', 'BCR_PI_20', 'BCR_PI_21', 'BCR_PI_22', 'BCR_PI_23', 'BCR_PI_24', 'BCR_PI_25', 'BCR_PI_26', 'BCR_PI_27', 'BCR_PI_28', 'BCR_PI_29', 'BCR_PI_30', 'BCR_PI_31', 'O1_IRAT_01', 'O1_IRAT_02', 'O1_IRAT_03', 'O1_IRAT_04', 'O1_IRAT_05', 'O1_IRAT_06', 'O1_IRAT_07', 'O1_IRAT_08', 'O1_IRAT_09', 'O1_IRAT_10', 'O1_IRAT_11', 'O1_IRAT_12', 'O2_IRAT_01', 'O2_IRAT_02', 'HA_IRAT_01', 'HA_IRAT_02', 'HD_IRAT_01', 'HD_IRAT_02', 'SA_IRAT_01', 'SA_IRAT_02', 'SA_IRAT_03', 'SA_IRAT_04', 'SA_IRAT_05', 'SA_IRAT_06', 'SA_IRAT_07', 'B2E_IRAT_01', 'B2E_IRAT_02', 'B2E_IRAT_03', 'B2E_IRAT_04', 'B2E_IRAT_05', 'B2E_IRAT_06', 'BCR_IRAT_01', 'BCR_IRAT_02', 'BCR_IRAT_03', 'O1_MCQ1_IND', 'O1_MCQ1_GRP', 'O1_MCQ1_TOT', 'O1_MCQ2_IND', 'O1_MCQ2_GRP', 'O1_MCQ2_TOT', 'O1_MCQ3_IND', 'O1_MCQ3_GRP', 'O1_MCQ3_TOT', 'O2_MCQ1_IND', 'O2_MCQ1_GRP', 'O2_MCQ1_TOT', 'O2_MCQ2_IND', 'O2_MCQ2_GRP', 'O2_MCQ2_TOT', 'O2_MCQ3_IND', 'O2_MCQ3_GRP', 'O2_MCQ3_TOT', 'HD_MCQ1_IND', 'HD_MCQ1_GRP', 'HD_MCQ1_TOT', 'SA_MCQ1_IND', 'SA_MCQ1_GRP', 'SA_MCQ1_TOT', 'SA_MCQ2_IND', 'SA_MCQ2_GRP', 'SA_MCQ2_TOT', 'SA_MCQ3_IND', 'SA_MCQ3_GRP', 'SA_MCQ3_TOT', 'SA_MCQ4_IND', 'SA_MCQ4_GRP', 'SA_MCQ4_TOT', 'SA_MCQ5_IND', 'SA_MCQ5_GRP', 'SA_MCQ5_TOT', 'B2E_MCQ1_IND', 'B2E_MCQ1_GRP', 'B2E_MCQ1_TOT', 'B2E_MCQ2_IND', 'B2E_MCQ2_GRP', 'B2E_MCQ2_TOT', 'B2E_MCQ3_IND', 'B2E_MCQ3_GRP', 'B2E_MCQ3_GRP.1', 'B2E_MCQ4_IND', 'B2E_MCQ4_GRP', 'B2E_MCQ4_TOT', 'BCR_MCQ1_IND', 'BCR_MCQ1_GRP', 'BCR_MCQ1_TOT', 'BCR_MCQ2_IND', 'BCR_MCQ2_GRP', 'BCR_MCQ2_TOT', 'BCR_MCQ3_IND', 'BCR_MCQ3_GRP', 'BCR_MCQ3_TOT', 'BCR_MCQ4_IND', 'BCR_MCQ4_GRP', 'BCR_MCQ4_TOT', 'BCR_NBME_final', 'B2E_NBME_final', 'O1_O2_NBME', 'SA_NBME', 'HA_final', 'HD_final', 'all_NBME_avg_n4', 'all_mcqs_avg_n20', 'O1_PI_AVG_13', 'O2_PI_AVG_13', 'O1O2_PI_AVG_26', 'HA_PI_AVG_04', 'HD_PI_AVG_15', 'SA_PI_AVG_26', 'B2E_PI_AVG_30', 'BCR_PI_AVG_30', 'O1_IRAT_AVG_12', 'O2_IRAT_AVG_02', 'HA_IRAT_AVG_02', 'HD_IRAT_AVG_02', 'SA_IRAT_AVG_07', 'B2E_IRAT_AVG_06', 'BCR_IRAT_AVG_03', 'O1_MCQ_AVG_03', 'O2_MCQ_AVG_03', 'HD_MCQ_AVG_01', 'SA_MCQ_AVG_05', 'B2E_MCQ_AVG_04', 'BCR_MCQ_AVG_04', 'BCR_ANAT_MCQ_AVG_02', 'CBSE_01', 'CBSE_02', 'STEP_1', 'LEVEL']]
    #__data=__data.dropna()
    # corelatiodata = __data.corr(method="spearman")
    # print(corelatiodata)
    # c = corrplot.Corrplot(corelatiodata)
    # c.plot(colorbar=False, method="square", shrink=.9, rotation=45)
    # plt.show()
    __data_LEVEL = __data.LEVEL.astype("category").cat.codes
    __data = __data.drop(['LEVEL'], axis=1)
    __data['LEVEL'] = __data_LEVEL
    #spearman corealtion
    print(__data)
    corelatiodata = __data.corr(method="spearman")
    print(corelatiodata['LEVEL'].sort_values())
    c = corrplot.Corrplot(corelatiodata)
    c.plot(colorbar=False, method="square", shrink=.9, rotation=45)
    plt.show()
    #pearson corealtion
    corelatiodata = __data.corr()
    c = corrplot.Corrplot(corelatiodata)
    c.plot(colorbar=False, method="square", shrink=.9, rotation=45)
    plt.show()
コード例 #2
0
def myinstance():
    try:
        letters = string.uppercase[0:10]
    except: #python3
        letters= string.ascii_uppercase[0:10]
    df = pd.DataFrame(dict(( (k, np.random.random(10)+ord(k)-65) for k in letters)))
    klass = corrplot.Corrplot(df.corr())
    klass = corrplot.Corrplot(df)
    return klass
コード例 #3
0
def test_correlation(myinstance):
    df1 = pd.DataFrame([[1,2,3,4],[4,5,1,2]])
    c1 = corrplot.Corrplot(df1)

    df2 = pd.DataFrame([[1,2,3,4],[4,5,1,2]]).corr()
    c2 = corrplot.Corrplot(df2)

    # in c1, the correlation is computed.
    assert (c1.df == c2.df).all().all() == True
コード例 #4
0
def heatmap_plot(df, x, path):
    #!pip install biokit
    # Drop unrelevant columns
    """
    Inputs= df,x, path
    df---> the dataframe 
    x--> list of coolums to be dropped  
    Path---> the path to store the heatmap plot 
    
    """

    for i in range(len(x)):
        del df[x[i]]

    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    for x in df.columns:
        if df[x].dtypes == 'object':
            df[x] = le.fit_transform(df[x])

    from biokit.viz import corrplot
    cor = df.corr(method='kendall')
    # {‘pearson’, ‘kendall’, ‘spearman’}
    c = corrplot.Corrplot(cor)

    c.plot(colorbar=True, method='square', shrink=.99, rotation=90)
コード例 #5
0
ファイル: plotters.py プロジェクト: Cloud-PG/smart-cache
def metric_corr(results: list):
    df = get_all_metric_values(results)

    # ref: https://nbviewer.jupyter.org/github/biokit/biokit/blob/master/notebooks/viz/corrplot.ipynb
    c = corrplot.Corrplot(df)
    # c.plot(method="text", colorbar=False, fontsize=12, rotation=45)
    c.plot(method="square", colorbar=True, shrink=0.9, rotation=45)
    plt.show()
コード例 #6
0
def generate_correlation(data):
    measures = [
        'dg', 'stg', 'sp', 'sp_w', 'pr', 'pr_w', 'accs', 'gaccs', 'sym', 'at'
    ]
    names = [
        'dg', 'stg', 'sp', 'sp_w', 'pr', 'pr_w', 'access', 'gAccess', 'sym',
        'absT'
    ]
    matrix_correlations = []

    for index, measure in enumerate(measures):
        print index, measure

    for measure in measures:
        temporal = data[measure]
        vector = []
        for temp_measure in measures:
            temp_vector = data[temp_measure]
            correlation = get_correlations(temporal, temp_vector)
            vector.append(correlation)
        matrix_correlations.append(vector)

    for i in matrix_correlations:
        print i
    #letters = string.uppercase[0:10]
    #print dict( ( (k, np.random.random(10)+ord(k)-65) for k in letters))

    dictionary = dict()
    for index, measure in enumerate(measures):
        dictionary[measure] = matrix_correlations[index]

    df = pd.DataFrame(matrix_correlations)
    #df = df.corr()

    #fig, ax = plt.subplots(1, 1)
    #m = plot_corr_ellipses(df, ax=ax, cmap='seismic')
    #cb = fig.colorbar(m)
    #cb.set_label('Correlation coefficient')
    #ax.margins(0.1)

    c = corrplot.Corrplot(df)

    c.plot(lower='ellipse', cmap='hsv')  # hsv gist_rainbow jet

    #value = np.asarray(matrix_correlations)

    t1 = 'Matrix of Spearman correlation for CSTNews'
    t2 = 'Matrix of Spearman correlation for DUC-2002'
    t3 = 'Matrix of Spearman correlation for DUC-2004'

    #sm.graphics.plot_corr(value, xnames=names, title=t3)

    plt.show()
コード例 #7
0
def plot_correlations(blups, traits, outprefix):
    # Subset blups to just the traits listed
    myblups = blups[traits]

    # Set up figure
    outpng = outprefix + ".corrplot.png"
    fig = plt.figure(figsize=(len(traits) * 1.25, len(traits)))
    ax = fig.add_subplot(111)

    # Plot correlation matrix
    cors = corrplot.Corrplot(myblups)
    cors.plot(ax=ax, lower='ellipse', upper='number')
    fig.savefig(outpng, dpi=100)
コード例 #8
0
ファイル: hinton.py プロジェクト: yuelianghaoyuana/biokit
def hinton(df,
           fig=1,
           shrink=2,
           method='square',
           bgcolor='grey',
           cmap='gray_r',
           binarise_color=True):
    """Hinton plot (simplified version of correlation plot)

    :param df: the input data as a dataframe or list of items (list, array). See
        :class:`~biokit.viz.corrplot.Corrplot` for details.
    :param fig: in which figure to plot the data
    :param shrink: factor to increase/decrease sizes of the symbols
    :param method: set the type of symbols for each coordinates. (default to square). See
        :class:`~biokit.viz.corrplot.Corrplot` for more details.
    :param bgcolor: set the background and label colors as grey
    :param cmap: gray color map used by default
    :param binarise_color: use only two colors. One for positive values and one for
        negative values.

    .. plot::
        :include-source:
        :width: 80%

        from biokit.viz import hinton
        df = np.random.rand(20, 20) - 0.5
        hinton(df)


    .. note:: Idea taken from a matplotlib recipes
        http://matplotlib.org/examples/specialty_plots/hinton_demo.html
        but solely using the implementation within :class:`~biokit.viz.corrplot.Corrplot`

    .. seealso:: :class:`biokit.viz.corrplot.Corrplot`

    .. note:: Values must be between -1 and 1. No sanity check performed.
    """
    from biokit.viz import corrplot
    c = corrplot.Corrplot(df)
    c.plot(colorbar=False,
           cmap=cmap,
           fig=fig,
           method=method,
           facecolor=bgcolor,
           shrink=shrink,
           label_color=bgcolor,
           binarise_color=binarise_color)
コード例 #9
0
def correlation_matrix(df, features=FEATURES, output_file=None):
    corr = corrplot.Corrplot(df[features].corr(method="spearman"))
    corr.plot(grid=False,
              method='text',
              colorbar=True,
              lower='ellipse',
              upper='text')
    figure = plt.gcf()
    ax = plt.gca()
    labels = [
        label.replace("sonar_", "").replace("_", " ").title()
        for label in features
    ]
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    if output_file:
        figure.tight_layout()
        figure.savefig(output_file)
    else:
        figure.show()
コード例 #10
0
"""

Corrplot example
==================

"""
# some useful pylab imports for this notebook

# Create some random data
import string
letters = string.ascii_uppercase[0:15]
import pandas as pd
import numpy as np
df = pd.DataFrame(
    dict(((k, np.random.random(10) + ord(k) - 65) for k in letters)))
df = df.corr()

# if the input is not a square matrix or indices do not match
# column names, correlation is computed on the fly
from biokit.viz import corrplot
c = corrplot.Corrplot(df)

c.plot(colorbar=False, method='square', shrink=.9, rotation=45)
コード例 #11
0
ファイル: test_corrplot.py プロジェクト: josepablog/biokit
 def setup_class(klass):
     letters = string.ascii_uppercase[0:10]
     df = pd.DataFrame(
         dict(((k, np.random.random(10) + ord(k) - 65) for k in letters)))
     klass.s = corrplot.Corrplot(df.corr())
     klass.s = corrplot.Corrplot(df)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from biokit.viz import corrplot

# load data
dataframe = pd.read_csv("preprocessed_data.csv")
# find correlation
co_relation = dataframe.corr()
# generate correlation plot
cp = corrplot.Corrplot(co_relation)
cp.plot(method='pie', shrink=.9, grid=False)
plt.savefig('correlation.png')
# generate pair plot for all attributes
sns.pairplot(dataframe)
sns.plt.savefig('data_distribution.png')
sns.plt.clf()
# generate heatmap based on correlation
sns.heatmap(co_relation, linewidths=.5, cmap="YlGnBu")
sns.plt.savefig('correlation_heatmap.png')
コード例 #13
0
ファイル: rfecvNanoExample.py プロジェクト: wanesta/rfecvNano
                  'PartZeta', # included in dimensionless ratio of zetas
                  'PartIEP', # included in dimensionless ratio of pH to IEP
                  'PartDiam', # included in dimensionless aspect ratio
                  'CollecDiam', # included in dimensionless aspect ratio
                  'CollecZeta',# included in dimensionless aspect ratio
                  'IonStr', # included in Debye Length
                  'SaltType',# included in Debye Length
                  'pH'# included in dimensionless ratio of pH to IEP
                 ],1)

# print list(data) # print out the remaining data field headers

# Make sure to install biokit dependencies with requirements.txt
# https://pypi.python.org/pypi/biokit/0.0.5

c = corrplot.Corrplot(data)
c.plot(upper='circle',fontsize = 10)

# assign the remaining data to the training data set.
trainingData = data


# Store the training data and target data as a matrices for import into ML.
trainingDataMatrix = trainingData.as_matrix() # all numbers, no headers
targetDataRPShapeMatrix  = targetDataRPShape
targetDataRFMatrix =  targetDataRF.as_matrix() # all numbers, no headers

# Get a list of the trainingData features remaining. This is used later for plotting etc.
trainingDataNames =  list(trainingData)
# print trainingDataNames
コード例 #14
0
injuries = {'NUMBER_KILLED': 'Total', 'COUNT_PED_KILLED': 'Walked', 'COUNT_PED_INJURED': 'Walked', 'COUNT_BICYCLIST_KILLED': 'Bicycle', 'COUNT_BICYCLIST_INJURED': 'Bicycle', 'COUNT_MC_KILLED': 'Taxicab, motorcycle, or other means', 'COUNT_MC_INJURED': 'Taxicab, motorcycle, or other means'}
for injury in list(injuries):
  df[injury] = df.index.map(lambda county: co[(int(county) * 100 <= co['CNTY_CITY_LOC']) & (co['CNTY_CITY_LOC'] < int(county) * 100 + 100)][injury].sum())

for injury, mode in injuries.items():
  df[injury + ' Rate'] = df[injury].apply(int) / df[mode].apply(int) * 100

df[['NAME','Bicycle','Bicycle Rate','COUNT_BICYCLIST_KILLED','COUNT_BICYCLIST_KILLED Rate']].sort_values(['Bicycle Rate'], ascending=False)

dft = df.convert_objects(convert_numeric=True)
# cor = list(set(injuries.values())) + list(injuries)
cor = list(modes) + list(injuries)
b = []
for a in cor:
  b.append(a + ' Rate') 
c = corrplot.Corrplot(dft[b])
matplotlib.rcParams.update({'font.size': 8})
c.plot()
pyplot.savefig('/Users/david/Desktop/fig.svg')
# pyplot.show()

# df.to_csv('modes.csv')

pyplot.scatter(df['Bicycle Rate'],df['COUNT_BICYCLIST_KILLED Rate'])
pyplot.show()

for county in df.index:
  print(df['NAME'][county])
  for injury in injuries:
    print(injury)
    co[(int(county) * 100 <= co['CNTY_CITY_LOC']) & (co['CNTY_CITY_LOC'] < int(county) * 100 + 100)][injury].sum()
correlation_matrix = np.load('correlation-matrix-nohbond-poster-new.npy')
for i, row in enumerate(correlation_matrix):
    for j, val in enumerate(row):
        correlation_matrix[i][j] = round(val, 2)
        if abs(val) < 0.005:
            correlation_matrix[i][j] == 0.0

fig, ax = plt.subplots(figsize=(16, 12))
'''
ax = sns.heatmap(correlation_matrix, xticklabels=variables, yticklabels=variables,
                 annot=True, cmap='coolwarm', vmin=-1.0, vmax=1.0, center=0.0,
                 square=True, annot_kws={'size': 16, 'weight': 'semibold'},
                 cbar_kws={'ticks': [-1, -0.5, 0, 0.5, 1], 'shrink': 0.68,
                 'label': 'Pearson correlation coefficient'})
'''
c = corrplot.Corrplot(correlation_matrix)
c.order(inplace=True)
c.plot(fig=fig,
       grid=True,
       rotation=30,
       upper='circle',
       lower=None,
       shrink=0.9,
       facecolor='white',
       colorbar=True,
       label_color='black',
       fontsize='large',
       edgecolor='black',
       method='circle',
       cmap='coolwarm',
       ax=ax)
コード例 #16
0
def plot_corr(df_corr, method='square'):
    c = corrplot.Corrplot(df_corr)
    c.plot(method=method, shrink=.9, rotation=45)