print('')

#convert categorical to numerical
#Deep copy the original data
df_encoded = df.copy(deep=True)
#Use Scikit-learn label encoding to encode character data
le = preprocessing.LabelEncoder()
for col in categoricalCol:
    df_encoded[col] = le.fit_transform(df[col])
    le_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print('Feature: ', col)
    print(le_mapping)

#drop columns
drop_col = [
    'DailyRate', 'EmployeeCount', 'EmployeeNumber', 'MonthlyRate', 'Over18'
]
df_encoded = df_encoded.drop(drop_col, axis=1)
print(df_encoded.head())

pd.set_option('display.max_columns', 10)
print(df_encoded.describe().transpose())
print('')

#check for missing values
print(df_encoded.isnull().sum())

df_corr = df_encoded.corr()
plt.figure(figsize=(8, 8))
corrplot(df_corr, size_scale=300)
Ejemplo n.º 2
0
     "cwl2(lr:0.015)":'{} ({})'.format(r'$CW\_l_2$', r'$lr: 0.015$'),
     "dfl2(os:3/255)":'{} ({})'.format(r'$DF\_l_2$', r'$overshoot: 3$'),
     "dfl2(os:8/255)":'{} ({})'.format(r'$DF\_l_2$', r'$overshoot: 8$'),
     "dfl2(os:20/255)":'{} ({})'.format(r'$DF\_l_2$', r'$overshoot: 20$'),
     "fgsm(eps:0.1)":'{} ({})'.format(r'$FGSM$', r'$\epsilon: 0.1$'),
     "fgsm(eps:0.2)":'{} ({})'.format(r'$FGSM$', r'$\epsilon: 0.2$'),
     "fgsm(eps:0.3)":'{} ({})'.format(r'$FGSM$', r'$\epsilon: 0.3$'),
     "jsma(theta:0.15)":'{} ({})'.format(r'$JSMA$', r'$\theta: 0.15$'),
     "jsma(theta:0.18)":'{} ({})'.format(r'$JSMA$', r'$\theta: 0.18$'),
     "jsma(theta:0.21)":'{} ({})'.format(r'$JSMA$', r'$\theta: 0.21$'),
     "mim(eps:0.05)":'{} ({})'.format(r'$MIM$', r'$\epsilon: 0.05$'),
     "mim(eps:0.075)":'{} ({})'.format(r'$MIM$', r'$\epsilon: 0.075$'),
     "mim(eps:0.1)":'{} ({})'.format(r'$MIM$', r'$\epsilon: 0.1$'),
     "onepixel(pxCnt:5)":'{} ({})'.format(r'$OP$', r'$px~count: 5$'),
     "onepixel(pxCnt:15)":'{} ({})'.format(r'$OP$', r'$px~count: 15$'),
     "onepixel(pxCnt:30)":'{} ({})'.format(r'$OP$', r'$px~count: 30$'),
     "pgd(eps:0.075)":'{} ({})'.format(r'$PGD$', r'$\epsilon: 0.075$'),
     "pgd(eps:0.09)":'{} ({})'.format(r'$PGD$', r'$\epsilon: 0.09$'),
     "pgd(eps:0.1)":'{} ({})'.format(r'$PGD$', r'$\epsilon: 0.1$'),
     "BS":'Benign Samples',
}

rank_corr=rank_corr.rename(columns=titles_for_attacks)
rank_corr=rank_corr.rename(index=titles_for_attacks)

filepath=os.path.join(resultDir, "rank_correlation_plot.pdf")
plt.figure(figsize=(50, 50))
corrplot(rank_corr, filepath, size_scale=36, palette=sns.diverging_palette(5, 250, n=256))


Ejemplo n.º 3
0
print(DF.tail(20))

DF = DF.drop(columns=['Element1', 'Element2'])
DF = DF.sort_values(by='Class')
#DF["name"] = DF["compound"] + '_group' + DF["Class"].astype(str)
#DF = DF[['name', 'Pauling EN', 'Sum of Valence e-', 'Mean atomic number', 'Mean atomic radius', 'Atomic radius ratio', 'Group Number difference', 'Quantum number difference']]

#print(DF.tail(20))
#DF.to_csv('np.txt', sep=' ', index=False, header=True)


#Plot Heatmap for some features
corr = DF.iloc[:, np.arange(3, 58, 1)].corr()
plt.figure(figsize=(30, 30))
heatmap.corrplot(corr)
plt.savefig('pairplot.png', dpi=400)
plt.show()

'''
#Define features and target
X = DF.iloc[:, list(range(5, 55))]
y = DF.iloc[:, 3]

#30/70 Test/Train stratified split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20, stratify=y)

#Standardize Features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Ejemplo n.º 4
0
def corr_heatmap(data, figsize=(10, 10)):
    # medium/Better_Heatmaps_and_Correlation_Matrix_Plots_in_Python
    plt.figure(figsize=figsize)
    corrplot(data.corr())
Ejemplo n.º 5
0
# Total hours of data collection

# In[14]:


round(max(df["Time"])/(60*60), 1)


# Creating the correlation plot

# In[15]:


plt.figure(figsize=(14,14))
corrplot(df.corr())
plt.grid(False)


# Looking at the distribution only at the non-fraudulent payment amounts

# In[16]:


fig, ax = plt.subplots(1, 2, figsize=(14,4))
sns.distplot(df.Amount[df["Class"] == 0], bins=100, kde=False,
             hist_kws={"color": "#3f8094","linewidth": 0.4, "alpha": 1}, 
             ax=ax[0])
sns.distplot(df.Amount[df["Amount"] <= 250], bins=100, kde=False, 
             hist_kws={"color": "#3f8094", "linewidth": 0.4, "alpha": 1},
             ax=ax[1])
Ejemplo n.º 6
0
ds.describe()

# View all possible variable correlations with Salary
correlation = ds.corr().sort_values(by='2018_2019_Salary', ascending=False)
correlation['2018_2019_Salary']


"""Variable Correlation Plot"""

#View all possible variable correlations with Salary
correlation = ds.corr(method='pearson', min_periods=1)
rs_val = correlation**2

#Generating heatmap of peason correlation values
plt.figure(figsize=(12, 12))
corrplot(correlation, size_scale=300)
plt.title("Heatmap 1 – Pearson Correlation", x=-8, y=1)
plt.show()



"""Top 8 variables"""

#Finding the top 8 variables with the highest correaltion with salary 
num_vals = 9
larg = rs_val.nlargest(num_vals, '2018_2019_Salary')['2018_2019_Salary']
c = larg.index
csquared_val = ds[c].corr()**2

#generating heatmap of top 8 features correlated with salary
f, ax = plt.subplots(figsize=(12, 12))
Ejemplo n.º 7
0
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from heatmap import heatmap, corrplot

data = pd.read_csv('met-data.csv')

corr = data.corr()
corr = pd.melt(
    corr.reset_index(), id_vars='index'
)  # Unpivot the dataframe, so we can get pair of arrays for x and y
corr.columns = ['x', 'y', 'value']
heatmap(x=corr['x'], y=corr['y'], size=corr['value'].abs())

plt.figure(figsize=(10, 10))
corrplot(data.corr())

plt.show()
Ejemplo n.º 8
0
dataset = pd.read_csv('cov19_clean.csv')

#Visualize the data
dataset.info()
dataset.describe()
dataset.head()

#Correlation Matrix
dataset.iloc[:,:-1].corrwith(dataset.InfectRate).plot.line()
plt.xticks(ticks = range(len(dataset.columns)-1), labels = ['population_density', 'median_age', 'aged_65_older', 'aged_70_older',
       'gdp_per_capita', 'cvd_death_rate', 'diabetes_prevalence',
       'female_smokers', 'male_smokers', 'handwashing_facilities',
       'hospital_beds_per_100k'],rotation = 45)
plt.title('Variable correlating to the Infection Rate ')

corrplot(dataset.corr(), size_scale=500, marker='s')

#Data Preprocessing
column_names_x = pd.DataFrame(dataset.iloc[:, 1:-1].columns.values)
x = pd.DataFrame(dataset.iloc[:, 1:-1].values)
y = dataset.iloc[:, -1].values
x.columns = column_names_x.values
#Feature scaling is not necessary for this Linear Regression 

#Split into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .2, random_state = 0)

#Apply Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
Ejemplo n.º 9
0
for i in range(1, len(new_helix)):
    maxsort = np.append(maxsort, np.argmax(new_helix[i]))

mat = np.zeros((len(hair_d.array) - 1, len(hair_d.array) - 1))
for i in range(1, len(hair_d.array)):
    for j in range(1, i):
        mat[i - 1, j - 1] = histogram_intersection(hair_d.array[i],
                                                   hair_d.array[j])
        mat[j - 1, i - 1] = mat[i - 1, j - 1]
        mat[i - 1, i - 1] = 1
mat[0, 0] = 1
plt.imshow(mat)
plt.show()

df = pd.DataFrame(mat)
corrplot(df)
plt.show()

mat = np.zeros((len(new_helix.T), len(new_helix.T)))
for i in range(0, len(new_helix.T)):
    for j in range(0, i):
        mat[i, j] = histogram_intersection(new_helix.T[i], new_helix.T[j])
        mat[j, i] = mat[i, j]
        mat[i, i] = 1
mat[0, 0] = 1
plt.imshow(mat)
plt.show()

df = pd.DataFrame(mat)
corrplot(df)  #, segid=i+1)
plt.show()
         margin=dict(r=0, l=210, t=25, b=210),
         yaxis=dict(tickfont=dict(size=9)),
         xaxis=dict(tickfont=dict(size=9))))

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

# https://towardsdatascience.com/better-heatmaps-and-correlation-matrix-plots-in-python-41445d0f2bec
# Correlation matrix with heatmapz

from heatmap import heatmap, corrplot

data = pd.read_csv("telco_customer_churn.csv")
plt.figure(figsize=(8, 8))
corrplot(data.corr(), size_scale=300)

# Contracts information (month-to-month contract, two year contract, one year contract)
ax1 = sns.distplot(data[data["Contract"] == "Month-to-month"]["tenure"],
                   hist=True,
                   kde=False,
                   bins=int(180 / 5),
                   hist_kws={'edgecolor': 'black'},
                   kde_kws={'linewidth': 4})
ax1.set_ylabel('Number of Customers')
ax1.set_xlabel('Tenure (months)')
ax1.set_title('Month-to-month Contract')

ax2 = sns.distplot(data[data["Contract"] == "One year"]["tenure"],
                   hist=True,
                   kde=False,