print corr_hr_df  # and then print out the correlation matrix
print end_of_question

# you'll note in this case that the correlation matrix is not as useful. We
# are concerned with how the variables interact with one another, not their
# correlation in isolation; to compensate, we'll use a logistic regression
formula = "left ~ S + LPE + NP + ANH + TIC + C(Newborn)"
print "Linear Regression Formula: {}".format(formula)
logitreg_model = smf.glm(formula=formula,
    data=hr_df, family=sm.families.Binomial()).fit()
print logitreg_model.summary()  # print the model summary

# the output of a logistic regression model is a probability
# and we can plot a histogram of these fitted probabilities
plt.hist(logitreg_model.fittedvalues)
plt.xlabel('Probability of Employee Having Left The Company')
plt.ylabel('Number of Employees Projected')
plt.show()

# we can use this histogram to help set a cutoff value above which an employee
# is assumed likely to leave the company; let's set our cutoff value as 0.5
cutoff = 0.5
# and compute the percentage of correctly classified employees who stayed,
correct_stayed = sum((logitreg_model.fittedvalues <= cutoff) &  \
    (hr_df.left == 0)) / float(sum(hr_df.left == 0))
print "Percentage of correctly classed employees who stayed: {}".format(
    correct_stayed)
# the percentage of correctly classified employees who left,
correct_left = sum((logitreg_model.fittedvalues > cutoff) & \
    (hr_df.left == 1)) / float(sum(hr_df.left == 1))
print "Percentage of correctly classed employees who left: {}".format(
sku_df_scaled = scale(sku_df)  # let's scale sku_df using our scale function

# the AgglomerativeClustering class from sklearn.cluster can perform
# hierarchical clustering using the ward method

n_clusters = 2  # let's set the number of clusters we want to show as 2
# and perform the clustering of the scaled dataset using the fit method
ward = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',
    linkage='ward').fit(sku_df_scaled)
colors = ward.labels_  # let's set our plot colors to be the cluster labels

# then let's plot the data with CV on the x_axis and ADS on the y_axis
plt.scatter(sku_df['CV'], sku_df['ADS'], s=100, c=colors)
# add some axes for clarity
plt.xlabel('Average Daily Sales')
plt.ylabel('Coefficient of Variance')
# and then show the plot
plt.show()


# third, we are asked which graph reports the correct plot of the last project
# evaluation as a function of the number of projects done for the HR dataset

# in order to do this, let's load the data into memory
hr_df = pd.read_csv(SBA_FILE_LOADER('clustering/DATA_2.02_HR.csv'))

# then let's plot the data with CV on the x_axis and ADS on the y_axis
plt.scatter(hr_df['LPE'], hr_df['NP'], s=100)
# add some axes for clarity
plt.xlabel('Last Project Evaluation')