print end_of_question # you'll note in this case that the correlation matrix is not as useful. We # are concerned with how the variables interact with one another, not their # correlation in isolation; to compensate, we'll use a logistic regression formula = "left ~ S + LPE + NP + ANH + TIC + C(Newborn)" print "Linear Regression Formula: {}".format(formula) logitreg_model = smf.glm(formula=formula, data=hr_df, family=sm.families.Binomial()).fit() print logitreg_model.summary() # print the model summary # the output of a logistic regression model is a probability # and we can plot a histogram of these fitted probabilities plt.hist(logitreg_model.fittedvalues) plt.xlabel('Probability of Employee Having Left The Company') plt.ylabel('Number of Employees Projected') plt.show() # we can use this histogram to help set a cutoff value above which an employee # is assumed likely to leave the company; let's set our cutoff value as 0.5 cutoff = 0.5 # and compute the percentage of correctly classified employees who stayed, correct_stayed = sum((logitreg_model.fittedvalues <= cutoff) & \ (hr_df.left == 0)) / float(sum(hr_df.left == 0)) print "Percentage of correctly classed employees who stayed: {}".format( correct_stayed) # the percentage of correctly classified employees who left, correct_left = sum((logitreg_model.fittedvalues > cutoff) & \ (hr_df.left == 1)) / float(sum(hr_df.left == 1)) print "Percentage of correctly classed employees who left: {}".format( correct_left)
sku_df_scaled = scale(sku_df) # let's scale sku_df using our scale function # the AgglomerativeClustering class from sklearn.cluster can perform # hierarchical clustering using the ward method n_clusters = 2 # let's set the number of clusters we want to show as 2 # and perform the clustering of the scaled dataset using the fit method ward = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward').fit(sku_df_scaled) colors = ward.labels_ # let's set our plot colors to be the cluster labels # then let's plot the data with CV on the x_axis and ADS on the y_axis plt.scatter(sku_df['CV'], sku_df['ADS'], s=100, c=colors) # add some axes for clarity plt.xlabel('Average Daily Sales') plt.ylabel('Coefficient of Variance') # and then show the plot plt.show() # third, we are asked which graph reports the correct plot of the last project # evaluation as a function of the number of projects done for the HR dataset # in order to do this, let's load the data into memory hr_df = pd.read_csv(SBA_FILE_LOADER('clustering/DATA_2.02_HR.csv')) # then let's plot the data with CV on the x_axis and ADS on the y_axis plt.scatter(hr_df['LPE'], hr_df['NP'], s=100) # add some axes for clarity plt.xlabel('Last Project Evaluation') plt.ylabel('Number of Projects')