def create_biplot(sIn, lWanted): """Run a PCA on state.x77 from R and generate its biplot. Color observations by k-means clustering.""" from scipy.cluster.vq import kmeans, vq from statsmodels.sandbox.tools.tools_pca import pcasvd aData, sComponent, sValue = csv2np(sIn) aDataReshape = aData.reshape((-1, len(lWanted))) dDataReshaped = dict( zip(lWanted, np.array([aDataReshape[:, i] for i in range(len(lWanted))]))) df = pd.DataFrame(dDataReshaped) #df = pd.io.parsers.read_csv(sIn, ',') #print df.describe() #print df.head() data = df[lWanted] data = (data - data.mean()) / data.std() pca = pcasvd(data, keepdim=0, demean=False) plt.figure(2) lX, lY = biplot(plt, pca, labels=data.index, xpc=0, ypc=1, bPoints=False) if sComponent != "": "get dot product for a variable - investigative tool for categorical units" #sComponent = "PrimaryUsage_Previous30days" iIndex1 = lWanted.index(sComponent) xVar = lX[iIndex1] yVar = lY[iIndex1] print('{}: {}, {}'.format(sComponent, xVar, yVar)) sTot = "TotalSpend" iIndex2 = lWanted.index(sTot) xTot = lX[iIndex2] yTot = lY[iIndex2] print('{}: {}, {}'.format(sTot, xTot, yTot)) rDotProd = (xVar * xTot) + (yVar * yTot) print("Dot product = ", rDotProd) plt.suptitle('"{}" for ({} . {})'.format(sValue, sComponent, sTot)) plt.title("Dot product = {}".format(round(rDotProd, 4))) plt.axis([-1.2, 1.2, -1.2, 1.2]) plt.show()
def create_biplot(sIn, lWanted): """Run a PCA on state.x77 from R and generate its biplot. Color observations by k-means clustering.""" from scipy.cluster.vq import kmeans, vq from statsmodels.sandbox.tools.tools_pca import pcasvd aData, sComponent, sValue = csv2np(sIn) aDataReshape = aData.reshape((-1,len(lWanted))) dDataReshaped = dict(zip(lWanted, np.array([aDataReshape[:,i] for i in range(len(lWanted)) ]))) df = pd.DataFrame( dDataReshaped ) #df = pd.io.parsers.read_csv(sIn, ',') #print df.describe() #print df.head() data = df[lWanted] data = (data - data.mean()) / data.std() pca = pcasvd(data, keepdim=0, demean=False) plt.figure(2) lX,lY = biplot(plt, pca, labels=data.index, xpc=0, ypc=1, bPoints=False) if sComponent != "": "get dot product for a variable - investigative tool for categorical units" #sComponent = "PrimaryUsage_Previous30days" iIndex1 = lWanted.index(sComponent) xVar = lX[iIndex1] yVar = lY[iIndex1] print('{}: {}, {}'.format(sComponent,xVar, yVar)) sTot = "TotalSpend" iIndex2 = lWanted.index(sTot) xTot = lX[iIndex2] yTot = lY[iIndex2] print('{}: {}, {}'.format(sTot, xTot, yTot)) rDotProd = (xVar * xTot) + (yVar * yTot) print("Dot product = ", rDotProd) plt.suptitle('"{}" for ({} . {})'.format(sValue, sComponent, sTot)) plt.title("Dot product = {}".format(round(rDotProd, 4))) plt.axis([-1.2,1.2,-1.2,1.2]) plt.show()
def main(): """Run a PCA on state.x77 from R and generate its biplot. Color observations by k-means clustering.""" df = pd.io.parsers.read_csv('data/state.x77') print df.describe() print df.head() columns = ['Population', 'Income', 'Illiteracy', 'Life Exp', 'Murder', 'HS Grad'] data = df[columns] data = (data - data.mean()) / data.std() pca = pcasvd(data, keepdim=0, demean=False) values = data.values centroids, _ = kmeans(values, 3) idx, _ = vq(values, centroids) colors = ['gby'[i] for i in idx] plt.figure(1) biplot(plt, pca, labels=data.index, colors=colors, xpc=1, ypc=2) plt.show()
def main(): """Run a PCA on state.x77 from R and generate its biplot. Color observations by k-means clustering.""" df = pd.io.parsers.read_csv('data/state.x77') print df.describe() print df.head() columns = [ 'Population', 'Income', 'Illiteracy', 'Life Exp', 'Murder', 'HS Grad' ] data = df[columns] data = (data - data.mean()) / data.std() pca = pcasvd(data, keepdim=0, demean=False) values = data.values centroids, _ = kmeans(values, 3) idx, _ = vq(values, centroids) colors = ['gby'[i] for i in idx] plt.figure(1) biplot(plt, pca, labels=data.index, colors=colors, xpc=1, ypc=2) plt.show()
# f.write("];\n") # f.write("criteria = {") # for crit in criteria: # f.write("'" + crit + "' ") # f.write('};') # f.close() # uninetflows.insert(0,criteria) with open("uninetflows.csv", "w") as f: writer = csv.writer(f) writer.writerows(uninetflows) df = pd.io.parsers.read_csv('uninetflows.csv') data = df[criteria] # data = (data - data.mean()) / data.std() pca = pcasvd(data, keepdim=0, demean=False) colors = ['kcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmykcgrbmy'[i] for i in clust_repart] labels = ['************************************************************************************'[i] for i in clust_repart] plt.figure(1) # biplot(plt, pca, labels=data.index, colors=colors, xpc=1, ypc=2) biplot(plt, pca, labels=labels, colors=colors, xpc=1, ypc=2) plt.show() # iter = 0 # sols = [] # while iter < 5: # prob.solve(pulp.GLPK()) # print(LpStatus[prob.status]) # sols.append(prob.variables()) # # iter += 1