def kmeansInitialClusters(dataset): model = KMeansModel(CENTER_VECTORS) vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features']))) trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model) result=[] for d in dataset.collect(): entry = {} entry["features"] = d["features"] entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features']))) entry["label"] = d['label'] result.append(entry) plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters") centroidArtistSongCount(result, CENTERS)
def map_to_libsvm_format_descr(tp): """ map the training rdd to libsvm format (label index:value index:value...) """ spVectorList=[] vectStr=Vectors.stringify(tp[1])[1:] (dim,indexStr,valsStr)=vectStr[:-1].split("[") indexLst=ast.literal_eval("["+indexStr)[0] valsLst=ast.literal_eval("["+valsStr) print "sparse vector-------------------------------" print tp[1] print "indexes-------------------------------" print indexLst print "vals-------------------------------" print valsLst for i in xrange(len(indexLst)): print i spVectorList.append("{0}:{1}".format((int(indexLst[i])+1),int(valsLst[i]))) #for this format indexes start from 1 print "spVectorList is" print spVectorList return "{0} {1}".format(int(tp[0])," ".join(spVectorList))
def map_to_libsvm_format_descr(tp): """ map the training rdd to libsvm format (label index:value index:value...) """ spVectorList = [] vectStr = Vectors.stringify(tp[1])[1:] (dim, indexStr, valsStr) = vectStr[:-1].split("[") indexLst = ast.literal_eval("[" + indexStr)[0] valsLst = ast.literal_eval("[" + valsStr) print "sparse vector-------------------------------" print tp[1] print "indexes-------------------------------" print indexLst print "vals-------------------------------" print valsLst for i in xrange(len(indexLst)): print i spVectorList.append("{0}:{1}".format( (int(indexLst[i]) + 1), int(valsLst[i]))) #for this format indexes start from 1 print "spVectorList is" print spVectorList return "{0} {1}".format(int(tp[0]), " ".join(spVectorList))
def __str__(self): return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"