コード例 #1
0
ファイル: tests.py プロジェクト: vidur89/spark
    def test_kmeans_deterministic(self):
        from pyspark.mllib.clustering import KMeans

        X = range(0, 100, 10)
        Y = range(0, 100, 10)
        data = [[x, y] for x, y in zip(X, Y)]
        clusters1 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42)
        clusters2 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42)
        centers1 = clusters1.centers
        centers2 = clusters2.centers
        for c1, c2 in zip(centers1, centers2):
            # TODO: Allow small numeric difference.
            self.assertTrue(array_equal(c1, c2))
コード例 #2
0
ファイル: aggregator.py プロジェクト: luca-zamboni/Big-Data
def clusterKMeanSpark(matrix,k):
	m = transformInRealMatrix(matrix)
	sc = SparkContext(appName="Jsonizer: Remove stop words")
	parsedData = sc.parallelize(m)
	y = []
	x = []
	clustersControl = range(k,k+1)
	for kc in clustersControl:
		clusters = KMeans.train(parsedData, kc, maxIterations=50000,runs=200, initializationMode="k-means||",epsilon=0.0001)
		clu = []

		def error(point,clust):
		    center = clust.centers[clust.predict(point)]
		    return sqrt(sum([x**2 for x in (point - center)]))


		WSSSE = parsedData.map(lambda point: error(point,clusters)).reduce(lambda x, y: x + y)
		for n in m:
			clu += [clusters.predict(np.array(n))]

		x += [kc]
		y += [WSSSE]

		#print(kc,WSSSE)

	#plt.plot(x,y)
	#plt.ylabel('some numbers')
	#plt.show()

	ret = [[] for i in range(0,max(clu)+1)]
	for i in range(0,len(clu)):
		ret[clu[i]] += [i]
	sc.stop()
	return ret
コード例 #3
0
def train_subquantizers(sc, split_vecs, M, subquantizer_clusters, model, seed=None):
    """
    Project each data point into it's local space and compute subquantizers by clustering
    each fine split of the locally projected data.
    """
    b = sc.broadcast(model)

    def project_local(x):
        x = np.concatenate(x)
        coarse = b.value.predict_coarse(x)
        return b.value.project(x, coarse)

    projected = split_vecs.map(project_local)

    # Split the vectors into the subvectors
    split_vecs = projected.map(lambda x: np.split(x, M))
    split_vecs.cache()

    subquantizers = []
    for split in xrange(M):
        data = split_vecs.map(lambda x: x[split])
        data.cache()
        sub = KMeans.train(data, subquantizer_clusters, initializationMode='random', maxIterations=10, seed=seed)
        data.unpersist()
        subquantizers.append(np.vstack(sub.clusterCenters))

    return (subquantizers[:len(subquantizers) / 2], subquantizers[len(subquantizers) / 2:])
コード例 #4
0
ファイル: ml.py プロジェクト: aditcoding/zfs
def main(sc):

    stopset = set(stopwords.words('english'))

    tweets = sc.textFile('hdfs:/adi/sample.txt')
    words = tweets.map(lambda word: word.split(" "))
    wordArr = []
    for wArr in words.collect():
        tempArr = []
        for w in wArr:
                if not w in stopset:
                        tempArr.append(w)
        wordArr.append(tempArr)
    # Open a file
   # print wordArr
    #tokens = sc.textFile("hdfs:/adi/tokens1.txt")

    # Load documents (one per line).
    documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" "))
    numDims = 100000
    hashingTF = HashingTF(numDims)
    tf = hashingTF.transform(documents)
    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    tfidf.count()
    model = KMeans.train(tfidf, 5)
    model.save(sc,"tweetModel1")
    print("Final centers: " + str(model.clusterCenters))
#    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
コード例 #5
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logger.info('Loading pickled noun to vector dictionary')
    # Load noun to vector dictionary
    with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as pickled:
        noun_to_vect_dict = pickle.load(pickled)
    # Create vector array from mapping
    vectors = np.array(noun_to_vect_dict.values())
    max_k = int(sqrt(len(vectors) / 2.0))

    # Define search space for k
    numbers_of_clusters = reversed(range(MIN_K, max_k))

    # For each k
    for i, k in enumerate(numbers_of_clusters):
        # Initialize Spark Context
        sc = ps.SparkContext()
        # Load data
        data = sc.parallelize(vectors, 1024)

        logger.info('Trial %i of %i, %i clusters', (i + 1), max_k - 1, k)
        # Calculate cluster
        kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10,
                                    initializationMode='k-means||')
        logger.info('Calculating WSSSE')
        # Calculate WSSSE
        WSSSE = data.map(lambda point: error(kmeans_model, point)) \
                    .reduce(lambda x, y: x + y)
        logger.info('Writing WSSSE')
        # Write k and WSSSE
        with open(path.join(OUT_FILES_LOC, 'elbow_data.txt'), 'a') as elbow_data:
            elbow_data.write(str(k) + '\t' + str(WSSSE) + '\n')

        sc.stop()
コード例 #6
0
ファイル: kmeans_analyse.py プロジェクト: summer-apple/spark
    def train_model(self, dataframe, k, model_name):
        '''
        use data to train model
        :param dataframe: all columns for train
        :param k:k value
        :param model_name:the trained model
        :return:None
        '''

        data = self.prepare_data(dataframe)

        # train to get model
        model = KMeans.train(data, k)

        # create model saving path
        path = self.base + model_name

        # try to delete the old model if it exists
        try:
            import subprocess
            subprocess.call(["hadoop", "fs", "-rm", "-f", path])
        except:
            pass
        # save new model on hdfs
        model.save(self.sc, path)
        # print all cluster of the model
        for c in model.clusterCenters:
            l = []
            for i in c:
                i = decimal.Decimal(i).quantize(decimal.Decimal('0.01'))
                l.append(float(i))
            print(l)
コード例 #7
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # Load in pickled noun to vector dictionary
    logger.info('Loading pickled noun to vector dictionary')
    # Load noun to vector dictionary
    with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as f:
        noun_to_vect_dict = pickle.load(f)

    # Create vectors array
    vectors = noun_to_vect_dict.values()

    # Initialize Spark Context
    sc = ps.SparkContext('local[*]')
    # Load data
    data = sc.parallelize(vectors, 1024)

    # Create and fit a KMeans model to the data
    logger.info('Fitting KMeans model')
    kmeans_model = KMeans.train(data, N_CLUSTERS, maxIterations=10, runs=10,
                                initializationMode='k-means||')

    # Create a list of labels corresponding to vectors
    logger.info('Labeling vectors')
    labels = [kmeans_model.predict(vector) for vector in vectors]
    # Write to text file
    logger.info('Writing labels to file')
    with open(path.join(OUT_FILE_LOC, 'labels.txt'), 'w') as f:
        for label in labels:
            f.write(str(label) + '\n')
コード例 #8
0
ファイル: tests.py プロジェクト: vidur89/spark
    def test_kmeans(self):
        from pyspark.mllib.clustering import KMeans

        data = [[0, 1.1], [0, 1.2], [1.1, 0], [1.2, 0]]
        clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
        self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
        self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
コード例 #9
0
    def fit(self, Z):
        """Compute k-means clustering.

        Parameters
        ----------
        Z : ArrayRDD or DictRDD containing array-like or sparse matrix
            Train data.

        Returns
        -------
        self
        """
        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))
        if self.init == 'k-means||':
            self._mllib_model = MLlibKMeans.train(
                X.unblock(),
                self.n_clusters,
                maxIterations=self.max_iter,
                initializationMode="k-means||")
            self.cluster_centers_ = self._mllib_model.centers
        else:
            models = X.map(lambda X: super(SparkKMeans, self).fit(X))
            models = models.map(lambda model: model.cluster_centers_).collect()
            return super(SparkKMeans, self).fit(np.concatenate(models))
コード例 #10
0
def kMeans(vecs, clusterNum):
	clusters = KMeans.train(vecs, clusterNum, maxIterations=10, runs=10, initializationMode="random")

	if pv.outputDebugMsg:
		Utils.logMessage("\nKmean cluster finished")

	return clusters
コード例 #11
0
ファイル: mlKmeans.py プロジェクト: Tomlong/MLlib-UI
def KMeansModel(dataPath, label, k, character, master):
    sc = SparkContext(master)
    data = sc.textFile(dataPath).map(lambda line: line.replace(character, ','))

    if label == 0:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)]))
    else:
        label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect()
        label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect()        
        train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1]))
    model = km.train(train_data, k)
    predict_data = train_data.collect()
    train = len(predict_data)
    acc = 0
    
    for i in range(len(label_sum)):
        ksum = np.zeros(k, dtype = int)
        cur_label = label_sum[i][0]
        for j in range(train):
            if label[j] == cur_label:
                ksum[model.predict(predict_data[j])] += 1
        acc += max(ksum)

    string = "KMeans Result: \n"
    center = model.centers
    for i in range(k):
        cur = str(i) + ":" + str(center[i]) + '\n'
        string += cur  
    string = string + "Acc: " + str((float(acc)/train) * 100) + "%"    
    sc.stop()
    return string
コード例 #12
0
def kmeans(iterations, theRdd):
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))
    clusters = KMeans.train(theRdd, iterations, maxIterations=10,
            runs=10, initializationMode="random")
    WSSSE = theRdd.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    return WSSSE, clusters
コード例 #13
0
ファイル: kmeansSpark.py プロジェクト: Riuchando/Spark
def main(arg1, arg2):
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile(arg1)
    data = lines.map(parseVector)
    k = int(arg2)
    model = KMeans.train(data, k)
    print("Final centers: " + str(model.clusterCenters))
    print("Total Cost: " + str(model.computeCost(data)))
    sc.stop()
コード例 #14
0
def spark_KMeans(train_data):
    maxIterations = 10
    runs = 20
    numClusters = [2,3,4,5,6,7,8,9,10,11,12,13,14]
    errors = []
    for k in numClusters:
        model = KMeans.train(train_data, k, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4)
        WSSSE = model.computeCost(train_data)
        errors.append(WSSSE)

    plt.plot(numClusters, errors, 'ro')
    plt.xlabel(r'k')
    plt.ylabel(r'inertia')
    plt.title(r'inertia v.s. k')
    plt.savefig('kmeans_cross_validation.png')

    bestModel = KMeans.train(train_data, 6, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4)
    return bestModel
コード例 #15
0
def cluster_data(sc, qc):
	drivers = read_file_path(BASE_PATH)
	print "Number of drivers: %d" % len(drivers)

	# Load and parse the data
	for i, dr in enumerate(drivers):
		# extract driver number from path
		dr_num = re.search("[0-9]+$", dr.strip())

		if dr_num:
			dr_num = dr_num.group(0)
			if dr_num == '1018':
				continue
		else:
			print 'driver number error for %s' % dr 
			continue

		dr_data = sc.textFile("hdfs://" + dr + "/" + dr_num + "_all_trips.txt")

		data = dr_data.map(lambda row: [float(x) for x in row.split(',')])

		if i == 0:
			all_data = data
		else:
			all_data = all_data.union(data)

		data.unpersist()

	print 'Total number of records: %d' % all_data.count()

	# Build the model (cluster the data), k = Number of clusters
	k = 5 
	t = time()
	clusters = KMeans.train(all_data, k, maxIterations=100, runs=100, initializationMode="random", )
	print 'KMeans took %.2f seconds' % (time() - t)

	# Compute cost
	WSSSE_map = all_data.map(lambda point: error(point, clusters))

	# Join cluster ID to original data
	all_data_w_cluster = all_data.map(lambda point: np.hstack((point, get_cluster_id(clusters, point))))

	# all_data_w_cluster.saveAsTextFile("hdfs:///usr/local/spark/kmeans/results.txt")

	for i in xrange(0,k):
		subset = all_data_w_cluster.filter(lambda x: x[-1] == i)
		print "Number of items in cluster %d: %d" % (i, subset.count())
		# Computer functions on different features:
		all_features_average = subset.sum() / subset.count()
		print 'Average of all features'
		print all_features_average
	
	WSSSE = all_data.map(lambda point: error(point, clusters)).reduce(lambda x, y: x + y)
	print("Within set sum of squared error: " + str(WSSSE))
コード例 #16
0
ファイル: test_linalg.py プロジェクト: drewrobb/spark
 def test_clustering(self):
     from pyspark.mllib.clustering import KMeans
     data = [
         self.scipy_matrix(3, {1: 1.0}),
         self.scipy_matrix(3, {1: 1.1}),
         self.scipy_matrix(3, {2: 1.0}),
         self.scipy_matrix(3, {2: 1.1})
     ]
     clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
     self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
     self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
コード例 #17
0
ファイル: k_means.py プロジェクト: honeycombcmu/SparkService
def k_means(loadTrainingFilePath, sc):
	# Load and parse the data
	loadTrainingFilePath = "../data/kmeans_data.txt"
	data = sc.textFile(loadTrainingFilePath)
	parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
	# Build the model (cluster the data)
	clusters = KMeans.train(parsedData, 3, maxIterations=10, runs=30, initializationMode="random")

	WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

	print("Within Set Sum of Squared Error = " + str(WSSSE))
コード例 #18
0
def build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs):
    """Perform the clustering of vectors using K-means.

    Returns:
        k means model learned from the training data in
            tfidf_vectors_rdd

    """

    # Build the model (cluster the training data)
    return KMeans.train(tfidf_vectors_rdd, num_clusters, maxIterations=max_iterations, runs=runs)
コード例 #19
0
def main(noun_file_loc, model_file_loc, percent, n_trials, out_files_loc):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logger.info('Loading Word2Vec model')
    # Load trained Word2Vec model
    model = Word2Vec.load('model_file_loc')

    logger.info('Reading in list of nouns')
    # Read in list of sorted nouns
    sorted_nouns = []
    with open(noun_file_loc, 'r') as f:
        for line in f:
            sorted_nouns += line
    # Count number of nouns
    n_nouns = len(sorted_nouns)

    # Create dictionary to map nouns to vectors
    noun_to_vect_dict = {}
    # Calculate index to stop slice as percentage of total nouns
    n_nouns_to_keep = int(n_nouns * percent / 100.)
    logger.info('Keeping %i nouns, %i percent of %i',
                n_nouns_to_keep, percent, n_nouns)
    # Add nouns and vectors to dictionary
    for noun in sorted_nouns[0:n_nouns_to_keep]:
        noun_to_vect_dict[noun] = model[noun]

    vectors = np.array(noun_to_vect_dict.values())

    # Initialize Spark Context
    sc = ps.SparkContext('local[4]')
    # Load data
    data = sc.parallelize(vectors)

    # Define search space for k
    ns_clusters = [int(x) for x in np.linspace(2, n_nouns, n_trials)]
    # Open WSSSEs output file
    with open(path.join(out_files_loc, 'elbow_data.txt'), 'w') as elbow_data:
        # For each k
        for i, k in enumerate(ns_clusters):
            logger.info('Trial %i of %i, %i clusters', (i + 1), n_trials, k)
            # Calculate cluster
            kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10,
                                        initalizationMode='k-means||')
            # Calculate WSSSE
            WSSSE = data.map(lambda point: error(kmeans_model, point)) \
                        .reduce(lambda x, y: x + y)
            # Save centroids
            with open(path.join(out_files_loc, '_', k, '.pkl'), 'w') as f:
                pickle.dump(kmeans_model.clusterCenters(), f)
            # Write k and WSSSE
            elbow_data.write('%i, %f', k, WSSSE)
コード例 #20
0
def main():
    sc = SparkContext()
    filename = sys.argv[1]
    clusters=int(sys.argv[2])
    outmodelname = sys.argv[3]
    dataset = gdal.Open(filename, GA_ReadOnly)
    driver = dataset.GetDriver().ShortName
    x, y, data = tiff_to_array(dataset, weights)
    print "after change to array"
    clusterdata = sc.parallelize(data)
    print "parallelize done"
    kmeanmodel = KMeans.train(clusterdata, clusters, maxIterations=50, runs=10)
    kmeanmodel.save(sc, outmodelname)
    print kmeanmodel.clusterCenters
コード例 #21
0
def train_coarse(sc, split_vecs, V, seed=None):
    """
    Perform KMeans on each split of the data with V clusters each.
    """

    # Cluster first split
    first = split_vecs.map(lambda x: x[0])
    first.cache()
    print 'Total training set size: %d' % first.count()
    print 'Starting training coarse quantizer...'
    C0 = KMeans.train(first, V, initializationMode='random', maxIterations=10, seed=seed)
    print '... done training coarse quantizer.'
    first.unpersist()

    # Cluster second split
    second = split_vecs.map(lambda x: x[1])
    second.cache()
    print 'Starting training coarse quantizer...'
    C1 = KMeans.train(second, V, initializationMode='random', maxIterations=10, seed=seed)
    print '... done training coarse quantizer.'
    second.unpersist()

    return np.vstack(C0.clusterCenters), np.vstack(C1.clusterCenters)
コード例 #22
0
ファイル: es_pyspark.py プロジェクト: Jios/bd2015hw01
def kmeans(k=2):
    """ kmeans """

    # Load and parse training data
    data = getTrainData(dataFilename)
    parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # pyspark.rdd.PipelinedRDD

    # Build the model (cluster the data)
    #  KMeans.train(cls, data, k, maxIterations=100, runs=1, initializationMode="k-means||")
    clf = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random") # pyspark.mllib.clustering.KMeansModel

    WSSSE = parsedData.map(lambda point: error(point, clf)).reduce(lambda x, y: x + y) # float
    print("Within Set Sum of Squared Error = " + str(WSSSE))
    print "###  cluster centers  ###:"
    print clf.centers
    return clf
コード例 #23
0
ファイル: recognition.py プロジェクト: jethrotan/bobo
def bagofwords(imtrain, imtest=None, features=_features, outdir=None):
    cache = Cache(cacheroot=outdir)

    # Unique labels
    labels = imtrain.map(lambda x: x.category).distinct().collect()
    print labels

    # Features: each returns a row array of features 
    X = imtrain.map(features)  
    
    # Clustering: kmeans clustering to generate words
    # http://spark.apache.org/docs/0.9.0/mllib-guide.html
    model = KMeans.train(X, 2, maxIterations=10, runs=30, initializationMode='random')
    
    # construct bag of words representation
    print model.clusterCenters
コード例 #24
0
ファイル: kmeans_CSV2.py プロジェクト: angelamuliu/PSC
def kmeans_CSV():
    try:
	# creating a parsedData RDD to do kmeans on
	servernum = sys.argv[1]
	serverpath = "hdfs://10.0.0.4:8020/opentsdb/" + servernum
	print "Attempting to create SparkContext"
	sconf = SparkConf().setAppName("Kmeans for files")
	print "Sconf set..."
	sc = SparkContext(conf=sconf)
	print "SparkContext created"
	
	# making parsedData RDD: [ array([filewrites, filereads, CPU, diskIOBW, net bytes]), array([...]), ... ]
	# kmeans iteratively passes over data multiple times - cache parsedData
	
	if len(sys.argv) == 2: #user just specified server - do full server kmeans
	    filepaths = get_file_paths(serverpath)# Array of string file paths to all files within folder
	    parsedData = compile_RDD(sc, filepaths).cache()
	    CSV_filename =  make_name(filepaths) + "_" + servernum
	elif len(sys.argv) == 3: #user put in server and single timeframe - do single file kmeans
	    timeframe = sys.argv[2] #ex: 2014-07-09
	    filepaths = get_singlefile_path(timeframe, serverpath)
	    parsedData = compile_RDD(sc, filepaths).cache()
	    CSV_filename = str(timeframe) + "_" + servernum
	else: #user put in server and start/end timeframe - do timeframe kmeans
	    start_timeframe = sys.argv[2]
	    end_timeframe = sys.argv[3]
	    filepaths = get_timeframefile_paths(start_timeframe, end_timeframe, serverpath)
	    parsedData = compile_RDD(sc, filepaths).cache()
	    CSV_filename =  make_name(filepaths)+ "_" + servernum
	k = findk(parsedData.count())
	
	clusters = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random")
	centers = clusters.clusterCenters
	
	# Creating two CSVs (one has data points, one has centers) for later visualization
	compile_CSV(CSV_filename, parsedData)
	compile_centers_CSV(CSV_filename, centers)
	print "SUCCESS: Kmeans done"
    except:
	print "---------------------------------"
	print "Usage: ./bin/spark-submit kmeans_CSV.py <servername> <start_timeframe> <end_timeframe>"
	print "<servername> must be specified. EX: sense0 "
	print "Timeframes are optional. Specify just one timeframe for single file kmeans. Specify start and end for kmeans over timeframe."
	print "Timeframes must be in format yyyy-mm-DD"
	print "---------------------------------"
	raise
コード例 #25
0
def choose_k(sub_df):
    wssse_list = []
    for i in range(1, 11):
        clusters = KMeans.train(sub_df,
                                i,
                                maxIterations=10,
                                initializationMode="random")
        WSSSE = sub_df.map(lambda point: sqrt(
            sum([
                x**2 for x in
                (point - clusters.centers[clusters.predict(point)]) / sd
            ]))).reduce(add)
        wssse_list.append(WSSSE)
    wssse_minus = [(x - y) / (x + 0.001)
                   for x, y in zip(wssse_list[:-2], wssse_list[1:])]
    zipped = zip(range(1, 10), np.abs(wssse_minus))
    k = sorted(zipped, key=itemgetter(1), reverse=True)[0][0]
    return k
コード例 #26
0
    def test_kmeans(self):
        from pyspark.mllib.clustering import KMeans

        data = [
            [0, 1.1],
            [0, 1.2],
            [1.1, 0],
            [1.2, 0],
        ]
        clusters = KMeans.train(
            self.sc.parallelize(data),
            2,
            initializationMode="k-means||",
            initializationSteps=7,
            epsilon=1e-4,
        )
        self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1]))
        self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
コード例 #27
0
def get_cluster_ids(data_to_cluster, K):
    """Gets cluster ids for data to be clustered"""

    #~ print('Traning KMeans')
    model = KMeans.train(data_to_cluster,
                         K,
                         maxIterations=10,
                         initializationMode='random')
    #~ print('Finished Traning')
    #~ print('Predicting Cluster IDs')
    cluster_ids = model.predict(data_to_cluster)
    #~ print('Finished Prediction')
    #~ print('10 Sample Cluster IDs:')
    #~ print(cluster_ids.takeSample(False, 10))
    #~ print('10 Sample Data (nutrient-nutrient) Clusters')
    #~ print(data_to_cluster.takeSample(False, 10))
    #~ print('Fetched Cluster IDs')

    return cluster_ids, data_to_cluster
コード例 #28
0
 def __kmeans_clustering(self):
     # get the whole population without fitness value, then flat it
     rdd_aux = self.__rdd.flatMap(lambda x: x.get_population(fitness=False))
     # train the kmeans
     kmeans_cluster = KMeans.train(
         rdd_aux,
         self.__colonies,
         maxIterations=self.__cluster_iterations,
         initializationMode="random")  # acepta "k-means||"
     # create a new rdd with the labels
     rdd_labels = kmeans_cluster.predict(rdd_aux)
     # zip each result with its class
     rdd_aux = rdd_labels.zip(rdd_aux)
     # input serialization
     cols = self.__colonies
     self.__sc.broadcast(cols)
     # divide into partitions
     rdd_aux = rdd_aux.partitionBy(cols, partitionFunc=lambda x: x).glom()
     # remove the index of each element
     rdd_aux = rdd_aux.map(lambda x: [y[1] for y in x])
     # input serialization
     evaluation = self.__evaluation
     generation = self.__generation
     cross = self.__cross
     mutation = self.__mutation
     selection = self.__selection
     survival = self.__survival
     mut_ratio = self.__mut_ratio
     survival_ratio = self.__survival_ratio
     control_obj = self.__control_obj
     # create the new colonies
     self.__rdd = rdd_aux.map(
         lambda x: Colony(evaluation,
                          generation,
                          cross=cross,
                          mutation=mutation,
                          selection=selection,
                          mut_ratio=mut_ratio,
                          survival_ratio=survival_ratio,
                          survival=survival,
                          control_obj=control_obj,
                          population=x))
コード例 #29
0
    def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sparkCt.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)
コード例 #30
0
def get_clusters(data_rdd, num_clusters=NUM_CLUSTERS, max_iterations=MAX_ITERATIONS,
                 initialization_mode=INITIALIZATION_MODE, seed=SEED):
    # TODO:
    # Use the given data and the cluster pparameters to train a K-Means model
    # Find the cluster id corresponding to data point (a car)
    # Return a list of lists of the titles which belong to the same cluster
    # For example, if the output is [["Mercedes", "Audi"], ["Honda", "Hyundai"]]
    # Then "Mercedes" and "Audi" should have the same cluster id, and "Honda" and
    # "Hyundai" should have the same cluster id
    features = data_rdd.map(lambda line: array([float(x) for x in line.split(',')[1:]]))
    clusters = KMeans.train(features, num_clusters, maxIterations=max_iterations, initializationMode=initialization_mode, seed=seed)
    
    res = data_rdd.map(lambda line: (clusters.predict(array([float(x) for x in line.split(',')[1:]])), [line.split(',')[0]])).reduceByKey(lambda a, b: a + b)
    result = [[]]
    res = res.collect()
    for c in res:
        result.append(c[1])
    if [] in result:
        result.remove([])
    return result
コード例 #31
0
def calculate_wssse(data_to_cluster):
    """Calculates Within Set Sum of Squared Error (WSSSE)"""

    K = []
    wssse = []

    for k in range(2, 12):

        print('Computing WSSE for {}'.format(k))
        K.append(k)
        model = KMeans.train(data_to_cluster,
                             k,
                             maxIterations=10,
                             initializationMode='random')
        wssse_value = model.computeCost(data_to_cluster)
        wssse.append(wssse_value)

    # Plot the WSSSE for different values of k
    plt.plot(K, wssse)
    plt.show()
コード例 #32
0
ファイル: anomaly_detection.py プロジェクト: bslc/cmpt733
    def detect(self, k, t):
        #Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1])
        df1.show()

        #Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        #Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features))
        df2.show()

        #Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2)
        df3.show()

        return df3.where(df3.score > t)
コード例 #33
0
def run():
    # Set up
    sc = SparkContext()
    records = sc.textFile(os.path.realpath(__file__+'/..') + '/data-scraper/data')
    # Build clusters
    kvpairs = records.map(keyAndParse)
    cts = kvpairs.groupByKey().map(lambda (name, statList): (name, len(statList))).collectAsMap()
    kvpairs = kvpairs.reduceByKey(combine)

    # Filter outliers with too few records
    kvpairs = kvpairs.filter(lambda (k,v): cts[k] > 2)
    kvpairs = kvpairs.map(lambda (name, statline): (name, normalize(statline, cts[name])))
    
    numClusters = 20
    clusters = KMeans.train(kvpairs.map(lambda (k,v): v),numClusters,10)
    groupedClusters = kvpairs.groupBy(lambda (k,v): clusters.predict(v)).map(lambda x: (x[0], getNames(list(x[1])))).collect()
    # Rank clusters
    centers = avg(clusters.clusterCenters)
    centers.sort(key=lambda x: x['score'], reverse=True)
    # Save sorted clusters
    save(groupedClusters, centers)
コード例 #34
0
def cluster(filename, k, indices):
    stat, data = format_input("input/" + filename, indices)
    output = file("output/" + filename, "w")
    model = KMeans.train(data, k)
    #print(model)
    #pickle.dump(model,open(filename+".p","wb"))

    P = dict()

    cluster_centers = model.clusterCenters
    with file("output/" + filename, "w") as f:
        for x in stat.collect():
            name = str(x[0])
            num = str(model.predict(x[1]))
            centers = str(' '.join(
                '{:.3f}'.format(i)
                for i in cluster_centers[model.predict(x[1])]))

            P[name] = num
            f.write(name + "," + num + "\n")
        pickle.dump(P, open(filename + ".p", "wb"))
コード例 #35
0
ファイル: recommend.py プロジェクト: summer-apple/spark
    def kmeans_demo(self):

        file = self.sc.textFile(self.base+'k_data.csv')

        # transform to rdd
        data = file.map(lambda line: line.split(',')).cache()
        print(type(data))

        # train data to get the model
        model = KMeans.train(data,k=3)

        # print to check all clusters
        cluster = model.clusterCenters
        for c in cluster:
            print(c)


        # predict new data  return the data belong to which cluster(index of the cluster)
        predict = model.predict([1.3,.1,1.1])

        print(predict)
コード例 #36
0
ファイル: recommend.py プロジェクト: summer-apple/spark
    def clustering_score(data,k):
        model = KMeans.train(data, k=k,maxIterations=200)

        def distance(v1, v2):
            s = 0
            # [1,2,3] [4,5,6] --> [(1,4),(2,5),(3,6)]
            pairs = zip(v1,v2)
            for p in pairs:
                sub = float(p[0]) - float(p[1])
                s = s + sub * sub
            return math.sqrt(s)

        def dist_to_centroid(datum):
            # predict the data
            cluster = model.predict(datum)
            # get the current centroid --> means center point
            centroid = model.clusterCenters[cluster]
            # call distance method
            return distance(centroid, datum)

        return data.map(dist_to_centroid).mean()
コード例 #37
0
ファイル: iestimate.py プロジェクト: sijuaugustin/ingts
 def build_classifier(self, dataset, kmeans_dataset, feature_keys):
     self.logger.info('building classifier')
     kmeans_train_set = []
     for item in kmeans_dataset:
         features = [item[column] for column in feature_keys]
         kmeans_train_set.append(array(features))
     self.logger.debug("kmeans_train_set %d", len(kmeans_train_set))
     kmeans_train_set = sc.parallelize(kmeans_train_set)
     clusters = KMeans.train(kmeans_train_set,
                             100,
                             maxIterations=500,
                             runs=10,
                             initializationMode="random")
     del kmeans_dataset
     del kmeans_train_set
     data = []
     for item in dataset:
         features = [item[column] for column in feature_keys]
         data.append(LabeledPoint(int(item['classifier_label']), features))
     del dataset
     data = sc.parallelize(data)
     (trainingData, testData) = data.randomSplit([0.7, 0.3])
     del data
     model = RandomForest.trainClassifier(
         trainingData,
         numClasses=self.total_splits,
         categoricalFeaturesInfo={},
         numTrees=self.rfc_config['num_trees'],
         featureSubsetStrategy=self.
         rfr_config['feature_subset_strategy'],  # "all",
         impurity='gini',
         maxDepth=self.rfc_config['max_depth'],
         maxBins=32)
     predictions = model.predict(testData.map(lambda x: x.features))
     labelsAndPredictions = testData.map(lambda lp: lp.label).zip(
         predictions)
     testErr = labelsAndPredictions.filter(
         lambda (v, p): v != p).count() / float(testData.count())
     self.logger.info('classifier build finished')
     return model, clusters, testErr
コード例 #38
0
def kmeans_w2v():
    df_path = "hdfs:///user/rmusters/lambert_w2v_data_jan"
    df = sqlContext.read.parquet(df_path)
    data = df.select("vectors")
    parsedData = data.dropna().map(lambda line: line[0])

    errors = []
    cluster_sizes = []

    for n_clusters in range(10, 1000, 50):
        # Build the model (cluster the data)
        clusters = KMeans.train(parsedData,
                                n_clusters,
                                maxIterations=10,
                                runs=10,
                                initializationMode="random")

        # Evaluate clustering by computing Within Set Sum of Squared Errors
        def error(point):
            center = clusters.centers[clusters.predict(point)]
            return sqrt(sum([x**2 for x in (point - center)]))

        WSSSE = parsedData.map(lambda point: error(point)).reduce(
            lambda x, y: x + y)
        errors.append(WSSSE)
        cluster_sizes.append(n_clusters)
        logger.info("Within Set Sum of Squared Error = " + str(n_clusters) +
                    "&" + str(WSSSE))

        # Save and load model
        if n_clusters == 520:
            clusters.save(sc, "hdfs:///user/rmusters/lambert_kmeans_w2v_jan")
    df = sc.parallelize(errors).map(lambda x: (x, )).toDF().withColumnRenamed(
        "_1", "error")
    df2 = sc.parallelize(cluster_sizes).map(
        lambda x: (x, )).toDF().withColumnRenamed("_1", "n_cluster")
    res = df.join(df2).dropDuplicates(["n_cluster"])
    res.write.format("com.databricks.spark.csv").mode("overwrite").save(
        "errors_kmeans.csv")
コード例 #39
0
ファイル: kmeans.py プロジェクト: skandg/rough-work
def main():
    '''
    '''
    # set up environment
    conf = SparkConf() \
            .setAppName("kMeans") \
            .set("spark.executor.memory", "2g")
    sc = SparkContext(conf=conf)

    # Load and parse the data
    data = sc.textFile("data/kmeans_data.txt")
    parsedData = data.map( \
            lambda line: array([float(x) for x in line.split(' ')]))

    # Build the model (cluster the data)
    clusters = KMeans.train(parsedData, 2, maxIterations=10, \
            runs=10, initializationMode="random")

    WSSSE = parsedData.map(lambda point: error(clusters, point)) \
            .reduce(lambda x, y: x + y)

    print("Within Set Sum of Squared Error = " + str(WSSSE))
コード例 #40
0
ファイル: kmeans.py プロジェクト: Ather23/machine_learning
    def kmeans_train(self, data_rdd, n_clusters):
        """
        This method is used to train the model
        """

        data_splits = data_rdd.randomSplit([.50, .25, .25], seed=0)
        training_set = data_splits[0].repartition(numPartitions=4).cache()
        validation_set = data_splits[1].repartition(numPartitions=4).cache()
        test_set = data_splits[2].repartition(numPartitions=4).cache()
        max_iter_arr = [50,60,80]
        max_runs =  [50,60,80]
        k_list = n_clusters
        best_model = None
        best_rmse = float("inf")
        best_run = 0
        best_k = 0
        itertools.product
        for itera, run, k in itertools.product(max_iter_arr, max_runs, k_list):
            try:
                model = KMeans.train(training_set, 3, itera, run, "random")
                validation_rmse = model.computeCost(validation_set)
                print("#of clusters k %d\n" % (k))

                if validation_rmse < best_rmse:
                    best_model = model
                    best_rmse = validation_rmse
                    best_run = max_runs
                    best_iter = itera
                    best_k = k
            except Exception as e:
                print(e)
                continue

        # test_preds = best_model.predict(test_set.first())
        print("K-means results...")
        print(str(best_rmse))
        print(str(best_k))

        return best_model
コード例 #41
0
def train_subquantizers(sc,
                        split_vecs,
                        M,
                        subquantizer_clusters,
                        model,
                        seed=None):
    """
    Project each data point into it's local space and compute subquantizers by clustering
    each fine split of the locally projected data.
    """
    b = sc.broadcast(model)

    def project_local(x):
        x = np.concatenate(x)
        coarse = b.value.predict_coarse(x)
        return b.value.project(x, coarse)

    projected = split_vecs.map(project_local)

    # Split the vectors into the subvectors
    split_vecs = projected.map(lambda x: np.split(x, M))
    split_vecs.cache()

    subquantizers = []
    for split in xrange(M):
        data = split_vecs.map(lambda x: x[split])
        data.cache()
        sub = KMeans.train(data,
                           subquantizer_clusters,
                           initializationMode='random',
                           maxIterations=10,
                           seed=seed)
        data.unpersist()
        subquantizers.append(np.vstack(sub.clusterCenters))

    return (subquantizers[:len(subquantizers) / 2],
            subquantizers[len(subquantizers) / 2:])
コード例 #42
0
def main():
    #Reading the json file
    reviews_data = sqlContext.read.json(input)
    reviews=reviews_data.select('reviewText')
    rdd_data=reviews.rdd.map(lambda line:str(line.reviewText))
    transformed_data=rdd_data.map(transform_data).cache()
    #Transforming the words
    model = word2vec.fit(transformed_data)
    #Finding distinct words
    unique_words=transformed_data.flatMap(lambda l:l).map(lambda l:str(l)).distinct()
    # print unique_words.collect()
    dict1={}
    for a in unique_words.collect():
        try:
            dict1[a]=model.transform(a)
        except Exception:
            pass

    # Saving word2vec model
    pickle.dump(dict1, open(output+'\output_vector_sample.txt', "wb"))

    # dict2=pickle.load(open(output+'/output4.txt', "rb"))
    #finding synonyms
    # synonyms = model.findSynonyms('happy', 10)
    # print synonyms
    feature_vectors=dict1.values()
    feature_vectors_rdd=sc.parallelize(feature_vectors)
    clusters = KMeans.train(feature_vectors_rdd, 2000, maxIterations=1,runs=1, initializationMode="random")
    # WSSSE=feature_vectors_rdd.map(lambda point: error(clusters,point)).reduce(lambda x, y: x + y)
    # print("Within Set Sum of Squared Error = " + str(WSSSE))
    cluster_predictions={}
    for key in dict1.keys():
        cluster_predictions[key]=clusters.predict(dict1[key])

    # Saving word to cluster index model
    pickle.dump(cluster_predictions,open(output+'/cluster_data.txt', "wb"))
コード例 #43
0
 def kmeans_check(self, T, k=3, normalize=True):
     '''
     #:param: indegree  threshold. if T<1, at least one vector in each group would be removed
     #:return: outlier list
     '''
     if not self.transform:
         self.column_datatype(self._column)
     trans_df = self._df.select(self._column).rdd.map(lambda x: np.array(x))
     clusters = KMeans.train(trans_df.map(lambda x: x[:-1]),
                             k,
                             maxIterations=10,
                             runs=1,
                             initializationMode='random')
     maxIngroup = trans_df.map(lambda x: (clusters.predict(x[:-1]), \
     np.linalg.norm(clusters.centers[clusters.predict(x[:-1])]-x[:-1]))).reduceByKey(lambda x,y: x if x>y else y).collect()
     maxIngroup = sorted(maxIngroup)
     distForAll = trans_df.map(lambda x: (x[-1],np.linalg.norm(clusters.centers[clusters.predict(x[:-1])]-x[:-1])/ \
     maxIngroup[clusters.predict(x[:-1])][1]))
     outlier_index = distForAll.filter(lambda x: x[1] > T).map(
         lambda x: int(x[0])).collect()
     print('Around %.2f of rows are outliers.' %
           (len(outlier_index) / self.rownum))
     self.transform = False
     return outlier_index
コード例 #44
0
def my_test(sc, util, data):
    dat = tcg.tc_gen(100)

    train_data = [np.array(sf.softmax(x)) for x in dat]
    clusters = KMeans.train(sc.parallelize(train_data),
                            20,
                            maxIterations=10,
                            initializationMode="random")

    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = map(lambda point: error(point), train_data)
    WSSSE = reduce(lambda x, y: x + y, WSSSE)
    print("Within Set Sum of Squared Error = " + str(WSSSE))

    clustered = collections.defaultdict(list)

    for i, point in enumerate(train_data):
        clustered[clusters.predict(point)].append(dat[i][0])
    #print len(train_data)
    print clustered.keys()
    return clustered
コード例 #45
0
def kmeans_model(file_path, file_out):
    global SPARK_MASTER
    y = pyspark.SparkConf()
    y.setMaster(SPARK_MASTER)
    # y.setSparkHome('/usr/local/spark')
    print file_path
    print y.getAll()
    sc = pyspark.SparkContext(conf=y)
    # print sc.pythonExec
    # print sc.pythonVer
    textfile = sc.textFile(file_path)
    print textfile.collect()
    print textfile.count()
    y = textfile.map(lambda each: each.split(' ')[1:])
    p = re.compile('\d:')
    z = y.map(lambda x: transform(x, p))
    z = z.map(lambda x: [float(each) for each in x])
    print z.collect()
    model = KMeans.train(z, 2)
    print model.clusterCenters
    # textfile.saveAsTextFile(file_out)
    model.save(sc, file_out)
    sc.stop()
    """
コード例 #46
0
rdd_split_int = rdd_split.map(lambda x: [int(x[0]), int(x[1])])

## Count the number of rows in RDD
print("There are {} rows in the rdd_split_int dataset".format(rdd_split_int.count()))

##### K-Means Training #####
## Error Function
def error(point):
    center = model.centers[model.predict(point)]
    return sqrt(sum([x ** 2 for x in (point - center)]))

## Train the model with clusters from 13 to 16 and compute WSSSE
clusters_wssse = []

for clst in range(1, 21):
    model = KMeans.train(rdd_split_int, clst, seed=1)
    WSSSE = rdd_split_int.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    clusters_wssse.append([clst, WSSSE])
    print("The cluster {} has Within Set Sum of Squared Error {}".format(clst, WSSSE))

## Train the model again with the best k
model = KMeans.train(rdd_split_int, k=15, seed=1)

## Get cluster centers
cluster_centers = model.clusterCenters

##### Visual the Clusters #####
## Convert rdd_split_int RDD into Spark DataFrame
rdd_split_int_df = spark.createDataFrame(rdd_split_int, schema=["col1", "col2"])

## Convert Spark DataFrame into Pandas DataFrame
コード例 #47
0
# iris.csv is from https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
# this file is uploaded to the S3 bucket used in sc.textfile() below
# FORMAT:
# 5.1,3.5,1.4,0.2,setosa
# 4.9,3.0,1.4,0.2,setosa
# 4.7,3.2,1.3,0.2,setosa
# 4.6,3.1,1.5,0.2,setosa

sc = SparkContext()
# data = sc.textFile("s3://com.lifetech.ampliseq.dev.transfer/iris.csv") # publically accessible file
data = sc.textFile(input_file)  # local file or publicly accessible file in S3
p = data.map(lambda line: array([float(x) for x in line.split(',')[0:4]]))

# print RDD
for x in p.collect():
    print x

clusters = KMeans.train(p, 3, maxIterations=100, initializationMode="random")


# Get within-cluster sum-of-squares
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = p.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error (WSSSE) = " + str(WSSSE))
コード例 #48
0

#*******************************************************************************************************#

if __name__ == "__main__":
    if len(sys.argv) != 1:
        print("Not a correct no. of arguments", file=sys.stderr)
        exit(-1)
    #*********Initiating the spark context for the application Kmean *****************#
    sc = SparkContext(appName="KMeansApp")
    #************partitioning the data into RDD with the help of function .textFile and parsing it***************#
    inplines = sc.textFile('input.csv')
    inpdata = inplines.map(ParseAndDrop)
    # ********** k defines the number of cluster ********************************************#
    k = int(2)
    model = KMeans.train(inpdata, k)

    #***************reading the data points for calculating the accuracy and predicting the trained model **********#
    with open('Input.csv') as file:
        rows = file.readlines()

    #*************Initialization of the list *******************************************#
    DProw = []
    PredVal_arrays = []
    ActVal_arrays = []
    #***********************************************************************************#
    for row in rows:
        row = row.rstrip("\n")

        #***********************Initializing an output list *************************************#
        Out = []
コード例 #49
0
    # Path to log input file
    logFile = "/user/root/src/Project - Developer - apache-access-log (4).txt.gz"

    # Read log text file and parse based on Apache log standard
    parsed_logs, access_logs = parseLogs(sc, logFile)

    # Process data for feature columns to be used in training
    df4 = dataProcessing(access_logs)
    df4.show()

    # Format DataFrame into Dense Vector for mllib K-means clustering
    data7 = df4.rdd.map(lambda row: Vectors.dense(row[2], row[3]))
    data7.cache()

    # Train Data for kmeans model
    kmeans = KMeans.train(data7, 3, 10)

    # Print the centers to cehck
    centers = kmeans.clusterCenters
    for center in centers:
        print(center)
    WSSSE = data7.map(lambda point: error(point, kmeans)).reduce(
        lambda x, y: x + y)
    print "Within Set Sum of Squared Error = " + str(WSSSE)
    # Convert DataFrame object to RDD object to add cluster predictions
    rowsRDD = df4.rdd.map(lambda r: (r[0], r[1], r[2], r[3], r[4]))
    rowsRDD.cache()
    predictions = rowsRDD.map(lambda r: (r[0], r[1], r[2], r[3], r[
        4], kmeans.predict(Vectors.dense(r[2], r[3]))))
    predDF = predictions.toDF()
    predDF.show()
コード例 #50
0
# Spark initialization
conf = pyspark.SparkConf().setAppName("kmeans").setMaster("local")
sc = pyspark.SparkContext(conf=conf)

# Arguments parsing
file_name = sys.argv[1]
k = int(sys.argv[2])
output_file_name = sys.argv[3]

# Initialization
points=sc.textFile(file_name).map(lambda x: x.split(" ")).\
        map(lambda (x,y): (float(x), float(y)))

clusters = KMeans.train(points,
                        k,
                        maxIterations=100,
                        initializationMode="kmeans||")

points.map(lambda x: "{0} {1} {2}".format(clusters.predict(x), x[0], x[1])).\
    saveAsTextFile(output_file_name)

write_centroids(clusters.centers,
                os.path.join(output_file_name, "centroids_final.txt"))


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


wsse = points.map(lambda point: error(point)).reduce(lambda x, y: x + y)
コード例 #51
0
from pyspark import SparkContext
from pyspark.mllib.clustering import KMeans
from numpy import array
from math import sqrt

sc = SparkContext()

#4 data points (0.0, 0.0), (1.0, 1.0), (9.0, 8.0) (8.0, 9.0)
data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)

#Generate K means
model = KMeans.train(sc.parallelize(data),
                     2,
                     maxIterations=10,
                     runs=30,
                     initializationMode="random")

#Print out the cluster of each data point
print(model.predict(array([0.0, 0.0])))
print(model.predict(array([1.0, 1.0])))
print(model.predict(array([8.0, 0.0])))
print(model.predict(array([9.0, 8.0])))
print(model.predict(array([8.0, 9.0])))
print(model.predict(array([8.0, 7.0])))
コード例 #52
0
# NOT MY CODE, modified from Apache Spark Python example of MLlib - Clustering
# http://spark.apache.org/docs/latest/mllib-clustering.html
###############################################################################

from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
#from math import sqrt

# Load and parse the data
#fileName = "data/mllib/kmeans_data.txt"
fileName = "data.txt"
data = sc.textFile(fileName, 8) #partition goes here
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])).cache()

# Build the models with different seeders: random or fariest spots
c1_clusters = KMeans.train(parsedData, 10, maxIterations=20, runs=1, initializationMode="random")

c2_clusters = KMeans.train(parsedData, 10, maxIterations=20, runs=1, initializationMode='k-means||')

#c1_initials=sc.textFile('c1.txt').map(lambda line: array([float(x) for x in line.split(' ')]))
#c1_preset_clusters = KMeans.train(parsedData, 10, maxIterations=20, initialModel=c1_initials) #new parameters in Spark v1.5.0

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point, model):
    center = model.centers[model.predict(point)]
    return sum([x**2 for x in (point - center)])**0.5

def wssse(dataRDD, model):
  return dataRDD.map(lambda point: error(point, model)).reduce(lambda x, y: x + y)

c1_WSSSE = wssse(parsedData, c1_clusters)
コード例 #53
0
'转为分布式行矩阵'
total_outlier_matrix = RowMatrix(total_outlier_sample)

'生成DenseVectorde的概述统计量'
desc_total_outlier_matrix = MultivariateStatisticalSummary(
    total_outlier_matrix.rows)

# print ('outlier factors mean:',desc_total_outlier_matrix.mean())
# print ('outlier factors variance:',desc_total_outlier_matrix.variance())

#训练
num_clusters = 9
num_iterations = 20
num_runs = 3

outlier_cluster_model = KMeans.train(total_outlier_sample, num_clusters,
                                     num_iterations, num_runs)
outlier_predictions = outlier_cluster_model.predict(total_outlier_sample)

print('对前十个outlier样本的预测标签为:' +
      ",".join([str(i) for i in outlier_predictions.take(10)]))

#评估模型
#内部指标WCSS
outlier_cost = outlier_cluster_model.computeCost(total_outlier_sample)
print("WCSS for outlier_sample: %f" % outlier_cost)

#外部指标
#带标注的数据 分类指标

#交叉验证K调优
train_test_split_outlier = total_outlier_sample.randomSplit([0.6, 0.4], 123)
コード例 #54
0
    .builder \
    .appName("KMeans") \
    .config("spark.some.config.option", "Angadpreet-KMeans") \
    .getOrCreate()
today = dt.datetime.today()
spark_df = sc.parallelize(spark.read.json("Data/yelp_academic_dataset_business.json").select("stars","review_count","is_open").take(1700))
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(lambda x:(x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(lambda x:Vectors.dense(x))
num_clusters = 3

#Input into the Algorithm
km = KMeans()
kme = km.train(vector_df, k = num_clusters, maxIterations = 10, seed=2018)
centers = kme.clusterCenters

err = vector_df.map(lambda x:(x[0], findCenter(x[0], centers))).collect()

#Silhoutte Value comparison
ag = 0
agi = 0
for er in err:
    avg = [0] * num_clusters
    avgi = [0] * num_clusters
    for e in err:
        avg[e[1]] += Vectors.squared_distance(er[0], e[0])
        avgi[e[1]] += 1
    a = avg[er[1]] / avgi[er[1]]
    b = sys.maxint
コード例 #55
0
                random.normal(incomecentroid, 10000.0),
                random.normal(agecentroid, 2.0)
            ])
    X = array(X)
    return X


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


#Load data and normalize it with scale
data = sc.parallelize(scale(createClusteredData(100, K)))
clusters = KMeans.train(data,
                        K,
                        maxIterations=10,
                        runs=10,
                        initializationMode="random")
resultRDD = data.map(lambda point: clusters.predict(point)).cache()
print "Counts by value"
counts = resultRDD.countByValue()
print counts
print "Actual assignments"
result = resultRDD.collect()
print result

#within set sum of squared errors
WSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print str(WSSE)
コード例 #56
0
    currTime = strftime("%Y-%m-%d-%H-%M-%S")
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv")
    dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv")
    predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv")

    average_per_year = average_year(lines) # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(), k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
    print("Prob: ", probabilities)
コード例 #57
0
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark import SparkContext
import json

# Load and parse the data
sc = SparkContext("local", "Python K-Means Amazon Reviews")

# First arg must be the filename
filename = sys.argv[1]
data = sc.textFile(filename)
parsedData = data.map(lambda line: json.loads(line)).\
             map(lambda line: (float(line['overall']),
                               float(len(line['reviewText'])),
                               float(line['unixReviewTime'])))

# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        2,
                        maxIterations=10,
                        runs=10,
                        initializationMode="random")

# Evaluate clustering by computing Within Set Sum of Squared Errors
WSSSE = clusters.computeCost(parsedData)
print("Within Set Sum of Squared Error = " + str(WSSSE))
# print clusters.centers

# Save and load model
# clusters.save(sc, "myModelPath")
# sameModel = KMeansModel.load(sc, "myModelPath")
コード例 #58
0
from numpy import array
from math import sqrt
import json
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("KMeans WSSSE")
sc = SparkContext(conf=conf)

coordinates = sc.textFile(
    "hdfs:///user/emilojkovic/data/az_businesses_kmeans/part-00000")


def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


errors = []
# Build the model (cluster the data)
for i in range(1, 15):
    clusters = KMeans.train(data,
                            i,
                            maxIterations=300,
                            runs=10,
                            initializationMode="k-means")
    WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    errors.append((i, str(WSSSE)))

sc.parallelize(errors).coalesce(1).saveAsTextFile(
    'hdfs:///user/emilojkovic/kmeans_wssse')
コード例 #59
0
    if l[2] == '-':
        l[2] = 0
    return np.array([float(l[1]), float(l[2])])


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: kmeans <file> <k>", file=sys.stderr)
        exit(-1)
    fp = open('ballout.csv', 'w')
    writer = csv.writer(fp)
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile(sys.argv[1])
    data = lines.map(parseVector)
    k = int(sys.argv[2])
    model = KMeans.train(data, k)  #batsman on ave and strik rate
    #model=KMeans.train(sc.parallelize(data),k,maxIterations=10,runs=30,initialzationMode="random")
    print("labels : ",
          data.map(model.predict))  #bowler on ave and no of wickets
    print("Final centers: " + str(model.clusterCenters))
    cluster_ind = model.predict(data)
    lis = []
    f = open('bowl.csv', 'r')
    st = f.read()
    some = st.split('\n')
    i = 0

    for x in cluster_ind.collect():
        print(x)
        l1 = []
        l1.append(x)
コード例 #60
0
    observation_group_1.append(randrange(5, 8))

observation_group_2=[]
for i in range(n_in_each_group*n_of_feature):
    observation_group_2.append(randrange(55, 58))

observation_group_3=[]
for i in range(n_in_each_group*n_of_feature):
    observation_group_3.append(randrange(105, 108))

data = array([observation_group_1, observation_group_2, observation_group_3]).reshape(n_in_each_group*3, 5)
data = sc.parallelize(data)


# Run the K-Means algorithm -----------------------------------------------------


# Build the K-Means model
clusters = KMeans.train(data, 3, maxIterations=10, initializationMode="random")  # the initializationMode can also be "k-means||" or set by users.

# Collect the clustering result
result=data.map(lambda point: clusters.predict(point)).collect()
print result

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))